From d41cc702cc4ba3782ebe3b2e189633607d5ccd6a Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Mon, 30 Jan 2006 08:53:33 +0000
Subject: [GFS2] Export file_ra_state_init

Export file_ra_state_init so that its possible to use the already
exported functions which require a struct ra_state as an argument
from a module.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 mm/readahead.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'mm')

diff --git a/mm/readahead.c b/mm/readahead.c
index 8d6eeaa..9f0b982 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -38,6 +38,7 @@ file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
 	ra->ra_pages = mapping->backing_dev_info->ra_pages;
 	ra->prev_page = -1;
 }
+EXPORT_SYMBOL_GPL(file_ra_state_init);
 
 /*
  * Return max readahead size for this inode in number-of-pages.
-- 
cgit v1.1


From c25ec8f5684cb3c5dde6a67c1bbc33a449eefbe2 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Mon, 30 Jan 2006 08:57:31 +0000
Subject: [GFS2] Export file_read_actor

Export file_read_actor so that it can be used from modules since
functions which take this function as an argument are already
exported.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 mm/filemap.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'mm')

diff --git a/mm/filemap.c b/mm/filemap.c
index 44da3d4..7624c26 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -981,6 +981,7 @@ success:
 	desc->arg.buf += size;
 	return size;
 }
+EXPORT_SYMBOL(file_read_actor);
 
 /*
  * This is the "read()" routine for all filesystems
-- 
cgit v1.1


From bf9f424d9acf461fabcb0e9f47b356e08186d91f Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 21 Jun 2006 11:54:43 -0400
Subject: [GFS2] Make file_read_actor export _GPL

Make file_read_actor export a _GPL export.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 mm/filemap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/filemap.c b/mm/filemap.c
index 82c4488..a02a0b2 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1044,7 +1044,7 @@ success:
 	desc->arg.buf += size;
 	return size;
 }
-EXPORT_SYMBOL(file_read_actor);
+EXPORT_SYMBOL_GPL(file_read_actor);
 
 /*
  * This is the "read()" routine for all filesystems
-- 
cgit v1.1


From a9e5f4d0780ec9cda7a70b08294d7718431b62a1 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 25 Jul 2006 17:24:12 -0400
Subject: [GFS2] Alter direct I/O path

As per comments received, alter the GFS2 direct I/O path so that
it uses the standard read functions "out of the box". Needs a
small change to one of the VFS functions. This reduces the size
of the code quite a lot and also removes the need for one new export.

Some more work remains to be done, but this is the bones of the
thing.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 mm/filemap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/filemap.c b/mm/filemap.c
index b9c91ab..a92d690 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1122,7 +1122,6 @@ success:
 	desc->arg.buf += size;
 	return size;
 }
-EXPORT_SYMBOL_GPL(file_read_actor);
 
 /**
  * __generic_file_aio_read - generic filesystem read routine
@@ -1184,7 +1183,8 @@ __generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
 				*ppos = pos + retval;
 		}
 		file_accessed(filp);
-		goto out;
+		if (retval != 0)
+			goto out;
 	}
 
 	retval = 0;
-- 
cgit v1.1


From 0e0bcae3bfb3c88dbe14735fa69d7d88794dc73a Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 27 Sep 2006 14:45:07 -0400
Subject: [GFS2] Fix direct i/o logic in filemap.c

We shouldn't mark the file accessed in the case that it
wasn't accessed.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 mm/filemap.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/filemap.c b/mm/filemap.c
index d9bbea1..91a741d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1180,9 +1180,10 @@ __generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
 			if (retval > 0)
 				*ppos = pos + retval;
 		}
-		file_accessed(filp);
-		if (retval != 0)
+		if (likely(retval != 0)) {
+			file_accessed(flip);
 			goto out;
+		}
 	}
 
 	retval = 0;
-- 
cgit v1.1


From 3f1a9aaeffd8d1cbc5ab9776c45cbd66af1c9699 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 27 Sep 2006 14:52:48 -0400
Subject: [GFS2] Fix typo in last patch

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 mm/filemap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/filemap.c b/mm/filemap.c
index 91a741d..3195806 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1181,7 +1181,7 @@ __generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
 				*ppos = pos + retval;
 		}
 		if (likely(retval != 0)) {
-			file_accessed(flip);
+			file_accessed(filp);
 			goto out;
 		}
 	}
-- 
cgit v1.1


From 038b0a6d8d32db934bba6a24e74e76e4e327a94f Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@redhat.com>
Date: Wed, 4 Oct 2006 03:38:54 -0400
Subject: Remove all inclusions of <linux/config.h> kbuild explicitly includes
 this at build time.

Signed-off-by: Dave Jones <davej@redhat.com>
---
 mm/filemap.h | 1 -
 mm/slab.c    | 1 -
 mm/vmstat.c  | 1 -
 3 files changed, 3 deletions(-)

(limited to 'mm')

diff --git a/mm/filemap.h b/mm/filemap.h
index 3f2a343..c2bff04 100644
--- a/mm/filemap.h
+++ b/mm/filemap.h
@@ -12,7 +12,6 @@
 #include <linux/mm.h>
 #include <linux/highmem.h>
 #include <linux/uio.h>
-#include <linux/config.h>
 #include <linux/uaccess.h>
 
 size_t
diff --git a/mm/slab.c b/mm/slab.c
index 3dbd6f4..f351435 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -86,7 +86,6 @@
  *	All object allocations for a node occur from node specific slab lists.
  */
 
-#include	<linux/config.h>
 #include	<linux/slab.h>
 #include	<linux/mm.h>
 #include	<linux/poison.h>
diff --git a/mm/vmstat.c b/mm/vmstat.c
index a2b6a9f..45b124e 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -9,7 +9,6 @@
  *		Christoph Lameter <christoph@lameter.com>
  */
 
-#include <linux/config.h>
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/cpu.h>
-- 
cgit v1.1


From b2abacf3a2699a8020829c85c16f358ba85cecaf Mon Sep 17 00:00:00 2001
From: Henrik Kretzschmar <henne@nachtwindheim.de>
Date: Wed, 4 Oct 2006 02:15:22 -0700
Subject: [PATCH] mm: fix in kerneldoc

Fixes an kerneldoc error.

Signed-off-by: Henrik Kretzschmar <henne@nachtwindheim.de>
Cc: "Randy.Dunlap" <rdunlap@xenotime.net>
Acked-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/filemap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/filemap.c b/mm/filemap.c
index ec4692359..f789500 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1139,11 +1139,11 @@ success:
 }
 
 /**
- * __generic_file_aio_read - generic filesystem read routine
+ * generic_file_aio_read - generic filesystem read routine
  * @iocb:	kernel I/O control block
  * @iov:	io vector request
  * @nr_segs:	number of segments in the iovec
- * @ppos:	current file position
+ * @pos:	current file position
  *
  * This is the "read()" routine for all filesystems
  * that can use the page cache directly.
-- 
cgit v1.1


From e80ee884ae0e3794ef2b65a18a767d502ad712ee Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Wed, 4 Oct 2006 02:15:23 -0700
Subject: [PATCH] mm: micro optimise zone_watermark_ok

Having min be a signed quantity means gcc can't turn high latency divides
into shifts.  There happen to be two such divides for GFP_ATOMIC (ie.
networking, ie.  important) allocations, one of which depends on the other.
 Fixing this makes code smaller as a bonus.

Shame on somebody (probably me).

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4f59d90..b5468de 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -900,7 +900,8 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
 		      int classzone_idx, int alloc_flags)
 {
 	/* free_pages my go negative - that's OK */
-	long min = mark, free_pages = z->free_pages - (1 << order) + 1;
+	unsigned long min = mark;
+	long free_pages = z->free_pages - (1 << order) + 1;
 	int o;
 
 	if (alloc_flags & ALLOC_HIGH)
-- 
cgit v1.1


From fe1668ae5bf0145014c71797febd9ad5670d5d05 Mon Sep 17 00:00:00 2001
From: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Date: Wed, 4 Oct 2006 02:15:24 -0700
Subject: [PATCH] enforce proper tlb flush in unmap_hugepage_range

Spotted by Hugh that hugetlb page is free'ed back to global pool before
performing any TLB flush in unmap_hugepage_range().  This potentially allow
threads to abuse free-alloc race condition.

The generic tlb gather code is unsuitable to use by hugetlb, I just open
coded a page gathering list and delayed put_page until tlb flush is
performed.

Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>
Acked-by: William Irwin <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/hugetlb.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 7c7d03d..1d709ff 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -364,6 +364,8 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 	pte_t *ptep;
 	pte_t pte;
 	struct page *page;
+	struct page *tmp;
+	LIST_HEAD(page_list);
 
 	WARN_ON(!is_vm_hugetlb_page(vma));
 	BUG_ON(start & ~HPAGE_MASK);
@@ -384,12 +386,16 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 			continue;
 
 		page = pte_page(pte);
-		put_page(page);
+		list_add(&page->lru, &page_list);
 		add_mm_counter(mm, file_rss, (int) -(HPAGE_SIZE / PAGE_SIZE));
 	}
 
 	spin_unlock(&mm->page_table_lock);
 	flush_tlb_range(vma, start, end);
+	list_for_each_entry_safe(page, tmp, &page_list, lru) {
+		list_del(&page->lru);
+		put_page(page);
+	}
 }
 
 static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
-- 
cgit v1.1


From 88ca3b94e82e763ef90c8e57cacd51a3c143ea62 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Wed, 4 Oct 2006 02:15:25 -0700
Subject: [PATCH] page_alloc: fix kernel-doc and func. declaration

Fix kernel-doc and function declaration (missing "void") in
mm/page_alloc.c.

Add mm/page_alloc.c to kernel-api.tmpl in DocBook.

mm/page_alloc.c:2589:38: warning: non-ANSI function declaration of function 'remove_all_active_ranges'

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Acked-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 50 +++++++++++++++++++++++++-------------------------
 1 file changed, 25 insertions(+), 25 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b5468de..a8c003e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2051,8 +2051,8 @@ int __init early_pfn_to_nid(unsigned long pfn)
 
 /**
  * free_bootmem_with_active_regions - Call free_bootmem_node for each active range
- * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed
- * @max_low_pfn: The highest PFN that till be passed to free_bootmem_node
+ * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
+ * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node
  *
  * If an architecture guarantees that all ranges registered with
  * add_active_ranges() contain no holes and may be freed, this
@@ -2082,11 +2082,11 @@ void __init free_bootmem_with_active_regions(int nid,
 
 /**
  * sparse_memory_present_with_active_regions - Call memory_present for each active range
- * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used
+ * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
  *
  * If an architecture guarantees that all ranges registered with
  * add_active_ranges() contain no holes and may be freed, this
- * this function may be used instead of calling memory_present() manually.
+ * function may be used instead of calling memory_present() manually.
  */
 void __init sparse_memory_present_with_active_regions(int nid)
 {
@@ -2156,14 +2156,14 @@ static void __init account_node_boundary(unsigned int nid,
 
 /**
  * get_pfn_range_for_nid - Return the start and end page frames for a node
- * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned
- * @start_pfn: Passed by reference. On return, it will have the node start_pfn
- * @end_pfn: Passed by reference. On return, it will have the node end_pfn
+ * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
+ * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
+ * @end_pfn: Passed by reference. On return, it will have the node end_pfn.
  *
  * It returns the start and end page frame of a node based on information
  * provided by an arch calling add_active_range(). If called for a node
  * with no available memory, a warning is printed and the start and end
- * PFNs will be 0
+ * PFNs will be 0.
  */
 void __init get_pfn_range_for_nid(unsigned int nid,
 			unsigned long *start_pfn, unsigned long *end_pfn)
@@ -2216,7 +2216,7 @@ unsigned long __init zone_spanned_pages_in_node(int nid,
 
 /*
  * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
- * then all holes in the requested range will be accounted for
+ * then all holes in the requested range will be accounted for.
  */
 unsigned long __init __absent_pages_in_range(int nid,
 				unsigned long range_start_pfn,
@@ -2269,7 +2269,7 @@ unsigned long __init __absent_pages_in_range(int nid,
  * @start_pfn: The start PFN to start searching for holes
  * @end_pfn: The end PFN to stop searching for holes
  *
- * It returns the number of pages frames in memory holes within a range
+ * It returns the number of pages frames in memory holes within a range.
  */
 unsigned long __init absent_pages_in_range(unsigned long start_pfn,
 							unsigned long end_pfn)
@@ -2583,11 +2583,12 @@ void __init shrink_active_range(unsigned int nid, unsigned long old_end_pfn,
 
 /**
  * remove_all_active_ranges - Remove all currently registered regions
+ *
  * During discovery, it may be found that a table like SRAT is invalid
  * and an alternative discovery method must be used. This function removes
  * all currently registered regions.
  */
-void __init remove_all_active_ranges()
+void __init remove_all_active_ranges(void)
 {
 	memset(early_node_map, 0, sizeof(early_node_map));
 	nr_nodemap_entries = 0;
@@ -2637,7 +2638,7 @@ unsigned long __init find_min_pfn_for_node(unsigned long nid)
  * find_min_pfn_with_active_regions - Find the minimum PFN registered
  *
  * It returns the minimum PFN based on information provided via
- * add_active_range()
+ * add_active_range().
  */
 unsigned long __init find_min_pfn_with_active_regions(void)
 {
@@ -2648,7 +2649,7 @@ unsigned long __init find_min_pfn_with_active_regions(void)
  * find_max_pfn_with_active_regions - Find the maximum PFN registered
  *
  * It returns the maximum PFN based on information provided via
- * add_active_range()
+ * add_active_range().
  */
 unsigned long __init find_max_pfn_with_active_regions(void)
 {
@@ -2663,10 +2664,7 @@ unsigned long __init find_max_pfn_with_active_regions(void)
 
 /**
  * free_area_init_nodes - Initialise all pg_data_t and zone data
- * @arch_max_dma_pfn: The maximum PFN usable for ZONE_DMA
- * @arch_max_dma32_pfn: The maximum PFN usable for ZONE_DMA32
- * @arch_max_low_pfn: The maximum PFN usable for ZONE_NORMAL
- * @arch_max_high_pfn: The maximum PFN usable for ZONE_HIGHMEM
+ * @max_zone_pfn: an array of max PFNs for each zone
  *
  * This will call free_area_init_node() for each active node in the system.
  * Using the page ranges provided by add_active_range(), the size of each
@@ -2724,14 +2722,15 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
 #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
 
 /**
- * set_dma_reserve - Account the specified number of pages reserved in ZONE_DMA
- * @new_dma_reserve - The number of pages to mark reserved
+ * set_dma_reserve - set the specified number of pages reserved in the first zone
+ * @new_dma_reserve: The number of pages to mark reserved
  *
  * The per-cpu batchsize and zone watermarks are determined by present_pages.
  * In the DMA zone, a significant percentage may be consumed by kernel image
  * and other unfreeable allocations which can skew the watermarks badly. This
- * function may optionally be used to account for unfreeable pages in
- * ZONE_DMA. The effect will be lower watermarks and smaller per-cpu batchsize
+ * function may optionally be used to account for unfreeable pages in the
+ * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
+ * smaller per-cpu batchsize.
  */
 void __init set_dma_reserve(unsigned long new_dma_reserve)
 {
@@ -2844,10 +2843,11 @@ static void setup_per_zone_lowmem_reserve(void)
 	calculate_totalreserve_pages();
 }
 
-/*
- * setup_per_zone_pages_min - called when min_free_kbytes changes.  Ensures 
- *	that the pages_{min,low,high} values for each zone are set correctly 
- *	with respect to min_free_kbytes.
+/**
+ * setup_per_zone_pages_min - called when min_free_kbytes changes.
+ *
+ * Ensures that the pages_{min,low,high} values for each zone are set correctly
+ * with respect to min_free_kbytes.
  */
 void setup_per_zone_pages_min(void)
 {
-- 
cgit v1.1


From 1d2c8eea698514cfaa53fc991b960791d09508e1 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 4 Oct 2006 02:15:25 -0700
Subject: [PATCH] slab: clean up leak tracking ifdefs a little bit

- rename ____kmalloc to kmalloc_track_caller so that people have a chance
  to guess what it does just from it's name.  Add a comment describing it
  for those who don't.  Also move it after kmalloc in slab.h so people get
  less confused when they are just looking for kmalloc - move things around
  in slab.c a little to reduce the ifdef mess.

[penberg@cs.helsinki.fi: Fix up reversed #ifdef]
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Christoph Lameter <clameter@engr.sgi.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 13 ++++++++-----
 mm/util.c |  6 +++---
 2 files changed, 11 insertions(+), 8 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 3dbd6f4..c23b992 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3488,22 +3488,25 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
 }
 
 
+#ifdef CONFIG_DEBUG_SLAB
 void *__kmalloc(size_t size, gfp_t flags)
 {
-#ifndef CONFIG_DEBUG_SLAB
-	return __do_kmalloc(size, flags, NULL);
-#else
 	return __do_kmalloc(size, flags, __builtin_return_address(0));
-#endif
 }
 EXPORT_SYMBOL(__kmalloc);
 
-#ifdef CONFIG_DEBUG_SLAB
 void *__kmalloc_track_caller(size_t size, gfp_t flags, void *caller)
 {
 	return __do_kmalloc(size, flags, caller);
 }
 EXPORT_SYMBOL(__kmalloc_track_caller);
+
+#else
+void *__kmalloc(size_t size, gfp_t flags)
+{
+	return __do_kmalloc(size, flags, NULL);
+}
+EXPORT_SYMBOL(__kmalloc);
 #endif
 
 /**
diff --git a/mm/util.c b/mm/util.c
index e14fa84..ace2aea 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -11,7 +11,7 @@
  */
 void *__kzalloc(size_t size, gfp_t flags)
 {
-	void *ret = ____kmalloc(size, flags);
+	void *ret = kmalloc_track_caller(size, flags);
 	if (ret)
 		memset(ret, 0, size);
 	return ret;
@@ -33,7 +33,7 @@ char *kstrdup(const char *s, gfp_t gfp)
 		return NULL;
 
 	len = strlen(s) + 1;
-	buf = ____kmalloc(len, gfp);
+	buf = kmalloc_track_caller(len, gfp);
 	if (buf)
 		memcpy(buf, s, len);
 	return buf;
@@ -51,7 +51,7 @@ void *kmemdup(const void *src, size_t len, gfp_t gfp)
 {
 	void *p;
 
-	p = ____kmalloc(len, gfp);
+	p = kmalloc_track_caller(len, gfp);
 	if (p)
 		memcpy(p, src, len);
 	return p;
-- 
cgit v1.1


From 1ca4cb2418c04914e4661c059cf5b7b9262c645a Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Fri, 6 Oct 2006 00:43:52 -0700
Subject: [PATCH] slab: reduce numa text size

Reduce the NUMA text size of mm/slab.o a little on x86 by using a local
variable to store the result of numa_node_id().

    text    data     bss     dec     hex filename
   16858    2584      16   19458    4c02 mm/slab.o (before)
   16804    2584      16   19404    4bcc mm/slab.o (after)

[akpm@osdl.org: use better names]
[pbadari@us.ibm.com: fix that]
Cc: Christoph Lameter <christoph@lameter.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Badari Pulavarty <pbadari@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 34 +++++++++++++++++++++-------------
 1 file changed, 21 insertions(+), 13 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index e9a63b5..64fb0d7 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1106,15 +1106,18 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
 	int nodeid = slabp->nodeid;
 	struct kmem_list3 *l3;
 	struct array_cache *alien = NULL;
+	int node;
+
+	node = numa_node_id();
 
 	/*
 	 * Make sure we are not freeing a object from another node to the array
 	 * cache on this cpu.
 	 */
-	if (likely(slabp->nodeid == numa_node_id()))
+	if (likely(slabp->nodeid == node))
 		return 0;
 
-	l3 = cachep->nodelists[numa_node_id()];
+	l3 = cachep->nodelists[node];
 	STATS_INC_NODEFREES(cachep);
 	if (l3->alien && l3->alien[nodeid]) {
 		alien = l3->alien[nodeid];
@@ -1352,6 +1355,7 @@ void __init kmem_cache_init(void)
 	struct cache_names *names;
 	int i;
 	int order;
+	int node;
 
 	for (i = 0; i < NUM_INIT_LISTS; i++) {
 		kmem_list3_init(&initkmem_list3[i]);
@@ -1386,12 +1390,14 @@ void __init kmem_cache_init(void)
 	 * 6) Resize the head arrays of the kmalloc caches to their final sizes.
 	 */
 
+	node = numa_node_id();
+
 	/* 1) create the cache_cache */
 	INIT_LIST_HEAD(&cache_chain);
 	list_add(&cache_cache.next, &cache_chain);
 	cache_cache.colour_off = cache_line_size();
 	cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
-	cache_cache.nodelists[numa_node_id()] = &initkmem_list3[CACHE_CACHE];
+	cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE];
 
 	cache_cache.buffer_size = ALIGN(cache_cache.buffer_size,
 					cache_line_size());
@@ -1496,19 +1502,18 @@ void __init kmem_cache_init(void)
 	}
 	/* 5) Replace the bootstrap kmem_list3's */
 	{
-		int node;
+		int nid;
+
 		/* Replace the static kmem_list3 structures for the boot cpu */
-		init_list(&cache_cache, &initkmem_list3[CACHE_CACHE],
-			  numa_node_id());
+		init_list(&cache_cache, &initkmem_list3[CACHE_CACHE], node);
 
-		for_each_online_node(node) {
+		for_each_online_node(nid) {
 			init_list(malloc_sizes[INDEX_AC].cs_cachep,
-				  &initkmem_list3[SIZE_AC + node], node);
+				  &initkmem_list3[SIZE_AC + nid], nid);
 
 			if (INDEX_AC != INDEX_L3) {
 				init_list(malloc_sizes[INDEX_L3].cs_cachep,
-					  &initkmem_list3[SIZE_L3 + node],
-					  node);
+					  &initkmem_list3[SIZE_L3 + nid], nid);
 			}
 		}
 	}
@@ -2918,6 +2923,9 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
 	int batchcount;
 	struct kmem_list3 *l3;
 	struct array_cache *ac;
+	int node;
+
+	node = numa_node_id();
 
 	check_irq_off();
 	ac = cpu_cache_get(cachep);
@@ -2931,7 +2939,7 @@ retry:
 		 */
 		batchcount = BATCHREFILL_LIMIT;
 	}
-	l3 = cachep->nodelists[numa_node_id()];
+	l3 = cachep->nodelists[node];
 
 	BUG_ON(ac->avail > 0 || !l3);
 	spin_lock(&l3->list_lock);
@@ -2961,7 +2969,7 @@ retry:
 			STATS_SET_HIGH(cachep);
 
 			ac->entry[ac->avail++] = slab_get_obj(cachep, slabp,
-							    numa_node_id());
+							    node);
 		}
 		check_slabp(cachep, slabp);
 
@@ -2980,7 +2988,7 @@ alloc_done:
 
 	if (unlikely(!ac->avail)) {
 		int x;
-		x = cache_grow(cachep, flags, numa_node_id());
+		x = cache_grow(cachep, flags, node);
 
 		/* cache_grow can reenable interrupts, then ac could change. */
 		ac = cpu_cache_get(cachep);
-- 
cgit v1.1


From 7f7bbbe50b8a28f4dfaa4cea939ddb50198c4a99 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Fri, 6 Oct 2006 00:43:53 -0700
Subject: [PATCH] page fault retry with NOPAGE_REFAULT

Add a way for a no_page() handler to request a retry of the faulting
instruction.  It goes back to userland on page faults and just tries again
in get_user_pages().  I added a cond_resched() in the loop in that later
case.

The problem I have with signal and spufs is an actual bug affecting apps and I
don't see other ways of fixing it.

In addition, we are having issues with infiniband and 64k pages (related to
the way the hypervisor deals with some HV cards) that will require us to muck
around with the MMU from within the IB driver's no_page() (it's a pSeries
specific driver) and return to the caller the same way using NOPAGE_REFAULT.

And to add to this, the graphics folks have been following a new approach of
memory management that involves transparently swapping objects between video
ram and main meory.  To do that, they need installing PTEs from a no_page()
handler as well and that also requires returning with NOPAGE_REFAULT.

(For the later, they are currently using io_remap_pfn_range to install one PTE
from no_page() which is a bit racy, we need to add a check for the PTE having
already been installed afer taking the lock, but that's ok, they are only at
the proof-of-concept stage.  I'll send a patch adding a "clean" function to do
that, we can use that from spufs too and get rid of the sparsemem hacks we do
to create struct page for SPEs.  Basically, that provides a generic solution
for being able to have no_page() map hardware devices, which is something that
I think sound driver folks have been asking for some time too).

All of these things depend on having the NOPAGE_REFAULT exit path from
no_page() handlers.

Signed-off-by: Benjamin Herrenchmidt <benh@kernel.crashing.org>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/memory.c b/mm/memory.c
index 9cf3f34..b5a4aad 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1086,6 +1086,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 				default:
 					BUG();
 				}
+				cond_resched();
 			}
 			if (pages) {
 				pages[i] = page;
@@ -2169,11 +2170,13 @@ retry:
 	 * after the next truncate_count read.
 	 */
 
-	/* no page was available -- either SIGBUS or OOM */
-	if (new_page == NOPAGE_SIGBUS)
+	/* no page was available -- either SIGBUS, OOM or REFAULT */
+	if (unlikely(new_page == NOPAGE_SIGBUS))
 		return VM_FAULT_SIGBUS;
-	if (new_page == NOPAGE_OOM)
+	else if (unlikely(new_page == NOPAGE_OOM))
 		return VM_FAULT_OOM;
+	else if (unlikely(new_page == NOPAGE_REFAULT))
+		return VM_FAULT_MINOR;
 
 	/*
 	 * Should we do an early C-O-W break?
-- 
cgit v1.1


From dcbd4ec4c258f88d4cfd3c309c8a56faff061340 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Fri, 6 Oct 2006 22:19:44 -0700
Subject: [PATCH] slab: remove wrongly placed BUG_ON

Init list is called with a list parameter that is not equal to the
cachep->nodelists entry under NUMA if more than one node exists.  This is
fully legitimatei.  One may want to populate the list fields before
switching nodelist pointers.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 64fb0d7..266449d 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1328,7 +1328,6 @@ static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
 {
 	struct kmem_list3 *ptr;
 
-	BUG_ON(cachep->nodelists[nodeid] != list);
 	ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, nodeid);
 	BUG_ON(!ptr);
 
-- 
cgit v1.1


From b888132b0ff29ca0733589a594c243ed612438eb Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@skynet.ie>
Date: Wed, 11 Oct 2006 01:20:40 -0700
Subject: [PATCH] mm: remove memmap_zone_idx()

memmap_zone_idx() is not used anymore.  It was required by an earlier
version of
account-for-memmap-and-optionally-the-kernel-image-as-holes.patch but not
any more.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 17 -----------------
 1 file changed, 17 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a8c003e..8246e83 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2294,19 +2294,6 @@ unsigned long __init zone_absent_pages_in_node(int nid,
 	return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
 }
 
-/* Return the zone index a PFN is in */
-int memmap_zone_idx(struct page *lmem_map)
-{
-	int i;
-	unsigned long phys_addr = virt_to_phys(lmem_map);
-	unsigned long pfn = phys_addr >> PAGE_SHIFT;
-
-	for (i = 0; i < MAX_NR_ZONES; i++)
-		if (pfn < arch_zone_highest_possible_pfn[i])
-			break;
-
-	return i;
-}
 #else
 static inline unsigned long zone_spanned_pages_in_node(int nid,
 					unsigned long zone_type,
@@ -2325,10 +2312,6 @@ static inline unsigned long zone_absent_pages_in_node(int nid,
 	return zholes_size[zone_type];
 }
 
-static inline int memmap_zone_idx(struct page *lmem_map)
-{
-	return MAX_NR_ZONES;
-}
 #endif
 
 static void __init calculate_node_totalpages(struct pglist_data *pgdat,
-- 
cgit v1.1


From 502717f4e112b18d9c37753a32f675bec9f2838b Mon Sep 17 00:00:00 2001
From: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Date: Wed, 11 Oct 2006 01:20:46 -0700
Subject: [PATCH] hugetlb: fix linked list corruption in unmap_hugepage_range()

commit fe1668ae5bf0145014c71797febd9ad5670d5d05 causes kernel to oops with
libhugetlbfs test suite.  The problem is that hugetlb pages can be shared
by multiple mappings.  Multiple threads can fight over page->lru in the
unmap path and bad things happen.  We now serialize __unmap_hugepage_range
to void concurrent linked list manipulation.  Such serialization is also
needed for shared page table page on hugetlb area.  This patch will fixed
the bug and also serve as a prepatch for shared page table.

Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/hugetlb.c | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 1d709ff..2dbec90 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -356,8 +356,8 @@ nomem:
 	return -ENOMEM;
 }
 
-void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
-			  unsigned long end)
+void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
+			    unsigned long end)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	unsigned long address;
@@ -398,6 +398,24 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 	}
 }
 
+void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
+			  unsigned long end)
+{
+	/*
+	 * It is undesirable to test vma->vm_file as it should be non-null
+	 * for valid hugetlb area. However, vm_file will be NULL in the error
+	 * cleanup path of do_mmap_pgoff. When hugetlbfs ->mmap method fails,
+	 * do_mmap_pgoff() nullifies vma->vm_file before calling this function
+	 * to clean up. Since no pte has actually been setup, it is safe to
+	 * do nothing in this case.
+	 */
+	if (vma->vm_file) {
+		spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
+		__unmap_hugepage_range(vma, start, end);
+		spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
+	}
+}
+
 static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long address, pte_t *ptep, pte_t pte)
 {
-- 
cgit v1.1


From b16bc64d1aed40fb9cff9187061005b2a89b5d5d Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@redhat.com>
Date: Wed, 11 Oct 2006 01:21:27 -0700
Subject: [PATCH] move rmap BUG_ON outside DEBUG_VM

We have a persistent dribble of reports of this BUG triggering.  Its extended
diagnostics were recently made conditional on CONFIG_DEBUG_VM, which was a bad
idea - we want to know about it.

Signed-off-by: Dave Jones <davej@redhat.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/rmap.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/rmap.c b/mm/rmap.c
index e2155d7..a9136d8 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -576,15 +576,14 @@ void page_add_file_rmap(struct page *page)
 void page_remove_rmap(struct page *page)
 {
 	if (atomic_add_negative(-1, &page->_mapcount)) {
-#ifdef CONFIG_DEBUG_VM
 		if (unlikely(page_mapcount(page) < 0)) {
 			printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page));
 			printk (KERN_EMERG "  page->flags = %lx\n", page->flags);
 			printk (KERN_EMERG "  page->count = %x\n", page_count(page));
 			printk (KERN_EMERG "  page->mapping = %p\n", page->mapping);
+			BUG();
 		}
-#endif
-		BUG_ON(page_mapcount(page) < 0);
+
 		/*
 		 * It would be tidy to reset the PageAnon mapping here,
 		 * but that might overwrite a racing page_add_anon_rmap
-- 
cgit v1.1


From 699397499742d1245ea5d677a08fa265df666d2d Mon Sep 17 00:00:00 2001
From: Keith Owens <kaos@ocs.com.au>
Date: Wed, 11 Oct 2006 01:21:28 -0700
Subject: [PATCH] Fix do_mbind warning with CONFIG_MIGRATION=n

With CONFIG_MIGRATION=n

mm/mempolicy.c: In function 'do_mbind':
mm/mempolicy.c:796: warning: passing argument 2 of 'migrate_pages' from incompatible pointer type

Signed-off-by: Keith Owens <kaos@ocs.com.au>
Cc: Christoph Lameter <clameter@engr.sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mempolicy.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 25788b1..617fb31 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -727,7 +727,7 @@ int do_migrate_pages(struct mm_struct *mm,
 	return -ENOSYS;
 }
 
-static struct page *new_vma_page(struct page *page, unsigned long private)
+static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
 {
 	return NULL;
 }
-- 
cgit v1.1


From dafb13673c463bc2aade4a4819704dde0f5fa37f Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 11 Oct 2006 01:21:30 -0700
Subject: [PATCH] mm: arch_free_page fix

After the PG_reserved check was added, arch_free_page was being called in the
wrong place (it could be called for a page we don't actually want to free).
Fix that.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8246e83..c5caac2 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -495,7 +495,6 @@ static void __free_pages_ok(struct page *page, unsigned int order)
 	int i;
 	int reserved = 0;
 
-	arch_free_page(page, order);
 	if (!PageHighMem(page))
 		debug_check_no_locks_freed(page_address(page),
 					   PAGE_SIZE<<order);
@@ -505,7 +504,9 @@ static void __free_pages_ok(struct page *page, unsigned int order)
 	if (reserved)
 		return;
 
+	arch_free_page(page, order);
 	kernel_map_pages(page, 1 << order, 0);
+
 	local_irq_save(flags);
 	__count_vm_events(PGFREE, 1 << order);
 	free_one_page(page_zone(page), page, order);
@@ -781,13 +782,12 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
 	struct per_cpu_pages *pcp;
 	unsigned long flags;
 
-	arch_free_page(page, 0);
-
 	if (PageAnon(page))
 		page->mapping = NULL;
 	if (free_pages_check(page))
 		return;
 
+	arch_free_page(page, 0);
 	kernel_map_pages(page, 1, 0);
 
 	pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
-- 
cgit v1.1


From 9858db504caedb2424b9a32744c23f9a81ec1731 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 11 Oct 2006 01:21:30 -0700
Subject: [PATCH] mm: locks_freed fix

Move the lock debug checks below the page reserved checks.  Also, having
debug_check_no_locks_freed in kernel_map_pages is wrong.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c5caac2..40db96a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -495,15 +495,13 @@ static void __free_pages_ok(struct page *page, unsigned int order)
 	int i;
 	int reserved = 0;
 
-	if (!PageHighMem(page))
-		debug_check_no_locks_freed(page_address(page),
-					   PAGE_SIZE<<order);
-
 	for (i = 0 ; i < (1 << order) ; ++i)
 		reserved += free_pages_check(page + i);
 	if (reserved)
 		return;
 
+	if (!PageHighMem(page))
+		debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);
 	arch_free_page(page, order);
 	kernel_map_pages(page, 1 << order, 0);
 
@@ -787,6 +785,8 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
 	if (free_pages_check(page))
 		return;
 
+	if (!PageHighMem(page))
+		debug_check_no_locks_freed(page_address(page), PAGE_SIZE);
 	arch_free_page(page, 0);
 	kernel_map_pages(page, 1, 0);
 
-- 
cgit v1.1


From 8258d4a574d3a8c01f0ef68aa26b969398a0e140 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 11 Oct 2006 01:21:53 -0700
Subject: [PATCH] invalidate_inode_pages2_range() debug

A failure in invalidate_inode_pages2_range() can result in unpleasant things
happening in NFS (at least).  Stick a WARN_ON_ONCE() in there so we can find
out if it happens, and maybe why.

(akpm: might be a -mm-only patch, we'll see..)

Cc: Chuck Lever <chuck.lever@oracle.com>
Cc: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: Steve Dickson <SteveD@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/truncate.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'mm')

diff --git a/mm/truncate.c b/mm/truncate.c
index f4edbc1..fca2883 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -396,6 +396,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
 		pagevec_release(&pvec);
 		cond_resched();
 	}
+	WARN_ON_ONCE(ret);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);
-- 
cgit v1.1


From 887ed2f3aecde2ff24e06666932dc5f144745044 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 11 Oct 2006 01:21:58 -0700
Subject: [PATCH] VM: Fix the gfp_mask in invalidate_complete_page2

If try_to_release_page() is called with a zero gfp mask, then the
filesystem is effectively denied the possibility of sleeping while
attempting to release the page.  There doesn't appear to be any valid
reason why this should be banned, given that we're not calling this from a
memory allocation context.

For this reason, change the gfp_mask argument of the call to GFP_KERNEL.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: Steve Dickson <SteveD@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/truncate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/truncate.c b/mm/truncate.c
index fca2883..11ca480 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -302,7 +302,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
 	if (page->mapping != mapping)
 		return 0;
 
-	if (PagePrivate(page) && !try_to_release_page(page, 0))
+	if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL))
 		return 0;
 
 	write_lock_irq(&mapping->tree_lock);
-- 
cgit v1.1


From 53bc5b2db16ceefdd972b9ffd1f7bde5c427939e Mon Sep 17 00:00:00 2001
From: Aneesh Kumar <aneesh.kumar@gmail.com>
Date: Wed, 11 Oct 2006 01:22:03 -0700
Subject: [PATCH] Fix typos in mm/shmem_acl.c

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/shmem_acl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/shmem_acl.c b/mm/shmem_acl.c
index c946bf4..f5664c5 100644
--- a/mm/shmem_acl.c
+++ b/mm/shmem_acl.c
@@ -35,7 +35,7 @@ shmem_get_acl(struct inode *inode, int type)
 }
 
 /**
- * shmem_get_acl  -   generic_acl_operations->setacl() operation
+ * shmem_set_acl  -   generic_acl_operations->setacl() operation
  */
 static void
 shmem_set_acl(struct inode *inode, int type, struct posix_acl *acl)
-- 
cgit v1.1


From 80c5606c3b45e0176c32d3108ade1e1cb0b954f3 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Sun, 15 Oct 2006 14:09:55 -0700
Subject: Fix VM_MAYEXEC calculation

.. and clean up the file mapping code while at it.  No point in having a
"if (file)" repeated twice, and generally doing similar checks in two
different sections of the same code

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mmap.c | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

(limited to 'mm')

diff --git a/mm/mmap.c b/mm/mmap.c
index eea8eef..497e502 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -900,17 +900,6 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
 	int accountable = 1;
 	unsigned long charged = 0, reqprot = prot;
 
-	if (file) {
-		if (is_file_hugepages(file))
-			accountable = 0;
-
-		if (!file->f_op || !file->f_op->mmap)
-			return -ENODEV;
-
-		if ((prot & PROT_EXEC) &&
-		    (file->f_vfsmnt->mnt_flags & MNT_NOEXEC))
-			return -EPERM;
-	}
 	/*
 	 * Does the application expect PROT_READ to imply PROT_EXEC?
 	 *
@@ -1000,6 +989,16 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
 		case MAP_PRIVATE:
 			if (!(file->f_mode & FMODE_READ))
 				return -EACCES;
+			if (file->f_vfsmnt->mnt_flags & MNT_NOEXEC) {
+				if (vm_flags & VM_EXEC)
+					return -EPERM;
+				vm_flags &= ~VM_MAYEXEC;
+			}
+			if (is_file_hugepages(file))
+				accountable = 0;
+
+			if (!file->f_op || !file->f_op->mmap)
+				return -ENODEV;
 			break;
 
 		default:
-- 
cgit v1.1


From a649fd9271773dd0f78e2b9f347bcceecb8827f9 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Tue, 17 Oct 2006 00:09:36 -0700
Subject: [PATCH] invalidate: remove_mapping() fix

If remove_mapping() failed to remove the page from its mapping, don't go and
mark it not uptodate!  Makes kernel go dead.

(Actually, I don't think the ClearPageUptodate is needed there at all).

Says Nick Piggin:

   "Right, it isn't needed because at this point the page is guaranteed
    by remove_mapping to have no references (except us) and cannot pick
    up any new ones because it is removed from pagecache.

    We can delete it."

Signed-off-by: Andrew Morton <akpm@osdl.org>
Acked-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/truncate.c | 1 -
 mm/vmscan.c   | 6 ++++++
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/truncate.c b/mm/truncate.c
index 11ca480..e07b1e6 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -96,7 +96,6 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
 		return 0;
 
 	ret = remove_mapping(mapping, page);
-	ClearPageUptodate(page);
 
 	return ret;
 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index eca7031..af73c14 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -378,6 +378,12 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
 	return PAGE_CLEAN;
 }
 
+/*
+ * Attempt to detach a locked page from its ->mapping.  If it is dirty or if
+ * someone else has a ref on the page, abort and return 0.  If it was
+ * successfully detached, return 1.  Assumes the caller has a single ref on
+ * this page.
+ */
 int remove_mapping(struct address_space *mapping, struct page *page)
 {
 	BUG_ON(!PageLocked(page));
-- 
cgit v1.1


From 91828a405ae454a9503c41a7744f6ff877a80714 Mon Sep 17 00:00:00 2001
From: "David M. Grimes" <dgrimes@navisite.com>
Date: Tue, 17 Oct 2006 00:09:45 -0700
Subject: [PATCH] knfsd: add nfs-export support to tmpfs

We need to encode a decode the 'file' part of a handle.  We simply use the
inode number and generation number to construct the filehandle.

The generation number is the time when the file was created.  As inode numbers
cycle through the full 32 bits before being reused, there is no real chance of
the same inum being allocated to different files in the same second so this is
suitably unique.  Using time-of-day rather than e.g.  jiffies makes it less
likely that the same filehandle can be created after a reboot.

In order to be able to decode a filehandle we need to be able to lookup by
inum, which means that the inode needs to be added to the inode hash table
(tmpfs doesn't currently hash inodes as there is never a need to lookup by
inum).  To avoid overhead when not exporting, we only hash an inode when it is
first exported.  This requires a lock to ensure it isn't hashed twice.

This code is separate from the patch posted in June06 from Atal Shargorodsky
which provided the same functionality, but does borrow slightly from it.

Locking comment: Most filesystems that hash their inodes do so at the point
where the 'struct inode' is initialised, and that has suitable locking
(I_NEW).  Here in shmem, we are hashing the inode later, the first time we
need an NFS file handle for it.  We no longer have I_NEW to ensure only one
thread tries to add it to the hash table.

Cc: Atal Shargorodsky <atal@codefidence.com>
Cc: Gilad Ben-Yossef <gilad@codefidence.com>
Signed-off-by: David M. Grimes <dgrimes@navisite.com>
Signed-off-by: Neil Brown <neilb@suse.de>
Acked-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/shmem.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 81 insertions(+)

(limited to 'mm')

diff --git a/mm/shmem.c b/mm/shmem.c
index bb8ca7e..b378f66 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1362,6 +1362,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
 		inode->i_mapping->a_ops = &shmem_aops;
 		inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+		inode->i_generation = get_seconds();
 		info = SHMEM_I(inode);
 		memset(info, 0, (char *)inode - (char *)info);
 		spin_lock_init(&info->lock);
@@ -1956,6 +1957,85 @@ static struct xattr_handler *shmem_xattr_handlers[] = {
 };
 #endif
 
+static struct dentry *shmem_get_parent(struct dentry *child)
+{
+	return ERR_PTR(-ESTALE);
+}
+
+static int shmem_match(struct inode *ino, void *vfh)
+{
+	__u32 *fh = vfh;
+	__u64 inum = fh[2];
+	inum = (inum << 32) | fh[1];
+	return ino->i_ino == inum && fh[0] == ino->i_generation;
+}
+
+static struct dentry *shmem_get_dentry(struct super_block *sb, void *vfh)
+{
+	struct dentry *de = NULL;
+	struct inode *inode;
+	__u32 *fh = vfh;
+	__u64 inum = fh[2];
+	inum = (inum << 32) | fh[1];
+
+	inode = ilookup5(sb, (unsigned long)(inum+fh[0]), shmem_match, vfh);
+	if (inode) {
+		de = d_find_alias(inode);
+		iput(inode);
+	}
+
+	return de? de: ERR_PTR(-ESTALE);
+}
+
+static struct dentry *shmem_decode_fh(struct super_block *sb, __u32 *fh,
+		int len, int type,
+		int (*acceptable)(void *context, struct dentry *de),
+		void *context)
+{
+	if (len < 3)
+		return ERR_PTR(-ESTALE);
+
+	return sb->s_export_op->find_exported_dentry(sb, fh, NULL, acceptable,
+							context);
+}
+
+static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len,
+				int connectable)
+{
+	struct inode *inode = dentry->d_inode;
+
+	if (*len < 3)
+		return 255;
+
+	if (hlist_unhashed(&inode->i_hash)) {
+		/* Unfortunately insert_inode_hash is not idempotent,
+		 * so as we hash inodes here rather than at creation
+		 * time, we need a lock to ensure we only try
+		 * to do it once
+		 */
+		static DEFINE_SPINLOCK(lock);
+		spin_lock(&lock);
+		if (hlist_unhashed(&inode->i_hash))
+			__insert_inode_hash(inode,
+					    inode->i_ino + inode->i_generation);
+		spin_unlock(&lock);
+	}
+
+	fh[0] = inode->i_generation;
+	fh[1] = inode->i_ino;
+	fh[2] = ((__u64)inode->i_ino) >> 32;
+
+	*len = 3;
+	return 1;
+}
+
+static struct export_operations shmem_export_ops = {
+	.get_parent     = shmem_get_parent,
+	.get_dentry     = shmem_get_dentry,
+	.encode_fh      = shmem_encode_fh,
+	.decode_fh      = shmem_decode_fh,
+};
+
 static int shmem_parse_options(char *options, int *mode, uid_t *uid,
 	gid_t *gid, unsigned long *blocks, unsigned long *inodes,
 	int *policy, nodemask_t *policy_nodes)
@@ -2128,6 +2208,7 @@ static int shmem_fill_super(struct super_block *sb,
 					&inodes, &policy, &policy_nodes))
 			return -EINVAL;
 	}
+	sb->s_export_op = &shmem_export_ops;
 #else
 	sb->s_flags |= MS_NOUSER;
 #endif
-- 
cgit v1.1


From 286e1ea3ac1ca4f503ebbb3020bdb0cbe6adffac Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Tue, 17 Oct 2006 00:09:57 -0700
Subject: [PATCH] vmalloc(): don't pass __GFP_ZERO to slab

A recent change to the vmalloc() code accidentally resulted in us passing
__GFP_ZERO into the slab allocator.  But we only wanted __GFP_ZERO for the
actual pages whcih are being vmalloc()ed, and passing __GFP_ZERO into slab is
not a rational thing to ask for.

Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/vmalloc.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 750ab6e..1133dd3 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -428,8 +428,11 @@ void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 	if (array_size > PAGE_SIZE) {
 		pages = __vmalloc_node(array_size, gfp_mask, PAGE_KERNEL, node);
 		area->flags |= VM_VPAGES;
-	} else
-		pages = kmalloc_node(array_size, (gfp_mask & ~__GFP_HIGHMEM), node);
+	} else {
+		pages = kmalloc_node(array_size,
+				(gfp_mask & ~(__GFP_HIGHMEM | __GFP_ZERO)),
+				node);
+	}
 	area->pages = pages;
 	if (!area->pages) {
 		remove_vm_area(area->addr);
-- 
cgit v1.1


From fb5527e68d495650a7658fec9a7246bf922db212 Mon Sep 17 00:00:00 2001
From: Jeff Moyer <jmoyer@redhat.com>
Date: Thu, 19 Oct 2006 23:28:13 -0700
Subject: [PATCH] direct-io: sync and invalidate file region when falling back
 to buffered write

When direct-io falls back to buffered write, it will just leave the dirty data
floating about in pagecache, pending regular writeback.

But normal direct-io semantics are that IO is synchronous, and that it leaves
no pagecache behind.

So change the fallback-to-buffered-write code to sync the file region and to
then strip away the pagecache, just as a regular direct-io write would do.

Acked-by: Jeff Moyer <jmoyer@redhat.com>
Cc: Zach Brown <zach.brown@oracle.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/filemap.c | 51 +++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 45 insertions(+), 6 deletions(-)

(limited to 'mm')

diff --git a/mm/filemap.c b/mm/filemap.c
index 3464b68..57faa8d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2222,7 +2222,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
 				unsigned long nr_segs, loff_t *ppos)
 {
 	struct file *file = iocb->ki_filp;
-	const struct address_space * mapping = file->f_mapping;
+	struct address_space * mapping = file->f_mapping;
 	size_t ocount;		/* original count */
 	size_t count;		/* after file limit checks */
 	struct inode 	*inode = mapping->host;
@@ -2275,8 +2275,11 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
 
 	/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
 	if (unlikely(file->f_flags & O_DIRECT)) {
-		written = generic_file_direct_write(iocb, iov,
-				&nr_segs, pos, ppos, count, ocount);
+		loff_t endbyte;
+		ssize_t written_buffered;
+
+		written = generic_file_direct_write(iocb, iov, &nr_segs, pos,
+							ppos, count, ocount);
 		if (written < 0 || written == count)
 			goto out;
 		/*
@@ -2285,10 +2288,46 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
 		 */
 		pos += written;
 		count -= written;
-	}
+		written_buffered = generic_file_buffered_write(iocb, iov,
+						nr_segs, pos, ppos, count,
+						written);
+		/*
+		 * If generic_file_buffered_write() retuned a synchronous error
+		 * then we want to return the number of bytes which were
+		 * direct-written, or the error code if that was zero.  Note
+		 * that this differs from normal direct-io semantics, which
+		 * will return -EFOO even if some bytes were written.
+		 */
+		if (written_buffered < 0) {
+			err = written_buffered;
+			goto out;
+		}
 
-	written = generic_file_buffered_write(iocb, iov, nr_segs,
-			pos, ppos, count, written);
+		/*
+		 * We need to ensure that the page cache pages are written to
+		 * disk and invalidated to preserve the expected O_DIRECT
+		 * semantics.
+		 */
+		endbyte = pos + written_buffered - written - 1;
+		err = do_sync_file_range(file, pos, endbyte,
+					 SYNC_FILE_RANGE_WAIT_BEFORE|
+					 SYNC_FILE_RANGE_WRITE|
+					 SYNC_FILE_RANGE_WAIT_AFTER);
+		if (err == 0) {
+			written = written_buffered;
+			invalidate_mapping_pages(mapping,
+						 pos >> PAGE_CACHE_SHIFT,
+						 endbyte >> PAGE_CACHE_SHIFT);
+		} else {
+			/*
+			 * We don't know how much we wrote, so just return
+			 * the number of bytes which were direct-written
+			 */
+		}
+	} else {
+		written = generic_file_buffered_write(iocb, iov, nr_segs,
+				pos, ppos, count, written);
+	}
 out:
 	current->backing_dev_info = NULL;
 	return written ? written : err;
-- 
cgit v1.1


From 3fcfab16c5b86eaa3db3a9a31adba550c5b67141 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Thu, 19 Oct 2006 23:28:16 -0700
Subject: [PATCH] separate bdi congestion functions from queue congestion
 functions

Separate out the concept of "queue congestion" from "backing-dev congestion".
Congestion is a backing-dev concept, not a queue concept.

The blk_* congestion functions are retained, as wrappers around the core
backing-dev congestion functions.

This proper layering is needed so that NFS can cleanly use the congestion
functions, and so that CONFIG_BLOCK=n actually links.

Cc: "Thomas Maier" <balagi@justmail.de>
Cc: "Jens Axboe" <jens.axboe@oracle.com>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: David Howells <dhowells@redhat.com>
Cc: Peter Osterlund <petero2@telia.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/Makefile         |  3 ++-
 mm/backing-dev.c    | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 mm/page-writeback.c | 17 ++++---------
 mm/page_alloc.c     |  5 ++--
 mm/shmem.c          |  3 ++-
 mm/vmscan.c         |  6 ++---
 6 files changed, 83 insertions(+), 20 deletions(-)
 create mode 100644 mm/backing-dev.c

(limited to 'mm')

diff --git a/mm/Makefile b/mm/Makefile
index 12b3a4e..f3c077e 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -10,7 +10,8 @@ mmu-$(CONFIG_MMU)	:= fremap.o highmem.o madvise.o memory.o mincore.o \
 obj-y			:= bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
 			   page_alloc.o page-writeback.o pdflush.o \
 			   readahead.o swap.o truncate.o vmscan.o \
-			   prio_tree.o util.o mmzone.o vmstat.o $(mmu-y)
+			   prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
+			   $(mmu-y)
 
 ifeq ($(CONFIG_MMU)$(CONFIG_BLOCK),yy)
 obj-y			+= bounce.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
new file mode 100644
index 0000000..f50a281
--- /dev/null
+++ b/mm/backing-dev.c
@@ -0,0 +1,69 @@
+
+#include <linux/wait.h>
+#include <linux/backing-dev.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+
+static wait_queue_head_t congestion_wqh[2] = {
+		__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
+		__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
+	};
+
+
+void clear_bdi_congested(struct backing_dev_info *bdi, int rw)
+{
+	enum bdi_state bit;
+	wait_queue_head_t *wqh = &congestion_wqh[rw];
+
+	bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested;
+	clear_bit(bit, &bdi->state);
+	smp_mb__after_clear_bit();
+	if (waitqueue_active(wqh))
+		wake_up(wqh);
+}
+EXPORT_SYMBOL(clear_bdi_congested);
+
+void set_bdi_congested(struct backing_dev_info *bdi, int rw)
+{
+	enum bdi_state bit;
+
+	bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested;
+	set_bit(bit, &bdi->state);
+}
+EXPORT_SYMBOL(set_bdi_congested);
+
+/**
+ * congestion_wait - wait for a backing_dev to become uncongested
+ * @rw: READ or WRITE
+ * @timeout: timeout in jiffies
+ *
+ * Waits for up to @timeout jiffies for a backing_dev (any backing_dev) to exit
+ * write congestion.  If no backing_devs are congested then just wait for the
+ * next write to be completed.
+ */
+long congestion_wait(int rw, long timeout)
+{
+	long ret;
+	DEFINE_WAIT(wait);
+	wait_queue_head_t *wqh = &congestion_wqh[rw];
+
+	prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
+	ret = io_schedule_timeout(timeout);
+	finish_wait(wqh, &wait);
+	return ret;
+}
+EXPORT_SYMBOL(congestion_wait);
+
+/**
+ * congestion_end - wake up sleepers on a congested backing_dev_info
+ * @rw: READ or WRITE
+ */
+void congestion_end(int rw)
+{
+	wait_queue_head_t *wqh = &congestion_wqh[rw];
+
+	if (waitqueue_active(wqh))
+		wake_up(wqh);
+}
+EXPORT_SYMBOL(congestion_end);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index a0f3390..8d9b19f 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -222,7 +222,7 @@ static void balance_dirty_pages(struct address_space *mapping)
 			if (pages_written >= write_chunk)
 				break;		/* We've done our duty */
 		}
-		blk_congestion_wait(WRITE, HZ/10);
+		congestion_wait(WRITE, HZ/10);
 	}
 
 	if (nr_reclaimable + global_page_state(NR_WRITEBACK)
@@ -314,7 +314,7 @@ void throttle_vm_writeout(void)
                 if (global_page_state(NR_UNSTABLE_NFS) +
 			global_page_state(NR_WRITEBACK) <= dirty_thresh)
                         	break;
-                blk_congestion_wait(WRITE, HZ/10);
+                congestion_wait(WRITE, HZ/10);
         }
 }
 
@@ -351,7 +351,7 @@ static void background_writeout(unsigned long _min_pages)
 		min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
 		if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
 			/* Wrote less than expected */
-			blk_congestion_wait(WRITE, HZ/10);
+			congestion_wait(WRITE, HZ/10);
 			if (!wbc.encountered_congestion)
 				break;
 		}
@@ -422,7 +422,7 @@ static void wb_kupdate(unsigned long arg)
 		writeback_inodes(&wbc);
 		if (wbc.nr_to_write > 0) {
 			if (wbc.encountered_congestion)
-				blk_congestion_wait(WRITE, HZ/10);
+				congestion_wait(WRITE, HZ/10);
 			else
 				break;	/* All the old data is written */
 		}
@@ -956,15 +956,6 @@ int test_set_page_writeback(struct page *page)
 EXPORT_SYMBOL(test_set_page_writeback);
 
 /*
- * Wakes up tasks that are being throttled due to writeback congestion
- */
-void writeback_congestion_end(void)
-{
-	blk_congestion_end(WRITE);
-}
-EXPORT_SYMBOL(writeback_congestion_end);
-
-/*
  * Return true if any of the pages in the mapping are marged with the
  * passed tag.
  */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 40db96a..afee38f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -39,6 +39,7 @@
 #include <linux/stop_machine.h>
 #include <linux/sort.h>
 #include <linux/pfn.h>
+#include <linux/backing-dev.h>
 
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -1050,7 +1051,7 @@ nofail_alloc:
 			if (page)
 				goto got_pg;
 			if (gfp_mask & __GFP_NOFAIL) {
-				blk_congestion_wait(WRITE, HZ/50);
+				congestion_wait(WRITE, HZ/50);
 				goto nofail_alloc;
 			}
 		}
@@ -1113,7 +1114,7 @@ rebalance:
 			do_retry = 1;
 	}
 	if (do_retry) {
-		blk_congestion_wait(WRITE, HZ/50);
+		congestion_wait(WRITE, HZ/50);
 		goto rebalance;
 	}
 
diff --git a/mm/shmem.c b/mm/shmem.c
index b378f66..4959535 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -48,6 +48,7 @@
 #include <linux/ctype.h>
 #include <linux/migrate.h>
 #include <linux/highmem.h>
+#include <linux/backing-dev.h>
 
 #include <asm/uaccess.h>
 #include <asm/div64.h>
@@ -1131,7 +1132,7 @@ repeat:
 			page_cache_release(swappage);
 			if (error == -ENOMEM) {
 				/* let kswapd refresh zone for GFP_ATOMICs */
-				blk_congestion_wait(WRITE, HZ/50);
+				congestion_wait(WRITE, HZ/50);
 			}
 			goto repeat;
 		}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index af73c14..f05527b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1059,7 +1059,7 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
 
 		/* Take a nap, wait for some writeback to complete */
 		if (sc.nr_scanned && priority < DEF_PRIORITY - 2)
-			blk_congestion_wait(WRITE, HZ/10);
+			congestion_wait(WRITE, HZ/10);
 	}
 	/* top priority shrink_caches still had more to do? don't OOM, then */
 	if (!sc.all_unreclaimable)
@@ -1214,7 +1214,7 @@ scan:
 		 * another pass across the zones.
 		 */
 		if (total_scanned && priority < DEF_PRIORITY - 2)
-			blk_congestion_wait(WRITE, HZ/10);
+			congestion_wait(WRITE, HZ/10);
 
 		/*
 		 * We do this so kswapd doesn't build up large priorities for
@@ -1458,7 +1458,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
 				goto out;
 
 			if (sc.nr_scanned && prio < DEF_PRIORITY - 2)
-				blk_congestion_wait(WRITE, HZ / 10);
+				congestion_wait(WRITE, HZ / 10);
 		}
 
 		lru_pages = 0;
-- 
cgit v1.1


From 8ac773b4f73afa6fd66695131103944b975d5d5c Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 19 Oct 2006 23:28:32 -0700
Subject: [PATCH] OOM killer meets userspace headers

Despite mm.h is not being exported header, it does contain one thing
which is part of userspace ABI -- value disabling OOM killer for given
process. So,
a) create and export include/linux/oom.h
b) move OOM_DISABLE define there.
c) turn bounding values of /proc/$PID/oom_adj into defines and export
   them too.

Note: mass __KERNEL__ removal will be done later.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/oom_kill.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'mm')

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 20f41b0..2e3ce3a 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -15,6 +15,7 @@
  *  kernel subsystems and hints as to where to find out what things do.
  */
 
+#include <linux/oom.h>
 #include <linux/mm.h>
 #include <linux/sched.h>
 #include <linux/swap.h>
-- 
cgit v1.1


From 6220ec7844fda2686496013a66b5b9169976b991 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Thu, 19 Oct 2006 23:29:05 -0700
Subject: [PATCH] highest_possible_node_id() linkage fix

Qooting Adrian:

- net/sunrpc/svc.c uses highest_possible_node_id()

- include/linux/nodemask.h says highest_possible_node_id() is
  out-of-line #if MAX_NUMNODES > 1

- the out-of-line highest_possible_node_id() is in lib/cpumask.c

- lib/Makefile: lib-$(CONFIG_SMP) += cpumask.o
  CONFIG_ARCH_DISCONTIGMEM_ENABLE=y, CONFIG_SMP=n, CONFIG_SUNRPC=y

-> highest_possible_node_id() is used in net/sunrpc/svc.c
   CONFIG_NODES_SHIFT defined and > 0

-> include/linux/numa.h: MAX_NUMNODES > 1

-> compile error

The bug is not present on architectures where ARCH_DISCONTIGMEM_ENABLE
depends on NUMA (but m32r isn't the only affected architecture).

So move the function into page_alloc.c

Cc: Adrian Bunk <bunk@stusta.de>
Cc: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index afee38f..ebd425c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3120,3 +3120,19 @@ unsigned long page_to_pfn(struct page *page)
 EXPORT_SYMBOL(pfn_to_page);
 EXPORT_SYMBOL(page_to_pfn);
 #endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */
+
+#if MAX_NUMNODES > 1
+/*
+ * Find the highest possible node id.
+ */
+int highest_possible_node_id(void)
+{
+	unsigned int node;
+	unsigned int highest = 0;
+
+	for_each_node_mask(node, node_possible_map)
+		highest = node;
+	return highest;
+}
+EXPORT_SYMBOL(highest_possible_node_id);
+#endif
-- 
cgit v1.1


From c4ec7b0de4bc18ccb4380de638550984d9a65c25 Mon Sep 17 00:00:00 2001
From: Dmitriy Monakhov <dmonakhov@openvz.org>
Date: Thu, 19 Oct 2006 23:29:08 -0700
Subject: [PATCH] mm: D-cache aliasing issue in cow_user_page

--=-=-=

 from mm/memory.c:
  1434  static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va)
  1435  {
  1436          /*
  1437           * If the source page was a PFN mapping, we don't have
  1438           * a "struct page" for it. We do a best-effort copy by
  1439           * just copying from the original user address. If that
  1440           * fails, we just zero-fill it. Live with it.
  1441           */
  1442          if (unlikely(!src)) {
  1443                  void *kaddr = kmap_atomic(dst, KM_USER0);
  1444                  void __user *uaddr = (void __user *)(va & PAGE_MASK);
  1445
  1446                  /*
  1447                   * This really shouldn't fail, because the page is there
  1448                   * in the page tables. But it might just be unreadable,
  1449                   * in which case we just give up and fill the result with
  1450                   * zeroes.
  1451                   */
  1452                  if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
  1453                          memset(kaddr, 0, PAGE_SIZE);
  1454                  kunmap_atomic(kaddr, KM_USER0);
  #### D-cache have to be flushed here.
  #### It seems it is just forgotten.

  1455                  return;
  1456
  1457          }
  1458          copy_user_highpage(dst, src, va);
  #### Ok here. flush_dcache_page() called from this func if arch need it
  1459  }

Following is the patch  fix this issue:

Signed-off-by: Dmitriy Monakhov <dmonakhov@openvz.org>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'mm')

diff --git a/mm/memory.c b/mm/memory.c
index b5a4aad..156861f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1452,6 +1452,7 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
 		if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
 			memset(kaddr, 0, PAGE_SIZE);
 		kunmap_atomic(kaddr, KM_USER0);
+		flush_dcache_page(dst);
 		return;
 		
 	}
-- 
cgit v1.1


From 82591e6ea234762eeaa8b2337fe060ed438c18dc Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Thu, 19 Oct 2006 23:29:10 -0700
Subject: [PATCH] mm: more commenting on lock ordering

Clarify lockorder comments now that sys_msync dropps mmap_sem before
calling do_fsync.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/filemap.c |  4 ++--
 mm/rmap.c    | 36 +++++++++++++++---------------------
 2 files changed, 17 insertions(+), 23 deletions(-)

(limited to 'mm')

diff --git a/mm/filemap.c b/mm/filemap.c
index 57faa8d..8558732 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -75,8 +75,8 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
  *  ->mmap_sem
  *    ->lock_page		(access_process_vm)
  *
- *  ->mmap_sem
- *    ->i_mutex			(msync)
+ *  ->i_mutex			(generic_file_buffered_write)
+ *    ->mmap_sem		(fault_in_pages_readable->do_page_fault)
  *
  *  ->i_mutex
  *    ->i_alloc_sem             (various)
diff --git a/mm/rmap.c b/mm/rmap.c
index a9136d8..d8a842a 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -21,27 +21,21 @@
  * Lock ordering in mm:
  *
  * inode->i_mutex	(while writing or truncating, not reading or faulting)
- *   inode->i_alloc_sem
- *
- * When a page fault occurs in writing from user to file, down_read
- * of mmap_sem nests within i_mutex; in sys_msync, i_mutex nests within
- * down_read of mmap_sem; i_mutex and down_write of mmap_sem are never
- * taken together; in truncation, i_mutex is taken outermost.
- *
- * mm->mmap_sem
- *   page->flags PG_locked (lock_page)
- *     mapping->i_mmap_lock
- *       anon_vma->lock
- *         mm->page_table_lock or pte_lock
- *           zone->lru_lock (in mark_page_accessed, isolate_lru_page)
- *           swap_lock (in swap_duplicate, swap_info_get)
- *             mmlist_lock (in mmput, drain_mmlist and others)
- *             mapping->private_lock (in __set_page_dirty_buffers)
- *             inode_lock (in set_page_dirty's __mark_inode_dirty)
- *               sb_lock (within inode_lock in fs/fs-writeback.c)
- *               mapping->tree_lock (widely used, in set_page_dirty,
- *                         in arch-dependent flush_dcache_mmap_lock,
- *                         within inode_lock in __sync_single_inode)
+ *   inode->i_alloc_sem (vmtruncate_range)
+ *   mm->mmap_sem
+ *     page->flags PG_locked (lock_page)
+ *       mapping->i_mmap_lock
+ *         anon_vma->lock
+ *           mm->page_table_lock or pte_lock
+ *             zone->lru_lock (in mark_page_accessed, isolate_lru_page)
+ *             swap_lock (in swap_duplicate, swap_info_get)
+ *               mmlist_lock (in mmput, drain_mmlist and others)
+ *               mapping->private_lock (in __set_page_dirty_buffers)
+ *               inode_lock (in set_page_dirty's __mark_inode_dirty)
+ *                 sb_lock (within inode_lock in fs/fs-writeback.c)
+ *                 mapping->tree_lock (widely used, in set_page_dirty,
+ *                           in arch-dependent flush_dcache_mmap_lock,
+ *                           within inode_lock in __sync_single_inode)
  */
 
 #include <linux/mm.h>
-- 
cgit v1.1