Merge branch 'upstream'

Conflicts: drivers/scsi/libata-core.c
author: Jeff Garzik <jeff@garzik.org> 2006-03-24 12:29:39 -0500
committer: Jeff Garzik <jeff@garzik.org> 2006-03-24 12:29:39 -0500
commit: 4bbf7bc4c7bf1c80ec3c942fa5f1b6e6fa67dd99 (patch)
tree: c4ff89dc09abe69d58db1e14da22ecda9fdd3ce8 /mm
parent: 84ac69e8bf9f36eb0166817373336d14fa58f5cc (diff)
parent: aec5c3c1a929d7d79a420e943285cf3ba26a7c0d (diff)
download: op-kernel-dev-4bbf7bc4c7bf1c80ec3c942fa5f1b6e6fa67dd99.zip
op-kernel-dev-4bbf7bc4c7bf1c80ec3c942fa5f1b6e6fa67dd99.tar.gz
11 files changed, 351 insertions, 107 deletions
diff --git a/mm/fadvise.c b/mm/fadvise.c
index d257c89..907c392 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -15,6 +15,7 @@
 #include <linux/backing-dev.h>
 #include <linux/pagevec.h>
 #include <linux/fadvise.h>
+#include <linux/writeback.h>
 #include <linux/syscalls.h>
 
 #include <asm/unistd.h>
@@ -22,13 +23,36 @@
 /*
  * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could
  * deactivate the pages and clear PG_Referenced.
+ *
+ * LINUX_FADV_ASYNC_WRITE: start async writeout of any dirty pages between file
+ * offsets `offset' and `offset+len' inclusive.  Any pages which are currently
+ * under writeout are skipped, whether or not they are dirty.
+ *
+ * LINUX_FADV_WRITE_WAIT: wait upon writeout of any dirty pages between file
+ * offsets `offset' and `offset+len'.
+ *
+ * By combining these two operations the application may do several things:
+ *
+ * LINUX_FADV_ASYNC_WRITE: push some or all of the dirty pages at the disk.
+ *
+ * LINUX_FADV_WRITE_WAIT, LINUX_FADV_ASYNC_WRITE: push all of the currently
+ * dirty pages at the disk.
+ *
+ * LINUX_FADV_WRITE_WAIT, LINUX_FADV_ASYNC_WRITE, LINUX_FADV_WRITE_WAIT: push
+ * all of the currently dirty pages at the disk, wait until they have been
+ * written.
+ *
+ * It should be noted that none of these operations write out the file's
+ * metadata.  So unless the application is strictly performing overwrites of
+ * already-instantiated disk blocks, there are no guarantees here that the data
+ * will be available after a crash.
  */
 asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
 {
 	struct file *file = fget(fd);
 	struct address_space *mapping;
 	struct backing_dev_info *bdi;
-	loff_t endbyte;
+	loff_t endbyte;			/* inclusive */
 	pgoff_t start_index;
 	pgoff_t end_index;
 	unsigned long nrpages;
@@ -56,6 +80,8 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
 	endbyte = offset + len;
 	if (!len || endbyte < len)
 		endbyte = -1;
+	else
+		endbyte--;		/* inclusive */
 
 	bdi = mapping->backing_dev_info;
 
@@ -78,7 +104,7 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
 
 		/* First and last PARTIAL page! */
 		start_index = offset >> PAGE_CACHE_SHIFT;
-		end_index = (endbyte-1) >> PAGE_CACHE_SHIFT;
+		end_index = endbyte >> PAGE_CACHE_SHIFT;
 
 		/* Careful about overflow on the "+1" */
 		nrpages = end_index - start_index + 1;
@@ -96,11 +122,21 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
 			filemap_flush(mapping);
 
 		/* First and last FULL page! */
-		start_index = (offset + (PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT;
+		start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT;
 		end_index = (endbyte >> PAGE_CACHE_SHIFT);
 
-		if (end_index > start_index)
-			invalidate_mapping_pages(mapping, start_index, end_index-1);
+		if (end_index >= start_index)
+			invalidate_mapping_pages(mapping, start_index,
+						end_index);
+		break;
+	case LINUX_FADV_ASYNC_WRITE:
+		ret = __filemap_fdatawrite_range(mapping, offset, endbyte,
+						WB_SYNC_NONE);
+		break;
+	case LINUX_FADV_WRITE_WAIT:
+		ret = wait_on_page_writeback_range(mapping,
+					offset >> PAGE_CACHE_SHIFT,
+					endbyte >> PAGE_CACHE_SHIFT);
 		break;
 	default:
 		ret = -EINVAL;
diff --git a/mm/filemap.c b/mm/filemap.c
index e8f58f7..3ef2073 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -29,6 +29,7 @@
 #include <linux/blkdev.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
+#include <linux/cpuset.h>
 #include "filemap.h"
 #include "internal.h"
 
@@ -174,7 +175,7 @@ static int sync_page(void *word)
  * dirty pages that lie within the byte offsets <start, end>
  * @mapping:	address space structure to write
  * @start:	offset in bytes where the range starts
- * @end:	offset in bytes where the range ends
+ * @end:	offset in bytes where the range ends (inclusive)
  * @sync_mode:	enable synchronous operation
  *
  * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
@@ -182,8 +183,8 @@ static int sync_page(void *word)
  * these two operations is that if a dirty page/buffer is encountered, it must
  * be waited upon, and not just skipped over.
  */
-static int __filemap_fdatawrite_range(struct address_space *mapping,
-	loff_t start, loff_t end, int sync_mode)
+int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
+				loff_t end, int sync_mode)
 {
 	int ret;
 	struct writeback_control wbc = {
@@ -212,8 +213,8 @@ int filemap_fdatawrite(struct address_space *mapping)
 }
 EXPORT_SYMBOL(filemap_fdatawrite);
 
-static int filemap_fdatawrite_range(struct address_space *mapping,
-	loff_t start, loff_t end)
+static int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
+				loff_t end)
 {
 	return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
 }
@@ -232,7 +233,7 @@ EXPORT_SYMBOL(filemap_flush);
  * Wait for writeback to complete against pages indexed by start->end
  * inclusive
  */
-static int wait_on_page_writeback_range(struct address_space *mapping,
+int wait_on_page_writeback_range(struct address_space *mapping,
 				pgoff_t start, pgoff_t end)
 {
 	struct pagevec pvec;
@@ -367,6 +368,12 @@ int filemap_write_and_wait(struct address_space *mapping)
 }
 EXPORT_SYMBOL(filemap_write_and_wait);
 
+/*
+ * Write out and wait upon file offsets lstart->lend, inclusive.
+ *
+ * Note that `lend' is inclusive (describes the last byte to be written) so
+ * that this function can be used to write to the very end-of-file (end = -1).
+ */
 int filemap_write_and_wait_range(struct address_space *mapping,
 				 loff_t lstart, loff_t lend)
 {
@@ -427,6 +434,28 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
 	return ret;
 }
 
+#ifdef CONFIG_NUMA
+struct page *page_cache_alloc(struct address_space *x)
+{
+	if (cpuset_do_page_mem_spread()) {
+		int n = cpuset_mem_spread_node();
+		return alloc_pages_node(n, mapping_gfp_mask(x), 0);
+	}
+	return alloc_pages(mapping_gfp_mask(x), 0);
+}
+EXPORT_SYMBOL(page_cache_alloc);
+
+struct page *page_cache_alloc_cold(struct address_space *x)
+{
+	if (cpuset_do_page_mem_spread()) {
+		int n = cpuset_mem_spread_node();
+		return alloc_pages_node(n, mapping_gfp_mask(x)|__GFP_COLD, 0);
+	}
+	return alloc_pages(mapping_gfp_mask(x)|__GFP_COLD, 0);
+}
+EXPORT_SYMBOL(page_cache_alloc_cold);
+#endif
+
 /*
  * In order to wait for pages to become available there must be
  * waitqueues associated with pages. By using a hash table of
diff --git a/mm/highmem.c b/mm/highmem.c
index ce2e7e8..d0ea1ee 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -26,6 +26,7 @@
 #include <linux/init.h>
 #include <linux/hash.h>
 #include <linux/highmem.h>
+#include <linux/blktrace_api.h>
 #include <asm/tlbflush.h>
 
 static mempool_t *page_pool, *isa_page_pool;
@@ -483,6 +484,8 @@ void blk_queue_bounce(request_queue_t *q, struct bio **bio_orig)
 		pool = isa_page_pool;
 	}
 
+	blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE);
+
 	/*
 	 * slow path
 	 */
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index e93cc74..4f71cfd 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -422,6 +422,37 @@ static int contextualize_policy(int mode, nodemask_t *nodes)
 	return mpol_check_policy(mode, nodes);
 }
 
+
+/*
+ * Update task->flags PF_MEMPOLICY bit: set iff non-default
+ * mempolicy.  Allows more rapid checking of this (combined perhaps
+ * with other PF_* flag bits) on memory allocation hot code paths.
+ *
+ * If called from outside this file, the task 'p' should -only- be
+ * a newly forked child not yet visible on the task list, because
+ * manipulating the task flags of a visible task is not safe.
+ *
+ * The above limitation is why this routine has the funny name
+ * mpol_fix_fork_child_flag().
+ *
+ * It is also safe to call this with a task pointer of current,
+ * which the static wrapper mpol_set_task_struct_flag() does,
+ * for use within this file.
+ */
+
+void mpol_fix_fork_child_flag(struct task_struct *p)
+{
+	if (p->mempolicy)
+		p->flags |= PF_MEMPOLICY;
+	else
+		p->flags &= ~PF_MEMPOLICY;
+}
+
+static void mpol_set_task_struct_flag(void)
+{
+	mpol_fix_fork_child_flag(current);
+}
+
 /* Set the process memory policy */
 long do_set_mempolicy(int mode, nodemask_t *nodes)
 {
@@ -434,6 +465,7 @@ long do_set_mempolicy(int mode, nodemask_t *nodes)
 		return PTR_ERR(new);
 	mpol_free(current->mempolicy);
 	current->mempolicy = new;
+	mpol_set_task_struct_flag();
 	if (new && new->policy == MPOL_INTERLEAVE)
 		current->il_next = first_node(new->v.nodes);
 	return 0;
diff --git a/mm/msync.c b/mm/msync.c
index 3563a56..2672b8d 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -9,20 +9,24 @@
  */
 #include <linux/slab.h>
 #include <linux/pagemap.h>
+#include <linux/fs.h>
 #include <linux/mm.h>
 #include <linux/mman.h>
 #include <linux/hugetlb.h>
+#include <linux/writeback.h>
+#include <linux/file.h>
 #include <linux/syscalls.h>
 
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 
-static void msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
+static unsigned long msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 				unsigned long addr, unsigned long end)
 {
 	pte_t *pte;
 	spinlock_t *ptl;
 	int progress = 0;
+	unsigned long ret = 0;
 
 again:
 	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
@@ -43,58 +47,64 @@ again:
 		if (!page)
 			continue;
 		if (ptep_clear_flush_dirty(vma, addr, pte) ||
-		    page_test_and_clear_dirty(page))
-			set_page_dirty(page);
+				page_test_and_clear_dirty(page))
+			ret += set_page_dirty(page);
 		progress += 3;
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 	pte_unmap_unlock(pte - 1, ptl);
 	cond_resched();
 	if (addr != end)
 		goto again;
+	return ret;
 }
 
-static inline void msync_pmd_range(struct vm_area_struct *vma, pud_t *pud,
-				unsigned long addr, unsigned long end)
+static inline unsigned long msync_pmd_range(struct vm_area_struct *vma,
+			pud_t *pud, unsigned long addr, unsigned long end)
 {
 	pmd_t *pmd;
 	unsigned long next;
+	unsigned long ret = 0;
 
 	pmd = pmd_offset(pud, addr);
 	do {
 		next = pmd_addr_end(addr, end);
 		if (pmd_none_or_clear_bad(pmd))
 			continue;
-		msync_pte_range(vma, pmd, addr, next);
+		ret += msync_pte_range(vma, pmd, addr, next);
 	} while (pmd++, addr = next, addr != end);
+	return ret;
 }
 
-static inline void msync_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
-				unsigned long addr, unsigned long end)
+static inline unsigned long msync_pud_range(struct vm_area_struct *vma,
+			pgd_t *pgd, unsigned long addr, unsigned long end)
 {
 	pud_t *pud;
 	unsigned long next;
+	unsigned long ret = 0;
 
 	pud = pud_offset(pgd, addr);
 	do {
 		next = pud_addr_end(addr, end);
 		if (pud_none_or_clear_bad(pud))
 			continue;
-		msync_pmd_range(vma, pud, addr, next);
+		ret += msync_pmd_range(vma, pud, addr, next);
 	} while (pud++, addr = next, addr != end);
+	return ret;
 }
 
-static void msync_page_range(struct vm_area_struct *vma,
+static unsigned long msync_page_range(struct vm_area_struct *vma,
 				unsigned long addr, unsigned long end)
 {
 	pgd_t *pgd;
 	unsigned long next;
+	unsigned long ret = 0;
 
 	/* For hugepages we can't go walking the page table normally,
 	 * but that's ok, hugetlbfs is memory based, so we don't need
 	 * to do anything more on an msync().
 	 */
 	if (vma->vm_flags & VM_HUGETLB)
-		return;
+		return 0;
 
 	BUG_ON(addr >= end);
 	pgd = pgd_offset(vma->vm_mm, addr);
@@ -103,8 +113,9 @@ static void msync_page_range(struct vm_area_struct *vma,
 		next = pgd_addr_end(addr, end);
 		if (pgd_none_or_clear_bad(pgd))
 			continue;
-		msync_pud_range(vma, pgd, addr, next);
+		ret += msync_pud_range(vma, pgd, addr, next);
 	} while (pgd++, addr = next, addr != end);
+	return ret;
 }
 
 /*
@@ -118,50 +129,28 @@ static void msync_page_range(struct vm_area_struct *vma,
  * So my _not_ starting I/O in MS_ASYNC we provide complete flexibility to
  * applications.
  */
-static int msync_interval(struct vm_area_struct *vma,
-			unsigned long addr, unsigned long end, int flags)
+static int msync_interval(struct vm_area_struct *vma, unsigned long addr,
+			unsigned long end, int flags,
+			unsigned long *nr_pages_dirtied)
 {
-	int ret = 0;
 	struct file *file = vma->vm_file;
 
 	if ((flags & MS_INVALIDATE) && (vma->vm_flags & VM_LOCKED))
 		return -EBUSY;
 
-	if (file && (vma->vm_flags & VM_SHARED)) {
-		msync_page_range(vma, addr, end);
-
-		if (flags & MS_SYNC) {
-			struct address_space *mapping = file->f_mapping;
-			int err;
-
-			ret = filemap_fdatawrite(mapping);
-			if (file->f_op && file->f_op->fsync) {
-				/*
-				 * We don't take i_mutex here because mmap_sem
-				 * is already held.
-				 */
-				err = file->f_op->fsync(file,file->f_dentry,1);
-				if (err && !ret)
-					ret = err;
-			}
-			err = filemap_fdatawait(mapping);
-			if (!ret)
-				ret = err;
-		}
-	}
-	return ret;
+	if (file && (vma->vm_flags & VM_SHARED))
+		*nr_pages_dirtied = msync_page_range(vma, addr, end);
+	return 0;
 }
 
 asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
 {
 	unsigned long end;
 	struct vm_area_struct *vma;
-	int unmapped_error, error = -EINVAL;
-
-	if (flags & MS_SYNC)
-		current->flags |= PF_SYNCWRITE;
+	int unmapped_error = 0;
+	int error = -EINVAL;
+	int done = 0;
 
-	down_read(&current->mm->mmap_sem);
 	if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
 		goto out;
 	if (start & ~PAGE_MASK)
@@ -180,13 +169,18 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
 	 * If the interval [start,end) covers some unmapped address ranges,
 	 * just ignore them, but return -ENOMEM at the end.
 	 */
+	down_read(&current->mm->mmap_sem);
+	if (flags & MS_SYNC)
+		current->flags |= PF_SYNCWRITE;
 	vma = find_vma(current->mm, start);
-	unmapped_error = 0;
-	for (;;) {
-		/* Still start < end. */
+	if (!vma) {
 		error = -ENOMEM;
-		if (!vma)
-			goto out;
+		goto out_unlock;
+	}
+	do {
+		unsigned long nr_pages_dirtied = 0;
+		struct file *file;
+
 		/* Here start < vma->vm_end. */
 		if (start < vma->vm_start) {
 			unmapped_error = -ENOMEM;
@@ -195,22 +189,47 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
 		/* Here vma->vm_start <= start < vma->vm_end. */
 		if (end <= vma->vm_end) {
 			if (start < end) {
-				error = msync_interval(vma, start, end, flags);
+				error = msync_interval(vma, start, end, flags,
+							&nr_pages_dirtied);
 				if (error)
-					goto out;
+					goto out_unlock;
 			}
 			error = unmapped_error;
-			goto out;
+			done = 1;
+		} else {
+			/* Here vma->vm_start <= start < vma->vm_end < end. */
+			error = msync_interval(vma, start, vma->vm_end, flags,
+						&nr_pages_dirtied);
+			if (error)
+				goto out_unlock;
 		}
-		/* Here vma->vm_start <= start < vma->vm_end < end. */
-		error = msync_interval(vma, start, vma->vm_end, flags);
-		if (error)
-			goto out;
+		file = vma->vm_file;
 		start = vma->vm_end;
-		vma = vma->vm_next;
-	}
-out:
-	up_read(&current->mm->mmap_sem);
+		if ((flags & MS_ASYNC) && file && nr_pages_dirtied) {
+			get_file(file);
+			up_read(&current->mm->mmap_sem);
+			balance_dirty_pages_ratelimited_nr(file->f_mapping,
+							nr_pages_dirtied);
+			fput(file);
+			down_read(&current->mm->mmap_sem);
+			vma = find_vma(current->mm, start);
+		} else if ((flags & MS_SYNC) && file &&
+				(vma->vm_flags & VM_SHARED)) {
+			get_file(file);
+			up_read(&current->mm->mmap_sem);
+			error = do_fsync(file, 0);
+			fput(file);
+			down_read(&current->mm->mmap_sem);
+			if (error)
+				goto out_unlock;
+			vma = find_vma(current->mm, start);
+		} else {
+			vma = vma->vm_next;
+		}
+	} while (vma && !done);
+out_unlock:
 	current->flags &= ~PF_SYNCWRITE;
+	up_read(&current->mm->mmap_sem);
+out:
 	return error;
 }
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 945559f..893d767 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -75,12 +75,12 @@ int vm_dirty_ratio = 40;
  * The interval between `kupdate'-style writebacks, in centiseconds
  * (hundredths of a second)
  */
-int dirty_writeback_centisecs = 5 * 100;
+int dirty_writeback_interval = 5 * HZ;
 
 /*
  * The longest number of centiseconds for which data is allowed to remain dirty
  */
-int dirty_expire_centisecs = 30 * 100;
+int dirty_expire_interval = 30 * HZ;
 
 /*
  * Flag that makes the machine dump writes/reads and block dirtyings.
@@ -88,7 +88,8 @@ int dirty_expire_centisecs = 30 * 100;
 int block_dump;
 
 /*
- * Flag that puts the machine in "laptop mode".
+ * Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies:
+ * a full sync is triggered after this time elapses without any disk activity.
  */
 int laptop_mode;
 
@@ -255,8 +256,9 @@ static void balance_dirty_pages(struct address_space *mapping)
 }
 
 /**
- * balance_dirty_pages_ratelimited - balance dirty memory state
+ * balance_dirty_pages_ratelimited_nr - balance dirty memory state
  * @mapping: address_space which was dirtied
+ * @nr_pages: number of pages which the caller has just dirtied
  *
  * Processes which are dirtying memory should call in here once for each page
  * which was newly dirtied.  The function will periodically check the system's
@@ -267,10 +269,12 @@ static void balance_dirty_pages(struct address_space *mapping)
  * limit we decrease the ratelimiting by a lot, to prevent individual processes
  * from overshooting the limit by (ratelimit_pages) each.
  */
-void balance_dirty_pages_ratelimited(struct address_space *mapping)
+void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
+					unsigned long nr_pages_dirtied)
 {
-	static DEFINE_PER_CPU(int, ratelimits) = 0;
-	long ratelimit;
+	static DEFINE_PER_CPU(unsigned long, ratelimits) = 0;
+	unsigned long ratelimit;
+	unsigned long *p;
 
 	ratelimit = ratelimit_pages;
 	if (dirty_exceeded)
@@ -280,15 +284,18 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping)
 	 * Check the rate limiting. Also, we do not want to throttle real-time
 	 * tasks in balance_dirty_pages(). Period.
 	 */
-	if (get_cpu_var(ratelimits)++ >= ratelimit) {
-		__get_cpu_var(ratelimits) = 0;
-		put_cpu_var(ratelimits);
+	preempt_disable();
+	p =  &__get_cpu_var(ratelimits);
+	*p += nr_pages_dirtied;
+	if (unlikely(*p >= ratelimit)) {
+		*p = 0;
+		preempt_enable();
 		balance_dirty_pages(mapping);
 		return;
 	}
-	put_cpu_var(ratelimits);
+	preempt_enable();
 }
-EXPORT_SYMBOL(balance_dirty_pages_ratelimited);
+EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr);
 
 void throttle_vm_writeout(void)
 {
@@ -380,8 +387,8 @@ static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
  * just walks the superblock inode list, writing back any inodes which are
  * older than a specific point in time.
  *
- * Try to run once per dirty_writeback_centisecs.  But if a writeback event
- * takes longer than a dirty_writeback_centisecs interval, then leave a
+ * Try to run once per dirty_writeback_interval.  But if a writeback event
+ * takes longer than a dirty_writeback_interval interval, then leave a
  * one-second gap.
  *
  * older_than_this takes precedence over nr_to_write.  So we'll only write back
@@ -406,9 +413,9 @@ static void wb_kupdate(unsigned long arg)
 	sync_supers();
 
 	get_writeback_state(&wbs);
-	oldest_jif = jiffies - (dirty_expire_centisecs * HZ) / 100;
+	oldest_jif = jiffies - dirty_expire_interval;
 	start_jif = jiffies;
-	next_jif = start_jif + (dirty_writeback_centisecs * HZ) / 100;
+	next_jif = start_jif + dirty_writeback_interval;
 	nr_to_write = wbs.nr_dirty + wbs.nr_unstable +
 			(inodes_stat.nr_inodes - inodes_stat.nr_unused);
 	while (nr_to_write > 0) {
@@ -425,7 +432,7 @@ static void wb_kupdate(unsigned long arg)
 	}
 	if (time_before(next_jif, jiffies + HZ))
 		next_jif = jiffies + HZ;
-	if (dirty_writeback_centisecs)
+	if (dirty_writeback_interval)
 		mod_timer(&wb_timer, next_jif);
 }
 
@@ -435,11 +442,11 @@ static void wb_kupdate(unsigned long arg)
 int dirty_writeback_centisecs_handler(ctl_table *table, int write,
 		struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
 {
-	proc_dointvec(table, write, file, buffer, length, ppos);
-	if (dirty_writeback_centisecs) {
+	proc_dointvec_userhz_jiffies(table, write, file, buffer, length, ppos);
+	if (dirty_writeback_interval) {
 		mod_timer(&wb_timer,
-			jiffies + (dirty_writeback_centisecs * HZ) / 100);
-	} else {
+			jiffies + dirty_writeback_interval);
+		} else {
 		del_timer(&wb_timer);
 	}
 	return 0;
@@ -468,7 +475,7 @@ static void laptop_timer_fn(unsigned long unused)
  */
 void laptop_io_completion(void)
 {
-	mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode * HZ);
+	mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode);
 }
 
 /*
@@ -544,7 +551,7 @@ void __init page_writeback_init(void)
 		if (vm_dirty_ratio <= 0)
 			vm_dirty_ratio = 1;
 	}
-	mod_timer(&wb_timer, jiffies + (dirty_writeback_centisecs * HZ) / 100);
+	mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
 	set_ratelimit();
 	register_cpu_notifier(&ratelimit_nb);
 }
@@ -621,8 +628,6 @@ EXPORT_SYMBOL(write_one_page);
  */
 int __set_page_dirty_nobuffers(struct page *page)
 {
-	int ret = 0;
-
 	if (!TestSetPageDirty(page)) {
 		struct address_space *mapping = page_mapping(page);
 		struct address_space *mapping2;
@@ -644,8 +649,9 @@ int __set_page_dirty_nobuffers(struct page *page)
 							I_DIRTY_PAGES);
 			}
 		}
+		return 1;
 	}
-	return ret;
+	return 0;
 }
 EXPORT_SYMBOL(__set_page_dirty_nobuffers);
 
@@ -675,8 +681,10 @@ int fastcall set_page_dirty(struct page *page)
 			return (*spd)(page);
 		return __set_page_dirty_buffers(page);
 	}
-	if (!PageDirty(page))
-		SetPageDirty(page);
+	if (!PageDirty(page)) {
+		if (!TestSetPageDirty(page))
+			return 1;
+	}
 	return 0;
 }
 EXPORT_SYMBOL(set_page_dirty);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b7f14a4..a5c3f8b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -943,7 +943,8 @@ restart:
 		goto got_pg;
 
 	do {
-		wakeup_kswapd(*z, order);
+		if (cpuset_zone_allowed(*z, gfp_mask))
+			wakeup_kswapd(*z, order);
 	} while (*(++z));
 
 	/*
diff --git a/mm/readahead.c b/mm/readahead.c
index 301b36c..0f142a4 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -555,6 +555,7 @@ recheck:
 out:
 	return ra->prev_page + 1;
 }
+EXPORT_SYMBOL_GPL(page_cache_readahead);
 
 /*
  * handle_ra_miss() is called when it is known that a page which should have
diff --git a/mm/slab.c b/mm/slab.c
index 1c8f5ee..26138c9 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -94,6 +94,7 @@
 #include	<linux/interrupt.h>
 #include	<linux/init.h>
 #include	<linux/compiler.h>
+#include	<linux/cpuset.h>
 #include	<linux/seq_file.h>
 #include	<linux/notifier.h>
 #include	<linux/kallsyms.h>
@@ -173,12 +174,12 @@
 			 SLAB_CACHE_DMA | \
 			 SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \
 			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
-			 SLAB_DESTROY_BY_RCU)
+			 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD)
 #else
 # define CREATE_MASK	(SLAB_HWCACHE_ALIGN | \
 			 SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \
 			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
-			 SLAB_DESTROY_BY_RCU)
+			 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD)
 #endif
 
 /*
@@ -898,6 +899,7 @@ static struct array_cache *alloc_arraycache(int node, int entries,
 
 #ifdef CONFIG_NUMA
 static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int);
+static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
 
 static struct array_cache **alloc_alien_cache(int node, int limit)
 {
@@ -2807,11 +2809,10 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 	struct array_cache *ac;
 
 #ifdef CONFIG_NUMA
-	if (unlikely(current->mempolicy && !in_interrupt())) {
-		int nid = slab_node(current->mempolicy);
-
-		if (nid != numa_node_id())
-			return __cache_alloc_node(cachep, flags, nid);
+	if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) {
+		objp = alternate_node_alloc(cachep, flags);
+		if (objp != NULL)
+			return objp;
 	}
 #endif
 
@@ -2847,6 +2848,28 @@ static __always_inline void *__cache_alloc(struct kmem_cache *cachep,
 
 #ifdef CONFIG_NUMA
 /*
+ * Try allocating on another node if PF_SPREAD_SLAB|PF_MEMPOLICY.
+ *
+ * If we are in_interrupt, then process context, including cpusets and
+ * mempolicy, may not apply and should not be used for allocation policy.
+ */
+static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
+{
+	int nid_alloc, nid_here;
+
+	if (in_interrupt())
+		return NULL;
+	nid_alloc = nid_here = numa_node_id();
+	if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
+		nid_alloc = cpuset_mem_spread_node();
+	else if (current->mempolicy)
+		nid_alloc = slab_node(current->mempolicy);
+	if (nid_alloc != nid_here)
+		return __cache_alloc_node(cachep, flags, nid_alloc);
+	return NULL;
+}
+
+/*
  * A interface to enable slab creation on nodeid
  */
 static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 365ed6f..39aa9d1 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -45,7 +45,7 @@ static const char Unused_offset[] = "Unused swap offset entry ";
 
 struct swap_list_t swap_list = {-1, -1};
 
-struct swap_info_struct swap_info[MAX_SWAPFILES];
+static struct swap_info_struct swap_info[MAX_SWAPFILES];
 
 static DEFINE_MUTEX(swapon_mutex);
 
@@ -417,6 +417,61 @@ void free_swap_and_cache(swp_entry_t entry)
 	}
 }
 
+#ifdef CONFIG_SOFTWARE_SUSPEND
+/*
+ * Find the swap type that corresponds to given device (if any)
+ *
+ * This is needed for software suspend and is done in such a way that inode
+ * aliasing is allowed.
+ */
+int swap_type_of(dev_t device)
+{
+	int i;
+
+	spin_lock(&swap_lock);
+	for (i = 0; i < nr_swapfiles; i++) {
+		struct inode *inode;
+
+		if (!(swap_info[i].flags & SWP_WRITEOK))
+			continue;
+		if (!device) {
+			spin_unlock(&swap_lock);
+			return i;
+		}
+		inode = swap_info->swap_file->f_dentry->d_inode;
+		if (S_ISBLK(inode->i_mode) &&
+		    device == MKDEV(imajor(inode), iminor(inode))) {
+			spin_unlock(&swap_lock);
+			return i;
+		}
+	}
+	spin_unlock(&swap_lock);
+	return -ENODEV;
+}
+
+/*
+ * Return either the total number of swap pages of given type, or the number
+ * of free pages of that type (depending on @free)
+ *
+ * This is needed for software suspend
+ */
+unsigned int count_swap_pages(int type, int free)
+{
+	unsigned int n = 0;
+
+	if (type < nr_swapfiles) {
+		spin_lock(&swap_lock);
+		if (swap_info[type].flags & SWP_WRITEOK) {
+			n = swap_info[type].pages;
+			if (free)
+				n -= swap_info[type].inuse_pages;
+		}
+		spin_unlock(&swap_lock);
+	}
+	return n;
+}
+#endif
+
 /*
  * No need to decide whether this PTE shares the swap entry with others,
  * just let do_wp_page work it out if a write is requested later - to
diff --git a/mm/util.c b/mm/util.c
index 5f4bb59..49e29f7 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -1,6 +1,8 @@
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/module.h>
+#include <linux/err.h>
+#include <asm/uaccess.h>
 
 /**
  * kzalloc - allocate memory. The memory is set to zero.
@@ -37,3 +39,38 @@ char *kstrdup(const char *s, gfp_t gfp)
 	return buf;
 }
 EXPORT_SYMBOL(kstrdup);
+
+/*
+ * strndup_user - duplicate an existing string from user space
+ *
+ * @s: The string to duplicate
+ * @n: Maximum number of bytes to copy, including the trailing NUL.
+ */
+char *strndup_user(const char __user *s, long n)
+{
+	char *p;
+	long length;
+
+	length = strnlen_user(s, n);
+
+	if (!length)
+		return ERR_PTR(-EFAULT);
+
+	if (length > n)
+		return ERR_PTR(-EINVAL);
+
+	p = kmalloc(length, GFP_KERNEL);
+
+	if (!p)
+		return ERR_PTR(-ENOMEM);
+
+	if (copy_from_user(p, s, length)) {
+		kfree(p);
+		return ERR_PTR(-EFAULT);
+	}
+
+	p[length - 1] = '\0';
+
+	return p;
+}
+EXPORT_SYMBOL(strndup_user);
author	Jeff Garzik <jeff@garzik.org>	2006-03-24 12:29:39 -0500
committer	Jeff Garzik <jeff@garzik.org>	2006-03-24 12:29:39 -0500
commit	4bbf7bc4c7bf1c80ec3c942fa5f1b6e6fa67dd99 (patch)
tree	c4ff89dc09abe69d58db1e14da22ecda9fdd3ce8 /mm
parent	84ac69e8bf9f36eb0166817373336d14fa58f5cc (diff)
parent	aec5c3c1a929d7d79a420e943285cf3ba26a7c0d (diff)
download	op-kernel-dev-4bbf7bc4c7bf1c80ec3c942fa5f1b6e6fa67dd99.zip op-kernel-dev-4bbf7bc4c7bf1c80ec3c942fa5f1b6e6fa67dd99.tar.gz