diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Makefile | 3 | ||||
-rw-r--r-- | mm/backing-dev.c | 69 | ||||
-rw-r--r-- | mm/filemap.c | 65 | ||||
-rw-r--r-- | mm/filemap.h | 1 | ||||
-rw-r--r-- | mm/hugetlb.c | 30 | ||||
-rw-r--r-- | mm/memory.c | 10 | ||||
-rw-r--r-- | mm/mempolicy.c | 2 | ||||
-rw-r--r-- | mm/mmap.c | 21 | ||||
-rw-r--r-- | mm/oom_kill.c | 1 | ||||
-rw-r--r-- | mm/page-writeback.c | 17 | ||||
-rw-r--r-- | mm/page_alloc.c | 105 | ||||
-rw-r--r-- | mm/readahead.c | 1 | ||||
-rw-r--r-- | mm/rmap.c | 41 | ||||
-rw-r--r-- | mm/shmem.c | 84 | ||||
-rw-r--r-- | mm/shmem_acl.c | 2 | ||||
-rw-r--r-- | mm/slab.c | 49 | ||||
-rw-r--r-- | mm/truncate.c | 4 | ||||
-rw-r--r-- | mm/util.c | 6 | ||||
-rw-r--r-- | mm/vmalloc.c | 7 | ||||
-rw-r--r-- | mm/vmscan.c | 12 | ||||
-rw-r--r-- | mm/vmstat.c | 1 |
21 files changed, 377 insertions, 154 deletions
diff --git a/mm/Makefile b/mm/Makefile index 12b3a4e..f3c077e 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -10,7 +10,8 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ page_alloc.o page-writeback.o pdflush.o \ readahead.o swap.o truncate.o vmscan.o \ - prio_tree.o util.o mmzone.o vmstat.o $(mmu-y) + prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ + $(mmu-y) ifeq ($(CONFIG_MMU)$(CONFIG_BLOCK),yy) obj-y += bounce.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c new file mode 100644 index 0000000..f50a281 --- /dev/null +++ b/mm/backing-dev.c @@ -0,0 +1,69 @@ + +#include <linux/wait.h> +#include <linux/backing-dev.h> +#include <linux/fs.h> +#include <linux/sched.h> +#include <linux/module.h> + +static wait_queue_head_t congestion_wqh[2] = { + __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]), + __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) + }; + + +void clear_bdi_congested(struct backing_dev_info *bdi, int rw) +{ + enum bdi_state bit; + wait_queue_head_t *wqh = &congestion_wqh[rw]; + + bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested; + clear_bit(bit, &bdi->state); + smp_mb__after_clear_bit(); + if (waitqueue_active(wqh)) + wake_up(wqh); +} +EXPORT_SYMBOL(clear_bdi_congested); + +void set_bdi_congested(struct backing_dev_info *bdi, int rw) +{ + enum bdi_state bit; + + bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested; + set_bit(bit, &bdi->state); +} +EXPORT_SYMBOL(set_bdi_congested); + +/** + * congestion_wait - wait for a backing_dev to become uncongested + * @rw: READ or WRITE + * @timeout: timeout in jiffies + * + * Waits for up to @timeout jiffies for a backing_dev (any backing_dev) to exit + * write congestion. If no backing_devs are congested then just wait for the + * next write to be completed. + */ +long congestion_wait(int rw, long timeout) +{ + long ret; + DEFINE_WAIT(wait); + wait_queue_head_t *wqh = &congestion_wqh[rw]; + + prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); + ret = io_schedule_timeout(timeout); + finish_wait(wqh, &wait); + return ret; +} +EXPORT_SYMBOL(congestion_wait); + +/** + * congestion_end - wake up sleepers on a congested backing_dev_info + * @rw: READ or WRITE + */ +void congestion_end(int rw) +{ + wait_queue_head_t *wqh = &congestion_wqh[rw]; + + if (waitqueue_active(wqh)) + wake_up(wqh); +} +EXPORT_SYMBOL(congestion_end); diff --git a/mm/filemap.c b/mm/filemap.c index ec4692359..8558732 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -75,8 +75,8 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, * ->mmap_sem * ->lock_page (access_process_vm) * - * ->mmap_sem - * ->i_mutex (msync) + * ->i_mutex (generic_file_buffered_write) + * ->mmap_sem (fault_in_pages_readable->do_page_fault) * * ->i_mutex * ->i_alloc_sem (various) @@ -1139,11 +1139,11 @@ success: } /** - * __generic_file_aio_read - generic filesystem read routine + * generic_file_aio_read - generic filesystem read routine * @iocb: kernel I/O control block * @iov: io vector request * @nr_segs: number of segments in the iovec - * @ppos: current file position + * @pos: current file position * * This is the "read()" routine for all filesystems * that can use the page cache directly. @@ -1198,8 +1198,10 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, if (retval > 0) *ppos = pos + retval; } - file_accessed(filp); - goto out; + if (likely(retval != 0)) { + file_accessed(filp); + goto out; + } } retval = 0; @@ -2220,7 +2222,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t *ppos) { struct file *file = iocb->ki_filp; - const struct address_space * mapping = file->f_mapping; + struct address_space * mapping = file->f_mapping; size_t ocount; /* original count */ size_t count; /* after file limit checks */ struct inode *inode = mapping->host; @@ -2273,8 +2275,11 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ if (unlikely(file->f_flags & O_DIRECT)) { - written = generic_file_direct_write(iocb, iov, - &nr_segs, pos, ppos, count, ocount); + loff_t endbyte; + ssize_t written_buffered; + + written = generic_file_direct_write(iocb, iov, &nr_segs, pos, + ppos, count, ocount); if (written < 0 || written == count) goto out; /* @@ -2283,10 +2288,46 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, */ pos += written; count -= written; - } + written_buffered = generic_file_buffered_write(iocb, iov, + nr_segs, pos, ppos, count, + written); + /* + * If generic_file_buffered_write() retuned a synchronous error + * then we want to return the number of bytes which were + * direct-written, or the error code if that was zero. Note + * that this differs from normal direct-io semantics, which + * will return -EFOO even if some bytes were written. + */ + if (written_buffered < 0) { + err = written_buffered; + goto out; + } - written = generic_file_buffered_write(iocb, iov, nr_segs, - pos, ppos, count, written); + /* + * We need to ensure that the page cache pages are written to + * disk and invalidated to preserve the expected O_DIRECT + * semantics. + */ + endbyte = pos + written_buffered - written - 1; + err = do_sync_file_range(file, pos, endbyte, + SYNC_FILE_RANGE_WAIT_BEFORE| + SYNC_FILE_RANGE_WRITE| + SYNC_FILE_RANGE_WAIT_AFTER); + if (err == 0) { + written = written_buffered; + invalidate_mapping_pages(mapping, + pos >> PAGE_CACHE_SHIFT, + endbyte >> PAGE_CACHE_SHIFT); + } else { + /* + * We don't know how much we wrote, so just return + * the number of bytes which were direct-written + */ + } + } else { + written = generic_file_buffered_write(iocb, iov, nr_segs, + pos, ppos, count, written); + } out: current->backing_dev_info = NULL; return written ? written : err; diff --git a/mm/filemap.h b/mm/filemap.h index 3f2a343..c2bff04 100644 --- a/mm/filemap.h +++ b/mm/filemap.h @@ -12,7 +12,6 @@ #include <linux/mm.h> #include <linux/highmem.h> #include <linux/uio.h> -#include <linux/config.h> #include <linux/uaccess.h> size_t diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7c7d03d..2dbec90 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -356,14 +356,16 @@ nomem: return -ENOMEM; } -void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, - unsigned long end) +void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, + unsigned long end) { struct mm_struct *mm = vma->vm_mm; unsigned long address; pte_t *ptep; pte_t pte; struct page *page; + struct page *tmp; + LIST_HEAD(page_list); WARN_ON(!is_vm_hugetlb_page(vma)); BUG_ON(start & ~HPAGE_MASK); @@ -384,12 +386,34 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, continue; page = pte_page(pte); - put_page(page); + list_add(&page->lru, &page_list); add_mm_counter(mm, file_rss, (int) -(HPAGE_SIZE / PAGE_SIZE)); } spin_unlock(&mm->page_table_lock); flush_tlb_range(vma, start, end); + list_for_each_entry_safe(page, tmp, &page_list, lru) { + list_del(&page->lru); + put_page(page); + } +} + +void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, + unsigned long end) +{ + /* + * It is undesirable to test vma->vm_file as it should be non-null + * for valid hugetlb area. However, vm_file will be NULL in the error + * cleanup path of do_mmap_pgoff. When hugetlbfs ->mmap method fails, + * do_mmap_pgoff() nullifies vma->vm_file before calling this function + * to clean up. Since no pte has actually been setup, it is safe to + * do nothing in this case. + */ + if (vma->vm_file) { + spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); + __unmap_hugepage_range(vma, start, end); + spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); + } } static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, diff --git a/mm/memory.c b/mm/memory.c index 9cf3f34..156861f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1086,6 +1086,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, default: BUG(); } + cond_resched(); } if (pages) { pages[i] = page; @@ -1451,6 +1452,7 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) memset(kaddr, 0, PAGE_SIZE); kunmap_atomic(kaddr, KM_USER0); + flush_dcache_page(dst); return; } @@ -2169,11 +2171,13 @@ retry: * after the next truncate_count read. */ - /* no page was available -- either SIGBUS or OOM */ - if (new_page == NOPAGE_SIGBUS) + /* no page was available -- either SIGBUS, OOM or REFAULT */ + if (unlikely(new_page == NOPAGE_SIGBUS)) return VM_FAULT_SIGBUS; - if (new_page == NOPAGE_OOM) + else if (unlikely(new_page == NOPAGE_OOM)) return VM_FAULT_OOM; + else if (unlikely(new_page == NOPAGE_REFAULT)) + return VM_FAULT_MINOR; /* * Should we do an early C-O-W break? diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 25788b1..617fb31 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -727,7 +727,7 @@ int do_migrate_pages(struct mm_struct *mm, return -ENOSYS; } -static struct page *new_vma_page(struct page *page, unsigned long private) +static struct page *new_vma_page(struct page *page, unsigned long private, int **x) { return NULL; } @@ -900,17 +900,6 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, int accountable = 1; unsigned long charged = 0, reqprot = prot; - if (file) { - if (is_file_hugepages(file)) - accountable = 0; - - if (!file->f_op || !file->f_op->mmap) - return -ENODEV; - - if ((prot & PROT_EXEC) && - (file->f_vfsmnt->mnt_flags & MNT_NOEXEC)) - return -EPERM; - } /* * Does the application expect PROT_READ to imply PROT_EXEC? * @@ -1000,6 +989,16 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, case MAP_PRIVATE: if (!(file->f_mode & FMODE_READ)) return -EACCES; + if (file->f_vfsmnt->mnt_flags & MNT_NOEXEC) { + if (vm_flags & VM_EXEC) + return -EPERM; + vm_flags &= ~VM_MAYEXEC; + } + if (is_file_hugepages(file)) + accountable = 0; + + if (!file->f_op || !file->f_op->mmap) + return -ENODEV; break; default: diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 20f41b0..2e3ce3a 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -15,6 +15,7 @@ * kernel subsystems and hints as to where to find out what things do. */ +#include <linux/oom.h> #include <linux/mm.h> #include <linux/sched.h> #include <linux/swap.h> diff --git a/mm/page-writeback.c b/mm/page-writeback.c index a0f3390..8d9b19f 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -222,7 +222,7 @@ static void balance_dirty_pages(struct address_space *mapping) if (pages_written >= write_chunk) break; /* We've done our duty */ } - blk_congestion_wait(WRITE, HZ/10); + congestion_wait(WRITE, HZ/10); } if (nr_reclaimable + global_page_state(NR_WRITEBACK) @@ -314,7 +314,7 @@ void throttle_vm_writeout(void) if (global_page_state(NR_UNSTABLE_NFS) + global_page_state(NR_WRITEBACK) <= dirty_thresh) break; - blk_congestion_wait(WRITE, HZ/10); + congestion_wait(WRITE, HZ/10); } } @@ -351,7 +351,7 @@ static void background_writeout(unsigned long _min_pages) min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) { /* Wrote less than expected */ - blk_congestion_wait(WRITE, HZ/10); + congestion_wait(WRITE, HZ/10); if (!wbc.encountered_congestion) break; } @@ -422,7 +422,7 @@ static void wb_kupdate(unsigned long arg) writeback_inodes(&wbc); if (wbc.nr_to_write > 0) { if (wbc.encountered_congestion) - blk_congestion_wait(WRITE, HZ/10); + congestion_wait(WRITE, HZ/10); else break; /* All the old data is written */ } @@ -956,15 +956,6 @@ int test_set_page_writeback(struct page *page) EXPORT_SYMBOL(test_set_page_writeback); /* - * Wakes up tasks that are being throttled due to writeback congestion - */ -void writeback_congestion_end(void) -{ - blk_congestion_end(WRITE); -} -EXPORT_SYMBOL(writeback_congestion_end); - -/* * Return true if any of the pages in the mapping are marged with the * passed tag. */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4f59d90..ebd425c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -39,6 +39,7 @@ #include <linux/stop_machine.h> #include <linux/sort.h> #include <linux/pfn.h> +#include <linux/backing-dev.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -495,17 +496,16 @@ static void __free_pages_ok(struct page *page, unsigned int order) int i; int reserved = 0; - arch_free_page(page, order); - if (!PageHighMem(page)) - debug_check_no_locks_freed(page_address(page), - PAGE_SIZE<<order); - for (i = 0 ; i < (1 << order) ; ++i) reserved += free_pages_check(page + i); if (reserved) return; + if (!PageHighMem(page)) + debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order); + arch_free_page(page, order); kernel_map_pages(page, 1 << order, 0); + local_irq_save(flags); __count_vm_events(PGFREE, 1 << order); free_one_page(page_zone(page), page, order); @@ -781,13 +781,14 @@ static void fastcall free_hot_cold_page(struct page *page, int cold) struct per_cpu_pages *pcp; unsigned long flags; - arch_free_page(page, 0); - if (PageAnon(page)) page->mapping = NULL; if (free_pages_check(page)) return; + if (!PageHighMem(page)) + debug_check_no_locks_freed(page_address(page), PAGE_SIZE); + arch_free_page(page, 0); kernel_map_pages(page, 1, 0); pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; @@ -900,7 +901,8 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark, int classzone_idx, int alloc_flags) { /* free_pages my go negative - that's OK */ - long min = mark, free_pages = z->free_pages - (1 << order) + 1; + unsigned long min = mark; + long free_pages = z->free_pages - (1 << order) + 1; int o; if (alloc_flags & ALLOC_HIGH) @@ -1049,7 +1051,7 @@ nofail_alloc: if (page) goto got_pg; if (gfp_mask & __GFP_NOFAIL) { - blk_congestion_wait(WRITE, HZ/50); + congestion_wait(WRITE, HZ/50); goto nofail_alloc; } } @@ -1112,7 +1114,7 @@ rebalance: do_retry = 1; } if (do_retry) { - blk_congestion_wait(WRITE, HZ/50); + congestion_wait(WRITE, HZ/50); goto rebalance; } @@ -2050,8 +2052,8 @@ int __init early_pfn_to_nid(unsigned long pfn) /** * free_bootmem_with_active_regions - Call free_bootmem_node for each active range - * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed - * @max_low_pfn: The highest PFN that till be passed to free_bootmem_node + * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. + * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node * * If an architecture guarantees that all ranges registered with * add_active_ranges() contain no holes and may be freed, this @@ -2081,11 +2083,11 @@ void __init free_bootmem_with_active_regions(int nid, /** * sparse_memory_present_with_active_regions - Call memory_present for each active range - * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used + * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. * * If an architecture guarantees that all ranges registered with * add_active_ranges() contain no holes and may be freed, this - * this function may be used instead of calling memory_present() manually. + * function may be used instead of calling memory_present() manually. */ void __init sparse_memory_present_with_active_regions(int nid) { @@ -2155,14 +2157,14 @@ static void __init account_node_boundary(unsigned int nid, /** * get_pfn_range_for_nid - Return the start and end page frames for a node - * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned - * @start_pfn: Passed by reference. On return, it will have the node start_pfn - * @end_pfn: Passed by reference. On return, it will have the node end_pfn + * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned. + * @start_pfn: Passed by reference. On return, it will have the node start_pfn. + * @end_pfn: Passed by reference. On return, it will have the node end_pfn. * * It returns the start and end page frame of a node based on information * provided by an arch calling add_active_range(). If called for a node * with no available memory, a warning is printed and the start and end - * PFNs will be 0 + * PFNs will be 0. */ void __init get_pfn_range_for_nid(unsigned int nid, unsigned long *start_pfn, unsigned long *end_pfn) @@ -2215,7 +2217,7 @@ unsigned long __init zone_spanned_pages_in_node(int nid, /* * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, - * then all holes in the requested range will be accounted for + * then all holes in the requested range will be accounted for. */ unsigned long __init __absent_pages_in_range(int nid, unsigned long range_start_pfn, @@ -2268,7 +2270,7 @@ unsigned long __init __absent_pages_in_range(int nid, * @start_pfn: The start PFN to start searching for holes * @end_pfn: The end PFN to stop searching for holes * - * It returns the number of pages frames in memory holes within a range + * It returns the number of pages frames in memory holes within a range. */ unsigned long __init absent_pages_in_range(unsigned long start_pfn, unsigned long end_pfn) @@ -2293,19 +2295,6 @@ unsigned long __init zone_absent_pages_in_node(int nid, return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); } -/* Return the zone index a PFN is in */ -int memmap_zone_idx(struct page *lmem_map) -{ - int i; - unsigned long phys_addr = virt_to_phys(lmem_map); - unsigned long pfn = phys_addr >> PAGE_SHIFT; - - for (i = 0; i < MAX_NR_ZONES; i++) - if (pfn < arch_zone_highest_possible_pfn[i]) - break; - - return i; -} #else static inline unsigned long zone_spanned_pages_in_node(int nid, unsigned long zone_type, @@ -2324,10 +2313,6 @@ static inline unsigned long zone_absent_pages_in_node(int nid, return zholes_size[zone_type]; } -static inline int memmap_zone_idx(struct page *lmem_map) -{ - return MAX_NR_ZONES; -} #endif static void __init calculate_node_totalpages(struct pglist_data *pgdat, @@ -2582,11 +2567,12 @@ void __init shrink_active_range(unsigned int nid, unsigned long old_end_pfn, /** * remove_all_active_ranges - Remove all currently registered regions + * * During discovery, it may be found that a table like SRAT is invalid * and an alternative discovery method must be used. This function removes * all currently registered regions. */ -void __init remove_all_active_ranges() +void __init remove_all_active_ranges(void) { memset(early_node_map, 0, sizeof(early_node_map)); nr_nodemap_entries = 0; @@ -2636,7 +2622,7 @@ unsigned long __init find_min_pfn_for_node(unsigned long nid) * find_min_pfn_with_active_regions - Find the minimum PFN registered * * It returns the minimum PFN based on information provided via - * add_active_range() + * add_active_range(). */ unsigned long __init find_min_pfn_with_active_regions(void) { @@ -2647,7 +2633,7 @@ unsigned long __init find_min_pfn_with_active_regions(void) * find_max_pfn_with_active_regions - Find the maximum PFN registered * * It returns the maximum PFN based on information provided via - * add_active_range() + * add_active_range(). */ unsigned long __init find_max_pfn_with_active_regions(void) { @@ -2662,10 +2648,7 @@ unsigned long __init find_max_pfn_with_active_regions(void) /** * free_area_init_nodes - Initialise all pg_data_t and zone data - * @arch_max_dma_pfn: The maximum PFN usable for ZONE_DMA - * @arch_max_dma32_pfn: The maximum PFN usable for ZONE_DMA32 - * @arch_max_low_pfn: The maximum PFN usable for ZONE_NORMAL - * @arch_max_high_pfn: The maximum PFN usable for ZONE_HIGHMEM + * @max_zone_pfn: an array of max PFNs for each zone * * This will call free_area_init_node() for each active node in the system. * Using the page ranges provided by add_active_range(), the size of each @@ -2723,14 +2706,15 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ /** - * set_dma_reserve - Account the specified number of pages reserved in ZONE_DMA - * @new_dma_reserve - The number of pages to mark reserved + * set_dma_reserve - set the specified number of pages reserved in the first zone + * @new_dma_reserve: The number of pages to mark reserved * * The per-cpu batchsize and zone watermarks are determined by present_pages. * In the DMA zone, a significant percentage may be consumed by kernel image * and other unfreeable allocations which can skew the watermarks badly. This - * function may optionally be used to account for unfreeable pages in - * ZONE_DMA. The effect will be lower watermarks and smaller per-cpu batchsize + * function may optionally be used to account for unfreeable pages in the + * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and + * smaller per-cpu batchsize. */ void __init set_dma_reserve(unsigned long new_dma_reserve) { @@ -2843,10 +2827,11 @@ static void setup_per_zone_lowmem_reserve(void) calculate_totalreserve_pages(); } -/* - * setup_per_zone_pages_min - called when min_free_kbytes changes. Ensures - * that the pages_{min,low,high} values for each zone are set correctly - * with respect to min_free_kbytes. +/** + * setup_per_zone_pages_min - called when min_free_kbytes changes. + * + * Ensures that the pages_{min,low,high} values for each zone are set correctly + * with respect to min_free_kbytes. */ void setup_per_zone_pages_min(void) { @@ -3135,3 +3120,19 @@ unsigned long page_to_pfn(struct page *page) EXPORT_SYMBOL(pfn_to_page); EXPORT_SYMBOL(page_to_pfn); #endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */ + +#if MAX_NUMNODES > 1 +/* + * Find the highest possible node id. + */ +int highest_possible_node_id(void) +{ + unsigned int node; + unsigned int highest = 0; + + for_each_node_mask(node, node_possible_map) + highest = node; + return highest; +} +EXPORT_SYMBOL(highest_possible_node_id); +#endif diff --git a/mm/readahead.c b/mm/readahead.c index aa7ec42..1ba736a 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -38,6 +38,7 @@ file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping) ra->ra_pages = mapping->backing_dev_info->ra_pages; ra->prev_page = -1; } +EXPORT_SYMBOL_GPL(file_ra_state_init); /* * Return max readahead size for this inode in number-of-pages. @@ -21,27 +21,21 @@ * Lock ordering in mm: * * inode->i_mutex (while writing or truncating, not reading or faulting) - * inode->i_alloc_sem - * - * When a page fault occurs in writing from user to file, down_read - * of mmap_sem nests within i_mutex; in sys_msync, i_mutex nests within - * down_read of mmap_sem; i_mutex and down_write of mmap_sem are never - * taken together; in truncation, i_mutex is taken outermost. - * - * mm->mmap_sem - * page->flags PG_locked (lock_page) - * mapping->i_mmap_lock - * anon_vma->lock - * mm->page_table_lock or pte_lock - * zone->lru_lock (in mark_page_accessed, isolate_lru_page) - * swap_lock (in swap_duplicate, swap_info_get) - * mmlist_lock (in mmput, drain_mmlist and others) - * mapping->private_lock (in __set_page_dirty_buffers) - * inode_lock (in set_page_dirty's __mark_inode_dirty) - * sb_lock (within inode_lock in fs/fs-writeback.c) - * mapping->tree_lock (widely used, in set_page_dirty, - * in arch-dependent flush_dcache_mmap_lock, - * within inode_lock in __sync_single_inode) + * inode->i_alloc_sem (vmtruncate_range) + * mm->mmap_sem + * page->flags PG_locked (lock_page) + * mapping->i_mmap_lock + * anon_vma->lock + * mm->page_table_lock or pte_lock + * zone->lru_lock (in mark_page_accessed, isolate_lru_page) + * swap_lock (in swap_duplicate, swap_info_get) + * mmlist_lock (in mmput, drain_mmlist and others) + * mapping->private_lock (in __set_page_dirty_buffers) + * inode_lock (in set_page_dirty's __mark_inode_dirty) + * sb_lock (within inode_lock in fs/fs-writeback.c) + * mapping->tree_lock (widely used, in set_page_dirty, + * in arch-dependent flush_dcache_mmap_lock, + * within inode_lock in __sync_single_inode) */ #include <linux/mm.h> @@ -576,15 +570,14 @@ void page_add_file_rmap(struct page *page) void page_remove_rmap(struct page *page) { if (atomic_add_negative(-1, &page->_mapcount)) { -#ifdef CONFIG_DEBUG_VM if (unlikely(page_mapcount(page) < 0)) { printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page)); printk (KERN_EMERG " page->flags = %lx\n", page->flags); printk (KERN_EMERG " page->count = %x\n", page_count(page)); printk (KERN_EMERG " page->mapping = %p\n", page->mapping); + BUG(); } -#endif - BUG_ON(page_mapcount(page) < 0); + /* * It would be tidy to reset the PageAnon mapping here, * but that might overwrite a racing page_add_anon_rmap @@ -48,6 +48,7 @@ #include <linux/ctype.h> #include <linux/migrate.h> #include <linux/highmem.h> +#include <linux/backing-dev.h> #include <asm/uaccess.h> #include <asm/div64.h> @@ -1131,7 +1132,7 @@ repeat: page_cache_release(swappage); if (error == -ENOMEM) { /* let kswapd refresh zone for GFP_ATOMICs */ - blk_congestion_wait(WRITE, HZ/50); + congestion_wait(WRITE, HZ/50); } goto repeat; } @@ -1362,6 +1363,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev) inode->i_mapping->a_ops = &shmem_aops; inode->i_mapping->backing_dev_info = &shmem_backing_dev_info; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_generation = get_seconds(); info = SHMEM_I(inode); memset(info, 0, (char *)inode - (char *)info); spin_lock_init(&info->lock); @@ -1956,6 +1958,85 @@ static struct xattr_handler *shmem_xattr_handlers[] = { }; #endif +static struct dentry *shmem_get_parent(struct dentry *child) +{ + return ERR_PTR(-ESTALE); +} + +static int shmem_match(struct inode *ino, void *vfh) +{ + __u32 *fh = vfh; + __u64 inum = fh[2]; + inum = (inum << 32) | fh[1]; + return ino->i_ino == inum && fh[0] == ino->i_generation; +} + +static struct dentry *shmem_get_dentry(struct super_block *sb, void *vfh) +{ + struct dentry *de = NULL; + struct inode *inode; + __u32 *fh = vfh; + __u64 inum = fh[2]; + inum = (inum << 32) | fh[1]; + + inode = ilookup5(sb, (unsigned long)(inum+fh[0]), shmem_match, vfh); + if (inode) { + de = d_find_alias(inode); + iput(inode); + } + + return de? de: ERR_PTR(-ESTALE); +} + +static struct dentry *shmem_decode_fh(struct super_block *sb, __u32 *fh, + int len, int type, + int (*acceptable)(void *context, struct dentry *de), + void *context) +{ + if (len < 3) + return ERR_PTR(-ESTALE); + + return sb->s_export_op->find_exported_dentry(sb, fh, NULL, acceptable, + context); +} + +static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len, + int connectable) +{ + struct inode *inode = dentry->d_inode; + + if (*len < 3) + return 255; + + if (hlist_unhashed(&inode->i_hash)) { + /* Unfortunately insert_inode_hash is not idempotent, + * so as we hash inodes here rather than at creation + * time, we need a lock to ensure we only try + * to do it once + */ + static DEFINE_SPINLOCK(lock); + spin_lock(&lock); + if (hlist_unhashed(&inode->i_hash)) + __insert_inode_hash(inode, + inode->i_ino + inode->i_generation); + spin_unlock(&lock); + } + + fh[0] = inode->i_generation; + fh[1] = inode->i_ino; + fh[2] = ((__u64)inode->i_ino) >> 32; + + *len = 3; + return 1; +} + +static struct export_operations shmem_export_ops = { + .get_parent = shmem_get_parent, + .get_dentry = shmem_get_dentry, + .encode_fh = shmem_encode_fh, + .decode_fh = shmem_decode_fh, +}; + static int shmem_parse_options(char *options, int *mode, uid_t *uid, gid_t *gid, unsigned long *blocks, unsigned long *inodes, int *policy, nodemask_t *policy_nodes) @@ -2128,6 +2209,7 @@ static int shmem_fill_super(struct super_block *sb, &inodes, &policy, &policy_nodes)) return -EINVAL; } + sb->s_export_op = &shmem_export_ops; #else sb->s_flags |= MS_NOUSER; #endif diff --git a/mm/shmem_acl.c b/mm/shmem_acl.c index c946bf4..f5664c5 100644 --- a/mm/shmem_acl.c +++ b/mm/shmem_acl.c @@ -35,7 +35,7 @@ shmem_get_acl(struct inode *inode, int type) } /** - * shmem_get_acl - generic_acl_operations->setacl() operation + * shmem_set_acl - generic_acl_operations->setacl() operation */ static void shmem_set_acl(struct inode *inode, int type, struct posix_acl *acl) @@ -86,7 +86,6 @@ * All object allocations for a node occur from node specific slab lists. */ -#include <linux/config.h> #include <linux/slab.h> #include <linux/mm.h> #include <linux/poison.h> @@ -1107,15 +1106,18 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) int nodeid = slabp->nodeid; struct kmem_list3 *l3; struct array_cache *alien = NULL; + int node; + + node = numa_node_id(); /* * Make sure we are not freeing a object from another node to the array * cache on this cpu. */ - if (likely(slabp->nodeid == numa_node_id())) + if (likely(slabp->nodeid == node)) return 0; - l3 = cachep->nodelists[numa_node_id()]; + l3 = cachep->nodelists[node]; STATS_INC_NODEFREES(cachep); if (l3->alien && l3->alien[nodeid]) { alien = l3->alien[nodeid]; @@ -1326,7 +1328,6 @@ static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, { struct kmem_list3 *ptr; - BUG_ON(cachep->nodelists[nodeid] != list); ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, nodeid); BUG_ON(!ptr); @@ -1353,6 +1354,7 @@ void __init kmem_cache_init(void) struct cache_names *names; int i; int order; + int node; for (i = 0; i < NUM_INIT_LISTS; i++) { kmem_list3_init(&initkmem_list3[i]); @@ -1387,12 +1389,14 @@ void __init kmem_cache_init(void) * 6) Resize the head arrays of the kmalloc caches to their final sizes. */ + node = numa_node_id(); + /* 1) create the cache_cache */ INIT_LIST_HEAD(&cache_chain); list_add(&cache_cache.next, &cache_chain); cache_cache.colour_off = cache_line_size(); cache_cache.array[smp_processor_id()] = &initarray_cache.cache; - cache_cache.nodelists[numa_node_id()] = &initkmem_list3[CACHE_CACHE]; + cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE]; cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, cache_line_size()); @@ -1497,19 +1501,18 @@ void __init kmem_cache_init(void) } /* 5) Replace the bootstrap kmem_list3's */ { - int node; + int nid; + /* Replace the static kmem_list3 structures for the boot cpu */ - init_list(&cache_cache, &initkmem_list3[CACHE_CACHE], - numa_node_id()); + init_list(&cache_cache, &initkmem_list3[CACHE_CACHE], node); - for_each_online_node(node) { + for_each_online_node(nid) { init_list(malloc_sizes[INDEX_AC].cs_cachep, - &initkmem_list3[SIZE_AC + node], node); + &initkmem_list3[SIZE_AC + nid], nid); if (INDEX_AC != INDEX_L3) { init_list(malloc_sizes[INDEX_L3].cs_cachep, - &initkmem_list3[SIZE_L3 + node], - node); + &initkmem_list3[SIZE_L3 + nid], nid); } } } @@ -2919,6 +2922,9 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) int batchcount; struct kmem_list3 *l3; struct array_cache *ac; + int node; + + node = numa_node_id(); check_irq_off(); ac = cpu_cache_get(cachep); @@ -2932,7 +2938,7 @@ retry: */ batchcount = BATCHREFILL_LIMIT; } - l3 = cachep->nodelists[numa_node_id()]; + l3 = cachep->nodelists[node]; BUG_ON(ac->avail > 0 || !l3); spin_lock(&l3->list_lock); @@ -2962,7 +2968,7 @@ retry: STATS_SET_HIGH(cachep); ac->entry[ac->avail++] = slab_get_obj(cachep, slabp, - numa_node_id()); + node); } check_slabp(cachep, slabp); @@ -2981,7 +2987,7 @@ alloc_done: if (unlikely(!ac->avail)) { int x; - x = cache_grow(cachep, flags, numa_node_id()); + x = cache_grow(cachep, flags, node); /* cache_grow can reenable interrupts, then ac could change. */ ac = cpu_cache_get(cachep); @@ -3488,22 +3494,25 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, } +#ifdef CONFIG_DEBUG_SLAB void *__kmalloc(size_t size, gfp_t flags) { -#ifndef CONFIG_DEBUG_SLAB - return __do_kmalloc(size, flags, NULL); -#else return __do_kmalloc(size, flags, __builtin_return_address(0)); -#endif } EXPORT_SYMBOL(__kmalloc); -#ifdef CONFIG_DEBUG_SLAB void *__kmalloc_track_caller(size_t size, gfp_t flags, void *caller) { return __do_kmalloc(size, flags, caller); } EXPORT_SYMBOL(__kmalloc_track_caller); + +#else +void *__kmalloc(size_t size, gfp_t flags) +{ + return __do_kmalloc(size, flags, NULL); +} +EXPORT_SYMBOL(__kmalloc); #endif /** diff --git a/mm/truncate.c b/mm/truncate.c index f4edbc1..e07b1e6 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -96,7 +96,6 @@ invalidate_complete_page(struct address_space *mapping, struct page *page) return 0; ret = remove_mapping(mapping, page); - ClearPageUptodate(page); return ret; } @@ -302,7 +301,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) if (page->mapping != mapping) return 0; - if (PagePrivate(page) && !try_to_release_page(page, 0)) + if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL)) return 0; write_lock_irq(&mapping->tree_lock); @@ -396,6 +395,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, pagevec_release(&pvec); cond_resched(); } + WARN_ON_ONCE(ret); return ret; } EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range); @@ -11,7 +11,7 @@ */ void *__kzalloc(size_t size, gfp_t flags) { - void *ret = ____kmalloc(size, flags); + void *ret = kmalloc_track_caller(size, flags); if (ret) memset(ret, 0, size); return ret; @@ -33,7 +33,7 @@ char *kstrdup(const char *s, gfp_t gfp) return NULL; len = strlen(s) + 1; - buf = ____kmalloc(len, gfp); + buf = kmalloc_track_caller(len, gfp); if (buf) memcpy(buf, s, len); return buf; @@ -51,7 +51,7 @@ void *kmemdup(const void *src, size_t len, gfp_t gfp) { void *p; - p = ____kmalloc(len, gfp); + p = kmalloc_track_caller(len, gfp); if (p) memcpy(p, src, len); return p; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 750ab6e..1133dd3 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -428,8 +428,11 @@ void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, if (array_size > PAGE_SIZE) { pages = __vmalloc_node(array_size, gfp_mask, PAGE_KERNEL, node); area->flags |= VM_VPAGES; - } else - pages = kmalloc_node(array_size, (gfp_mask & ~__GFP_HIGHMEM), node); + } else { + pages = kmalloc_node(array_size, + (gfp_mask & ~(__GFP_HIGHMEM | __GFP_ZERO)), + node); + } area->pages = pages; if (!area->pages) { remove_vm_area(area->addr); diff --git a/mm/vmscan.c b/mm/vmscan.c index eca7031..f05527b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -378,6 +378,12 @@ static pageout_t pageout(struct page *page, struct address_space *mapping) return PAGE_CLEAN; } +/* + * Attempt to detach a locked page from its ->mapping. If it is dirty or if + * someone else has a ref on the page, abort and return 0. If it was + * successfully detached, return 1. Assumes the caller has a single ref on + * this page. + */ int remove_mapping(struct address_space *mapping, struct page *page) { BUG_ON(!PageLocked(page)); @@ -1053,7 +1059,7 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) /* Take a nap, wait for some writeback to complete */ if (sc.nr_scanned && priority < DEF_PRIORITY - 2) - blk_congestion_wait(WRITE, HZ/10); + congestion_wait(WRITE, HZ/10); } /* top priority shrink_caches still had more to do? don't OOM, then */ if (!sc.all_unreclaimable) @@ -1208,7 +1214,7 @@ scan: * another pass across the zones. */ if (total_scanned && priority < DEF_PRIORITY - 2) - blk_congestion_wait(WRITE, HZ/10); + congestion_wait(WRITE, HZ/10); /* * We do this so kswapd doesn't build up large priorities for @@ -1452,7 +1458,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) goto out; if (sc.nr_scanned && prio < DEF_PRIORITY - 2) - blk_congestion_wait(WRITE, HZ / 10); + congestion_wait(WRITE, HZ / 10); } lru_pages = 0; diff --git a/mm/vmstat.c b/mm/vmstat.c index a2b6a9f..45b124e 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -9,7 +9,6 @@ * Christoph Lameter <christoph@lameter.com> */ -#include <linux/config.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/cpu.h> |