diff options
-rw-r--r-- | arch/ia64/include/asm/siginfo.h | 1 | ||||
-rw-r--r-- | arch/x86/mm/fault.c | 19 | ||||
-rw-r--r-- | fs/hugetlbfs/inode.c | 15 | ||||
-rw-r--r-- | fs/signalfd.c | 10 | ||||
-rw-r--r-- | include/linux/hugetlb.h | 17 | ||||
-rw-r--r-- | include/linux/migrate.h | 16 | ||||
-rw-r--r-- | include/linux/mm.h | 12 | ||||
-rw-r--r-- | include/linux/signalfd.h | 3 | ||||
-rw-r--r-- | mm/hugetlb.c | 233 | ||||
-rw-r--r-- | mm/memory-failure.c | 175 | ||||
-rw-r--r-- | mm/memory.c | 3 | ||||
-rw-r--r-- | mm/migrate.c | 234 | ||||
-rw-r--r-- | mm/rmap.c | 25 |
13 files changed, 596 insertions, 167 deletions
diff --git a/arch/ia64/include/asm/siginfo.h b/arch/ia64/include/asm/siginfo.h index 118d429..c8fcaa2 100644 --- a/arch/ia64/include/asm/siginfo.h +++ b/arch/ia64/include/asm/siginfo.h @@ -62,6 +62,7 @@ typedef struct siginfo { int _imm; /* immediate value for "break" */ unsigned int _flags; /* see below */ unsigned long _isr; /* isr */ + short _addr_lsb; /* lsb of faulting address */ } _sigfault; /* SIGPOLL */ diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 79b0b37..852b319 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -11,6 +11,7 @@ #include <linux/kprobes.h> /* __kprobes, ... */ #include <linux/mmiotrace.h> /* kmmio_handler, ... */ #include <linux/perf_event.h> /* perf_sw_event */ +#include <linux/hugetlb.h> /* hstate_index_to_shift */ #include <asm/traps.h> /* dotraplinkage, ... */ #include <asm/pgalloc.h> /* pgd_*(), ... */ @@ -160,15 +161,20 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) static void force_sig_info_fault(int si_signo, int si_code, unsigned long address, - struct task_struct *tsk) + struct task_struct *tsk, int fault) { + unsigned lsb = 0; siginfo_t info; info.si_signo = si_signo; info.si_errno = 0; info.si_code = si_code; info.si_addr = (void __user *)address; - info.si_addr_lsb = si_code == BUS_MCEERR_AR ? PAGE_SHIFT : 0; + if (fault & VM_FAULT_HWPOISON_LARGE) + lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); + if (fault & VM_FAULT_HWPOISON) + lsb = PAGE_SHIFT; + info.si_addr_lsb = lsb; force_sig_info(si_signo, &info, tsk); } @@ -722,7 +728,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, tsk->thread.error_code = error_code | (address >= TASK_SIZE); tsk->thread.trap_no = 14; - force_sig_info_fault(SIGSEGV, si_code, address, tsk); + force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0); return; } @@ -807,14 +813,14 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, tsk->thread.trap_no = 14; #ifdef CONFIG_MEMORY_FAILURE - if (fault & VM_FAULT_HWPOISON) { + if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) { printk(KERN_ERR "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n", tsk->comm, tsk->pid, address); code = BUS_MCEERR_AR; } #endif - force_sig_info_fault(SIGBUS, code, address, tsk); + force_sig_info_fault(SIGBUS, code, address, tsk, fault); } static noinline void @@ -824,7 +830,8 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, if (fault & VM_FAULT_OOM) { out_of_memory(regs, error_code, address); } else { - if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON)) + if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| + VM_FAULT_HWPOISON_LARGE)) do_sigbus(regs, error_code, address, fault); else BUG(); diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 113eba3..a14328d 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -31,6 +31,7 @@ #include <linux/statfs.h> #include <linux/security.h> #include <linux/magic.h> +#include <linux/migrate.h> #include <asm/uaccess.h> @@ -573,6 +574,19 @@ static int hugetlbfs_set_page_dirty(struct page *page) return 0; } +static int hugetlbfs_migrate_page(struct address_space *mapping, + struct page *newpage, struct page *page) +{ + int rc; + + rc = migrate_huge_page_move_mapping(mapping, newpage, page); + if (rc) + return rc; + migrate_page_copy(newpage, page); + + return 0; +} + static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb); @@ -659,6 +673,7 @@ static const struct address_space_operations hugetlbfs_aops = { .write_begin = hugetlbfs_write_begin, .write_end = hugetlbfs_write_end, .set_page_dirty = hugetlbfs_set_page_dirty, + .migratepage = hugetlbfs_migrate_page, }; diff --git a/fs/signalfd.c b/fs/signalfd.c index 7404730..492465b 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -99,6 +99,16 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo, #ifdef __ARCH_SI_TRAPNO err |= __put_user(kinfo->si_trapno, &uinfo->ssi_trapno); #endif +#ifdef BUS_MCEERR_AO + /* + * Other callers might not initialize the si_lsb field, + * so check explicitly for the right codes here. + */ + if (kinfo->si_code == BUS_MCEERR_AR || + kinfo->si_code == BUS_MCEERR_AO) + err |= __put_user((short) kinfo->si_addr_lsb, + &uinfo->ssi_addr_lsb); +#endif break; case __SI_CHLD: err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid); diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index f479700..943c76b 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -43,7 +43,8 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to, struct vm_area_struct *vma, int acctflags); void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed); -void __isolate_hwpoisoned_huge_page(struct page *page); +int dequeue_hwpoisoned_huge_page(struct page *page); +void copy_huge_page(struct page *dst, struct page *src); extern unsigned long hugepages_treat_as_movable; extern const unsigned long hugetlb_zero, hugetlb_infinity; @@ -101,7 +102,10 @@ static inline void hugetlb_report_meminfo(struct seq_file *m) #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; }) #define hugetlb_fault(mm, vma, addr, flags) ({ BUG(); 0; }) #define huge_pte_offset(mm, address) 0 -#define __isolate_hwpoisoned_huge_page(page) 0 +#define dequeue_hwpoisoned_huge_page(page) 0 +static inline void copy_huge_page(struct page *dst, struct page *src) +{ +} #define hugetlb_change_protection(vma, address, end, newprot) @@ -228,6 +232,8 @@ struct huge_bootmem_page { struct hstate *hstate; }; +struct page *alloc_huge_page_node(struct hstate *h, int nid); + /* arch callback */ int __init alloc_bootmem_huge_page(struct hstate *h); @@ -301,8 +307,14 @@ static inline struct hstate *page_hstate(struct page *page) return size_to_hstate(PAGE_SIZE << compound_order(page)); } +static inline unsigned hstate_index_to_shift(unsigned index) +{ + return hstates[index].order + PAGE_SHIFT; +} + #else struct hstate {}; +#define alloc_huge_page_node(h, nid) NULL #define alloc_bootmem_huge_page(h) NULL #define hstate_file(f) NULL #define hstate_vma(v) NULL @@ -317,6 +329,7 @@ static inline unsigned int pages_per_huge_page(struct hstate *h) { return 1; } +#define hstate_index_to_shift(index) 0 #endif #endif /* _LINUX_HUGETLB_H */ diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 7238231..085527f 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -14,6 +14,8 @@ extern int migrate_page(struct address_space *, struct page *, struct page *); extern int migrate_pages(struct list_head *l, new_page_t x, unsigned long private, int offlining); +extern int migrate_huge_pages(struct list_head *l, new_page_t x, + unsigned long private, int offlining); extern int fail_migrate_page(struct address_space *, struct page *, struct page *); @@ -23,12 +25,17 @@ extern int migrate_prep_local(void); extern int migrate_vmas(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to, unsigned long flags); +extern void migrate_page_copy(struct page *newpage, struct page *page); +extern int migrate_huge_page_move_mapping(struct address_space *mapping, + struct page *newpage, struct page *page); #else #define PAGE_MIGRATION 0 static inline void putback_lru_pages(struct list_head *l) {} static inline int migrate_pages(struct list_head *l, new_page_t x, unsigned long private, int offlining) { return -ENOSYS; } +static inline int migrate_huge_pages(struct list_head *l, new_page_t x, + unsigned long private, int offlining) { return -ENOSYS; } static inline int migrate_prep(void) { return -ENOSYS; } static inline int migrate_prep_local(void) { return -ENOSYS; } @@ -40,6 +47,15 @@ static inline int migrate_vmas(struct mm_struct *mm, return -ENOSYS; } +static inline void migrate_page_copy(struct page *newpage, + struct page *page) {} + +static inline int migrate_huge_page_move_mapping(struct address_space *mapping, + struct page *newpage, struct page *page) +{ + return -ENOSYS; +} + /* Possible settings for the migrate_page() method in address_operations */ #define migrate_page NULL #define fail_migrate_page NULL diff --git a/include/linux/mm.h b/include/linux/mm.h index 7687228..a4c66846 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -718,12 +718,20 @@ static inline int page_mapped(struct page *page) #define VM_FAULT_SIGBUS 0x0002 #define VM_FAULT_MAJOR 0x0004 #define VM_FAULT_WRITE 0x0008 /* Special case for get_user_pages */ -#define VM_FAULT_HWPOISON 0x0010 /* Hit poisoned page */ +#define VM_FAULT_HWPOISON 0x0010 /* Hit poisoned small page */ +#define VM_FAULT_HWPOISON_LARGE 0x0020 /* Hit poisoned large page. Index encoded in upper bits */ #define VM_FAULT_NOPAGE 0x0100 /* ->fault installed the pte, not return page */ #define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */ -#define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_HWPOISON) +#define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */ + +#define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_HWPOISON | \ + VM_FAULT_HWPOISON_LARGE) + +/* Encode hstate index for a hwpoisoned large page */ +#define VM_FAULT_SET_HINDEX(x) ((x) << 12) +#define VM_FAULT_GET_HINDEX(x) (((x) >> 12) & 0xf) /* * Can be called by the pagefault handler when it gets a VM_FAULT_OOM. diff --git a/include/linux/signalfd.h b/include/linux/signalfd.h index b363b91..3ff4961 100644 --- a/include/linux/signalfd.h +++ b/include/linux/signalfd.h @@ -33,6 +33,7 @@ struct signalfd_siginfo { __u64 ssi_utime; __u64 ssi_stime; __u64 ssi_addr; + __u16 ssi_addr_lsb; /* * Pad strcture to 128 bytes. Remember to update the @@ -43,7 +44,7 @@ struct signalfd_siginfo { * comes out of a read(2) and we really don't want to have * a compat on read(2). */ - __u8 __pad[48]; + __u8 __pad[46]; }; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c032738..96991de 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -423,14 +423,14 @@ static void clear_huge_page(struct page *page, } } -static void copy_gigantic_page(struct page *dst, struct page *src, +static void copy_user_gigantic_page(struct page *dst, struct page *src, unsigned long addr, struct vm_area_struct *vma) { int i; struct hstate *h = hstate_vma(vma); struct page *dst_base = dst; struct page *src_base = src; - might_sleep(); + for (i = 0; i < pages_per_huge_page(h); ) { cond_resched(); copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); @@ -440,14 +440,15 @@ static void copy_gigantic_page(struct page *dst, struct page *src, src = mem_map_next(src, src_base, i); } } -static void copy_huge_page(struct page *dst, struct page *src, + +static void copy_user_huge_page(struct page *dst, struct page *src, unsigned long addr, struct vm_area_struct *vma) { int i; struct hstate *h = hstate_vma(vma); if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) { - copy_gigantic_page(dst, src, addr, vma); + copy_user_gigantic_page(dst, src, addr, vma); return; } @@ -458,6 +459,40 @@ static void copy_huge_page(struct page *dst, struct page *src, } } +static void copy_gigantic_page(struct page *dst, struct page *src) +{ + int i; + struct hstate *h = page_hstate(src); + struct page *dst_base = dst; + struct page *src_base = src; + + for (i = 0; i < pages_per_huge_page(h); ) { + cond_resched(); + copy_highpage(dst, src); + + i++; + dst = mem_map_next(dst, dst_base, i); + src = mem_map_next(src, src_base, i); + } +} + +void copy_huge_page(struct page *dst, struct page *src) +{ + int i; + struct hstate *h = page_hstate(src); + + if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) { + copy_gigantic_page(dst, src); + return; + } + + might_sleep(); + for (i = 0; i < pages_per_huge_page(h); i++) { + cond_resched(); + copy_highpage(dst + i, src + i); + } +} + static void enqueue_huge_page(struct hstate *h, struct page *page) { int nid = page_to_nid(page); @@ -466,11 +501,24 @@ static void enqueue_huge_page(struct hstate *h, struct page *page) h->free_huge_pages_node[nid]++; } +static struct page *dequeue_huge_page_node(struct hstate *h, int nid) +{ + struct page *page; + + if (list_empty(&h->hugepage_freelists[nid])) + return NULL; + page = list_entry(h->hugepage_freelists[nid].next, struct page, lru); + list_del(&page->lru); + set_page_refcounted(page); + h->free_huge_pages--; + h->free_huge_pages_node[nid]--; + return page; +} + static struct page *dequeue_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address, int avoid_reserve) { - int nid; struct page *page = NULL; struct mempolicy *mpol; nodemask_t *nodemask; @@ -496,19 +544,13 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, for_each_zone_zonelist_nodemask(zone, z, zonelist, MAX_NR_ZONES - 1, nodemask) { - nid = zone_to_nid(zone); - if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) && - !list_empty(&h->hugepage_freelists[nid])) { - page = list_entry(h->hugepage_freelists[nid].next, - struct page, lru); - list_del(&page->lru); - h->free_huge_pages--; - h->free_huge_pages_node[nid]--; - - if (!avoid_reserve) - decrement_hugepage_resv_vma(h, vma); - - break; + if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) { + page = dequeue_huge_page_node(h, zone_to_nid(zone)); + if (page) { + if (!avoid_reserve) + decrement_hugepage_resv_vma(h, vma); + break; + } } } err: @@ -770,11 +812,10 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, return ret; } -static struct page *alloc_buddy_huge_page(struct hstate *h, - struct vm_area_struct *vma, unsigned long address) +static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) { struct page *page; - unsigned int nid; + unsigned int r_nid; if (h->order >= MAX_ORDER) return NULL; @@ -812,9 +853,14 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, } spin_unlock(&hugetlb_lock); - page = alloc_pages(htlb_alloc_mask|__GFP_COMP| - __GFP_REPEAT|__GFP_NOWARN, - huge_page_order(h)); + if (nid == NUMA_NO_NODE) + page = alloc_pages(htlb_alloc_mask|__GFP_COMP| + __GFP_REPEAT|__GFP_NOWARN, + huge_page_order(h)); + else + page = alloc_pages_exact_node(nid, + htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| + __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); if (page && arch_prepare_hugepage(page)) { __free_pages(page, huge_page_order(h)); @@ -823,19 +869,13 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, spin_lock(&hugetlb_lock); if (page) { - /* - * This page is now managed by the hugetlb allocator and has - * no users -- drop the buddy allocator's reference. - */ - put_page_testzero(page); - VM_BUG_ON(page_count(page)); - nid = page_to_nid(page); + r_nid = page_to_nid(page); set_compound_page_dtor(page, free_huge_page); /* * We incremented the global counters already */ - h->nr_huge_pages_node[nid]++; - h->surplus_huge_pages_node[nid]++; + h->nr_huge_pages_node[r_nid]++; + h->surplus_huge_pages_node[r_nid]++; __count_vm_event(HTLB_BUDDY_PGALLOC); } else { h->nr_huge_pages--; @@ -848,6 +888,25 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, } /* + * This allocation function is useful in the context where vma is irrelevant. + * E.g. soft-offlining uses this function because it only cares physical + * address of error page. + */ +struct page *alloc_huge_page_node(struct hstate *h, int nid) +{ + struct page *page; + + spin_lock(&hugetlb_lock); + page = dequeue_huge_page_node(h, nid); + spin_unlock(&hugetlb_lock); + + if (!page) + page = alloc_buddy_huge_page(h, nid); + + return page; +} + +/* * Increase the hugetlb pool such that it can accomodate a reservation * of size 'delta'. */ @@ -871,17 +930,14 @@ static int gather_surplus_pages(struct hstate *h, int delta) retry: spin_unlock(&hugetlb_lock); for (i = 0; i < needed; i++) { - page = alloc_buddy_huge_page(h, NULL, 0); - if (!page) { + page = alloc_buddy_huge_page(h, NUMA_NO_NODE); + if (!page) /* * We were not able to allocate enough pages to * satisfy the entire reservation so we free what * we've allocated so far. */ - spin_lock(&hugetlb_lock); - needed = 0; goto free; - } list_add(&page->lru, &surplus_list); } @@ -908,31 +964,31 @@ retry: needed += allocated; h->resv_huge_pages += delta; ret = 0; -free: + + spin_unlock(&hugetlb_lock); /* Free the needed pages to the hugetlb pool */ list_for_each_entry_safe(page, tmp, &surplus_list, lru) { if ((--needed) < 0) break; list_del(&page->lru); + /* + * This page is now managed by the hugetlb allocator and has + * no users -- drop the buddy allocator's reference. + */ + put_page_testzero(page); + VM_BUG_ON(page_count(page)); enqueue_huge_page(h, page); } /* Free unnecessary surplus pages to the buddy allocator */ +free: if (!list_empty(&surplus_list)) { - spin_unlock(&hugetlb_lock); list_for_each_entry_safe(page, tmp, &surplus_list, lru) { list_del(&page->lru); - /* - * The page has a reference count of zero already, so - * call free_huge_page directly instead of using - * put_page. This must be done with hugetlb_lock - * unlocked which is safe because free_huge_page takes - * hugetlb_lock before deciding how to free the page. - */ - free_huge_page(page); + put_page(page); } - spin_lock(&hugetlb_lock); } + spin_lock(&hugetlb_lock); return ret; } @@ -1052,14 +1108,13 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, spin_unlock(&hugetlb_lock); if (!page) { - page = alloc_buddy_huge_page(h, vma, addr); + page = alloc_buddy_huge_page(h, NUMA_NO_NODE); if (!page) { hugetlb_put_quota(inode->i_mapping, chg); return ERR_PTR(-VM_FAULT_SIGBUS); } } - set_page_refcounted(page); set_page_private(page, (unsigned long) mapping); vma_commit_reservation(h, vma, addr); @@ -2153,6 +2208,19 @@ nomem: return -ENOMEM; } +static int is_hugetlb_entry_migration(pte_t pte) +{ + swp_entry_t swp; + + if (huge_pte_none(pte) || pte_present(pte)) + return 0; + swp = pte_to_swp_entry(pte); + if (non_swap_entry(swp) && is_migration_entry(swp)) { + return 1; + } else + return 0; +} + static int is_hugetlb_entry_hwpoisoned(pte_t pte) { swp_entry_t swp; @@ -2383,7 +2451,7 @@ retry_avoidcopy: if (unlikely(anon_vma_prepare(vma))) return VM_FAULT_OOM; - copy_huge_page(new_page, old_page, address, vma); + copy_user_huge_page(new_page, old_page, address, vma); __SetPageUptodate(new_page); /* @@ -2515,22 +2583,20 @@ retry: hugepage_add_new_anon_rmap(page, vma, address); } } else { + /* + * If memory error occurs between mmap() and fault, some process + * don't have hwpoisoned swap entry for errored virtual address. + * So we need to block hugepage fault by PG_hwpoison bit check. + */ + if (unlikely(PageHWPoison(page))) { + ret = VM_FAULT_HWPOISON | + VM_FAULT_SET_HINDEX(h - hstates); + goto backout_unlocked; + } page_dup_rmap(page); } /* - * Since memory error handler replaces pte into hwpoison swap entry - * at the time of error handling, a process which reserved but not have - * the mapping to the error hugepage does not have hwpoison swap entry. - * So we need to block accesses from such a process by checking - * PG_hwpoison bit here. - */ - if (unlikely(PageHWPoison(page))) { - ret = VM_FAULT_HWPOISON; - goto backout_unlocked; - } - - /* * If we are going to COW a private mapping later, we examine the * pending reservations for this page now. This will ensure that * any allocations necessary to record that reservation occur outside @@ -2587,8 +2653,12 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, ptep = huge_pte_offset(mm, address); if (ptep) { entry = huge_ptep_get(ptep); - if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) - return VM_FAULT_HWPOISON; + if (unlikely(is_hugetlb_entry_migration(entry))) { + migration_entry_wait(mm, (pmd_t *)ptep, address); + return 0; + } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) + return VM_FAULT_HWPOISON_LARGE | + VM_FAULT_SET_HINDEX(h - hstates); } ptep = huge_pte_alloc(mm, address, huge_page_size(h)); @@ -2878,18 +2948,41 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) hugetlb_acct_memory(h, -(chg - freed)); } +#ifdef CONFIG_MEMORY_FAILURE + +/* Should be called in hugetlb_lock */ +static int is_hugepage_on_freelist(struct page *hpage) +{ + struct page *page; + struct page *tmp; + struct hstate *h = page_hstate(hpage); + int nid = page_to_nid(hpage); + + list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru) + if (page == hpage) + return 1; + return 0; +} + /* * This function is called from memory failure code. * Assume the caller holds page lock of the head page. */ -void __isolate_hwpoisoned_huge_page(struct page *hpage) +int dequeue_hwpoisoned_huge_page(struct page *hpage) { struct hstate *h = page_hstate(hpage); int nid = page_to_nid(hpage); + int ret = -EBUSY; spin_lock(&hugetlb_lock); - list_del(&hpage->lru); - h->free_huge_pages--; - h->free_huge_pages_node[nid]--; + if (is_hugepage_on_freelist(hpage)) { + list_del(&hpage->lru); + set_page_refcounted(hpage); + h->free_huge_pages--; + h->free_huge_pages_node[nid]--; + ret = 0; + } spin_unlock(&hugetlb_lock); + return ret; } +#endif diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 757f6b0..44a8cef 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -7,21 +7,26 @@ * Free Software Foundation. * * High level machine check handler. Handles pages reported by the - * hardware as being corrupted usually due to a 2bit ECC memory or cache + * hardware as being corrupted usually due to a multi-bit ECC memory or cache * failure. + * + * In addition there is a "soft offline" entry point that allows stop using + * not-yet-corrupted-by-suspicious pages without killing anything. * * Handles page cache pages in various states. The tricky part - * here is that we can access any page asynchronous to other VM - * users, because memory failures could happen anytime and anywhere, - * possibly violating some of their assumptions. This is why this code - * has to be extremely careful. Generally it tries to use normal locking - * rules, as in get the standard locks, even if that means the - * error handling takes potentially a long time. - * - * The operation to map back from RMAP chains to processes has to walk - * the complete process list and has non linear complexity with the number - * mappings. In short it can be quite slow. But since memory corruptions - * are rare we hope to get away with this. + * here is that we can access any page asynchronously in respect to + * other VM users, because memory failures could happen anytime and + * anywhere. This could violate some of their assumptions. This is why + * this code has to be extremely careful. Generally it tries to use + * normal locking rules, as in get the standard locks, even if that means + * the error handling takes potentially a long time. + * + * There are several operations here with exponential complexity because + * of unsuitable VM data structures. For example the operation to map back + * from RMAP chains to processes has to walk the complete process list and + * has non linear complexity with the number. But since memory corruptions + * are rare we hope to get away with this. This avoids impacting the core + * VM. */ /* @@ -30,7 +35,6 @@ * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages * - pass bad pages to kdump next kernel */ -#define DEBUG 1 /* remove me in 2.6.34 */ #include <linux/kernel.h> #include <linux/mm.h> #include <linux/page-flags.h> @@ -78,7 +82,7 @@ static int hwpoison_filter_dev(struct page *p) return 0; /* - * page_mapping() does not accept slab page + * page_mapping() does not accept slab pages. */ if (PageSlab(p)) return -EINVAL; @@ -268,7 +272,7 @@ struct to_kill { struct list_head nd; struct task_struct *tsk; unsigned long addr; - unsigned addr_valid:1; + char addr_valid; }; /* @@ -309,7 +313,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p, * a SIGKILL because the error is not contained anymore. */ if (tk->addr == -EFAULT) { - pr_debug("MCE: Unable to find user space address %lx in %s\n", + pr_info("MCE: Unable to find user space address %lx in %s\n", page_to_pfn(p), tsk->comm); tk->addr_valid = 0; } @@ -577,7 +581,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) pfn, err); } else if (page_has_private(p) && !try_to_release_page(p, GFP_NOIO)) { - pr_debug("MCE %#lx: failed to release buffers\n", pfn); + pr_info("MCE %#lx: failed to release buffers\n", pfn); } else { ret = RECOVERED; } @@ -693,11 +697,10 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn) * Issues: * - Error on hugepage is contained in hugepage unit (not in raw page unit.) * To narrow down kill region to one page, we need to break up pmd. - * - To support soft-offlining for hugepage, we need to support hugepage - * migration. */ static int me_huge_page(struct page *p, unsigned long pfn) { + int res = 0; struct page *hpage = compound_head(p); /* * We can safely recover from error on free or reserved (i.e. @@ -710,8 +713,9 @@ static int me_huge_page(struct page *p, unsigned long pfn) * so there is no race between isolation and mapping/unmapping. */ if (!(page_mapping(hpage) || PageAnon(hpage))) { - __isolate_hwpoisoned_huge_page(hpage); - return RECOVERED; + res = dequeue_hwpoisoned_huge_page(hpage); + if (!res) + return RECOVERED; } return DELAYED; } @@ -836,8 +840,6 @@ static int page_action(struct page_state *ps, struct page *p, return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY; } -#define N_UNMAP_TRIES 5 - /* * Do all that is necessary to remove user space mappings. Unmap * the pages and send SIGBUS to the processes if the data was dirty. @@ -849,7 +851,6 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, struct address_space *mapping; LIST_HEAD(tokill); int ret; - int i; int kill = 1; struct page *hpage = compound_head(p); @@ -903,17 +904,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, if (kill) collect_procs(hpage, &tokill); - /* - * try_to_unmap can fail temporarily due to races. - * Try a few times (RED-PEN better strategy?) - */ - for (i = 0; i < N_UNMAP_TRIES; i++) { - ret = try_to_unmap(hpage, ttu); - if (ret == SWAP_SUCCESS) - break; - pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn, ret); - } - + ret = try_to_unmap(hpage, ttu); if (ret != SWAP_SUCCESS) printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", pfn, page_mapcount(hpage)); @@ -981,7 +972,10 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) * We need/can do nothing about count=0 pages. * 1) it's a free page, and therefore in safe hand: * prep_new_page() will be the gate keeper. - * 2) it's part of a non-compound high order page. + * 2) it's a free hugepage, which is also safe: + * an affected hugepage will be dequeued from hugepage freelist, + * so there's no concern about reusing it ever after. + * 3) it's part of a non-compound high order page. * Implies some kernel user: cannot stop them from * R/W the page; let's pray that the page has been * used and will be freed some time later. @@ -993,6 +987,24 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) if (is_free_buddy_page(p)) { action_result(pfn, "free buddy", DELAYED); return 0; + } else if (PageHuge(hpage)) { + /* + * Check "just unpoisoned", "filter hit", and + * "race with other subpage." + */ + lock_page_nosync(hpage); + if (!PageHWPoison(hpage) + || (hwpoison_filter(p) && TestClearPageHWPoison(p)) + || (p != hpage && TestSetPageHWPoison(hpage))) { + atomic_long_sub(nr_pages, &mce_bad_pages); + return 0; + } + set_page_hwpoison_huge_page(hpage); + res = dequeue_hwpoisoned_huge_page(hpage); + action_result(pfn, "free huge", + res ? IGNORED : DELAYED); + unlock_page(hpage); + return res; } else { action_result(pfn, "high order kernel", IGNORED); return -EBUSY; @@ -1147,16 +1159,26 @@ int unpoison_memory(unsigned long pfn) page = compound_head(p); if (!PageHWPoison(p)) { - pr_debug("MCE: Page was already unpoisoned %#lx\n", pfn); + pr_info("MCE: Page was already unpoisoned %#lx\n", pfn); return 0; } nr_pages = 1 << compound_order(page); if (!get_page_unless_zero(page)) { + /* + * Since HWPoisoned hugepage should have non-zero refcount, + * race between memory failure and unpoison seems to happen. + * In such case unpoison fails and memory failure runs + * to the end. + */ + if (PageHuge(page)) { + pr_debug("MCE: Memory failure is now running on free hugepage %#lx\n", pfn); + return 0; + } if (TestClearPageHWPoison(p)) atomic_long_sub(nr_pages, &mce_bad_pages); - pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn); + pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn); return 0; } @@ -1168,12 +1190,12 @@ int unpoison_memory(unsigned long pfn) * the free buddy page pool. */ if (TestClearPageHWPoison(page)) { - pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn); + pr_info("MCE: Software-unpoisoned page %#lx\n", pfn); atomic_long_sub(nr_pages, &mce_bad_pages); freeit = 1; + if (PageHuge(page)) + clear_page_hwpoison_huge_page(page); } - if (PageHuge(p)) - clear_page_hwpoison_huge_page(page); unlock_page(page); put_page(page); @@ -1187,7 +1209,11 @@ EXPORT_SYMBOL(unpoison_memory); static struct page *new_page(struct page *p, unsigned long private, int **x) { int nid = page_to_nid(p); - return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0); + if (PageHuge(p)) + return alloc_huge_page_node(page_hstate(compound_head(p)), + nid); + else + return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0); } /* @@ -1215,14 +1241,21 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags) * was free. */ set_migratetype_isolate(p); + /* + * When the target page is a free hugepage, just remove it + * from free hugepage list. + */ if (!get_page_unless_zero(compound_head(p))) { - if (is_free_buddy_page(p)) { - pr_debug("get_any_page: %#lx free buddy page\n", pfn); + if (PageHuge(p)) { + pr_info("get_any_page: %#lx free huge page\n", pfn); + ret = dequeue_hwpoisoned_huge_page(compound_head(p)); + } else if (is_free_buddy_page(p)) { + pr_info("get_any_page: %#lx free buddy page\n", pfn); /* Set hwpoison bit while page is still isolated */ SetPageHWPoison(p); ret = 0; } else { - pr_debug("get_any_page: %#lx: unknown zero refcount page type %lx\n", + pr_info("get_any_page: %#lx: unknown zero refcount page type %lx\n", pfn, p->flags); ret = -EIO; } @@ -1235,6 +1268,45 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags) return ret; } +static int soft_offline_huge_page(struct page *page, int flags) +{ + int ret; + unsigned long pfn = page_to_pfn(page); + struct page *hpage = compound_head(page); + LIST_HEAD(pagelist); + + ret = get_any_page(page, pfn, flags); + if (ret < 0) + return ret; + if (ret == 0) + goto done; + + if (PageHWPoison(hpage)) { + put_page(hpage); + pr_debug("soft offline: %#lx hugepage already poisoned\n", pfn); + return -EBUSY; + } + + /* Keep page count to indicate a given hugepage is isolated. */ + + list_add(&hpage->lru, &pagelist); + ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); + if (ret) { + pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", + pfn, ret, page->flags); + if (ret > 0) + ret = -EIO; + return ret; + } +done: + if (!PageHWPoison(hpage)) + atomic_long_add(1 << compound_order(hpage), &mce_bad_pages); + set_page_hwpoison_huge_page(hpage); + dequeue_hwpoisoned_huge_page(hpage); + /* keep elevated page count for bad page */ + return ret; +} + /** * soft_offline_page - Soft offline a page. * @page: page to offline @@ -1262,6 +1334,9 @@ int soft_offline_page(struct page *page, int flags) int ret; unsigned long pfn = page_to_pfn(page); + if (PageHuge(page)) + return soft_offline_huge_page(page, flags); + ret = get_any_page(page, pfn, flags); if (ret < 0) return ret; @@ -1288,7 +1363,7 @@ int soft_offline_page(struct page *page, int flags) goto done; } if (!PageLRU(page)) { - pr_debug("soft_offline: %#lx: unknown non LRU page type %lx\n", + pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n", pfn, page->flags); return -EIO; } @@ -1302,7 +1377,7 @@ int soft_offline_page(struct page *page, int flags) if (PageHWPoison(page)) { unlock_page(page); put_page(page); - pr_debug("soft offline: %#lx page already poisoned\n", pfn); + pr_info("soft offline: %#lx page already poisoned\n", pfn); return -EBUSY; } @@ -1323,7 +1398,7 @@ int soft_offline_page(struct page *page, int flags) put_page(page); if (ret == 1) { ret = 0; - pr_debug("soft_offline: %#lx: invalidated\n", pfn); + pr_info("soft_offline: %#lx: invalidated\n", pfn); goto done; } @@ -1339,13 +1414,13 @@ int soft_offline_page(struct page *page, int flags) list_add(&page->lru, &pagelist); ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); if (ret) { - pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", + pr_info("soft offline: %#lx: migration failed %d, type %lx\n", pfn, ret, page->flags); if (ret > 0) ret = -EIO; } } else { - pr_debug("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", + pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", pfn, ret, page_count(page), page->flags); } if (ret) diff --git a/mm/memory.c b/mm/memory.c index 98b58fe..af82741 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1450,7 +1450,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, if (ret & VM_FAULT_OOM) return i ? i : -ENOMEM; if (ret & - (VM_FAULT_HWPOISON|VM_FAULT_SIGBUS)) + (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE| + VM_FAULT_SIGBUS)) return i ? i : -EFAULT; BUG(); } diff --git a/mm/migrate.c b/mm/migrate.c index 38e7cad..f8c9bcc 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -32,6 +32,7 @@ #include <linux/security.h> #include <linux/memcontrol.h> #include <linux/syscalls.h> +#include <linux/hugetlb.h> #include <linux/gfp.h> #include "internal.h" @@ -95,26 +96,34 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, pte_t *ptep, pte; spinlock_t *ptl; - pgd = pgd_offset(mm, addr); - if (!pgd_present(*pgd)) - goto out; + if (unlikely(PageHuge(new))) { + ptep = huge_pte_offset(mm, addr); + if (!ptep) + goto out; + ptl = &mm->page_table_lock; + } else { + pgd = pgd_offset(mm, addr); + if (!pgd_present(*pgd)) + goto out; - pud = pud_offset(pgd, addr); - if (!pud_present(*pud)) - goto out; + pud = pud_offset(pgd, addr); + if (!pud_present(*pud)) + goto out; - pmd = pmd_offset(pud, addr); - if (!pmd_present(*pmd)) - goto out; + pmd = pmd_offset(pud, addr); + if (!pmd_present(*pmd)) + goto out; - ptep = pte_offset_map(pmd, addr); + ptep = pte_offset_map(pmd, addr); - if (!is_swap_pte(*ptep)) { - pte_unmap(ptep); - goto out; - } + if (!is_swap_pte(*ptep)) { + pte_unmap(ptep); + goto out; + } + + ptl = pte_lockptr(mm, pmd); + } - ptl = pte_lockptr(mm, pmd); spin_lock(ptl); pte = *ptep; if (!is_swap_pte(pte)) @@ -130,10 +139,19 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); if (is_write_migration_entry(entry)) pte = pte_mkwrite(pte); +#ifdef CONFIG_HUGETLB_PAGE + if (PageHuge(new)) + pte = pte_mkhuge(pte); +#endif flush_cache_page(vma, addr, pte_pfn(pte)); set_pte_at(mm, addr, ptep, pte); - if (PageAnon(new)) + if (PageHuge(new)) { + if (PageAnon(new)) + hugepage_add_anon_rmap(new, vma, addr); + else + page_dup_rmap(new); + } else if (PageAnon(new)) page_add_anon_rmap(new, vma, addr); else page_add_file_rmap(new); @@ -276,11 +294,59 @@ static int migrate_page_move_mapping(struct address_space *mapping, } /* + * The expected number of remaining references is the same as that + * of migrate_page_move_mapping(). + */ +int migrate_huge_page_move_mapping(struct address_space *mapping, + struct page *newpage, struct page *page) +{ + int expected_count; + void **pslot; + + if (!mapping) { + if (page_count(page) != 1) + return -EAGAIN; + return 0; + } + + spin_lock_irq(&mapping->tree_lock); + + pslot = radix_tree_lookup_slot(&mapping->page_tree, + page_index(page)); + + expected_count = 2 + page_has_private(page); + if (page_count(page) != expected_count || + (struct page *)radix_tree_deref_slot(pslot) != page) { + spin_unlock_irq(&mapping->tree_lock); + return -EAGAIN; + } + + if (!page_freeze_refs(page, expected_count)) { + spin_unlock_irq(&mapping->tree_lock); + return -EAGAIN; + } + + get_page(newpage); + + radix_tree_replace_slot(pslot, newpage); + + page_unfreeze_refs(page, expected_count); + + __put_page(page); + + spin_unlock_irq(&mapping->tree_lock); + return 0; +} + +/* * Copy the page to its new location */ -static void migrate_page_copy(struct page *newpage, struct page *page) +void migrate_page_copy(struct page *newpage, struct page *page) { - copy_highpage(newpage, page); + if (PageHuge(page)) + copy_huge_page(newpage, page); + else + copy_highpage(newpage, page); if (PageError(page)) SetPageError(newpage); @@ -724,6 +790,92 @@ move_newpage: } /* + * Counterpart of unmap_and_move_page() for hugepage migration. + * + * This function doesn't wait the completion of hugepage I/O + * because there is no race between I/O and migration for hugepage. + * Note that currently hugepage I/O occurs only in direct I/O + * where no lock is held and PG_writeback is irrelevant, + * and writeback status of all subpages are counted in the reference + * count of the head page (i.e. if all subpages of a 2MB hugepage are + * under direct I/O, the reference of the head page is 512 and a bit more.) + * This means that when we try to migrate hugepage whose subpages are + * doing direct I/O, some references remain after try_to_unmap() and + * hugepage migration fails without data corruption. + * + * There is also no race when direct I/O is issued on the page under migration, + * because then pte is replaced with migration swap entry and direct I/O code + * will wait in the page fault for migration to complete. + */ +static int unmap_and_move_huge_page(new_page_t get_new_page, + unsigned long private, struct page *hpage, + int force, int offlining) +{ + int rc = 0; + int *result = NULL; + struct page *new_hpage = get_new_page(hpage, private, &result); + int rcu_locked = 0; + struct anon_vma *anon_vma = NULL; + + if (!new_hpage) + return -ENOMEM; + + rc = -EAGAIN; + + if (!trylock_page(hpage)) { + if (!force) + goto out; + lock_page(hpage); + } + + if (PageAnon(hpage)) { + rcu_read_lock(); + rcu_locked = 1; + + if (page_mapped(hpage)) { + anon_vma = page_anon_vma(hpage); + atomic_inc(&anon_vma->external_refcount); + } + } + + try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); + + if (!page_mapped(hpage)) + rc = move_to_new_page(new_hpage, hpage, 1); + + if (rc) + remove_migration_ptes(hpage, hpage); + + if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount, + &anon_vma->lock)) { + int empty = list_empty(&anon_vma->head); + spin_unlock(&anon_vma->lock); + if (empty) + anon_vma_free(anon_vma); + } + + if (rcu_locked) + rcu_read_unlock(); +out: + unlock_page(hpage); + + if (rc != -EAGAIN) { + list_del(&hpage->lru); + put_page(hpage); + } + + put_page(new_hpage); + + if (result) { + if (rc) + *result = rc; + else + *result = page_to_nid(new_hpage); + } + return rc; +} + +/* * migrate_pages * * The function takes one list of pages to migrate and a function @@ -788,6 +940,52 @@ out: return nr_failed + retry; } +int migrate_huge_pages(struct list_head *from, + new_page_t get_new_page, unsigned long private, int offlining) +{ + int retry = 1; + int nr_failed = 0; + int pass = 0; + struct page *page; + struct page *page2; + int rc; + + for (pass = 0; pass < 10 && retry; pass++) { + retry = 0; + + list_for_each_entry_safe(page, page2, from, lru) { + cond_resched(); + + rc = unmap_and_move_huge_page(get_new_page, + private, page, pass > 2, offlining); + + switch(rc) { + case -ENOMEM: + goto out; + case -EAGAIN: + retry++; + break; + case 0: + break; + default: + /* Permanent failure */ + nr_failed++; + break; + } + } + } + rc = 0; +out: + + list_for_each_entry_safe(page, page2, from, lru) + put_page(page); + + if (rc) + return rc; + + return nr_failed + retry; +} + #ifdef CONFIG_NUMA /* * Move a list of individual pages @@ -780,10 +780,10 @@ void page_move_anon_rmap(struct page *page, } /** - * __page_set_anon_rmap - setup new anonymous rmap - * @page: the page to add the mapping to - * @vma: the vm area in which the mapping is added - * @address: the user virtual address mapped + * __page_set_anon_rmap - set up new anonymous rmap + * @page: Page to add to rmap + * @vma: VM area to add page to. + * @address: User virtual address of the mapping * @exclusive: the page is exclusively owned by the current process */ static void __page_set_anon_rmap(struct page *page, @@ -793,25 +793,16 @@ static void __page_set_anon_rmap(struct page *page, BUG_ON(!anon_vma); + if (PageAnon(page)) + return; + /* * If the page isn't exclusively mapped into this vma, * we must use the _oldest_ possible anon_vma for the * page mapping! */ - if (!exclusive) { - if (PageAnon(page)) - return; + if (!exclusive) anon_vma = anon_vma->root; - } else { - /* - * In this case, swapped-out-but-not-discarded swap-cache - * is remapped. So, no need to update page->mapping here. - * We convice anon_vma poitned by page->mapping is not obsolete - * because vma->anon_vma is necessary to be a family of it. - */ - if (PageAnon(page)) - return; - } anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; page->mapping = (struct address_space *) anon_vma; |