diff options
Diffstat (limited to 'mm/rmap.c')
-rw-r--r-- | mm/rmap.c | 195 |
1 files changed, 151 insertions, 44 deletions
@@ -24,8 +24,8 @@ * inode->i_alloc_sem (vmtruncate_range) * mm->mmap_sem * page->flags PG_locked (lock_page) - * mapping->i_mmap_lock - * anon_vma->lock + * mapping->i_mmap_mutex + * anon_vma->mutex * mm->page_table_lock or pte_lock * zone->lru_lock (in mark_page_accessed, isolate_lru_page) * swap_lock (in swap_duplicate, swap_info_get) @@ -40,7 +40,7 @@ * * (code doesn't rely on that order so it could be switched around) * ->tasklist_lock - * anon_vma->lock (memory_failure, collect_procs_anon) + * anon_vma->mutex (memory_failure, collect_procs_anon) * pte map lock */ @@ -86,6 +86,29 @@ static inline struct anon_vma *anon_vma_alloc(void) static inline void anon_vma_free(struct anon_vma *anon_vma) { VM_BUG_ON(atomic_read(&anon_vma->refcount)); + + /* + * Synchronize against page_lock_anon_vma() such that + * we can safely hold the lock without the anon_vma getting + * freed. + * + * Relies on the full mb implied by the atomic_dec_and_test() from + * put_anon_vma() against the acquire barrier implied by + * mutex_trylock() from page_lock_anon_vma(). This orders: + * + * page_lock_anon_vma() VS put_anon_vma() + * mutex_trylock() atomic_dec_and_test() + * LOCK MB + * atomic_read() mutex_is_locked() + * + * LOCK should suffice since the actual taking of the lock must + * happen _before_ what follows. + */ + if (mutex_is_locked(&anon_vma->root->mutex)) { + anon_vma_lock(anon_vma); + anon_vma_unlock(anon_vma); + } + kmem_cache_free(anon_vma_cachep, anon_vma); } @@ -307,7 +330,7 @@ static void anon_vma_ctor(void *data) { struct anon_vma *anon_vma = data; - spin_lock_init(&anon_vma->lock); + mutex_init(&anon_vma->mutex); atomic_set(&anon_vma->refcount, 0); INIT_LIST_HEAD(&anon_vma->head); } @@ -320,12 +343,31 @@ void __init anon_vma_init(void) } /* - * Getting a lock on a stable anon_vma from a page off the LRU is - * tricky: page_lock_anon_vma rely on RCU to guard against the races. + * Getting a lock on a stable anon_vma from a page off the LRU is tricky! + * + * Since there is no serialization what so ever against page_remove_rmap() + * the best this function can do is return a locked anon_vma that might + * have been relevant to this page. + * + * The page might have been remapped to a different anon_vma or the anon_vma + * returned may already be freed (and even reused). + * + * In case it was remapped to a different anon_vma, the new anon_vma will be a + * child of the old anon_vma, and the anon_vma lifetime rules will therefore + * ensure that any anon_vma obtained from the page will still be valid for as + * long as we observe page_mapped() [ hence all those page_mapped() tests ]. + * + * All users of this function must be very careful when walking the anon_vma + * chain and verify that the page in question is indeed mapped in it + * [ something equivalent to page_mapped_in_vma() ]. + * + * Since anon_vma's slab is DESTROY_BY_RCU and we know from page_remove_rmap() + * that the anon_vma pointer from page->mapping is valid if there is a + * mapcount, we can dereference the anon_vma after observing those. */ -struct anon_vma *__page_lock_anon_vma(struct page *page) +struct anon_vma *page_get_anon_vma(struct page *page) { - struct anon_vma *anon_vma, *root_anon_vma; + struct anon_vma *anon_vma = NULL; unsigned long anon_mapping; rcu_read_lock(); @@ -336,32 +378,100 @@ struct anon_vma *__page_lock_anon_vma(struct page *page) goto out; anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); - root_anon_vma = ACCESS_ONCE(anon_vma->root); - spin_lock(&root_anon_vma->lock); + if (!atomic_inc_not_zero(&anon_vma->refcount)) { + anon_vma = NULL; + goto out; + } /* * If this page is still mapped, then its anon_vma cannot have been - * freed. But if it has been unmapped, we have no security against - * the anon_vma structure being freed and reused (for another anon_vma: - * SLAB_DESTROY_BY_RCU guarantees that - so the spin_lock above cannot - * corrupt): with anon_vma_prepare() or anon_vma_fork() redirecting - * anon_vma->root before page_unlock_anon_vma() is called to unlock. + * freed. But if it has been unmapped, we have no security against the + * anon_vma structure being freed and reused (for another anon_vma: + * SLAB_DESTROY_BY_RCU guarantees that - so the atomic_inc_not_zero() + * above cannot corrupt). */ - if (page_mapped(page)) - return anon_vma; + if (!page_mapped(page)) { + put_anon_vma(anon_vma); + anon_vma = NULL; + } +out: + rcu_read_unlock(); + + return anon_vma; +} + +/* + * Similar to page_get_anon_vma() except it locks the anon_vma. + * + * Its a little more complex as it tries to keep the fast path to a single + * atomic op -- the trylock. If we fail the trylock, we fall back to getting a + * reference like with page_get_anon_vma() and then block on the mutex. + */ +struct anon_vma *page_lock_anon_vma(struct page *page) +{ + struct anon_vma *anon_vma = NULL; + struct anon_vma *root_anon_vma; + unsigned long anon_mapping; + + rcu_read_lock(); + anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping); + if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) + goto out; + if (!page_mapped(page)) + goto out; + + anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); + root_anon_vma = ACCESS_ONCE(anon_vma->root); + if (mutex_trylock(&root_anon_vma->mutex)) { + /* + * If the page is still mapped, then this anon_vma is still + * its anon_vma, and holding the mutex ensures that it will + * not go away, see anon_vma_free(). + */ + if (!page_mapped(page)) { + mutex_unlock(&root_anon_vma->mutex); + anon_vma = NULL; + } + goto out; + } + + /* trylock failed, we got to sleep */ + if (!atomic_inc_not_zero(&anon_vma->refcount)) { + anon_vma = NULL; + goto out; + } + + if (!page_mapped(page)) { + put_anon_vma(anon_vma); + anon_vma = NULL; + goto out; + } + + /* we pinned the anon_vma, its safe to sleep */ + rcu_read_unlock(); + anon_vma_lock(anon_vma); + + if (atomic_dec_and_test(&anon_vma->refcount)) { + /* + * Oops, we held the last refcount, release the lock + * and bail -- can't simply use put_anon_vma() because + * we'll deadlock on the anon_vma_lock() recursion. + */ + anon_vma_unlock(anon_vma); + __put_anon_vma(anon_vma); + anon_vma = NULL; + } + + return anon_vma; - spin_unlock(&root_anon_vma->lock); out: rcu_read_unlock(); - return NULL; + return anon_vma; } void page_unlock_anon_vma(struct anon_vma *anon_vma) - __releases(&anon_vma->root->lock) - __releases(RCU) { anon_vma_unlock(anon_vma); - rcu_read_unlock(); } /* @@ -646,14 +756,14 @@ static int page_referenced_file(struct page *page, * The page lock not only makes sure that page->mapping cannot * suddenly be NULLified by truncation, it makes sure that the * structure at mapping cannot be freed and reused yet, - * so we can safely take mapping->i_mmap_lock. + * so we can safely take mapping->i_mmap_mutex. */ BUG_ON(!PageLocked(page)); - spin_lock(&mapping->i_mmap_lock); + mutex_lock(&mapping->i_mmap_mutex); /* - * i_mmap_lock does not stabilize mapcount at all, but mapcount + * i_mmap_mutex does not stabilize mapcount at all, but mapcount * is more likely to be accurate if we note it after spinning. */ mapcount = page_mapcount(page); @@ -675,7 +785,7 @@ static int page_referenced_file(struct page *page, break; } - spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->i_mmap_mutex); return referenced; } @@ -719,7 +829,7 @@ int page_referenced(struct page *page, unlock_page(page); } out: - if (page_test_and_clear_young(page)) + if (page_test_and_clear_young(page_to_pfn(page))) referenced++; return referenced; @@ -762,7 +872,7 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page) BUG_ON(PageAnon(page)); - spin_lock(&mapping->i_mmap_lock); + mutex_lock(&mapping->i_mmap_mutex); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { if (vma->vm_flags & VM_SHARED) { unsigned long address = vma_address(page, vma); @@ -771,7 +881,7 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page) ret += page_mkclean_one(page, vma, address); } } - spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->i_mmap_mutex); return ret; } @@ -785,10 +895,8 @@ int page_mkclean(struct page *page) struct address_space *mapping = page_mapping(page); if (mapping) { ret = page_mkclean_file(mapping, page); - if (page_test_dirty(page)) { - page_clear_dirty(page, 1); + if (page_test_and_clear_dirty(page_to_pfn(page), 1)) ret = 1; - } } } @@ -914,7 +1022,7 @@ void do_page_add_anon_rmap(struct page *page, return; VM_BUG_ON(!PageLocked(page)); - VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); + /* address might be in next vma when migration races vma_adjust */ if (first) __page_set_anon_rmap(page, vma, address, exclusive); else @@ -981,10 +1089,9 @@ void page_remove_rmap(struct page *page) * not if it's in swapcache - there might be another pte slot * containing the swap entry, but page not yet written to swap. */ - if ((!PageAnon(page) || PageSwapCache(page)) && page_test_dirty(page)) { - page_clear_dirty(page, 1); + if ((!PageAnon(page) || PageSwapCache(page)) && + page_test_and_clear_dirty(page_to_pfn(page), 1)) set_page_dirty(page); - } /* * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED * and not charged by memcg for now. @@ -1122,7 +1229,7 @@ out_mlock: /* * We need mmap_sem locking, Otherwise VM_LOCKED check makes * unstable result and race. Plus, We can't wait here because - * we now hold anon_vma->lock or mapping->i_mmap_lock. + * we now hold anon_vma->mutex or mapping->i_mmap_mutex. * if trylock failed, the page remain in evictable lru and later * vmscan could retry to move the page to unevictable lru if the * page is actually mlocked. @@ -1348,7 +1455,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) unsigned long max_nl_size = 0; unsigned int mapcount; - spin_lock(&mapping->i_mmap_lock); + mutex_lock(&mapping->i_mmap_mutex); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); if (address == -EFAULT) @@ -1394,7 +1501,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) mapcount = page_mapcount(page); if (!mapcount) goto out; - cond_resched_lock(&mapping->i_mmap_lock); + cond_resched(); max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK; if (max_nl_cursor == 0) @@ -1416,7 +1523,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) } vma->vm_private_data = (void *) max_nl_cursor; } - cond_resched_lock(&mapping->i_mmap_lock); + cond_resched(); max_nl_cursor += CLUSTER_SIZE; } while (max_nl_cursor <= max_nl_size); @@ -1428,7 +1535,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) vma->vm_private_data = NULL; out: - spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->i_mmap_mutex); return ret; } @@ -1547,7 +1654,7 @@ static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *, if (!mapping) return ret; - spin_lock(&mapping->i_mmap_lock); + mutex_lock(&mapping->i_mmap_mutex); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); if (address == -EFAULT) @@ -1561,7 +1668,7 @@ static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *, * never contain migration ptes. Decide what to do about this * limitation to linear when we need rmap_walk() on nonlinear. */ - spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->i_mmap_mutex); return ret; } @@ -1610,7 +1717,7 @@ void hugepage_add_anon_rmap(struct page *page, BUG_ON(!PageLocked(page)); BUG_ON(!anon_vma); - BUG_ON(address < vma->vm_start || address >= vma->vm_end); + /* address might be in next vma when migration races vma_adjust */ first = atomic_inc_and_test(&page->_mapcount); if (first) __hugepage_set_anon_rmap(page, vma, address, 0); |