diff options
Diffstat (limited to 'arch/x86/kvm/mmu.c')
-rw-r--r-- | arch/x86/kvm/mmu.c | 1021 |
1 files changed, 538 insertions, 483 deletions
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index ff606f5..c512f09 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -41,6 +41,7 @@ #include <asm/cmpxchg.h> #include <asm/io.h> #include <asm/vmx.h> +#include <asm/kvm_page_track.h> /* * When setting this variable to true it enables Two-Dimensional-Paging @@ -259,7 +260,7 @@ static unsigned get_mmio_spte_access(u64 spte) } static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, - pfn_t pfn, unsigned access) + kvm_pfn_t pfn, unsigned access) { if (unlikely(is_noslot_pfn(pfn))) { mark_mmio_spte(vcpu, sptep, gfn, access); @@ -311,11 +312,6 @@ static int is_large_pte(u64 pte) return pte & PT_PAGE_SIZE_MASK; } -static int is_rmap_spte(u64 pte) -{ - return is_shadow_present_pte(pte); -} - static int is_last_spte(u64 pte, int level) { if (level == PT_PAGE_TABLE_LEVEL) @@ -325,7 +321,7 @@ static int is_last_spte(u64 pte, int level) return 0; } -static pfn_t spte_to_pfn(u64 pte) +static kvm_pfn_t spte_to_pfn(u64 pte) { return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; } @@ -540,7 +536,7 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) u64 old_spte = *sptep; bool ret = false; - WARN_ON(!is_rmap_spte(new_spte)); + WARN_ON(!is_shadow_present_pte(new_spte)); if (!is_shadow_present_pte(old_spte)) { mmu_spte_set(sptep, new_spte); @@ -587,7 +583,7 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) */ static int mmu_spte_clear_track_bits(u64 *sptep) { - pfn_t pfn; + kvm_pfn_t pfn; u64 old_spte = *sptep; if (!spte_has_volatile_bits(old_spte)) @@ -595,7 +591,7 @@ static int mmu_spte_clear_track_bits(u64 *sptep) else old_spte = __update_clear_spte_slow(sptep, 0ull); - if (!is_rmap_spte(old_spte)) + if (!is_shadow_present_pte(old_spte)) return 0; pfn = spte_to_pfn(old_spte); @@ -781,57 +777,85 @@ static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn, return &slot->arch.lpage_info[level - 2][idx]; } +static void update_gfn_disallow_lpage_count(struct kvm_memory_slot *slot, + gfn_t gfn, int count) +{ + struct kvm_lpage_info *linfo; + int i; + + for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { + linfo = lpage_info_slot(gfn, slot, i); + linfo->disallow_lpage += count; + WARN_ON(linfo->disallow_lpage < 0); + } +} + +void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn) +{ + update_gfn_disallow_lpage_count(slot, gfn, 1); +} + +void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn) +{ + update_gfn_disallow_lpage_count(slot, gfn, -1); +} + static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) { struct kvm_memslots *slots; struct kvm_memory_slot *slot; - struct kvm_lpage_info *linfo; gfn_t gfn; - int i; + kvm->arch.indirect_shadow_pages++; gfn = sp->gfn; slots = kvm_memslots_for_spte_role(kvm, sp->role); slot = __gfn_to_memslot(slots, gfn); - for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { - linfo = lpage_info_slot(gfn, slot, i); - linfo->write_count += 1; - } - kvm->arch.indirect_shadow_pages++; + + /* the non-leaf shadow pages are keeping readonly. */ + if (sp->role.level > PT_PAGE_TABLE_LEVEL) + return kvm_slot_page_track_add_page(kvm, slot, gfn, + KVM_PAGE_TRACK_WRITE); + + kvm_mmu_gfn_disallow_lpage(slot, gfn); } static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) { struct kvm_memslots *slots; struct kvm_memory_slot *slot; - struct kvm_lpage_info *linfo; gfn_t gfn; - int i; + kvm->arch.indirect_shadow_pages--; gfn = sp->gfn; slots = kvm_memslots_for_spte_role(kvm, sp->role); slot = __gfn_to_memslot(slots, gfn); - for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { - linfo = lpage_info_slot(gfn, slot, i); - linfo->write_count -= 1; - WARN_ON(linfo->write_count < 0); - } - kvm->arch.indirect_shadow_pages--; + if (sp->role.level > PT_PAGE_TABLE_LEVEL) + return kvm_slot_page_track_remove_page(kvm, slot, gfn, + KVM_PAGE_TRACK_WRITE); + + kvm_mmu_gfn_allow_lpage(slot, gfn); } -static int has_wrprotected_page(struct kvm_vcpu *vcpu, - gfn_t gfn, - int level) +static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level, + struct kvm_memory_slot *slot) { - struct kvm_memory_slot *slot; struct kvm_lpage_info *linfo; - slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); if (slot) { linfo = lpage_info_slot(gfn, slot, level); - return linfo->write_count; + return !!linfo->disallow_lpage; } - return 1; + return true; +} + +static bool mmu_gfn_lpage_is_disallowed(struct kvm_vcpu *vcpu, gfn_t gfn, + int level) +{ + struct kvm_memory_slot *slot; + + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + return __mmu_gfn_lpage_is_disallowed(gfn, level, slot); } static int host_mapping_level(struct kvm *kvm, gfn_t gfn) @@ -851,6 +875,17 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn) return ret; } +static inline bool memslot_valid_for_gpte(struct kvm_memory_slot *slot, + bool no_dirty_log) +{ + if (!slot || slot->flags & KVM_MEMSLOT_INVALID) + return false; + if (no_dirty_log && slot->dirty_bitmap) + return false; + + return true; +} + static struct kvm_memory_slot * gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log) @@ -858,21 +893,25 @@ gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_memory_slot *slot; slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); - if (!slot || slot->flags & KVM_MEMSLOT_INVALID || - (no_dirty_log && slot->dirty_bitmap)) + if (!memslot_valid_for_gpte(slot, no_dirty_log)) slot = NULL; return slot; } -static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn) -{ - return !gfn_to_memslot_dirty_bitmap(vcpu, large_gfn, true); -} - -static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) +static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn, + bool *force_pt_level) { int host_level, level, max_level; + struct kvm_memory_slot *slot; + + if (unlikely(*force_pt_level)) + return PT_PAGE_TABLE_LEVEL; + + slot = kvm_vcpu_gfn_to_memslot(vcpu, large_gfn); + *force_pt_level = !memslot_valid_for_gpte(slot, true); + if (unlikely(*force_pt_level)) + return PT_PAGE_TABLE_LEVEL; host_level = host_mapping_level(vcpu->kvm, large_gfn); @@ -882,43 +921,42 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) max_level = min(kvm_x86_ops->get_lpage_level(), host_level); for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level) - if (has_wrprotected_page(vcpu, large_gfn, level)) + if (__mmu_gfn_lpage_is_disallowed(large_gfn, level, slot)) break; return level - 1; } /* - * Pte mapping structures: + * About rmap_head encoding: * - * If pte_list bit zero is zero, then pte_list point to the spte. - * - * If pte_list bit zero is one, (then pte_list & ~1) points to a struct + * If the bit zero of rmap_head->val is clear, then it points to the only spte + * in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct * pte_list_desc containing more mappings. - * - * Returns the number of pte entries before the spte was added or zero if - * the spte was not added. - * + */ + +/* + * Returns the number of pointers in the rmap chain, not counting the new one. */ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte, - unsigned long *pte_list) + struct kvm_rmap_head *rmap_head) { struct pte_list_desc *desc; int i, count = 0; - if (!*pte_list) { + if (!rmap_head->val) { rmap_printk("pte_list_add: %p %llx 0->1\n", spte, *spte); - *pte_list = (unsigned long)spte; - } else if (!(*pte_list & 1)) { + rmap_head->val = (unsigned long)spte; + } else if (!(rmap_head->val & 1)) { rmap_printk("pte_list_add: %p %llx 1->many\n", spte, *spte); desc = mmu_alloc_pte_list_desc(vcpu); - desc->sptes[0] = (u64 *)*pte_list; + desc->sptes[0] = (u64 *)rmap_head->val; desc->sptes[1] = spte; - *pte_list = (unsigned long)desc | 1; + rmap_head->val = (unsigned long)desc | 1; ++count; } else { rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte); - desc = (struct pte_list_desc *)(*pte_list & ~1ul); + desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); while (desc->sptes[PTE_LIST_EXT-1] && desc->more) { desc = desc->more; count += PTE_LIST_EXT; @@ -935,8 +973,9 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte, } static void -pte_list_desc_remove_entry(unsigned long *pte_list, struct pte_list_desc *desc, - int i, struct pte_list_desc *prev_desc) +pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head, + struct pte_list_desc *desc, int i, + struct pte_list_desc *prev_desc) { int j; @@ -947,43 +986,43 @@ pte_list_desc_remove_entry(unsigned long *pte_list, struct pte_list_desc *desc, if (j != 0) return; if (!prev_desc && !desc->more) - *pte_list = (unsigned long)desc->sptes[0]; + rmap_head->val = (unsigned long)desc->sptes[0]; else if (prev_desc) prev_desc->more = desc->more; else - *pte_list = (unsigned long)desc->more | 1; + rmap_head->val = (unsigned long)desc->more | 1; mmu_free_pte_list_desc(desc); } -static void pte_list_remove(u64 *spte, unsigned long *pte_list) +static void pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head) { struct pte_list_desc *desc; struct pte_list_desc *prev_desc; int i; - if (!*pte_list) { + if (!rmap_head->val) { printk(KERN_ERR "pte_list_remove: %p 0->BUG\n", spte); BUG(); - } else if (!(*pte_list & 1)) { + } else if (!(rmap_head->val & 1)) { rmap_printk("pte_list_remove: %p 1->0\n", spte); - if ((u64 *)*pte_list != spte) { + if ((u64 *)rmap_head->val != spte) { printk(KERN_ERR "pte_list_remove: %p 1->BUG\n", spte); BUG(); } - *pte_list = 0; + rmap_head->val = 0; } else { rmap_printk("pte_list_remove: %p many->many\n", spte); - desc = (struct pte_list_desc *)(*pte_list & ~1ul); + desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); prev_desc = NULL; while (desc) { - for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) + for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) { if (desc->sptes[i] == spte) { - pte_list_desc_remove_entry(pte_list, - desc, i, - prev_desc); + pte_list_desc_remove_entry(rmap_head, + desc, i, prev_desc); return; } + } prev_desc = desc; desc = desc->more; } @@ -992,28 +1031,8 @@ static void pte_list_remove(u64 *spte, unsigned long *pte_list) } } -typedef void (*pte_list_walk_fn) (u64 *spte); -static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn) -{ - struct pte_list_desc *desc; - int i; - - if (!*pte_list) - return; - - if (!(*pte_list & 1)) - return fn((u64 *)*pte_list); - - desc = (struct pte_list_desc *)(*pte_list & ~1ul); - while (desc) { - for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) - fn(desc->sptes[i]); - desc = desc->more; - } -} - -static unsigned long *__gfn_to_rmap(gfn_t gfn, int level, - struct kvm_memory_slot *slot) +static struct kvm_rmap_head *__gfn_to_rmap(gfn_t gfn, int level, + struct kvm_memory_slot *slot) { unsigned long idx; @@ -1021,10 +1040,8 @@ static unsigned long *__gfn_to_rmap(gfn_t gfn, int level, return &slot->arch.rmap[level - PT_PAGE_TABLE_LEVEL][idx]; } -/* - * Take gfn and return the reverse mapping to it. - */ -static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, struct kvm_mmu_page *sp) +static struct kvm_rmap_head *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, + struct kvm_mmu_page *sp) { struct kvm_memslots *slots; struct kvm_memory_slot *slot; @@ -1045,24 +1062,24 @@ static bool rmap_can_add(struct kvm_vcpu *vcpu) static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) { struct kvm_mmu_page *sp; - unsigned long *rmapp; + struct kvm_rmap_head *rmap_head; sp = page_header(__pa(spte)); kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); - rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp); - return pte_list_add(vcpu, spte, rmapp); + rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp); + return pte_list_add(vcpu, spte, rmap_head); } static void rmap_remove(struct kvm *kvm, u64 *spte) { struct kvm_mmu_page *sp; gfn_t gfn; - unsigned long *rmapp; + struct kvm_rmap_head *rmap_head; sp = page_header(__pa(spte)); gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); - rmapp = gfn_to_rmap(kvm, gfn, sp); - pte_list_remove(spte, rmapp); + rmap_head = gfn_to_rmap(kvm, gfn, sp); + pte_list_remove(spte, rmap_head); } /* @@ -1082,19 +1099,26 @@ struct rmap_iterator { * * Returns sptep if found, NULL otherwise. */ -static u64 *rmap_get_first(unsigned long rmap, struct rmap_iterator *iter) +static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head, + struct rmap_iterator *iter) { - if (!rmap) + u64 *sptep; + + if (!rmap_head->val) return NULL; - if (!(rmap & 1)) { + if (!(rmap_head->val & 1)) { iter->desc = NULL; - return (u64 *)rmap; + sptep = (u64 *)rmap_head->val; + goto out; } - iter->desc = (struct pte_list_desc *)(rmap & ~1ul); + iter->desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); iter->pos = 0; - return iter->desc->sptes[iter->pos]; + sptep = iter->desc->sptes[iter->pos]; +out: + BUG_ON(!is_shadow_present_pte(*sptep)); + return sptep; } /* @@ -1104,14 +1128,14 @@ static u64 *rmap_get_first(unsigned long rmap, struct rmap_iterator *iter) */ static u64 *rmap_get_next(struct rmap_iterator *iter) { + u64 *sptep; + if (iter->desc) { if (iter->pos < PTE_LIST_EXT - 1) { - u64 *sptep; - ++iter->pos; sptep = iter->desc->sptes[iter->pos]; if (sptep) - return sptep; + goto out; } iter->desc = iter->desc->more; @@ -1119,17 +1143,20 @@ static u64 *rmap_get_next(struct rmap_iterator *iter) if (iter->desc) { iter->pos = 0; /* desc->sptes[0] cannot be NULL */ - return iter->desc->sptes[iter->pos]; + sptep = iter->desc->sptes[iter->pos]; + goto out; } } return NULL; +out: + BUG_ON(!is_shadow_present_pte(*sptep)); + return sptep; } -#define for_each_rmap_spte(_rmap_, _iter_, _spte_) \ - for (_spte_ = rmap_get_first(*_rmap_, _iter_); \ - _spte_ && ({BUG_ON(!is_shadow_present_pte(*_spte_)); 1;}); \ - _spte_ = rmap_get_next(_iter_)) +#define for_each_rmap_spte(_rmap_head_, _iter_, _spte_) \ + for (_spte_ = rmap_get_first(_rmap_head_, _iter_); \ + _spte_; _spte_ = rmap_get_next(_iter_)) static void drop_spte(struct kvm *kvm, u64 *sptep) { @@ -1187,14 +1214,15 @@ static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool pt_protect) return mmu_spte_update(sptep, spte); } -static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, +static bool __rmap_write_protect(struct kvm *kvm, + struct kvm_rmap_head *rmap_head, bool pt_protect) { u64 *sptep; struct rmap_iterator iter; bool flush = false; - for_each_rmap_spte(rmapp, &iter, sptep) + for_each_rmap_spte(rmap_head, &iter, sptep) flush |= spte_write_protect(kvm, sptep, pt_protect); return flush; @@ -1211,13 +1239,13 @@ static bool spte_clear_dirty(struct kvm *kvm, u64 *sptep) return mmu_spte_update(sptep, spte); } -static bool __rmap_clear_dirty(struct kvm *kvm, unsigned long *rmapp) +static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head) { u64 *sptep; struct rmap_iterator iter; bool flush = false; - for_each_rmap_spte(rmapp, &iter, sptep) + for_each_rmap_spte(rmap_head, &iter, sptep) flush |= spte_clear_dirty(kvm, sptep); return flush; @@ -1234,13 +1262,13 @@ static bool spte_set_dirty(struct kvm *kvm, u64 *sptep) return mmu_spte_update(sptep, spte); } -static bool __rmap_set_dirty(struct kvm *kvm, unsigned long *rmapp) +static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head) { u64 *sptep; struct rmap_iterator iter; bool flush = false; - for_each_rmap_spte(rmapp, &iter, sptep) + for_each_rmap_spte(rmap_head, &iter, sptep) flush |= spte_set_dirty(kvm, sptep); return flush; @@ -1260,12 +1288,12 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask) { - unsigned long *rmapp; + struct kvm_rmap_head *rmap_head; while (mask) { - rmapp = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), - PT_PAGE_TABLE_LEVEL, slot); - __rmap_write_protect(kvm, rmapp, false); + rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), + PT_PAGE_TABLE_LEVEL, slot); + __rmap_write_protect(kvm, rmap_head, false); /* clear the first set bit */ mask &= mask - 1; @@ -1285,12 +1313,12 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask) { - unsigned long *rmapp; + struct kvm_rmap_head *rmap_head; while (mask) { - rmapp = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), - PT_PAGE_TABLE_LEVEL, slot); - __rmap_clear_dirty(kvm, rmapp); + rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), + PT_PAGE_TABLE_LEVEL, slot); + __rmap_clear_dirty(kvm, rmap_head); /* clear the first set bit */ mask &= mask - 1; @@ -1319,31 +1347,36 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); } -static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn) +bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, + struct kvm_memory_slot *slot, u64 gfn) { - struct kvm_memory_slot *slot; - unsigned long *rmapp; + struct kvm_rmap_head *rmap_head; int i; bool write_protected = false; - slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); - for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { - rmapp = __gfn_to_rmap(gfn, i, slot); - write_protected |= __rmap_write_protect(vcpu->kvm, rmapp, true); + rmap_head = __gfn_to_rmap(gfn, i, slot); + write_protected |= __rmap_write_protect(kvm, rmap_head, true); } return write_protected; } -static bool kvm_zap_rmapp(struct kvm *kvm, unsigned long *rmapp) +static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn) +{ + struct kvm_memory_slot *slot; + + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn); +} + +static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head) { u64 *sptep; struct rmap_iterator iter; bool flush = false; - while ((sptep = rmap_get_first(*rmapp, &iter))) { - BUG_ON(!(*sptep & PT_PRESENT_MASK)); + while ((sptep = rmap_get_first(rmap_head, &iter))) { rmap_printk("%s: spte %p %llx.\n", __func__, sptep, *sptep); drop_spte(kvm, sptep); @@ -1353,14 +1386,14 @@ static bool kvm_zap_rmapp(struct kvm *kvm, unsigned long *rmapp) return flush; } -static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, +static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, struct kvm_memory_slot *slot, gfn_t gfn, int level, unsigned long data) { - return kvm_zap_rmapp(kvm, rmapp); + return kvm_zap_rmapp(kvm, rmap_head); } -static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, +static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, struct kvm_memory_slot *slot, gfn_t gfn, int level, unsigned long data) { @@ -1369,13 +1402,13 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, int need_flush = 0; u64 new_spte; pte_t *ptep = (pte_t *)data; - pfn_t new_pfn; + kvm_pfn_t new_pfn; WARN_ON(pte_huge(*ptep)); new_pfn = pte_pfn(*ptep); restart: - for_each_rmap_spte(rmapp, &iter, sptep) { + for_each_rmap_spte(rmap_head, &iter, sptep) { rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n", sptep, *sptep, gfn, level); @@ -1413,11 +1446,11 @@ struct slot_rmap_walk_iterator { /* output fields. */ gfn_t gfn; - unsigned long *rmap; + struct kvm_rmap_head *rmap; int level; /* private field. */ - unsigned long *end_rmap; + struct kvm_rmap_head *end_rmap; }; static void @@ -1476,7 +1509,7 @@ static int kvm_handle_hva_range(struct kvm *kvm, unsigned long end, unsigned long data, int (*handler)(struct kvm *kvm, - unsigned long *rmapp, + struct kvm_rmap_head *rmap_head, struct kvm_memory_slot *slot, gfn_t gfn, int level, @@ -1520,7 +1553,8 @@ static int kvm_handle_hva_range(struct kvm *kvm, static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, unsigned long data, - int (*handler)(struct kvm *kvm, unsigned long *rmapp, + int (*handler)(struct kvm *kvm, + struct kvm_rmap_head *rmap_head, struct kvm_memory_slot *slot, gfn_t gfn, int level, unsigned long data)) @@ -1543,7 +1577,7 @@ void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp); } -static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, +static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, struct kvm_memory_slot *slot, gfn_t gfn, int level, unsigned long data) { @@ -1553,18 +1587,19 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, BUG_ON(!shadow_accessed_mask); - for_each_rmap_spte(rmapp, &iter, sptep) + for_each_rmap_spte(rmap_head, &iter, sptep) { if (*sptep & shadow_accessed_mask) { young = 1; clear_bit((ffs(shadow_accessed_mask) - 1), (unsigned long *)sptep); } + } trace_kvm_age_page(gfn, level, slot, young); return young; } -static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, +static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, struct kvm_memory_slot *slot, gfn_t gfn, int level, unsigned long data) { @@ -1580,11 +1615,12 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, if (!shadow_accessed_mask) goto out; - for_each_rmap_spte(rmapp, &iter, sptep) + for_each_rmap_spte(rmap_head, &iter, sptep) { if (*sptep & shadow_accessed_mask) { young = 1; break; } + } out: return young; } @@ -1593,14 +1629,14 @@ out: static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) { - unsigned long *rmapp; + struct kvm_rmap_head *rmap_head; struct kvm_mmu_page *sp; sp = page_header(__pa(spte)); - rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp); + rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp); - kvm_unmap_rmapp(vcpu->kvm, rmapp, NULL, gfn, sp->role.level, 0); + kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, 0); kvm_flush_remote_tlbs(vcpu->kvm); } @@ -1700,8 +1736,7 @@ static void drop_parent_pte(struct kvm_mmu_page *sp, mmu_spte_clear_no_track(parent_pte); } -static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, - u64 *parent_pte, int direct) +static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct) { struct kvm_mmu_page *sp; @@ -1717,8 +1752,6 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, * this feature. See the comments in kvm_zap_obsolete_pages(). */ list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); - sp->parent_ptes = 0; - mmu_page_add_parent_pte(vcpu, sp, parent_pte); kvm_mod_used_mmu_pages(vcpu->kvm, +1); return sp; } @@ -1726,7 +1759,12 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, static void mark_unsync(u64 *spte); static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp) { - pte_list_walk(&sp->parent_ptes, mark_unsync); + u64 *sptep; + struct rmap_iterator iter; + + for_each_rmap_spte(&sp->parent_ptes, &iter, sptep) { + mark_unsync(sptep); + } } static void mark_unsync(u64 *spte) @@ -1746,7 +1784,7 @@ static void mark_unsync(u64 *spte) static int nonpaging_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { - return 1; + return 0; } static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva) @@ -1786,6 +1824,13 @@ static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp, return (pvec->nr == KVM_PAGE_ARRAY_NR); } +static inline void clear_unsync_child_bit(struct kvm_mmu_page *sp, int idx) +{ + --sp->unsync_children; + WARN_ON((int)sp->unsync_children < 0); + __clear_bit(idx, sp->unsync_child_bitmap); +} + static int __mmu_unsync_walk(struct kvm_mmu_page *sp, struct kvm_mmu_pages *pvec) { @@ -1795,8 +1840,10 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp, struct kvm_mmu_page *child; u64 ent = sp->spt[i]; - if (!is_shadow_present_pte(ent) || is_large_pte(ent)) - goto clear_child_bitmap; + if (!is_shadow_present_pte(ent) || is_large_pte(ent)) { + clear_unsync_child_bit(sp, i); + continue; + } child = page_header(ent & PT64_BASE_ADDR_MASK); @@ -1805,38 +1852,34 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp, return -ENOSPC; ret = __mmu_unsync_walk(child, pvec); - if (!ret) - goto clear_child_bitmap; - else if (ret > 0) + if (!ret) { + clear_unsync_child_bit(sp, i); + continue; + } else if (ret > 0) { nr_unsync_leaf += ret; - else + } else return ret; } else if (child->unsync) { nr_unsync_leaf++; if (mmu_pages_add(pvec, child, i)) return -ENOSPC; } else - goto clear_child_bitmap; - - continue; - -clear_child_bitmap: - __clear_bit(i, sp->unsync_child_bitmap); - sp->unsync_children--; - WARN_ON((int)sp->unsync_children < 0); + clear_unsync_child_bit(sp, i); } - return nr_unsync_leaf; } +#define INVALID_INDEX (-1) + static int mmu_unsync_walk(struct kvm_mmu_page *sp, struct kvm_mmu_pages *pvec) { + pvec->nr = 0; if (!sp->unsync_children) return 0; - mmu_pages_add(pvec, sp, 0); + mmu_pages_add(pvec, sp, INVALID_INDEX); return __mmu_unsync_walk(sp, pvec); } @@ -1873,37 +1916,35 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, if ((_sp)->role.direct || (_sp)->role.invalid) {} else /* @sp->gfn should be write-protected at the call site */ -static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, - struct list_head *invalid_list, bool clear_unsync) +static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, + struct list_head *invalid_list) { if (sp->role.cr4_pae != !!is_pae(vcpu)) { kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); - return 1; + return false; } - if (clear_unsync) - kvm_unlink_unsync_page(vcpu->kvm, sp); - - if (vcpu->arch.mmu.sync_page(vcpu, sp)) { + if (vcpu->arch.mmu.sync_page(vcpu, sp) == 0) { kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); - return 1; + return false; } - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); - return 0; + return true; } -static int kvm_sync_page_transient(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp) +static void kvm_mmu_flush_or_zap(struct kvm_vcpu *vcpu, + struct list_head *invalid_list, + bool remote_flush, bool local_flush) { - LIST_HEAD(invalid_list); - int ret; - - ret = __kvm_sync_page(vcpu, sp, &invalid_list, false); - if (ret) - kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); + if (!list_empty(invalid_list)) { + kvm_mmu_commit_zap_page(vcpu->kvm, invalid_list); + return; + } - return ret; + if (remote_flush) + kvm_flush_remote_tlbs(vcpu->kvm); + else if (local_flush) + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); } #ifdef CONFIG_KVM_MMU_AUDIT @@ -1913,46 +1954,38 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { } static void mmu_audit_disable(void) { } #endif -static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, +static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, struct list_head *invalid_list) { - return __kvm_sync_page(vcpu, sp, invalid_list, true); + kvm_unlink_unsync_page(vcpu->kvm, sp); + return __kvm_sync_page(vcpu, sp, invalid_list); } /* @gfn should be write-protected at the call site */ -static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) +static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, + struct list_head *invalid_list) { struct kvm_mmu_page *s; - LIST_HEAD(invalid_list); - bool flush = false; + bool ret = false; for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) { if (!s->unsync) continue; WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); - kvm_unlink_unsync_page(vcpu->kvm, s); - if ((s->role.cr4_pae != !!is_pae(vcpu)) || - (vcpu->arch.mmu.sync_page(vcpu, s))) { - kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list); - continue; - } - flush = true; + ret |= kvm_sync_page(vcpu, s, invalid_list); } - kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); - if (flush) - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + return ret; } struct mmu_page_path { - struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1]; - unsigned int idx[PT64_ROOT_LEVEL-1]; + struct kvm_mmu_page *parent[PT64_ROOT_LEVEL]; + unsigned int idx[PT64_ROOT_LEVEL]; }; #define for_each_sp(pvec, sp, parents, i) \ - for (i = mmu_pages_next(&pvec, &parents, -1), \ - sp = pvec.page[i].sp; \ + for (i = mmu_pages_first(&pvec, &parents); \ i < pvec.nr && ({ sp = pvec.page[i].sp; 1;}); \ i = mmu_pages_next(&pvec, &parents, i)) @@ -1964,19 +1997,43 @@ static int mmu_pages_next(struct kvm_mmu_pages *pvec, for (n = i+1; n < pvec->nr; n++) { struct kvm_mmu_page *sp = pvec->page[n].sp; + unsigned idx = pvec->page[n].idx; + int level = sp->role.level; - if (sp->role.level == PT_PAGE_TABLE_LEVEL) { - parents->idx[0] = pvec->page[n].idx; - return n; - } + parents->idx[level-1] = idx; + if (level == PT_PAGE_TABLE_LEVEL) + break; - parents->parent[sp->role.level-2] = sp; - parents->idx[sp->role.level-1] = pvec->page[n].idx; + parents->parent[level-2] = sp; } return n; } +static int mmu_pages_first(struct kvm_mmu_pages *pvec, + struct mmu_page_path *parents) +{ + struct kvm_mmu_page *sp; + int level; + + if (pvec->nr == 0) + return 0; + + WARN_ON(pvec->page[0].idx != INVALID_INDEX); + + sp = pvec->page[0].sp; + level = sp->role.level; + WARN_ON(level == PT_PAGE_TABLE_LEVEL); + + parents->parent[level-2] = sp; + + /* Also set up a sentinel. Further entries in pvec are all + * children of sp, so this element is never overwritten. + */ + parents->parent[level-1] = NULL; + return mmu_pages_next(pvec, parents, 0); +} + static void mmu_pages_clear_parents(struct mmu_page_path *parents) { struct kvm_mmu_page *sp; @@ -1984,24 +2041,14 @@ static void mmu_pages_clear_parents(struct mmu_page_path *parents) do { unsigned int idx = parents->idx[level]; - sp = parents->parent[level]; if (!sp) return; - --sp->unsync_children; - WARN_ON((int)sp->unsync_children < 0); - __clear_bit(idx, sp->unsync_child_bitmap); + WARN_ON(idx == INVALID_INDEX); + clear_unsync_child_bit(sp, idx); level++; - } while (level < PT64_ROOT_LEVEL-1 && !sp->unsync_children); -} - -static void kvm_mmu_pages_init(struct kvm_mmu_page *parent, - struct mmu_page_path *parents, - struct kvm_mmu_pages *pvec) -{ - parents->parent[parent->role.level-1] = NULL; - pvec->nr = 0; + } while (!sp->unsync_children); } static void mmu_sync_children(struct kvm_vcpu *vcpu, @@ -2012,38 +2059,36 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu, struct mmu_page_path parents; struct kvm_mmu_pages pages; LIST_HEAD(invalid_list); + bool flush = false; - kvm_mmu_pages_init(parent, &parents, &pages); while (mmu_unsync_walk(parent, &pages)) { bool protected = false; for_each_sp(pages, sp, parents, i) protected |= rmap_write_protect(vcpu, sp->gfn); - if (protected) + if (protected) { kvm_flush_remote_tlbs(vcpu->kvm); + flush = false; + } for_each_sp(pages, sp, parents, i) { - kvm_sync_page(vcpu, sp, &invalid_list); + flush |= kvm_sync_page(vcpu, sp, &invalid_list); mmu_pages_clear_parents(&parents); } - kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); - cond_resched_lock(&vcpu->kvm->mmu_lock); - kvm_mmu_pages_init(parent, &parents, &pages); + if (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock)) { + kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush); + cond_resched_lock(&vcpu->kvm->mmu_lock); + flush = false; + } } -} - -static void init_shadow_page_table(struct kvm_mmu_page *sp) -{ - int i; - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) - sp->spt[i] = 0ull; + kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush); } static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp) { - sp->write_flooding_count = 0; + atomic_set(&sp->write_flooding_count, 0); } static void clear_sp_write_flooding_count(u64 *spte) @@ -2063,13 +2108,14 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, gva_t gaddr, unsigned level, int direct, - unsigned access, - u64 *parent_pte) + unsigned access) { union kvm_mmu_page_role role; unsigned quadrant; struct kvm_mmu_page *sp; bool need_sync = false; + bool flush = false; + LIST_HEAD(invalid_list); role = vcpu->arch.mmu.base_role; role.level = level; @@ -2093,39 +2139,52 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, if (sp->role.word != role.word) continue; - if (sp->unsync && kvm_sync_page_transient(vcpu, sp)) - break; + if (sp->unsync) { + /* The page is good, but __kvm_sync_page might still end + * up zapping it. If so, break in order to rebuild it. + */ + if (!__kvm_sync_page(vcpu, sp, &invalid_list)) + break; + + WARN_ON(!list_empty(&invalid_list)); + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + } - mmu_page_add_parent_pte(vcpu, sp, parent_pte); - if (sp->unsync_children) { + if (sp->unsync_children) kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); - kvm_mmu_mark_parents_unsync(sp); - } else if (sp->unsync) - kvm_mmu_mark_parents_unsync(sp); __clear_sp_write_flooding_count(sp); trace_kvm_mmu_get_page(sp, false); return sp; } + ++vcpu->kvm->stat.mmu_cache_miss; - sp = kvm_mmu_alloc_page(vcpu, parent_pte, direct); - if (!sp) - return sp; + + sp = kvm_mmu_alloc_page(vcpu, direct); + sp->gfn = gfn; sp->role = role; hlist_add_head(&sp->hash_link, &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]); if (!direct) { - if (rmap_write_protect(vcpu, gfn)) + /* + * we should do write protection before syncing pages + * otherwise the content of the synced shadow page may + * be inconsistent with guest page table. + */ + account_shadowed(vcpu->kvm, sp); + if (level == PT_PAGE_TABLE_LEVEL && + rmap_write_protect(vcpu, gfn)) kvm_flush_remote_tlbs(vcpu->kvm); - if (level > PT_PAGE_TABLE_LEVEL && need_sync) - kvm_sync_pages(vcpu, gfn); - account_shadowed(vcpu->kvm, sp); + if (level > PT_PAGE_TABLE_LEVEL && need_sync) + flush |= kvm_sync_pages(vcpu, gfn, &invalid_list); } sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen; - init_shadow_page_table(sp); + clear_page(sp->spt); trace_kvm_mmu_get_page(sp, true); + + kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush); return sp; } @@ -2178,7 +2237,8 @@ static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) return __shadow_walk_next(iterator, *iterator->sptep); } -static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp, bool accessed) +static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep, + struct kvm_mmu_page *sp) { u64 spte; @@ -2186,12 +2246,14 @@ static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp, bool accessed) VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK); spte = __pa(sp->spt) | PT_PRESENT_MASK | PT_WRITABLE_MASK | - shadow_user_mask | shadow_x_mask; - - if (accessed) - spte |= shadow_accessed_mask; + shadow_user_mask | shadow_x_mask | shadow_accessed_mask; mmu_spte_set(sptep, spte); + + mmu_page_add_parent_pte(vcpu, sp, sptep); + + if (sp->unsync_children || sp->unsync) + mark_unsync(sptep); } static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, @@ -2250,17 +2312,12 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm, mmu_page_zap_pte(kvm, sp, sp->spt + i); } -static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte) -{ - mmu_page_remove_parent_pte(sp, parent_pte); -} - static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) { u64 *sptep; struct rmap_iterator iter; - while ((sptep = rmap_get_first(sp->parent_ptes, &iter))) + while ((sptep = rmap_get_first(&sp->parent_ptes, &iter))) drop_parent_pte(sp, sptep); } @@ -2275,7 +2332,6 @@ static int mmu_zap_unsync_children(struct kvm *kvm, if (parent->role.level == PT_PAGE_TABLE_LEVEL) return 0; - kvm_mmu_pages_init(parent, &parents, &pages); while (mmu_unsync_walk(parent, &pages)) { struct kvm_mmu_page *sp; @@ -2284,7 +2340,6 @@ static int mmu_zap_unsync_children(struct kvm *kvm, mmu_pages_clear_parents(&parents); zapped++; } - kvm_mmu_pages_init(parent, &parents, &pages); } return zapped; @@ -2360,8 +2415,8 @@ static bool prepare_zap_oldest_mmu_page(struct kvm *kvm, if (list_empty(&kvm->arch.active_mmu_pages)) return false; - sp = list_entry(kvm->arch.active_mmu_pages.prev, - struct kvm_mmu_page, link); + sp = list_last_entry(&kvm->arch.active_mmu_pages, + struct kvm_mmu_page, link); kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); return true; @@ -2414,7 +2469,7 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) } EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page); -static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { trace_kvm_mmu_unsync_page(sp); ++vcpu->kvm->stat.mmu_unsync; @@ -2423,40 +2478,29 @@ static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) kvm_mmu_mark_parents_unsync(sp); } -static void kvm_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) +static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, + bool can_unsync) { - struct kvm_mmu_page *s; - - for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) { - if (s->unsync) - continue; - WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); - __kvm_unsync_page(vcpu, s); - } -} + struct kvm_mmu_page *sp; -static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, - bool can_unsync) -{ - struct kvm_mmu_page *s; - bool need_unsync = false; + if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE)) + return true; - for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) { + for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) { if (!can_unsync) - return 1; + return true; - if (s->role.level != PT_PAGE_TABLE_LEVEL) - return 1; + if (sp->unsync) + continue; - if (!s->unsync) - need_unsync = true; + WARN_ON(sp->role.level != PT_PAGE_TABLE_LEVEL); + kvm_unsync_page(vcpu, sp); } - if (need_unsync) - kvm_unsync_pages(vcpu, gfn); - return 0; + + return false; } -static bool kvm_is_mmio_pfn(pfn_t pfn) +static bool kvm_is_mmio_pfn(kvm_pfn_t pfn) { if (pfn_valid(pfn)) return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)); @@ -2466,7 +2510,7 @@ static bool kvm_is_mmio_pfn(pfn_t pfn) static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, int level, - gfn_t gfn, pfn_t pfn, bool speculative, + gfn_t gfn, kvm_pfn_t pfn, bool speculative, bool can_unsync, bool host_writable) { u64 spte; @@ -2509,7 +2553,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, * be fixed if guest refault. */ if (level > PT_PAGE_TABLE_LEVEL && - has_wrprotected_page(vcpu, gfn, level)) + mmu_gfn_lpage_is_disallowed(vcpu, gfn, level)) goto done; spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE; @@ -2544,18 +2588,18 @@ done: return ret; } -static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, - unsigned pte_access, int write_fault, int *emulate, - int level, gfn_t gfn, pfn_t pfn, bool speculative, - bool host_writable) +static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, + int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn, + bool speculative, bool host_writable) { int was_rmapped = 0; int rmap_count; + bool emulate = false; pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__, *sptep, write_fault, gfn); - if (is_rmap_spte(*sptep)) { + if (is_shadow_present_pte(*sptep)) { /* * If we overwrite a PTE page pointer with a 2MB PMD, unlink * the parent of the now unreachable PTE. @@ -2580,12 +2624,12 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (set_spte(vcpu, sptep, pte_access, level, gfn, pfn, speculative, true, host_writable)) { if (write_fault) - *emulate = 1; + emulate = true; kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); } - if (unlikely(is_mmio_spte(*sptep) && emulate)) - *emulate = 1; + if (unlikely(is_mmio_spte(*sptep))) + emulate = true; pgprintk("%s: setting spte %llx\n", __func__, *sptep); pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n", @@ -2604,9 +2648,11 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, } kvm_release_pfn_clean(pfn); + + return emulate; } -static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, +static kvm_pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log) { struct kvm_memory_slot *slot; @@ -2638,9 +2684,8 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, return -1; for (i = 0; i < ret; i++, gfn++, start++) - mmu_set_spte(vcpu, start, access, 0, NULL, - sp->role.level, gfn, page_to_pfn(pages[i]), - true, true); + mmu_set_spte(vcpu, start, access, 0, sp->role.level, gfn, + page_to_pfn(pages[i]), true, true); return 0; } @@ -2688,9 +2733,8 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) __direct_pte_prefetch(vcpu, sp, sptep); } -static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, - int map_writable, int level, gfn_t gfn, pfn_t pfn, - bool prefault) +static int __direct_map(struct kvm_vcpu *vcpu, int write, int map_writable, + int level, gfn_t gfn, kvm_pfn_t pfn, bool prefault) { struct kvm_shadow_walk_iterator iterator; struct kvm_mmu_page *sp; @@ -2702,9 +2746,9 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { if (iterator.level == level) { - mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, - write, &emulate, level, gfn, pfn, - prefault, map_writable); + emulate = mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, + write, level, gfn, pfn, prefault, + map_writable); direct_pte_prefetch(vcpu, iterator.sptep); ++vcpu->stat.pf_fixed; break; @@ -2717,10 +2761,9 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, base_addr &= PT64_LVL_ADDR_MASK(iterator.level); pseudo_gfn = base_addr >> PAGE_SHIFT; sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr, - iterator.level - 1, - 1, ACC_ALL, iterator.sptep); + iterator.level - 1, 1, ACC_ALL); - link_shadow_page(iterator.sptep, sp, true); + link_shadow_page(vcpu, iterator.sptep, sp); } } return emulate; @@ -2739,7 +2782,7 @@ static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct * send_sig_info(SIGBUS, &info, tsk); } -static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn) +static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn) { /* * Do not cache the mmio info caused by writing the readonly gfn @@ -2759,9 +2802,10 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn) } static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, - gfn_t *gfnp, pfn_t *pfnp, int *levelp) + gfn_t *gfnp, kvm_pfn_t *pfnp, + int *levelp) { - pfn_t pfn = *pfnp; + kvm_pfn_t pfn = *pfnp; gfn_t gfn = *gfnp; int level = *levelp; @@ -2774,7 +2818,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL && PageTransCompound(pfn_to_page(pfn)) && - !has_wrprotected_page(vcpu, gfn, PT_DIRECTORY_LEVEL)) { + !mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) { unsigned long mask; /* * mmu_notifier_retry was successful and we hold the @@ -2800,22 +2844,18 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, } static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, - pfn_t pfn, unsigned access, int *ret_val) + kvm_pfn_t pfn, unsigned access, int *ret_val) { - bool ret = true; - /* The pfn is invalid, report the error! */ if (unlikely(is_error_pfn(pfn))) { *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn); - goto exit; + return true; } if (unlikely(is_noslot_pfn(pfn))) vcpu_cache_mmio_info(vcpu, gva, gfn, access); - ret = false; -exit: - return ret; + return false; } static bool page_fault_can_be_fast(u32 error_code) @@ -2899,7 +2939,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, * If the mapping has been changed, let the vcpu fault on the * same address again. */ - if (!is_rmap_spte(spte)) { + if (!is_shadow_present_pte(spte)) { ret = true; goto exit; } @@ -2954,7 +2994,7 @@ exit: } static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, - gva_t gva, pfn_t *pfn, bool write, bool *writable); + gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable); static void make_mmu_pages_available(struct kvm_vcpu *vcpu); static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, @@ -2962,14 +3002,13 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, { int r; int level; - int force_pt_level; - pfn_t pfn; + bool force_pt_level = false; + kvm_pfn_t pfn; unsigned long mmu_seq; bool map_writable, write = error_code & PFERR_WRITE_MASK; - force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn); + level = mapping_level(vcpu, gfn, &force_pt_level); if (likely(!force_pt_level)) { - level = mapping_level(vcpu, gfn); /* * This path builds a PAE pagetable - so we can map * 2mb pages at maximum. Therefore check if the level @@ -2979,8 +3018,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, level = PT_DIRECTORY_LEVEL; gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); - } else - level = PT_PAGE_TABLE_LEVEL; + } if (fast_page_fault(vcpu, v, level, error_code)) return 0; @@ -3000,11 +3038,9 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, make_mmu_pages_available(vcpu); if (likely(!force_pt_level)) transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); - r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn, - prefault); + r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault); spin_unlock(&vcpu->kvm->mmu_lock); - return r; out_unlock: @@ -3079,8 +3115,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { spin_lock(&vcpu->kvm->mmu_lock); make_mmu_pages_available(vcpu); - sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL, - 1, ACC_ALL, NULL); + sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL, 1, ACC_ALL); ++sp->root_count; spin_unlock(&vcpu->kvm->mmu_lock); vcpu->arch.mmu.root_hpa = __pa(sp->spt); @@ -3092,9 +3127,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) spin_lock(&vcpu->kvm->mmu_lock); make_mmu_pages_available(vcpu); sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT), - i << 30, - PT32_ROOT_LEVEL, 1, ACC_ALL, - NULL); + i << 30, PT32_ROOT_LEVEL, 1, ACC_ALL); root = __pa(sp->spt); ++sp->root_count; spin_unlock(&vcpu->kvm->mmu_lock); @@ -3131,7 +3164,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) spin_lock(&vcpu->kvm->mmu_lock); make_mmu_pages_available(vcpu); sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL, - 0, ACC_ALL, NULL); + 0, ACC_ALL); root = __pa(sp->spt); ++sp->root_count; spin_unlock(&vcpu->kvm->mmu_lock); @@ -3164,9 +3197,8 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) } spin_lock(&vcpu->kvm->mmu_lock); make_mmu_pages_available(vcpu); - sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, - PT32_ROOT_LEVEL, 0, - ACC_ALL, NULL); + sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL, + 0, ACC_ALL); root = __pa(sp->spt); ++sp->root_count; spin_unlock(&vcpu->kvm->mmu_lock); @@ -3287,7 +3319,7 @@ static bool is_shadow_zero_bits_set(struct kvm_mmu *mmu, u64 spte, int level) return __is_rsvd_bits_set(&mmu->shadow_zero_check, spte, level); } -static bool quickly_check_mmio_pf(struct kvm_vcpu *vcpu, u64 addr, bool direct) +static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct) { if (direct) return vcpu_match_mmio_gpa(vcpu, addr); @@ -3341,16 +3373,16 @@ exit: return reserved; } -int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct) +int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct) { u64 spte; bool reserved; - if (quickly_check_mmio_pf(vcpu, addr, direct)) + if (mmio_info_in_cache(vcpu, addr, direct)) return RET_MMIO_PF_EMULATE; reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte); - if (unlikely(reserved)) + if (WARN_ON(reserved)) return RET_MMIO_PF_BUG; if (is_mmio_spte(spte)) { @@ -3374,32 +3406,55 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct) */ return RET_MMIO_PF_RETRY; } -EXPORT_SYMBOL_GPL(handle_mmio_page_fault_common); +EXPORT_SYMBOL_GPL(handle_mmio_page_fault); -static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, - u32 error_code, bool direct) +static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu, + u32 error_code, gfn_t gfn) { - int ret; + if (unlikely(error_code & PFERR_RSVD_MASK)) + return false; - ret = handle_mmio_page_fault_common(vcpu, addr, direct); - WARN_ON(ret == RET_MMIO_PF_BUG); - return ret; + if (!(error_code & PFERR_PRESENT_MASK) || + !(error_code & PFERR_WRITE_MASK)) + return false; + + /* + * guest is writing the page which is write tracked which can + * not be fixed by page fault handler. + */ + if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE)) + return true; + + return false; +} + +static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr) +{ + struct kvm_shadow_walk_iterator iterator; + u64 spte; + + if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + return; + + walk_shadow_page_lockless_begin(vcpu); + for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) { + clear_sp_write_flooding_count(iterator.sptep); + if (!is_shadow_present_pte(spte)) + break; + } + walk_shadow_page_lockless_end(vcpu); } static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code, bool prefault) { - gfn_t gfn; + gfn_t gfn = gva >> PAGE_SHIFT; int r; pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code); - if (unlikely(error_code & PFERR_RSVD_MASK)) { - r = handle_mmio_page_fault(vcpu, gva, error_code, true); - - if (likely(r != RET_MMIO_PF_INVALID)) - return r; - } + if (page_fault_handle_page_track(vcpu, error_code, gfn)) + return 1; r = mmu_topup_memory_caches(vcpu); if (r) @@ -3407,7 +3462,6 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); - gfn = gva >> PAGE_SHIFT; return nonpaging_map(vcpu, gva & PAGE_MASK, error_code, gfn, prefault); @@ -3427,7 +3481,7 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) static bool can_do_async_pf(struct kvm_vcpu *vcpu) { - if (unlikely(!irqchip_in_kernel(vcpu->kvm) || + if (unlikely(!lapic_in_kernel(vcpu) || kvm_event_needs_reinjection(vcpu))) return false; @@ -3435,7 +3489,7 @@ static bool can_do_async_pf(struct kvm_vcpu *vcpu) } static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, - gva_t gva, pfn_t *pfn, bool write, bool *writable) + gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable) { struct kvm_memory_slot *slot; bool async; @@ -3473,10 +3527,10 @@ check_hugepage_cache_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int level) static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, bool prefault) { - pfn_t pfn; + kvm_pfn_t pfn; int r; int level; - int force_pt_level; + bool force_pt_level; gfn_t gfn = gpa >> PAGE_SHIFT; unsigned long mmu_seq; int write = error_code & PFERR_WRITE_MASK; @@ -3484,31 +3538,22 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); - if (unlikely(error_code & PFERR_RSVD_MASK)) { - r = handle_mmio_page_fault(vcpu, gpa, error_code, true); - - if (likely(r != RET_MMIO_PF_INVALID)) - return r; - } + if (page_fault_handle_page_track(vcpu, error_code, gfn)) + return 1; r = mmu_topup_memory_caches(vcpu); if (r) return r; - if (mapping_level_dirty_bitmap(vcpu, gfn) || - !check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL)) - force_pt_level = 1; - else - force_pt_level = 0; - + force_pt_level = !check_hugepage_cache_consistency(vcpu, gfn, + PT_DIRECTORY_LEVEL); + level = mapping_level(vcpu, gfn, &force_pt_level); if (likely(!force_pt_level)) { - level = mapping_level(vcpu, gfn); if (level > PT_DIRECTORY_LEVEL && !check_hugepage_cache_consistency(vcpu, gfn, level)) level = PT_DIRECTORY_LEVEL; gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); - } else - level = PT_PAGE_TABLE_LEVEL; + } if (fast_page_fault(vcpu, gpa, level, error_code)) return 0; @@ -3528,8 +3573,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, make_mmu_pages_available(vcpu); if (likely(!force_pt_level)) transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); - r = __direct_map(vcpu, gpa, write, map_writable, - level, gfn, pfn, prefault); + r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault); spin_unlock(&vcpu->kvm->mmu_lock); return r; @@ -3588,13 +3632,24 @@ static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, return false; } -static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gpte) +static inline bool is_last_gpte(struct kvm_mmu *mmu, + unsigned level, unsigned gpte) { - unsigned index; + /* + * PT_PAGE_TABLE_LEVEL always terminates. The RHS has bit 7 set + * iff level <= PT_PAGE_TABLE_LEVEL, which for our purpose means + * level == PT_PAGE_TABLE_LEVEL; set PT_PAGE_SIZE_MASK in gpte then. + */ + gpte |= level - PT_PAGE_TABLE_LEVEL - 1; + + /* + * The RHS has bit 7 set iff level < mmu->last_nonleaf_level. + * If it is clear, there are no large pages at this level, so clear + * PT_PAGE_SIZE_MASK in gpte if that is the case. + */ + gpte &= level - mmu->last_nonleaf_level; - index = level - 1; - index |= (gpte & PT_PAGE_SIZE_MASK) >> (PT_PAGE_SIZE_SHIFT - 2); - return mmu->last_pte_bitmap & (1 << index); + return gpte & PT_PAGE_SIZE_MASK; } #define PTTYPE_EPT 18 /* arbitrary */ @@ -3706,7 +3761,7 @@ static void __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check, int maxphyaddr, bool execonly) { - int pte; + u64 bad_mt_xwr; rsvd_check->rsvd_bits_mask[0][3] = rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7); @@ -3724,14 +3779,16 @@ __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check, rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20); rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0]; - for (pte = 0; pte < 64; pte++) { - int rwx_bits = pte & 7; - int mt = pte >> 3; - if (mt == 0x2 || mt == 0x3 || mt == 0x7 || - rwx_bits == 0x2 || rwx_bits == 0x6 || - (rwx_bits == 0x4 && !execonly)) - rsvd_check->bad_mt_xwr |= (1ull << pte); + bad_mt_xwr = 0xFFull << (2 * 8); /* bits 3..5 must not be 2 */ + bad_mt_xwr |= 0xFFull << (3 * 8); /* bits 3..5 must not be 3 */ + bad_mt_xwr |= 0xFFull << (7 * 8); /* bits 3..5 must not be 7 */ + bad_mt_xwr |= REPEAT_BYTE(1ull << 2); /* bits 0..2 must not be 010 */ + bad_mt_xwr |= REPEAT_BYTE(1ull << 6); /* bits 0..2 must not be 110 */ + if (!execonly) { + /* bits 0..2 must not be 100 unless VMX capabilities allow it */ + bad_mt_xwr |= REPEAT_BYTE(1ull << 4); } + rsvd_check->bad_mt_xwr = bad_mt_xwr; } static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu, @@ -3749,13 +3806,15 @@ static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu, void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context) { + bool uses_nx = context->nx || context->base_role.smep_andnot_wp; + /* * Passing "true" to the last argument is okay; it adds a check * on bit 8 of the SPTEs which KVM doesn't use anyway. */ __reset_rsvds_bits_mask(vcpu, &context->shadow_zero_check, boot_cpu_data.x86_phys_bits, - context->shadow_root_level, context->nx, + context->shadow_root_level, uses_nx, guest_cpuid_has_gbpages(vcpu), is_pse(vcpu), true); } @@ -3864,22 +3923,13 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu, } } -static void update_last_pte_bitmap(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) +static void update_last_nonleaf_level(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) { - u8 map; - unsigned level, root_level = mmu->root_level; - const unsigned ps_set_index = 1 << 2; /* bit 2 of index: ps */ - - if (root_level == PT32E_ROOT_LEVEL) - --root_level; - /* PT_PAGE_TABLE_LEVEL always terminates */ - map = 1 | (1 << ps_set_index); - for (level = PT_DIRECTORY_LEVEL; level <= root_level; ++level) { - if (level <= PT_PDPE_LEVEL - && (mmu->root_level >= PT32E_ROOT_LEVEL || is_pse(vcpu))) - map |= 1 << (ps_set_index | (level - 1)); - } - mmu->last_pte_bitmap = map; + unsigned root_level = mmu->root_level; + + mmu->last_nonleaf_level = root_level; + if (root_level == PT32_ROOT_LEVEL && is_pse(vcpu)) + mmu->last_nonleaf_level++; } static void paging64_init_context_common(struct kvm_vcpu *vcpu, @@ -3891,7 +3941,7 @@ static void paging64_init_context_common(struct kvm_vcpu *vcpu, reset_rsvds_bits_mask(vcpu, context); update_permission_bitmask(vcpu, context, false); - update_last_pte_bitmap(vcpu, context); + update_last_nonleaf_level(vcpu, context); MMU_WARN_ON(!is_pae(vcpu)); context->page_fault = paging64_page_fault; @@ -3918,7 +3968,7 @@ static void paging32_init_context(struct kvm_vcpu *vcpu, reset_rsvds_bits_mask(vcpu, context); update_permission_bitmask(vcpu, context, false); - update_last_pte_bitmap(vcpu, context); + update_last_nonleaf_level(vcpu, context); context->page_fault = paging32_page_fault; context->gva_to_gpa = paging32_gva_to_gpa; @@ -3976,7 +4026,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) } update_permission_bitmask(vcpu, context, false); - update_last_pte_bitmap(vcpu, context); + update_last_nonleaf_level(vcpu, context); reset_tdp_shadow_zero_bits_mask(vcpu, context); } @@ -4053,10 +4103,12 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) g_context->inject_page_fault = kvm_inject_page_fault; /* - * Note that arch.mmu.gva_to_gpa translates l2_gva to l1_gpa. The - * translation of l2_gpa to l1_gpa addresses is done using the - * arch.nested_mmu.gva_to_gpa function. Basically the gva_to_gpa - * functions between mmu and nested_mmu are swapped. + * Note that arch.mmu.gva_to_gpa translates l2_gpa to l1_gpa using + * L1's nested page tables (e.g. EPT12). The nested translation + * of l2_gva to l1_gpa is done by arch.nested_mmu.gva_to_gpa using + * L2's page tables as the first level of translation and L1's + * nested page tables as the second level of translation. Basically + * the gva_to_gpa functions between mmu and nested_mmu are swapped. */ if (!is_paging(vcpu)) { g_context->nx = false; @@ -4080,7 +4132,7 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) } update_permission_bitmask(vcpu, g_context, false); - update_last_pte_bitmap(vcpu, g_context); + update_last_nonleaf_level(vcpu, g_context); } static void init_kvm_mmu(struct kvm_vcpu *vcpu) @@ -4151,18 +4203,6 @@ static bool need_remote_flush(u64 old, u64 new) return (old & ~new & PT64_PERM_MASK) != 0; } -static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page, - bool remote_flush, bool local_flush) -{ - if (zap_page) - return; - - if (remote_flush) - kvm_flush_remote_tlbs(vcpu->kvm); - else if (local_flush) - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); -} - static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa, const u8 *new, int *bytes) { @@ -4212,7 +4252,8 @@ static bool detect_write_flooding(struct kvm_mmu_page *sp) if (sp->role.level == PT_PAGE_TABLE_LEVEL) return false; - return ++sp->write_flooding_count >= 3; + atomic_inc(&sp->write_flooding_count); + return atomic_read(&sp->write_flooding_count) >= 3; } /* @@ -4274,15 +4315,15 @@ static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte) return spte; } -void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, - const u8 *new, int bytes) +static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, + const u8 *new, int bytes) { gfn_t gfn = gpa >> PAGE_SHIFT; struct kvm_mmu_page *sp; LIST_HEAD(invalid_list); u64 entry, gentry, *spte; int npte; - bool remote_flush, local_flush, zap_page; + bool remote_flush, local_flush; union kvm_mmu_page_role mask = { }; mask.cr0_wp = 1; @@ -4299,7 +4340,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, if (!ACCESS_ONCE(vcpu->kvm->arch.indirect_shadow_pages)) return; - zap_page = remote_flush = local_flush = false; + remote_flush = local_flush = false; pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); @@ -4319,8 +4360,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) { if (detect_write_misaligned(sp, gpa, bytes) || detect_write_flooding(sp)) { - zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp, - &invalid_list); + kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); ++vcpu->kvm->stat.mmu_flooded; continue; } @@ -4342,8 +4382,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, ++spte; } } - mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush); - kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); + kvm_mmu_flush_or_zap(vcpu, &invalid_list, remote_flush, local_flush); kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE); spin_unlock(&vcpu->kvm->mmu_lock); } @@ -4380,32 +4419,34 @@ static void make_mmu_pages_available(struct kvm_vcpu *vcpu) kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); } -static bool is_mmio_page_fault(struct kvm_vcpu *vcpu, gva_t addr) -{ - if (vcpu->arch.mmu.direct_map || mmu_is_nested(vcpu)) - return vcpu_match_mmio_gpa(vcpu, addr); - - return vcpu_match_mmio_gva(vcpu, addr); -} - int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, void *insn, int insn_len) { int r, emulation_type = EMULTYPE_RETRY; enum emulation_result er; + bool direct = vcpu->arch.mmu.direct_map || mmu_is_nested(vcpu); + + if (unlikely(error_code & PFERR_RSVD_MASK)) { + r = handle_mmio_page_fault(vcpu, cr2, direct); + if (r == RET_MMIO_PF_EMULATE) { + emulation_type = 0; + goto emulate; + } + if (r == RET_MMIO_PF_RETRY) + return 1; + if (r < 0) + return r; + } r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false); if (r < 0) - goto out; - - if (!r) { - r = 1; - goto out; - } + return r; + if (!r) + return 1; - if (is_mmio_page_fault(vcpu, cr2)) + if (mmio_info_in_cache(vcpu, cr2, direct)) emulation_type = 0; - +emulate: er = x86_emulate_instruction(vcpu, cr2, emulation_type, insn, insn_len); switch (er) { @@ -4419,8 +4460,6 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, default: BUG(); } -out: - return r; } EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); @@ -4489,8 +4528,23 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu) init_kvm_mmu(vcpu); } +void kvm_mmu_init_vm(struct kvm *kvm) +{ + struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker; + + node->track_write = kvm_mmu_pte_write; + kvm_page_track_register_notifier(kvm, node); +} + +void kvm_mmu_uninit_vm(struct kvm *kvm) +{ + struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker; + + kvm_page_track_unregister_notifier(kvm, node); +} + /* The return value indicates if tlb flush on all vcpus is needed. */ -typedef bool (*slot_level_handler) (struct kvm *kvm, unsigned long *rmap); +typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head); /* The caller should hold mmu-lock before calling this function. */ static bool @@ -4584,9 +4638,10 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) spin_unlock(&kvm->mmu_lock); } -static bool slot_rmap_write_protect(struct kvm *kvm, unsigned long *rmapp) +static bool slot_rmap_write_protect(struct kvm *kvm, + struct kvm_rmap_head *rmap_head) { - return __rmap_write_protect(kvm, rmapp, false); + return __rmap_write_protect(kvm, rmap_head, false); } void kvm_mmu_slot_remove_write_access(struct kvm *kvm, @@ -4622,16 +4677,16 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, } static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, - unsigned long *rmapp) + struct kvm_rmap_head *rmap_head) { u64 *sptep; struct rmap_iterator iter; int need_tlb_flush = 0; - pfn_t pfn; + kvm_pfn_t pfn; struct kvm_mmu_page *sp; restart: - for_each_rmap_spte(rmapp, &iter, sptep) { + for_each_rmap_spte(rmap_head, &iter, sptep) { sp = page_header(__pa(sptep)); pfn = spte_to_pfn(*sptep); |