summaryrefslogtreecommitdiffstats
path: root/arch/x86/kvm/mmu.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm/mmu.c')
-rw-r--r--arch/x86/kvm/mmu.c1021
1 files changed, 538 insertions, 483 deletions
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index ff606f5..c512f09 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -41,6 +41,7 @@
#include <asm/cmpxchg.h>
#include <asm/io.h>
#include <asm/vmx.h>
+#include <asm/kvm_page_track.h>
/*
* When setting this variable to true it enables Two-Dimensional-Paging
@@ -259,7 +260,7 @@ static unsigned get_mmio_spte_access(u64 spte)
}
static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
- pfn_t pfn, unsigned access)
+ kvm_pfn_t pfn, unsigned access)
{
if (unlikely(is_noslot_pfn(pfn))) {
mark_mmio_spte(vcpu, sptep, gfn, access);
@@ -311,11 +312,6 @@ static int is_large_pte(u64 pte)
return pte & PT_PAGE_SIZE_MASK;
}
-static int is_rmap_spte(u64 pte)
-{
- return is_shadow_present_pte(pte);
-}
-
static int is_last_spte(u64 pte, int level)
{
if (level == PT_PAGE_TABLE_LEVEL)
@@ -325,7 +321,7 @@ static int is_last_spte(u64 pte, int level)
return 0;
}
-static pfn_t spte_to_pfn(u64 pte)
+static kvm_pfn_t spte_to_pfn(u64 pte)
{
return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
}
@@ -540,7 +536,7 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
u64 old_spte = *sptep;
bool ret = false;
- WARN_ON(!is_rmap_spte(new_spte));
+ WARN_ON(!is_shadow_present_pte(new_spte));
if (!is_shadow_present_pte(old_spte)) {
mmu_spte_set(sptep, new_spte);
@@ -587,7 +583,7 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
*/
static int mmu_spte_clear_track_bits(u64 *sptep)
{
- pfn_t pfn;
+ kvm_pfn_t pfn;
u64 old_spte = *sptep;
if (!spte_has_volatile_bits(old_spte))
@@ -595,7 +591,7 @@ static int mmu_spte_clear_track_bits(u64 *sptep)
else
old_spte = __update_clear_spte_slow(sptep, 0ull);
- if (!is_rmap_spte(old_spte))
+ if (!is_shadow_present_pte(old_spte))
return 0;
pfn = spte_to_pfn(old_spte);
@@ -781,57 +777,85 @@ static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
return &slot->arch.lpage_info[level - 2][idx];
}
+static void update_gfn_disallow_lpage_count(struct kvm_memory_slot *slot,
+ gfn_t gfn, int count)
+{
+ struct kvm_lpage_info *linfo;
+ int i;
+
+ for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
+ linfo = lpage_info_slot(gfn, slot, i);
+ linfo->disallow_lpage += count;
+ WARN_ON(linfo->disallow_lpage < 0);
+ }
+}
+
+void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
+{
+ update_gfn_disallow_lpage_count(slot, gfn, 1);
+}
+
+void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
+{
+ update_gfn_disallow_lpage_count(slot, gfn, -1);
+}
+
static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
{
struct kvm_memslots *slots;
struct kvm_memory_slot *slot;
- struct kvm_lpage_info *linfo;
gfn_t gfn;
- int i;
+ kvm->arch.indirect_shadow_pages++;
gfn = sp->gfn;
slots = kvm_memslots_for_spte_role(kvm, sp->role);
slot = __gfn_to_memslot(slots, gfn);
- for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
- linfo = lpage_info_slot(gfn, slot, i);
- linfo->write_count += 1;
- }
- kvm->arch.indirect_shadow_pages++;
+
+ /* the non-leaf shadow pages are keeping readonly. */
+ if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+ return kvm_slot_page_track_add_page(kvm, slot, gfn,
+ KVM_PAGE_TRACK_WRITE);
+
+ kvm_mmu_gfn_disallow_lpage(slot, gfn);
}
static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
{
struct kvm_memslots *slots;
struct kvm_memory_slot *slot;
- struct kvm_lpage_info *linfo;
gfn_t gfn;
- int i;
+ kvm->arch.indirect_shadow_pages--;
gfn = sp->gfn;
slots = kvm_memslots_for_spte_role(kvm, sp->role);
slot = __gfn_to_memslot(slots, gfn);
- for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
- linfo = lpage_info_slot(gfn, slot, i);
- linfo->write_count -= 1;
- WARN_ON(linfo->write_count < 0);
- }
- kvm->arch.indirect_shadow_pages--;
+ if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+ return kvm_slot_page_track_remove_page(kvm, slot, gfn,
+ KVM_PAGE_TRACK_WRITE);
+
+ kvm_mmu_gfn_allow_lpage(slot, gfn);
}
-static int has_wrprotected_page(struct kvm_vcpu *vcpu,
- gfn_t gfn,
- int level)
+static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level,
+ struct kvm_memory_slot *slot)
{
- struct kvm_memory_slot *slot;
struct kvm_lpage_info *linfo;
- slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
if (slot) {
linfo = lpage_info_slot(gfn, slot, level);
- return linfo->write_count;
+ return !!linfo->disallow_lpage;
}
- return 1;
+ return true;
+}
+
+static bool mmu_gfn_lpage_is_disallowed(struct kvm_vcpu *vcpu, gfn_t gfn,
+ int level)
+{
+ struct kvm_memory_slot *slot;
+
+ slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+ return __mmu_gfn_lpage_is_disallowed(gfn, level, slot);
}
static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
@@ -851,6 +875,17 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
return ret;
}
+static inline bool memslot_valid_for_gpte(struct kvm_memory_slot *slot,
+ bool no_dirty_log)
+{
+ if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
+ return false;
+ if (no_dirty_log && slot->dirty_bitmap)
+ return false;
+
+ return true;
+}
+
static struct kvm_memory_slot *
gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
bool no_dirty_log)
@@ -858,21 +893,25 @@ gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
struct kvm_memory_slot *slot;
slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
- if (!slot || slot->flags & KVM_MEMSLOT_INVALID ||
- (no_dirty_log && slot->dirty_bitmap))
+ if (!memslot_valid_for_gpte(slot, no_dirty_log))
slot = NULL;
return slot;
}
-static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn)
-{
- return !gfn_to_memslot_dirty_bitmap(vcpu, large_gfn, true);
-}
-
-static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
+static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn,
+ bool *force_pt_level)
{
int host_level, level, max_level;
+ struct kvm_memory_slot *slot;
+
+ if (unlikely(*force_pt_level))
+ return PT_PAGE_TABLE_LEVEL;
+
+ slot = kvm_vcpu_gfn_to_memslot(vcpu, large_gfn);
+ *force_pt_level = !memslot_valid_for_gpte(slot, true);
+ if (unlikely(*force_pt_level))
+ return PT_PAGE_TABLE_LEVEL;
host_level = host_mapping_level(vcpu->kvm, large_gfn);
@@ -882,43 +921,42 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
max_level = min(kvm_x86_ops->get_lpage_level(), host_level);
for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
- if (has_wrprotected_page(vcpu, large_gfn, level))
+ if (__mmu_gfn_lpage_is_disallowed(large_gfn, level, slot))
break;
return level - 1;
}
/*
- * Pte mapping structures:
+ * About rmap_head encoding:
*
- * If pte_list bit zero is zero, then pte_list point to the spte.
- *
- * If pte_list bit zero is one, (then pte_list & ~1) points to a struct
+ * If the bit zero of rmap_head->val is clear, then it points to the only spte
+ * in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct
* pte_list_desc containing more mappings.
- *
- * Returns the number of pte entries before the spte was added or zero if
- * the spte was not added.
- *
+ */
+
+/*
+ * Returns the number of pointers in the rmap chain, not counting the new one.
*/
static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
- unsigned long *pte_list)
+ struct kvm_rmap_head *rmap_head)
{
struct pte_list_desc *desc;
int i, count = 0;
- if (!*pte_list) {
+ if (!rmap_head->val) {
rmap_printk("pte_list_add: %p %llx 0->1\n", spte, *spte);
- *pte_list = (unsigned long)spte;
- } else if (!(*pte_list & 1)) {
+ rmap_head->val = (unsigned long)spte;
+ } else if (!(rmap_head->val & 1)) {
rmap_printk("pte_list_add: %p %llx 1->many\n", spte, *spte);
desc = mmu_alloc_pte_list_desc(vcpu);
- desc->sptes[0] = (u64 *)*pte_list;
+ desc->sptes[0] = (u64 *)rmap_head->val;
desc->sptes[1] = spte;
- *pte_list = (unsigned long)desc | 1;
+ rmap_head->val = (unsigned long)desc | 1;
++count;
} else {
rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte);
- desc = (struct pte_list_desc *)(*pte_list & ~1ul);
+ desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
while (desc->sptes[PTE_LIST_EXT-1] && desc->more) {
desc = desc->more;
count += PTE_LIST_EXT;
@@ -935,8 +973,9 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
}
static void
-pte_list_desc_remove_entry(unsigned long *pte_list, struct pte_list_desc *desc,
- int i, struct pte_list_desc *prev_desc)
+pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
+ struct pte_list_desc *desc, int i,
+ struct pte_list_desc *prev_desc)
{
int j;
@@ -947,43 +986,43 @@ pte_list_desc_remove_entry(unsigned long *pte_list, struct pte_list_desc *desc,
if (j != 0)
return;
if (!prev_desc && !desc->more)
- *pte_list = (unsigned long)desc->sptes[0];
+ rmap_head->val = (unsigned long)desc->sptes[0];
else
if (prev_desc)
prev_desc->more = desc->more;
else
- *pte_list = (unsigned long)desc->more | 1;
+ rmap_head->val = (unsigned long)desc->more | 1;
mmu_free_pte_list_desc(desc);
}
-static void pte_list_remove(u64 *spte, unsigned long *pte_list)
+static void pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
{
struct pte_list_desc *desc;
struct pte_list_desc *prev_desc;
int i;
- if (!*pte_list) {
+ if (!rmap_head->val) {
printk(KERN_ERR "pte_list_remove: %p 0->BUG\n", spte);
BUG();
- } else if (!(*pte_list & 1)) {
+ } else if (!(rmap_head->val & 1)) {
rmap_printk("pte_list_remove: %p 1->0\n", spte);
- if ((u64 *)*pte_list != spte) {
+ if ((u64 *)rmap_head->val != spte) {
printk(KERN_ERR "pte_list_remove: %p 1->BUG\n", spte);
BUG();
}
- *pte_list = 0;
+ rmap_head->val = 0;
} else {
rmap_printk("pte_list_remove: %p many->many\n", spte);
- desc = (struct pte_list_desc *)(*pte_list & ~1ul);
+ desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
prev_desc = NULL;
while (desc) {
- for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i)
+ for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) {
if (desc->sptes[i] == spte) {
- pte_list_desc_remove_entry(pte_list,
- desc, i,
- prev_desc);
+ pte_list_desc_remove_entry(rmap_head,
+ desc, i, prev_desc);
return;
}
+ }
prev_desc = desc;
desc = desc->more;
}
@@ -992,28 +1031,8 @@ static void pte_list_remove(u64 *spte, unsigned long *pte_list)
}
}
-typedef void (*pte_list_walk_fn) (u64 *spte);
-static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn)
-{
- struct pte_list_desc *desc;
- int i;
-
- if (!*pte_list)
- return;
-
- if (!(*pte_list & 1))
- return fn((u64 *)*pte_list);
-
- desc = (struct pte_list_desc *)(*pte_list & ~1ul);
- while (desc) {
- for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i)
- fn(desc->sptes[i]);
- desc = desc->more;
- }
-}
-
-static unsigned long *__gfn_to_rmap(gfn_t gfn, int level,
- struct kvm_memory_slot *slot)
+static struct kvm_rmap_head *__gfn_to_rmap(gfn_t gfn, int level,
+ struct kvm_memory_slot *slot)
{
unsigned long idx;
@@ -1021,10 +1040,8 @@ static unsigned long *__gfn_to_rmap(gfn_t gfn, int level,
return &slot->arch.rmap[level - PT_PAGE_TABLE_LEVEL][idx];
}
-/*
- * Take gfn and return the reverse mapping to it.
- */
-static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, struct kvm_mmu_page *sp)
+static struct kvm_rmap_head *gfn_to_rmap(struct kvm *kvm, gfn_t gfn,
+ struct kvm_mmu_page *sp)
{
struct kvm_memslots *slots;
struct kvm_memory_slot *slot;
@@ -1045,24 +1062,24 @@ static bool rmap_can_add(struct kvm_vcpu *vcpu)
static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
{
struct kvm_mmu_page *sp;
- unsigned long *rmapp;
+ struct kvm_rmap_head *rmap_head;
sp = page_header(__pa(spte));
kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
- rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp);
- return pte_list_add(vcpu, spte, rmapp);
+ rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
+ return pte_list_add(vcpu, spte, rmap_head);
}
static void rmap_remove(struct kvm *kvm, u64 *spte)
{
struct kvm_mmu_page *sp;
gfn_t gfn;
- unsigned long *rmapp;
+ struct kvm_rmap_head *rmap_head;
sp = page_header(__pa(spte));
gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
- rmapp = gfn_to_rmap(kvm, gfn, sp);
- pte_list_remove(spte, rmapp);
+ rmap_head = gfn_to_rmap(kvm, gfn, sp);
+ pte_list_remove(spte, rmap_head);
}
/*
@@ -1082,19 +1099,26 @@ struct rmap_iterator {
*
* Returns sptep if found, NULL otherwise.
*/
-static u64 *rmap_get_first(unsigned long rmap, struct rmap_iterator *iter)
+static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head,
+ struct rmap_iterator *iter)
{
- if (!rmap)
+ u64 *sptep;
+
+ if (!rmap_head->val)
return NULL;
- if (!(rmap & 1)) {
+ if (!(rmap_head->val & 1)) {
iter->desc = NULL;
- return (u64 *)rmap;
+ sptep = (u64 *)rmap_head->val;
+ goto out;
}
- iter->desc = (struct pte_list_desc *)(rmap & ~1ul);
+ iter->desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
iter->pos = 0;
- return iter->desc->sptes[iter->pos];
+ sptep = iter->desc->sptes[iter->pos];
+out:
+ BUG_ON(!is_shadow_present_pte(*sptep));
+ return sptep;
}
/*
@@ -1104,14 +1128,14 @@ static u64 *rmap_get_first(unsigned long rmap, struct rmap_iterator *iter)
*/
static u64 *rmap_get_next(struct rmap_iterator *iter)
{
+ u64 *sptep;
+
if (iter->desc) {
if (iter->pos < PTE_LIST_EXT - 1) {
- u64 *sptep;
-
++iter->pos;
sptep = iter->desc->sptes[iter->pos];
if (sptep)
- return sptep;
+ goto out;
}
iter->desc = iter->desc->more;
@@ -1119,17 +1143,20 @@ static u64 *rmap_get_next(struct rmap_iterator *iter)
if (iter->desc) {
iter->pos = 0;
/* desc->sptes[0] cannot be NULL */
- return iter->desc->sptes[iter->pos];
+ sptep = iter->desc->sptes[iter->pos];
+ goto out;
}
}
return NULL;
+out:
+ BUG_ON(!is_shadow_present_pte(*sptep));
+ return sptep;
}
-#define for_each_rmap_spte(_rmap_, _iter_, _spte_) \
- for (_spte_ = rmap_get_first(*_rmap_, _iter_); \
- _spte_ && ({BUG_ON(!is_shadow_present_pte(*_spte_)); 1;}); \
- _spte_ = rmap_get_next(_iter_))
+#define for_each_rmap_spte(_rmap_head_, _iter_, _spte_) \
+ for (_spte_ = rmap_get_first(_rmap_head_, _iter_); \
+ _spte_; _spte_ = rmap_get_next(_iter_))
static void drop_spte(struct kvm *kvm, u64 *sptep)
{
@@ -1187,14 +1214,15 @@ static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool pt_protect)
return mmu_spte_update(sptep, spte);
}
-static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
+static bool __rmap_write_protect(struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head,
bool pt_protect)
{
u64 *sptep;
struct rmap_iterator iter;
bool flush = false;
- for_each_rmap_spte(rmapp, &iter, sptep)
+ for_each_rmap_spte(rmap_head, &iter, sptep)
flush |= spte_write_protect(kvm, sptep, pt_protect);
return flush;
@@ -1211,13 +1239,13 @@ static bool spte_clear_dirty(struct kvm *kvm, u64 *sptep)
return mmu_spte_update(sptep, spte);
}
-static bool __rmap_clear_dirty(struct kvm *kvm, unsigned long *rmapp)
+static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
{
u64 *sptep;
struct rmap_iterator iter;
bool flush = false;
- for_each_rmap_spte(rmapp, &iter, sptep)
+ for_each_rmap_spte(rmap_head, &iter, sptep)
flush |= spte_clear_dirty(kvm, sptep);
return flush;
@@ -1234,13 +1262,13 @@ static bool spte_set_dirty(struct kvm *kvm, u64 *sptep)
return mmu_spte_update(sptep, spte);
}
-static bool __rmap_set_dirty(struct kvm *kvm, unsigned long *rmapp)
+static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
{
u64 *sptep;
struct rmap_iterator iter;
bool flush = false;
- for_each_rmap_spte(rmapp, &iter, sptep)
+ for_each_rmap_spte(rmap_head, &iter, sptep)
flush |= spte_set_dirty(kvm, sptep);
return flush;
@@ -1260,12 +1288,12 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *slot,
gfn_t gfn_offset, unsigned long mask)
{
- unsigned long *rmapp;
+ struct kvm_rmap_head *rmap_head;
while (mask) {
- rmapp = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
- PT_PAGE_TABLE_LEVEL, slot);
- __rmap_write_protect(kvm, rmapp, false);
+ rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
+ PT_PAGE_TABLE_LEVEL, slot);
+ __rmap_write_protect(kvm, rmap_head, false);
/* clear the first set bit */
mask &= mask - 1;
@@ -1285,12 +1313,12 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *slot,
gfn_t gfn_offset, unsigned long mask)
{
- unsigned long *rmapp;
+ struct kvm_rmap_head *rmap_head;
while (mask) {
- rmapp = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
- PT_PAGE_TABLE_LEVEL, slot);
- __rmap_clear_dirty(kvm, rmapp);
+ rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
+ PT_PAGE_TABLE_LEVEL, slot);
+ __rmap_clear_dirty(kvm, rmap_head);
/* clear the first set bit */
mask &= mask - 1;
@@ -1319,31 +1347,36 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
}
-static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
+bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
+ struct kvm_memory_slot *slot, u64 gfn)
{
- struct kvm_memory_slot *slot;
- unsigned long *rmapp;
+ struct kvm_rmap_head *rmap_head;
int i;
bool write_protected = false;
- slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
-
for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
- rmapp = __gfn_to_rmap(gfn, i, slot);
- write_protected |= __rmap_write_protect(vcpu->kvm, rmapp, true);
+ rmap_head = __gfn_to_rmap(gfn, i, slot);
+ write_protected |= __rmap_write_protect(kvm, rmap_head, true);
}
return write_protected;
}
-static bool kvm_zap_rmapp(struct kvm *kvm, unsigned long *rmapp)
+static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
+{
+ struct kvm_memory_slot *slot;
+
+ slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+ return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn);
+}
+
+static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
{
u64 *sptep;
struct rmap_iterator iter;
bool flush = false;
- while ((sptep = rmap_get_first(*rmapp, &iter))) {
- BUG_ON(!(*sptep & PT_PRESENT_MASK));
+ while ((sptep = rmap_get_first(rmap_head, &iter))) {
rmap_printk("%s: spte %p %llx.\n", __func__, sptep, *sptep);
drop_spte(kvm, sptep);
@@ -1353,14 +1386,14 @@ static bool kvm_zap_rmapp(struct kvm *kvm, unsigned long *rmapp)
return flush;
}
-static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
+static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
struct kvm_memory_slot *slot, gfn_t gfn, int level,
unsigned long data)
{
- return kvm_zap_rmapp(kvm, rmapp);
+ return kvm_zap_rmapp(kvm, rmap_head);
}
-static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
+static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
struct kvm_memory_slot *slot, gfn_t gfn, int level,
unsigned long data)
{
@@ -1369,13 +1402,13 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
int need_flush = 0;
u64 new_spte;
pte_t *ptep = (pte_t *)data;
- pfn_t new_pfn;
+ kvm_pfn_t new_pfn;
WARN_ON(pte_huge(*ptep));
new_pfn = pte_pfn(*ptep);
restart:
- for_each_rmap_spte(rmapp, &iter, sptep) {
+ for_each_rmap_spte(rmap_head, &iter, sptep) {
rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n",
sptep, *sptep, gfn, level);
@@ -1413,11 +1446,11 @@ struct slot_rmap_walk_iterator {
/* output fields. */
gfn_t gfn;
- unsigned long *rmap;
+ struct kvm_rmap_head *rmap;
int level;
/* private field. */
- unsigned long *end_rmap;
+ struct kvm_rmap_head *end_rmap;
};
static void
@@ -1476,7 +1509,7 @@ static int kvm_handle_hva_range(struct kvm *kvm,
unsigned long end,
unsigned long data,
int (*handler)(struct kvm *kvm,
- unsigned long *rmapp,
+ struct kvm_rmap_head *rmap_head,
struct kvm_memory_slot *slot,
gfn_t gfn,
int level,
@@ -1520,7 +1553,8 @@ static int kvm_handle_hva_range(struct kvm *kvm,
static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
unsigned long data,
- int (*handler)(struct kvm *kvm, unsigned long *rmapp,
+ int (*handler)(struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head,
struct kvm_memory_slot *slot,
gfn_t gfn, int level,
unsigned long data))
@@ -1543,7 +1577,7 @@ void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
}
-static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
+static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
struct kvm_memory_slot *slot, gfn_t gfn, int level,
unsigned long data)
{
@@ -1553,18 +1587,19 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
BUG_ON(!shadow_accessed_mask);
- for_each_rmap_spte(rmapp, &iter, sptep)
+ for_each_rmap_spte(rmap_head, &iter, sptep) {
if (*sptep & shadow_accessed_mask) {
young = 1;
clear_bit((ffs(shadow_accessed_mask) - 1),
(unsigned long *)sptep);
}
+ }
trace_kvm_age_page(gfn, level, slot, young);
return young;
}
-static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
+static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
struct kvm_memory_slot *slot, gfn_t gfn,
int level, unsigned long data)
{
@@ -1580,11 +1615,12 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
if (!shadow_accessed_mask)
goto out;
- for_each_rmap_spte(rmapp, &iter, sptep)
+ for_each_rmap_spte(rmap_head, &iter, sptep) {
if (*sptep & shadow_accessed_mask) {
young = 1;
break;
}
+ }
out:
return young;
}
@@ -1593,14 +1629,14 @@ out:
static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
{
- unsigned long *rmapp;
+ struct kvm_rmap_head *rmap_head;
struct kvm_mmu_page *sp;
sp = page_header(__pa(spte));
- rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp);
+ rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
- kvm_unmap_rmapp(vcpu->kvm, rmapp, NULL, gfn, sp->role.level, 0);
+ kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, 0);
kvm_flush_remote_tlbs(vcpu->kvm);
}
@@ -1700,8 +1736,7 @@ static void drop_parent_pte(struct kvm_mmu_page *sp,
mmu_spte_clear_no_track(parent_pte);
}
-static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
- u64 *parent_pte, int direct)
+static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct)
{
struct kvm_mmu_page *sp;
@@ -1717,8 +1752,6 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
* this feature. See the comments in kvm_zap_obsolete_pages().
*/
list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
- sp->parent_ptes = 0;
- mmu_page_add_parent_pte(vcpu, sp, parent_pte);
kvm_mod_used_mmu_pages(vcpu->kvm, +1);
return sp;
}
@@ -1726,7 +1759,12 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
static void mark_unsync(u64 *spte);
static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
{
- pte_list_walk(&sp->parent_ptes, mark_unsync);
+ u64 *sptep;
+ struct rmap_iterator iter;
+
+ for_each_rmap_spte(&sp->parent_ptes, &iter, sptep) {
+ mark_unsync(sptep);
+ }
}
static void mark_unsync(u64 *spte)
@@ -1746,7 +1784,7 @@ static void mark_unsync(u64 *spte)
static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp)
{
- return 1;
+ return 0;
}
static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
@@ -1786,6 +1824,13 @@ static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
return (pvec->nr == KVM_PAGE_ARRAY_NR);
}
+static inline void clear_unsync_child_bit(struct kvm_mmu_page *sp, int idx)
+{
+ --sp->unsync_children;
+ WARN_ON((int)sp->unsync_children < 0);
+ __clear_bit(idx, sp->unsync_child_bitmap);
+}
+
static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
struct kvm_mmu_pages *pvec)
{
@@ -1795,8 +1840,10 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
struct kvm_mmu_page *child;
u64 ent = sp->spt[i];
- if (!is_shadow_present_pte(ent) || is_large_pte(ent))
- goto clear_child_bitmap;
+ if (!is_shadow_present_pte(ent) || is_large_pte(ent)) {
+ clear_unsync_child_bit(sp, i);
+ continue;
+ }
child = page_header(ent & PT64_BASE_ADDR_MASK);
@@ -1805,38 +1852,34 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
return -ENOSPC;
ret = __mmu_unsync_walk(child, pvec);
- if (!ret)
- goto clear_child_bitmap;
- else if (ret > 0)
+ if (!ret) {
+ clear_unsync_child_bit(sp, i);
+ continue;
+ } else if (ret > 0) {
nr_unsync_leaf += ret;
- else
+ } else
return ret;
} else if (child->unsync) {
nr_unsync_leaf++;
if (mmu_pages_add(pvec, child, i))
return -ENOSPC;
} else
- goto clear_child_bitmap;
-
- continue;
-
-clear_child_bitmap:
- __clear_bit(i, sp->unsync_child_bitmap);
- sp->unsync_children--;
- WARN_ON((int)sp->unsync_children < 0);
+ clear_unsync_child_bit(sp, i);
}
-
return nr_unsync_leaf;
}
+#define INVALID_INDEX (-1)
+
static int mmu_unsync_walk(struct kvm_mmu_page *sp,
struct kvm_mmu_pages *pvec)
{
+ pvec->nr = 0;
if (!sp->unsync_children)
return 0;
- mmu_pages_add(pvec, sp, 0);
+ mmu_pages_add(pvec, sp, INVALID_INDEX);
return __mmu_unsync_walk(sp, pvec);
}
@@ -1873,37 +1916,35 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
if ((_sp)->role.direct || (_sp)->role.invalid) {} else
/* @sp->gfn should be write-protected at the call site */
-static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
- struct list_head *invalid_list, bool clear_unsync)
+static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ struct list_head *invalid_list)
{
if (sp->role.cr4_pae != !!is_pae(vcpu)) {
kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
- return 1;
+ return false;
}
- if (clear_unsync)
- kvm_unlink_unsync_page(vcpu->kvm, sp);
-
- if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
+ if (vcpu->arch.mmu.sync_page(vcpu, sp) == 0) {
kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
- return 1;
+ return false;
}
- kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
- return 0;
+ return true;
}
-static int kvm_sync_page_transient(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp)
+static void kvm_mmu_flush_or_zap(struct kvm_vcpu *vcpu,
+ struct list_head *invalid_list,
+ bool remote_flush, bool local_flush)
{
- LIST_HEAD(invalid_list);
- int ret;
-
- ret = __kvm_sync_page(vcpu, sp, &invalid_list, false);
- if (ret)
- kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+ if (!list_empty(invalid_list)) {
+ kvm_mmu_commit_zap_page(vcpu->kvm, invalid_list);
+ return;
+ }
- return ret;
+ if (remote_flush)
+ kvm_flush_remote_tlbs(vcpu->kvm);
+ else if (local_flush)
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
}
#ifdef CONFIG_KVM_MMU_AUDIT
@@ -1913,46 +1954,38 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { }
static void mmu_audit_disable(void) { }
#endif
-static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
struct list_head *invalid_list)
{
- return __kvm_sync_page(vcpu, sp, invalid_list, true);
+ kvm_unlink_unsync_page(vcpu->kvm, sp);
+ return __kvm_sync_page(vcpu, sp, invalid_list);
}
/* @gfn should be write-protected at the call site */
-static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn)
+static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn,
+ struct list_head *invalid_list)
{
struct kvm_mmu_page *s;
- LIST_HEAD(invalid_list);
- bool flush = false;
+ bool ret = false;
for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) {
if (!s->unsync)
continue;
WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
- kvm_unlink_unsync_page(vcpu->kvm, s);
- if ((s->role.cr4_pae != !!is_pae(vcpu)) ||
- (vcpu->arch.mmu.sync_page(vcpu, s))) {
- kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list);
- continue;
- }
- flush = true;
+ ret |= kvm_sync_page(vcpu, s, invalid_list);
}
- kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
- if (flush)
- kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+ return ret;
}
struct mmu_page_path {
- struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1];
- unsigned int idx[PT64_ROOT_LEVEL-1];
+ struct kvm_mmu_page *parent[PT64_ROOT_LEVEL];
+ unsigned int idx[PT64_ROOT_LEVEL];
};
#define for_each_sp(pvec, sp, parents, i) \
- for (i = mmu_pages_next(&pvec, &parents, -1), \
- sp = pvec.page[i].sp; \
+ for (i = mmu_pages_first(&pvec, &parents); \
i < pvec.nr && ({ sp = pvec.page[i].sp; 1;}); \
i = mmu_pages_next(&pvec, &parents, i))
@@ -1964,19 +1997,43 @@ static int mmu_pages_next(struct kvm_mmu_pages *pvec,
for (n = i+1; n < pvec->nr; n++) {
struct kvm_mmu_page *sp = pvec->page[n].sp;
+ unsigned idx = pvec->page[n].idx;
+ int level = sp->role.level;
- if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
- parents->idx[0] = pvec->page[n].idx;
- return n;
- }
+ parents->idx[level-1] = idx;
+ if (level == PT_PAGE_TABLE_LEVEL)
+ break;
- parents->parent[sp->role.level-2] = sp;
- parents->idx[sp->role.level-1] = pvec->page[n].idx;
+ parents->parent[level-2] = sp;
}
return n;
}
+static int mmu_pages_first(struct kvm_mmu_pages *pvec,
+ struct mmu_page_path *parents)
+{
+ struct kvm_mmu_page *sp;
+ int level;
+
+ if (pvec->nr == 0)
+ return 0;
+
+ WARN_ON(pvec->page[0].idx != INVALID_INDEX);
+
+ sp = pvec->page[0].sp;
+ level = sp->role.level;
+ WARN_ON(level == PT_PAGE_TABLE_LEVEL);
+
+ parents->parent[level-2] = sp;
+
+ /* Also set up a sentinel. Further entries in pvec are all
+ * children of sp, so this element is never overwritten.
+ */
+ parents->parent[level-1] = NULL;
+ return mmu_pages_next(pvec, parents, 0);
+}
+
static void mmu_pages_clear_parents(struct mmu_page_path *parents)
{
struct kvm_mmu_page *sp;
@@ -1984,24 +2041,14 @@ static void mmu_pages_clear_parents(struct mmu_page_path *parents)
do {
unsigned int idx = parents->idx[level];
-
sp = parents->parent[level];
if (!sp)
return;
- --sp->unsync_children;
- WARN_ON((int)sp->unsync_children < 0);
- __clear_bit(idx, sp->unsync_child_bitmap);
+ WARN_ON(idx == INVALID_INDEX);
+ clear_unsync_child_bit(sp, idx);
level++;
- } while (level < PT64_ROOT_LEVEL-1 && !sp->unsync_children);
-}
-
-static void kvm_mmu_pages_init(struct kvm_mmu_page *parent,
- struct mmu_page_path *parents,
- struct kvm_mmu_pages *pvec)
-{
- parents->parent[parent->role.level-1] = NULL;
- pvec->nr = 0;
+ } while (!sp->unsync_children);
}
static void mmu_sync_children(struct kvm_vcpu *vcpu,
@@ -2012,38 +2059,36 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
struct mmu_page_path parents;
struct kvm_mmu_pages pages;
LIST_HEAD(invalid_list);
+ bool flush = false;
- kvm_mmu_pages_init(parent, &parents, &pages);
while (mmu_unsync_walk(parent, &pages)) {
bool protected = false;
for_each_sp(pages, sp, parents, i)
protected |= rmap_write_protect(vcpu, sp->gfn);
- if (protected)
+ if (protected) {
kvm_flush_remote_tlbs(vcpu->kvm);
+ flush = false;
+ }
for_each_sp(pages, sp, parents, i) {
- kvm_sync_page(vcpu, sp, &invalid_list);
+ flush |= kvm_sync_page(vcpu, sp, &invalid_list);
mmu_pages_clear_parents(&parents);
}
- kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
- cond_resched_lock(&vcpu->kvm->mmu_lock);
- kvm_mmu_pages_init(parent, &parents, &pages);
+ if (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock)) {
+ kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
+ cond_resched_lock(&vcpu->kvm->mmu_lock);
+ flush = false;
+ }
}
-}
-
-static void init_shadow_page_table(struct kvm_mmu_page *sp)
-{
- int i;
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
- sp->spt[i] = 0ull;
+ kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
}
static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
{
- sp->write_flooding_count = 0;
+ atomic_set(&sp->write_flooding_count, 0);
}
static void clear_sp_write_flooding_count(u64 *spte)
@@ -2063,13 +2108,14 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
gva_t gaddr,
unsigned level,
int direct,
- unsigned access,
- u64 *parent_pte)
+ unsigned access)
{
union kvm_mmu_page_role role;
unsigned quadrant;
struct kvm_mmu_page *sp;
bool need_sync = false;
+ bool flush = false;
+ LIST_HEAD(invalid_list);
role = vcpu->arch.mmu.base_role;
role.level = level;
@@ -2093,39 +2139,52 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
if (sp->role.word != role.word)
continue;
- if (sp->unsync && kvm_sync_page_transient(vcpu, sp))
- break;
+ if (sp->unsync) {
+ /* The page is good, but __kvm_sync_page might still end
+ * up zapping it. If so, break in order to rebuild it.
+ */
+ if (!__kvm_sync_page(vcpu, sp, &invalid_list))
+ break;
+
+ WARN_ON(!list_empty(&invalid_list));
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+ }
- mmu_page_add_parent_pte(vcpu, sp, parent_pte);
- if (sp->unsync_children) {
+ if (sp->unsync_children)
kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
- kvm_mmu_mark_parents_unsync(sp);
- } else if (sp->unsync)
- kvm_mmu_mark_parents_unsync(sp);
__clear_sp_write_flooding_count(sp);
trace_kvm_mmu_get_page(sp, false);
return sp;
}
+
++vcpu->kvm->stat.mmu_cache_miss;
- sp = kvm_mmu_alloc_page(vcpu, parent_pte, direct);
- if (!sp)
- return sp;
+
+ sp = kvm_mmu_alloc_page(vcpu, direct);
+
sp->gfn = gfn;
sp->role = role;
hlist_add_head(&sp->hash_link,
&vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
if (!direct) {
- if (rmap_write_protect(vcpu, gfn))
+ /*
+ * we should do write protection before syncing pages
+ * otherwise the content of the synced shadow page may
+ * be inconsistent with guest page table.
+ */
+ account_shadowed(vcpu->kvm, sp);
+ if (level == PT_PAGE_TABLE_LEVEL &&
+ rmap_write_protect(vcpu, gfn))
kvm_flush_remote_tlbs(vcpu->kvm);
- if (level > PT_PAGE_TABLE_LEVEL && need_sync)
- kvm_sync_pages(vcpu, gfn);
- account_shadowed(vcpu->kvm, sp);
+ if (level > PT_PAGE_TABLE_LEVEL && need_sync)
+ flush |= kvm_sync_pages(vcpu, gfn, &invalid_list);
}
sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
- init_shadow_page_table(sp);
+ clear_page(sp->spt);
trace_kvm_mmu_get_page(sp, true);
+
+ kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
return sp;
}
@@ -2178,7 +2237,8 @@ static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
return __shadow_walk_next(iterator, *iterator->sptep);
}
-static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp, bool accessed)
+static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
+ struct kvm_mmu_page *sp)
{
u64 spte;
@@ -2186,12 +2246,14 @@ static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp, bool accessed)
VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
spte = __pa(sp->spt) | PT_PRESENT_MASK | PT_WRITABLE_MASK |
- shadow_user_mask | shadow_x_mask;
-
- if (accessed)
- spte |= shadow_accessed_mask;
+ shadow_user_mask | shadow_x_mask | shadow_accessed_mask;
mmu_spte_set(sptep, spte);
+
+ mmu_page_add_parent_pte(vcpu, sp, sptep);
+
+ if (sp->unsync_children || sp->unsync)
+ mark_unsync(sptep);
}
static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
@@ -2250,17 +2312,12 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
mmu_page_zap_pte(kvm, sp, sp->spt + i);
}
-static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
-{
- mmu_page_remove_parent_pte(sp, parent_pte);
-}
-
static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
{
u64 *sptep;
struct rmap_iterator iter;
- while ((sptep = rmap_get_first(sp->parent_ptes, &iter)))
+ while ((sptep = rmap_get_first(&sp->parent_ptes, &iter)))
drop_parent_pte(sp, sptep);
}
@@ -2275,7 +2332,6 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
if (parent->role.level == PT_PAGE_TABLE_LEVEL)
return 0;
- kvm_mmu_pages_init(parent, &parents, &pages);
while (mmu_unsync_walk(parent, &pages)) {
struct kvm_mmu_page *sp;
@@ -2284,7 +2340,6 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
mmu_pages_clear_parents(&parents);
zapped++;
}
- kvm_mmu_pages_init(parent, &parents, &pages);
}
return zapped;
@@ -2360,8 +2415,8 @@ static bool prepare_zap_oldest_mmu_page(struct kvm *kvm,
if (list_empty(&kvm->arch.active_mmu_pages))
return false;
- sp = list_entry(kvm->arch.active_mmu_pages.prev,
- struct kvm_mmu_page, link);
+ sp = list_last_entry(&kvm->arch.active_mmu_pages,
+ struct kvm_mmu_page, link);
kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
return true;
@@ -2414,7 +2469,7 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
}
EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page);
-static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{
trace_kvm_mmu_unsync_page(sp);
++vcpu->kvm->stat.mmu_unsync;
@@ -2423,40 +2478,29 @@ static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
kvm_mmu_mark_parents_unsync(sp);
}
-static void kvm_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn)
+static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
+ bool can_unsync)
{
- struct kvm_mmu_page *s;
-
- for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) {
- if (s->unsync)
- continue;
- WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
- __kvm_unsync_page(vcpu, s);
- }
-}
+ struct kvm_mmu_page *sp;
-static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
- bool can_unsync)
-{
- struct kvm_mmu_page *s;
- bool need_unsync = false;
+ if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
+ return true;
- for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) {
+ for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
if (!can_unsync)
- return 1;
+ return true;
- if (s->role.level != PT_PAGE_TABLE_LEVEL)
- return 1;
+ if (sp->unsync)
+ continue;
- if (!s->unsync)
- need_unsync = true;
+ WARN_ON(sp->role.level != PT_PAGE_TABLE_LEVEL);
+ kvm_unsync_page(vcpu, sp);
}
- if (need_unsync)
- kvm_unsync_pages(vcpu, gfn);
- return 0;
+
+ return false;
}
-static bool kvm_is_mmio_pfn(pfn_t pfn)
+static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
{
if (pfn_valid(pfn))
return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn));
@@ -2466,7 +2510,7 @@ static bool kvm_is_mmio_pfn(pfn_t pfn)
static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
unsigned pte_access, int level,
- gfn_t gfn, pfn_t pfn, bool speculative,
+ gfn_t gfn, kvm_pfn_t pfn, bool speculative,
bool can_unsync, bool host_writable)
{
u64 spte;
@@ -2509,7 +2553,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
* be fixed if guest refault.
*/
if (level > PT_PAGE_TABLE_LEVEL &&
- has_wrprotected_page(vcpu, gfn, level))
+ mmu_gfn_lpage_is_disallowed(vcpu, gfn, level))
goto done;
spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
@@ -2544,18 +2588,18 @@ done:
return ret;
}
-static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
- unsigned pte_access, int write_fault, int *emulate,
- int level, gfn_t gfn, pfn_t pfn, bool speculative,
- bool host_writable)
+static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
+ int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn,
+ bool speculative, bool host_writable)
{
int was_rmapped = 0;
int rmap_count;
+ bool emulate = false;
pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
*sptep, write_fault, gfn);
- if (is_rmap_spte(*sptep)) {
+ if (is_shadow_present_pte(*sptep)) {
/*
* If we overwrite a PTE page pointer with a 2MB PMD, unlink
* the parent of the now unreachable PTE.
@@ -2580,12 +2624,12 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
if (set_spte(vcpu, sptep, pte_access, level, gfn, pfn, speculative,
true, host_writable)) {
if (write_fault)
- *emulate = 1;
+ emulate = true;
kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
}
- if (unlikely(is_mmio_spte(*sptep) && emulate))
- *emulate = 1;
+ if (unlikely(is_mmio_spte(*sptep)))
+ emulate = true;
pgprintk("%s: setting spte %llx\n", __func__, *sptep);
pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
@@ -2604,9 +2648,11 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
}
kvm_release_pfn_clean(pfn);
+
+ return emulate;
}
-static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
+static kvm_pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
bool no_dirty_log)
{
struct kvm_memory_slot *slot;
@@ -2638,9 +2684,8 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
return -1;
for (i = 0; i < ret; i++, gfn++, start++)
- mmu_set_spte(vcpu, start, access, 0, NULL,
- sp->role.level, gfn, page_to_pfn(pages[i]),
- true, true);
+ mmu_set_spte(vcpu, start, access, 0, sp->role.level, gfn,
+ page_to_pfn(pages[i]), true, true);
return 0;
}
@@ -2688,9 +2733,8 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
__direct_pte_prefetch(vcpu, sp, sptep);
}
-static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
- int map_writable, int level, gfn_t gfn, pfn_t pfn,
- bool prefault)
+static int __direct_map(struct kvm_vcpu *vcpu, int write, int map_writable,
+ int level, gfn_t gfn, kvm_pfn_t pfn, bool prefault)
{
struct kvm_shadow_walk_iterator iterator;
struct kvm_mmu_page *sp;
@@ -2702,9 +2746,9 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
if (iterator.level == level) {
- mmu_set_spte(vcpu, iterator.sptep, ACC_ALL,
- write, &emulate, level, gfn, pfn,
- prefault, map_writable);
+ emulate = mmu_set_spte(vcpu, iterator.sptep, ACC_ALL,
+ write, level, gfn, pfn, prefault,
+ map_writable);
direct_pte_prefetch(vcpu, iterator.sptep);
++vcpu->stat.pf_fixed;
break;
@@ -2717,10 +2761,9 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
base_addr &= PT64_LVL_ADDR_MASK(iterator.level);
pseudo_gfn = base_addr >> PAGE_SHIFT;
sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr,
- iterator.level - 1,
- 1, ACC_ALL, iterator.sptep);
+ iterator.level - 1, 1, ACC_ALL);
- link_shadow_page(iterator.sptep, sp, true);
+ link_shadow_page(vcpu, iterator.sptep, sp);
}
}
return emulate;
@@ -2739,7 +2782,7 @@ static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *
send_sig_info(SIGBUS, &info, tsk);
}
-static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn)
+static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
{
/*
* Do not cache the mmio info caused by writing the readonly gfn
@@ -2759,9 +2802,10 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn)
}
static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
- gfn_t *gfnp, pfn_t *pfnp, int *levelp)
+ gfn_t *gfnp, kvm_pfn_t *pfnp,
+ int *levelp)
{
- pfn_t pfn = *pfnp;
+ kvm_pfn_t pfn = *pfnp;
gfn_t gfn = *gfnp;
int level = *levelp;
@@ -2774,7 +2818,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
level == PT_PAGE_TABLE_LEVEL &&
PageTransCompound(pfn_to_page(pfn)) &&
- !has_wrprotected_page(vcpu, gfn, PT_DIRECTORY_LEVEL)) {
+ !mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) {
unsigned long mask;
/*
* mmu_notifier_retry was successful and we hold the
@@ -2800,22 +2844,18 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
}
static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
- pfn_t pfn, unsigned access, int *ret_val)
+ kvm_pfn_t pfn, unsigned access, int *ret_val)
{
- bool ret = true;
-
/* The pfn is invalid, report the error! */
if (unlikely(is_error_pfn(pfn))) {
*ret_val = kvm_handle_bad_page(vcpu, gfn, pfn);
- goto exit;
+ return true;
}
if (unlikely(is_noslot_pfn(pfn)))
vcpu_cache_mmio_info(vcpu, gva, gfn, access);
- ret = false;
-exit:
- return ret;
+ return false;
}
static bool page_fault_can_be_fast(u32 error_code)
@@ -2899,7 +2939,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
* If the mapping has been changed, let the vcpu fault on the
* same address again.
*/
- if (!is_rmap_spte(spte)) {
+ if (!is_shadow_present_pte(spte)) {
ret = true;
goto exit;
}
@@ -2954,7 +2994,7 @@ exit:
}
static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
- gva_t gva, pfn_t *pfn, bool write, bool *writable);
+ gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable);
static void make_mmu_pages_available(struct kvm_vcpu *vcpu);
static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
@@ -2962,14 +3002,13 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
{
int r;
int level;
- int force_pt_level;
- pfn_t pfn;
+ bool force_pt_level = false;
+ kvm_pfn_t pfn;
unsigned long mmu_seq;
bool map_writable, write = error_code & PFERR_WRITE_MASK;
- force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
+ level = mapping_level(vcpu, gfn, &force_pt_level);
if (likely(!force_pt_level)) {
- level = mapping_level(vcpu, gfn);
/*
* This path builds a PAE pagetable - so we can map
* 2mb pages at maximum. Therefore check if the level
@@ -2979,8 +3018,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
level = PT_DIRECTORY_LEVEL;
gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
- } else
- level = PT_PAGE_TABLE_LEVEL;
+ }
if (fast_page_fault(vcpu, v, level, error_code))
return 0;
@@ -3000,11 +3038,9 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
make_mmu_pages_available(vcpu);
if (likely(!force_pt_level))
transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
- r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn,
- prefault);
+ r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault);
spin_unlock(&vcpu->kvm->mmu_lock);
-
return r;
out_unlock:
@@ -3079,8 +3115,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
spin_lock(&vcpu->kvm->mmu_lock);
make_mmu_pages_available(vcpu);
- sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL,
- 1, ACC_ALL, NULL);
+ sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL, 1, ACC_ALL);
++sp->root_count;
spin_unlock(&vcpu->kvm->mmu_lock);
vcpu->arch.mmu.root_hpa = __pa(sp->spt);
@@ -3092,9 +3127,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
spin_lock(&vcpu->kvm->mmu_lock);
make_mmu_pages_available(vcpu);
sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),
- i << 30,
- PT32_ROOT_LEVEL, 1, ACC_ALL,
- NULL);
+ i << 30, PT32_ROOT_LEVEL, 1, ACC_ALL);
root = __pa(sp->spt);
++sp->root_count;
spin_unlock(&vcpu->kvm->mmu_lock);
@@ -3131,7 +3164,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
spin_lock(&vcpu->kvm->mmu_lock);
make_mmu_pages_available(vcpu);
sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL,
- 0, ACC_ALL, NULL);
+ 0, ACC_ALL);
root = __pa(sp->spt);
++sp->root_count;
spin_unlock(&vcpu->kvm->mmu_lock);
@@ -3164,9 +3197,8 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
}
spin_lock(&vcpu->kvm->mmu_lock);
make_mmu_pages_available(vcpu);
- sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
- PT32_ROOT_LEVEL, 0,
- ACC_ALL, NULL);
+ sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL,
+ 0, ACC_ALL);
root = __pa(sp->spt);
++sp->root_count;
spin_unlock(&vcpu->kvm->mmu_lock);
@@ -3287,7 +3319,7 @@ static bool is_shadow_zero_bits_set(struct kvm_mmu *mmu, u64 spte, int level)
return __is_rsvd_bits_set(&mmu->shadow_zero_check, spte, level);
}
-static bool quickly_check_mmio_pf(struct kvm_vcpu *vcpu, u64 addr, bool direct)
+static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
{
if (direct)
return vcpu_match_mmio_gpa(vcpu, addr);
@@ -3341,16 +3373,16 @@ exit:
return reserved;
}
-int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
+int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
{
u64 spte;
bool reserved;
- if (quickly_check_mmio_pf(vcpu, addr, direct))
+ if (mmio_info_in_cache(vcpu, addr, direct))
return RET_MMIO_PF_EMULATE;
reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte);
- if (unlikely(reserved))
+ if (WARN_ON(reserved))
return RET_MMIO_PF_BUG;
if (is_mmio_spte(spte)) {
@@ -3374,32 +3406,55 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
*/
return RET_MMIO_PF_RETRY;
}
-EXPORT_SYMBOL_GPL(handle_mmio_page_fault_common);
+EXPORT_SYMBOL_GPL(handle_mmio_page_fault);
-static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr,
- u32 error_code, bool direct)
+static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu,
+ u32 error_code, gfn_t gfn)
{
- int ret;
+ if (unlikely(error_code & PFERR_RSVD_MASK))
+ return false;
- ret = handle_mmio_page_fault_common(vcpu, addr, direct);
- WARN_ON(ret == RET_MMIO_PF_BUG);
- return ret;
+ if (!(error_code & PFERR_PRESENT_MASK) ||
+ !(error_code & PFERR_WRITE_MASK))
+ return false;
+
+ /*
+ * guest is writing the page which is write tracked which can
+ * not be fixed by page fault handler.
+ */
+ if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
+ return true;
+
+ return false;
+}
+
+static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
+{
+ struct kvm_shadow_walk_iterator iterator;
+ u64 spte;
+
+ if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ return;
+
+ walk_shadow_page_lockless_begin(vcpu);
+ for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) {
+ clear_sp_write_flooding_count(iterator.sptep);
+ if (!is_shadow_present_pte(spte))
+ break;
+ }
+ walk_shadow_page_lockless_end(vcpu);
}
static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
u32 error_code, bool prefault)
{
- gfn_t gfn;
+ gfn_t gfn = gva >> PAGE_SHIFT;
int r;
pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
- if (unlikely(error_code & PFERR_RSVD_MASK)) {
- r = handle_mmio_page_fault(vcpu, gva, error_code, true);
-
- if (likely(r != RET_MMIO_PF_INVALID))
- return r;
- }
+ if (page_fault_handle_page_track(vcpu, error_code, gfn))
+ return 1;
r = mmu_topup_memory_caches(vcpu);
if (r)
@@ -3407,7 +3462,6 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
- gfn = gva >> PAGE_SHIFT;
return nonpaging_map(vcpu, gva & PAGE_MASK,
error_code, gfn, prefault);
@@ -3427,7 +3481,7 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
static bool can_do_async_pf(struct kvm_vcpu *vcpu)
{
- if (unlikely(!irqchip_in_kernel(vcpu->kvm) ||
+ if (unlikely(!lapic_in_kernel(vcpu) ||
kvm_event_needs_reinjection(vcpu)))
return false;
@@ -3435,7 +3489,7 @@ static bool can_do_async_pf(struct kvm_vcpu *vcpu)
}
static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
- gva_t gva, pfn_t *pfn, bool write, bool *writable)
+ gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable)
{
struct kvm_memory_slot *slot;
bool async;
@@ -3473,10 +3527,10 @@ check_hugepage_cache_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int level)
static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
bool prefault)
{
- pfn_t pfn;
+ kvm_pfn_t pfn;
int r;
int level;
- int force_pt_level;
+ bool force_pt_level;
gfn_t gfn = gpa >> PAGE_SHIFT;
unsigned long mmu_seq;
int write = error_code & PFERR_WRITE_MASK;
@@ -3484,31 +3538,22 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
- if (unlikely(error_code & PFERR_RSVD_MASK)) {
- r = handle_mmio_page_fault(vcpu, gpa, error_code, true);
-
- if (likely(r != RET_MMIO_PF_INVALID))
- return r;
- }
+ if (page_fault_handle_page_track(vcpu, error_code, gfn))
+ return 1;
r = mmu_topup_memory_caches(vcpu);
if (r)
return r;
- if (mapping_level_dirty_bitmap(vcpu, gfn) ||
- !check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL))
- force_pt_level = 1;
- else
- force_pt_level = 0;
-
+ force_pt_level = !check_hugepage_cache_consistency(vcpu, gfn,
+ PT_DIRECTORY_LEVEL);
+ level = mapping_level(vcpu, gfn, &force_pt_level);
if (likely(!force_pt_level)) {
- level = mapping_level(vcpu, gfn);
if (level > PT_DIRECTORY_LEVEL &&
!check_hugepage_cache_consistency(vcpu, gfn, level))
level = PT_DIRECTORY_LEVEL;
gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
- } else
- level = PT_PAGE_TABLE_LEVEL;
+ }
if (fast_page_fault(vcpu, gpa, level, error_code))
return 0;
@@ -3528,8 +3573,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
make_mmu_pages_available(vcpu);
if (likely(!force_pt_level))
transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
- r = __direct_map(vcpu, gpa, write, map_writable,
- level, gfn, pfn, prefault);
+ r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault);
spin_unlock(&vcpu->kvm->mmu_lock);
return r;
@@ -3588,13 +3632,24 @@ static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
return false;
}
-static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gpte)
+static inline bool is_last_gpte(struct kvm_mmu *mmu,
+ unsigned level, unsigned gpte)
{
- unsigned index;
+ /*
+ * PT_PAGE_TABLE_LEVEL always terminates. The RHS has bit 7 set
+ * iff level <= PT_PAGE_TABLE_LEVEL, which for our purpose means
+ * level == PT_PAGE_TABLE_LEVEL; set PT_PAGE_SIZE_MASK in gpte then.
+ */
+ gpte |= level - PT_PAGE_TABLE_LEVEL - 1;
+
+ /*
+ * The RHS has bit 7 set iff level < mmu->last_nonleaf_level.
+ * If it is clear, there are no large pages at this level, so clear
+ * PT_PAGE_SIZE_MASK in gpte if that is the case.
+ */
+ gpte &= level - mmu->last_nonleaf_level;
- index = level - 1;
- index |= (gpte & PT_PAGE_SIZE_MASK) >> (PT_PAGE_SIZE_SHIFT - 2);
- return mmu->last_pte_bitmap & (1 << index);
+ return gpte & PT_PAGE_SIZE_MASK;
}
#define PTTYPE_EPT 18 /* arbitrary */
@@ -3706,7 +3761,7 @@ static void
__reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
int maxphyaddr, bool execonly)
{
- int pte;
+ u64 bad_mt_xwr;
rsvd_check->rsvd_bits_mask[0][3] =
rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
@@ -3724,14 +3779,16 @@ __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20);
rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0];
- for (pte = 0; pte < 64; pte++) {
- int rwx_bits = pte & 7;
- int mt = pte >> 3;
- if (mt == 0x2 || mt == 0x3 || mt == 0x7 ||
- rwx_bits == 0x2 || rwx_bits == 0x6 ||
- (rwx_bits == 0x4 && !execonly))
- rsvd_check->bad_mt_xwr |= (1ull << pte);
+ bad_mt_xwr = 0xFFull << (2 * 8); /* bits 3..5 must not be 2 */
+ bad_mt_xwr |= 0xFFull << (3 * 8); /* bits 3..5 must not be 3 */
+ bad_mt_xwr |= 0xFFull << (7 * 8); /* bits 3..5 must not be 7 */
+ bad_mt_xwr |= REPEAT_BYTE(1ull << 2); /* bits 0..2 must not be 010 */
+ bad_mt_xwr |= REPEAT_BYTE(1ull << 6); /* bits 0..2 must not be 110 */
+ if (!execonly) {
+ /* bits 0..2 must not be 100 unless VMX capabilities allow it */
+ bad_mt_xwr |= REPEAT_BYTE(1ull << 4);
}
+ rsvd_check->bad_mt_xwr = bad_mt_xwr;
}
static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
@@ -3749,13 +3806,15 @@ static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
void
reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
{
+ bool uses_nx = context->nx || context->base_role.smep_andnot_wp;
+
/*
* Passing "true" to the last argument is okay; it adds a check
* on bit 8 of the SPTEs which KVM doesn't use anyway.
*/
__reset_rsvds_bits_mask(vcpu, &context->shadow_zero_check,
boot_cpu_data.x86_phys_bits,
- context->shadow_root_level, context->nx,
+ context->shadow_root_level, uses_nx,
guest_cpuid_has_gbpages(vcpu), is_pse(vcpu),
true);
}
@@ -3864,22 +3923,13 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu,
}
}
-static void update_last_pte_bitmap(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
+static void update_last_nonleaf_level(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
{
- u8 map;
- unsigned level, root_level = mmu->root_level;
- const unsigned ps_set_index = 1 << 2; /* bit 2 of index: ps */
-
- if (root_level == PT32E_ROOT_LEVEL)
- --root_level;
- /* PT_PAGE_TABLE_LEVEL always terminates */
- map = 1 | (1 << ps_set_index);
- for (level = PT_DIRECTORY_LEVEL; level <= root_level; ++level) {
- if (level <= PT_PDPE_LEVEL
- && (mmu->root_level >= PT32E_ROOT_LEVEL || is_pse(vcpu)))
- map |= 1 << (ps_set_index | (level - 1));
- }
- mmu->last_pte_bitmap = map;
+ unsigned root_level = mmu->root_level;
+
+ mmu->last_nonleaf_level = root_level;
+ if (root_level == PT32_ROOT_LEVEL && is_pse(vcpu))
+ mmu->last_nonleaf_level++;
}
static void paging64_init_context_common(struct kvm_vcpu *vcpu,
@@ -3891,7 +3941,7 @@ static void paging64_init_context_common(struct kvm_vcpu *vcpu,
reset_rsvds_bits_mask(vcpu, context);
update_permission_bitmask(vcpu, context, false);
- update_last_pte_bitmap(vcpu, context);
+ update_last_nonleaf_level(vcpu, context);
MMU_WARN_ON(!is_pae(vcpu));
context->page_fault = paging64_page_fault;
@@ -3918,7 +3968,7 @@ static void paging32_init_context(struct kvm_vcpu *vcpu,
reset_rsvds_bits_mask(vcpu, context);
update_permission_bitmask(vcpu, context, false);
- update_last_pte_bitmap(vcpu, context);
+ update_last_nonleaf_level(vcpu, context);
context->page_fault = paging32_page_fault;
context->gva_to_gpa = paging32_gva_to_gpa;
@@ -3976,7 +4026,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
}
update_permission_bitmask(vcpu, context, false);
- update_last_pte_bitmap(vcpu, context);
+ update_last_nonleaf_level(vcpu, context);
reset_tdp_shadow_zero_bits_mask(vcpu, context);
}
@@ -4053,10 +4103,12 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
g_context->inject_page_fault = kvm_inject_page_fault;
/*
- * Note that arch.mmu.gva_to_gpa translates l2_gva to l1_gpa. The
- * translation of l2_gpa to l1_gpa addresses is done using the
- * arch.nested_mmu.gva_to_gpa function. Basically the gva_to_gpa
- * functions between mmu and nested_mmu are swapped.
+ * Note that arch.mmu.gva_to_gpa translates l2_gpa to l1_gpa using
+ * L1's nested page tables (e.g. EPT12). The nested translation
+ * of l2_gva to l1_gpa is done by arch.nested_mmu.gva_to_gpa using
+ * L2's page tables as the first level of translation and L1's
+ * nested page tables as the second level of translation. Basically
+ * the gva_to_gpa functions between mmu and nested_mmu are swapped.
*/
if (!is_paging(vcpu)) {
g_context->nx = false;
@@ -4080,7 +4132,7 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
}
update_permission_bitmask(vcpu, g_context, false);
- update_last_pte_bitmap(vcpu, g_context);
+ update_last_nonleaf_level(vcpu, g_context);
}
static void init_kvm_mmu(struct kvm_vcpu *vcpu)
@@ -4151,18 +4203,6 @@ static bool need_remote_flush(u64 old, u64 new)
return (old & ~new & PT64_PERM_MASK) != 0;
}
-static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page,
- bool remote_flush, bool local_flush)
-{
- if (zap_page)
- return;
-
- if (remote_flush)
- kvm_flush_remote_tlbs(vcpu->kvm);
- else if (local_flush)
- kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
-}
-
static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
const u8 *new, int *bytes)
{
@@ -4212,7 +4252,8 @@ static bool detect_write_flooding(struct kvm_mmu_page *sp)
if (sp->role.level == PT_PAGE_TABLE_LEVEL)
return false;
- return ++sp->write_flooding_count >= 3;
+ atomic_inc(&sp->write_flooding_count);
+ return atomic_read(&sp->write_flooding_count) >= 3;
}
/*
@@ -4274,15 +4315,15 @@ static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte)
return spte;
}
-void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
- const u8 *new, int bytes)
+static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
+ const u8 *new, int bytes)
{
gfn_t gfn = gpa >> PAGE_SHIFT;
struct kvm_mmu_page *sp;
LIST_HEAD(invalid_list);
u64 entry, gentry, *spte;
int npte;
- bool remote_flush, local_flush, zap_page;
+ bool remote_flush, local_flush;
union kvm_mmu_page_role mask = { };
mask.cr0_wp = 1;
@@ -4299,7 +4340,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
if (!ACCESS_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
return;
- zap_page = remote_flush = local_flush = false;
+ remote_flush = local_flush = false;
pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
@@ -4319,8 +4360,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
if (detect_write_misaligned(sp, gpa, bytes) ||
detect_write_flooding(sp)) {
- zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
- &invalid_list);
+ kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
++vcpu->kvm->stat.mmu_flooded;
continue;
}
@@ -4342,8 +4382,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
++spte;
}
}
- mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush);
- kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+ kvm_mmu_flush_or_zap(vcpu, &invalid_list, remote_flush, local_flush);
kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
spin_unlock(&vcpu->kvm->mmu_lock);
}
@@ -4380,32 +4419,34 @@ static void make_mmu_pages_available(struct kvm_vcpu *vcpu)
kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
}
-static bool is_mmio_page_fault(struct kvm_vcpu *vcpu, gva_t addr)
-{
- if (vcpu->arch.mmu.direct_map || mmu_is_nested(vcpu))
- return vcpu_match_mmio_gpa(vcpu, addr);
-
- return vcpu_match_mmio_gva(vcpu, addr);
-}
-
int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
void *insn, int insn_len)
{
int r, emulation_type = EMULTYPE_RETRY;
enum emulation_result er;
+ bool direct = vcpu->arch.mmu.direct_map || mmu_is_nested(vcpu);
+
+ if (unlikely(error_code & PFERR_RSVD_MASK)) {
+ r = handle_mmio_page_fault(vcpu, cr2, direct);
+ if (r == RET_MMIO_PF_EMULATE) {
+ emulation_type = 0;
+ goto emulate;
+ }
+ if (r == RET_MMIO_PF_RETRY)
+ return 1;
+ if (r < 0)
+ return r;
+ }
r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false);
if (r < 0)
- goto out;
-
- if (!r) {
- r = 1;
- goto out;
- }
+ return r;
+ if (!r)
+ return 1;
- if (is_mmio_page_fault(vcpu, cr2))
+ if (mmio_info_in_cache(vcpu, cr2, direct))
emulation_type = 0;
-
+emulate:
er = x86_emulate_instruction(vcpu, cr2, emulation_type, insn, insn_len);
switch (er) {
@@ -4419,8 +4460,6 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
default:
BUG();
}
-out:
- return r;
}
EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
@@ -4489,8 +4528,23 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu)
init_kvm_mmu(vcpu);
}
+void kvm_mmu_init_vm(struct kvm *kvm)
+{
+ struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
+
+ node->track_write = kvm_mmu_pte_write;
+ kvm_page_track_register_notifier(kvm, node);
+}
+
+void kvm_mmu_uninit_vm(struct kvm *kvm)
+{
+ struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
+
+ kvm_page_track_unregister_notifier(kvm, node);
+}
+
/* The return value indicates if tlb flush on all vcpus is needed. */
-typedef bool (*slot_level_handler) (struct kvm *kvm, unsigned long *rmap);
+typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head);
/* The caller should hold mmu-lock before calling this function. */
static bool
@@ -4584,9 +4638,10 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
spin_unlock(&kvm->mmu_lock);
}
-static bool slot_rmap_write_protect(struct kvm *kvm, unsigned long *rmapp)
+static bool slot_rmap_write_protect(struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head)
{
- return __rmap_write_protect(kvm, rmapp, false);
+ return __rmap_write_protect(kvm, rmap_head, false);
}
void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
@@ -4622,16 +4677,16 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
}
static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
- unsigned long *rmapp)
+ struct kvm_rmap_head *rmap_head)
{
u64 *sptep;
struct rmap_iterator iter;
int need_tlb_flush = 0;
- pfn_t pfn;
+ kvm_pfn_t pfn;
struct kvm_mmu_page *sp;
restart:
- for_each_rmap_spte(rmapp, &iter, sptep) {
+ for_each_rmap_spte(rmap_head, &iter, sptep) {
sp = page_header(__pa(sptep));
pfn = spte_to_pfn(*sptep);
OpenPOWER on IntegriCloud