From 68a644d734e61f38b686cb755bd2a3f43d9372f4 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 22 Apr 2013 14:10:37 +0930 Subject: lguest: check vaddr not pgd for Switcher protection. We currently assume that the Switcher the top pgd; we want to remove this assumption, so check that vaddr is OK, rather then checking pgd index. Signed-off-by: Rusty Russell --- drivers/lguest/page_tables.c | 37 ++++++++++++++++--------------------- 1 file changed, 16 insertions(+), 21 deletions(-) (limited to 'drivers/lguest/page_tables.c') diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index 3b62be16..a2454a2 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c @@ -95,13 +95,6 @@ static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr) { unsigned int index = pgd_index(vaddr); -#ifndef CONFIG_X86_PAE - /* We kill any Guest trying to touch the Switcher addresses. */ - if (index >= SWITCHER_PGD_INDEX) { - kill_guest(cpu, "attempt to access switcher pages"); - index = 0; - } -#endif /* Return a pointer index'th pgd entry for the i'th page table. */ return &cpu->lg->pgdirs[i].pgdir[index]; } @@ -117,13 +110,6 @@ static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) unsigned int index = pmd_index(vaddr); pmd_t *page; - /* We kill any Guest trying to touch the Switcher addresses. */ - if (pgd_index(vaddr) == SWITCHER_PGD_INDEX && - index >= SWITCHER_PMD_INDEX) { - kill_guest(cpu, "attempt to access switcher pages"); - index = 0; - } - /* You should never call this if the PGD entry wasn't valid */ BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT)); page = __va(pgd_pfn(spgd) << PAGE_SHIFT); @@ -323,6 +309,10 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) pmd_t gpmd; #endif + /* We never demand page the Switcher, so trying is a mistake. */ + if (vaddr >= switcher_addr) + return false; + /* First step: get the top-level Guest page table entry. */ if (unlikely(cpu->linear_pages)) { /* Faking up a linear mapping. */ @@ -495,10 +485,14 @@ static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr) { pgd_t *spgd; unsigned long flags; - #ifdef CONFIG_X86_PAE pmd_t *spmd; #endif + + /* You can't put your stack in the Switcher! */ + if (vaddr >= switcher_addr) + return false; + /* Look at the current top level entry: is it present? */ spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr); if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) @@ -897,6 +891,12 @@ static void do_set_pte(struct lg_cpu *cpu, int idx, void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir, unsigned long vaddr, pte_t gpte) { + /* We don't let you remap the Switcher; we need it to get back! */ + if (vaddr >= switcher_addr) { + kill_guest(cpu, "attempt to set pte into Switcher pages"); + return; + } + /* * Kernel mappings must be changed on all top levels. Slow, but doesn't * happen often. @@ -995,12 +995,7 @@ void page_table_guest_data_init(struct lg_cpu *cpu) * "pgd_index(lg->kernel_address)". This assumes it won't hit the * Switcher mappings, so check that now. */ -#ifdef CONFIG_X86_PAE - if (pgd_index(cpu->lg->kernel_address) == SWITCHER_PGD_INDEX && - pmd_index(cpu->lg->kernel_address) == SWITCHER_PMD_INDEX) -#else - if (pgd_index(cpu->lg->kernel_address) >= SWITCHER_PGD_INDEX) -#endif + if (cpu->lg->kernel_address >= switcher_addr) kill_guest(cpu, "bad kernel address %#lx", cpu->lg->kernel_address); } -- cgit v1.1 From c215a8b9eb17739c01d59faa7db9d1ef162a82a8 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 22 Apr 2013 14:10:37 +0930 Subject: lguest: remove RESERVE_MEM constant. We can use switcher_addr directly. Signed-off-by: Rusty Russell --- drivers/lguest/page_tables.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'drivers/lguest/page_tables.c') diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index a2454a2..27cbb18 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c @@ -63,10 +63,8 @@ */ #ifdef CONFIG_X86_PAE #define SWITCHER_PMD_INDEX (PTRS_PER_PMD - 1) -#define RESERVE_MEM 2U #define CHECK_GPGD_MASK _PAGE_PRESENT #else -#define RESERVE_MEM 4U #define CHECK_GPGD_MASK _PAGE_TABLE #endif @@ -977,15 +975,21 @@ int init_guest_pagetable(struct lguest *lg) /*H:508 When the Guest calls LHCALL_LGUEST_INIT we do more setup. */ void page_table_guest_data_init(struct lg_cpu *cpu) { + /* + * We tell the Guest that it can't use the virtual addresses + * used by the Switcher. This trick is equivalent to 4GB - + * switcher_addr. + */ + u32 top = ~switcher_addr + 1; + /* We get the kernel address: above this is all kernel memory. */ if (get_user(cpu->lg->kernel_address, - &cpu->lg->lguest_data->kernel_address) + &cpu->lg->lguest_data->kernel_address) /* - * We tell the Guest that it can't use the top 2 or 4 MB - * of virtual addresses used by the Switcher. + * We tell the Guest that it can't use the top virtual + * addresses (used by the Switcher). */ - || put_user(RESERVE_MEM * 1024 * 1024, - &cpu->lg->lguest_data->reserve_mem)) { + || put_user(top, &cpu->lg->lguest_data->reserve_mem)) { kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); return; } -- cgit v1.1 From 856c608827928d29f80605e85fc3f8f0ab3af4fb Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 22 Apr 2013 14:10:38 +0930 Subject: lguest: rename switcher_page to switcher_pages. There is a single page with the Switcher in it, but it's followed by 2 pages per Host CPU. Signed-off-by: Rusty Russell --- drivers/lguest/page_tables.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'drivers/lguest/page_tables.c') diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index 27cbb18..2168558 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c @@ -1084,7 +1084,7 @@ static void free_switcher_pte_pages(void) * Currently the Switcher is less than a page long, so "pages" is always 1. */ static __init void populate_switcher_pte_page(unsigned int cpu, - struct page *switcher_page[], + struct page *switcher_pages[], unsigned int pages) { unsigned int i; @@ -1092,7 +1092,7 @@ static __init void populate_switcher_pte_page(unsigned int cpu, /* The first entries are easy: they map the Switcher code. */ for (i = 0; i < pages; i++) { - set_pte(&pte[i], mk_pte(switcher_page[i], + set_pte(&pte[i], mk_pte(switcher_pages[i], __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED))); } @@ -1100,14 +1100,14 @@ static __init void populate_switcher_pte_page(unsigned int cpu, i = pages + cpu*2; /* First page (Guest registers) is writable from the Guest */ - set_pte(&pte[i], pfn_pte(page_to_pfn(switcher_page[i]), + set_pte(&pte[i], pfn_pte(page_to_pfn(switcher_pages[i]), __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW))); /* * The second page contains the "struct lguest_ro_state", and is * read-only. */ - set_pte(&pte[i+1], pfn_pte(page_to_pfn(switcher_page[i+1]), + set_pte(&pte[i+1], pfn_pte(page_to_pfn(switcher_pages[i+1]), __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED))); } @@ -1128,7 +1128,7 @@ static __init void populate_switcher_pte_page(unsigned int cpu, * At boot or module load time, init_pagetables() allocates and populates * the Switcher PTE page for each CPU. */ -__init int init_pagetables(struct page **switcher_page, unsigned int pages) +__init int init_pagetables(struct page **switcher_pages, unsigned int pages) { unsigned int i; @@ -1138,7 +1138,7 @@ __init int init_pagetables(struct page **switcher_page, unsigned int pages) free_switcher_pte_pages(); return -ENOMEM; } - populate_switcher_pte_page(i, switcher_page, pages); + populate_switcher_pte_page(i, switcher_pages, pages); } return 0; } -- cgit v1.1 From 93a2cdff98243df06bafd3c4f3b31b38f0d0fe3e Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 22 Apr 2013 14:10:38 +0930 Subject: lguest: assume Switcher text is a single page. ie. SHARED_SWITCHER_PAGES == 1. It is well under a page, and it's a minor simplification: it's nice to have *one* simplification in a patch series! Signed-off-by: Rusty Russell --- drivers/lguest/page_tables.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) (limited to 'drivers/lguest/page_tables.c') diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index 2168558..7584662 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c @@ -1079,25 +1079,20 @@ static void free_switcher_pte_pages(void) /*H:520 * Setting up the Switcher PTE page for given CPU is fairly easy, given - * the CPU number and the "struct page"s for the Switcher code itself. - * - * Currently the Switcher is less than a page long, so "pages" is always 1. + * the CPU number and the "struct page"s for the Switcher and per-cpu pages. */ static __init void populate_switcher_pte_page(unsigned int cpu, - struct page *switcher_pages[], - unsigned int pages) + struct page *switcher_pages[]) { - unsigned int i; pte_t *pte = switcher_pte_page(cpu); + int i; - /* The first entries are easy: they map the Switcher code. */ - for (i = 0; i < pages; i++) { - set_pte(&pte[i], mk_pte(switcher_pages[i], + /* The first entries maps the Switcher code. */ + set_pte(&pte[0], mk_pte(switcher_pages[0], __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED))); - } /* The only other thing we map is this CPU's pair of pages. */ - i = pages + cpu*2; + i = 1 + cpu*2; /* First page (Guest registers) is writable from the Guest */ set_pte(&pte[i], pfn_pte(page_to_pfn(switcher_pages[i]), @@ -1128,7 +1123,7 @@ static __init void populate_switcher_pte_page(unsigned int cpu, * At boot or module load time, init_pagetables() allocates and populates * the Switcher PTE page for each CPU. */ -__init int init_pagetables(struct page **switcher_pages, unsigned int pages) +__init int init_pagetables(struct page **switcher_pages) { unsigned int i; @@ -1138,7 +1133,7 @@ __init int init_pagetables(struct page **switcher_pages, unsigned int pages) free_switcher_pte_pages(); return -ENOMEM; } - populate_switcher_pte_page(i, switcher_pages, pages); + populate_switcher_pte_page(i, switcher_pages); } return 0; } -- cgit v1.1 From e1d12606f756bdb8328a66a2873dca6c46bcb4e5 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 22 Apr 2013 14:10:39 +0930 Subject: lguest: make check_gpte et. al return bool. This is a bit neater: we can immediately return if a PTE/PGD/PMD entry is invalid (which also kills the guest). It means we don't risk using invalid entries as we reshuffle the code. Signed-off-by: Rusty Russell --- drivers/lguest/page_tables.c | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) (limited to 'drivers/lguest/page_tables.c') diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index 7584662..f074f34 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c @@ -259,26 +259,35 @@ static void release_pte(pte_t pte) } /*:*/ -static void check_gpte(struct lg_cpu *cpu, pte_t gpte) +static bool check_gpte(struct lg_cpu *cpu, pte_t gpte) { if ((pte_flags(gpte) & _PAGE_PSE) || - pte_pfn(gpte) >= cpu->lg->pfn_limit) + pte_pfn(gpte) >= cpu->lg->pfn_limit) { kill_guest(cpu, "bad page table entry"); + return false; + } + return true; } -static void check_gpgd(struct lg_cpu *cpu, pgd_t gpgd) +static bool check_gpgd(struct lg_cpu *cpu, pgd_t gpgd) { if ((pgd_flags(gpgd) & ~CHECK_GPGD_MASK) || - (pgd_pfn(gpgd) >= cpu->lg->pfn_limit)) + (pgd_pfn(gpgd) >= cpu->lg->pfn_limit)) { kill_guest(cpu, "bad page directory entry"); + return false; + } + return true; } #ifdef CONFIG_X86_PAE -static void check_gpmd(struct lg_cpu *cpu, pmd_t gpmd) +static bool check_gpmd(struct lg_cpu *cpu, pmd_t gpmd) { if ((pmd_flags(gpmd) & ~_PAGE_TABLE) || - (pmd_pfn(gpmd) >= cpu->lg->pfn_limit)) + (pmd_pfn(gpmd) >= cpu->lg->pfn_limit)) { kill_guest(cpu, "bad page middle directory entry"); + return false; + } + return true; } #endif @@ -336,7 +345,8 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) return false; } /* We check that the Guest pgd is OK. */ - check_gpgd(cpu, gpgd); + if (!check_gpgd(cpu, gpgd)) + return false; /* * And we copy the flags to the shadow PGD entry. The page * number in the shadow PGD is the page we just allocated. @@ -372,7 +382,8 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) } /* We check that the Guest pmd is OK. */ - check_gpmd(cpu, gpmd); + if (!check_gpmd(cpu, gpmd)) + return false; /* * And we copy the flags to the shadow PMD entry. The page @@ -421,7 +432,8 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) * Check that the Guest PTE flags are OK, and the page number is below * the pfn_limit (ie. not mapping the Launcher binary). */ - check_gpte(cpu, gpte); + if (!check_gpte(cpu, gpte)) + return false; /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */ gpte = pte_mkyoung(gpte); @@ -857,7 +869,8 @@ static void do_set_pte(struct lg_cpu *cpu, int idx, * micro-benchmark. */ if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) { - check_gpte(cpu, gpte); + if (!check_gpte(cpu, gpte)) + return; set_pte(spte, gpte_to_spte(cpu, gpte, pte_flags(gpte) & _PAGE_DIRTY)); -- cgit v1.1 From 17427e08faae3e63271a9c2d0edb6a22e5fbb54b Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 22 Apr 2013 14:10:39 +0930 Subject: lguest: extract shadow PTE walking / allocating. We want a separate find_pte() function so we can call it for populating the switcher PTE entries. We can also use it in page_writable(). Signed-off-by: Rusty Russell --- drivers/lguest/page_tables.c | 170 +++++++++++++++++++++++++------------------ 1 file changed, 101 insertions(+), 69 deletions(-) (limited to 'drivers/lguest/page_tables.c') diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index f074f34..009c717 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c @@ -291,6 +291,88 @@ static bool check_gpmd(struct lg_cpu *cpu, pmd_t gpmd) } #endif +/*H:331 + * This is the core routine to walk the shadow page tables and find the page + * table entry for a specific address. + * + * If allocate is set, then we allocate any missing levels, setting the flags + * on the new page directory and mid-level directories using the arguments + * (which are copied from the Guest's page table entries). + */ +static pte_t *find_spte(struct lg_cpu *cpu, unsigned long vaddr, bool allocate, + int pgd_flags, int pmd_flags) +{ + pgd_t *spgd; + /* Mid level for PAE. */ +#ifdef CONFIG_X86_PAE + pmd_t *spmd; +#endif + + /* Get top level entry. */ + spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr); + if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) { + /* No shadow entry: allocate a new shadow PTE page. */ + unsigned long ptepage; + + /* If they didn't want us to allocate anything, stop. */ + if (!allocate) + return NULL; + + ptepage = get_zeroed_page(GFP_KERNEL); + /* + * This is not really the Guest's fault, but killing it is + * simple for this corner case. + */ + if (!ptepage) { + kill_guest(cpu, "out of memory allocating pte page"); + return NULL; + } + /* + * And we copy the flags to the shadow PGD entry. The page + * number in the shadow PGD is the page we just allocated. + */ + set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags)); + } + + /* + * Intel's Physical Address Extension actually uses three levels of + * page tables, so we need to look in the mid-level. + */ +#ifdef CONFIG_X86_PAE + /* Now look at the mid-level shadow entry. */ + spmd = spmd_addr(cpu, *spgd, vaddr); + + if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) { + /* No shadow entry: allocate a new shadow PTE page. */ + unsigned long ptepage; + + /* If they didn't want us to allocate anything, stop. */ + if (!allocate) + return NULL; + + ptepage = get_zeroed_page(GFP_KERNEL); + + /* + * This is not really the Guest's fault, but killing it is + * simple for this corner case. + */ + if (!ptepage) { + kill_guest(cpu, "out of memory allocating pmd page"); + return NULL; + } + + /* + * And we copy the flags to the shadow PMD entry. The page + * number in the shadow PMD is the page we just allocated. + */ + set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags)); + } +#endif + + /* Get the pointer to the shadow PTE entry we're going to set. */ + return spte_addr(cpu, *spgd, vaddr); +} + /*H:330 * (i) Looking up a page table entry when the Guest faults. * @@ -304,17 +386,11 @@ static bool check_gpmd(struct lg_cpu *cpu, pmd_t gpmd) */ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) { - pgd_t gpgd; - pgd_t *spgd; unsigned long gpte_ptr; pte_t gpte; pte_t *spte; - - /* Mid level for PAE. */ -#ifdef CONFIG_X86_PAE - pmd_t *spmd; pmd_t gpmd; -#endif + pgd_t gpgd; /* We never demand page the Switcher, so trying is a mistake. */ if (vaddr >= switcher_addr) @@ -329,67 +405,31 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) /* Toplevel not present? We can't map it in. */ if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) return false; - } - /* Now look at the matching shadow entry. */ - spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr); - if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) { - /* No shadow entry: allocate a new shadow PTE page. */ - unsigned long ptepage = get_zeroed_page(GFP_KERNEL); - /* - * This is not really the Guest's fault, but killing it is - * simple for this corner case. + /* + * This kills the Guest if it has weird flags or tries to + * refer to a "physical" address outside the bounds. */ - if (!ptepage) { - kill_guest(cpu, "out of memory allocating pte page"); - return false; - } - /* We check that the Guest pgd is OK. */ if (!check_gpgd(cpu, gpgd)) return false; - /* - * And we copy the flags to the shadow PGD entry. The page - * number in the shadow PGD is the page we just allocated. - */ - set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags(gpgd))); } + /* This "mid-level" entry is only used for non-linear, PAE mode. */ + gpmd = __pmd(_PAGE_TABLE); + #ifdef CONFIG_X86_PAE - if (unlikely(cpu->linear_pages)) { - /* Faking up a linear mapping. */ - gpmd = __pmd(_PAGE_TABLE); - } else { + if (likely(!cpu->linear_pages)) { gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); /* Middle level not present? We can't map it in. */ if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) return false; - } - - /* Now look at the matching shadow entry. */ - spmd = spmd_addr(cpu, *spgd, vaddr); - - if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) { - /* No shadow entry: allocate a new shadow PTE page. */ - unsigned long ptepage = get_zeroed_page(GFP_KERNEL); - /* - * This is not really the Guest's fault, but killing it is - * simple for this corner case. + /* + * This kills the Guest if it has weird flags or tries to + * refer to a "physical" address outside the bounds. */ - if (!ptepage) { - kill_guest(cpu, "out of memory allocating pte page"); - return false; - } - - /* We check that the Guest pmd is OK. */ if (!check_gpmd(cpu, gpmd)) return false; - - /* - * And we copy the flags to the shadow PMD entry. The page - * number in the shadow PMD is the page we just allocated. - */ - set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags(gpmd))); } /* @@ -441,7 +481,9 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) gpte = pte_mkdirty(gpte); /* Get the pointer to the shadow PTE entry we're going to set. */ - spte = spte_addr(cpu, *spgd, vaddr); + spte = find_spte(cpu, vaddr, true, pgd_flags(gpgd), pmd_flags(gpmd)); + if (!spte) + return false; /* * If there was a valid shadow PTE entry here before, we release it. @@ -493,33 +535,23 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) */ static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr) { - pgd_t *spgd; + pte_t *spte; unsigned long flags; -#ifdef CONFIG_X86_PAE - pmd_t *spmd; -#endif /* You can't put your stack in the Switcher! */ if (vaddr >= switcher_addr) return false; - /* Look at the current top level entry: is it present? */ - spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr); - if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) + /* If there's no shadow PTE, it's not writable. */ + spte = find_spte(cpu, vaddr, false, 0, 0); + if (!spte) return false; -#ifdef CONFIG_X86_PAE - spmd = spmd_addr(cpu, *spgd, vaddr); - if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) - return false; -#endif - /* * Check the flags on the pte entry itself: it must be present and * writable. */ - flags = pte_flags(*(spte_addr(cpu, *spgd, vaddr))); - + flags = pte_flags(*spte); return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); } -- cgit v1.1 From 3412b6ae2924e068f9932f841bdea0f2d8424502 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 22 Apr 2013 14:10:40 +0930 Subject: lguest: don't share Switcher PTE pages between guests. We currently use the whole top PGD entry for the switcher, so we simply share a fixed page of PTEs between all guests (actually, it's one per Host CPU, to ensure isolation between guests). Changes to a scheme where every guest has its own mappings. Signed-off-by: Rusty Russell --- drivers/lguest/page_tables.c | 260 ++++++++++++++++++------------------------- 1 file changed, 106 insertions(+), 154 deletions(-) (limited to 'drivers/lguest/page_tables.c') diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index 009c717..1f48f27 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c @@ -62,20 +62,11 @@ * will need the last pmd entry of the last pmd page. */ #ifdef CONFIG_X86_PAE -#define SWITCHER_PMD_INDEX (PTRS_PER_PMD - 1) #define CHECK_GPGD_MASK _PAGE_PRESENT #else #define CHECK_GPGD_MASK _PAGE_TABLE #endif -/* - * We actually need a separate PTE page for each CPU. Remember that after the - * Switcher code itself comes two pages for each CPU, and we don't want this - * CPU's guest to see the pages of any other CPU. - */ -static DEFINE_PER_CPU(pte_t *, switcher_pte_pages); -#define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu) - /*H:320 * The page table code is curly enough to need helper functions to keep it * clear and clean. The kernel itself provides many of them; one advantage @@ -714,9 +705,6 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, int *blank_pgdir) { unsigned int next; -#ifdef CONFIG_X86_PAE - pmd_t *pmd_table; -#endif /* * We pick one entry at random to throw out. Choosing the Least @@ -731,29 +719,11 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, if (!cpu->lg->pgdirs[next].pgdir) next = cpu->cpu_pgd; else { -#ifdef CONFIG_X86_PAE /* - * In PAE mode, allocate a pmd page and populate the - * last pgd entry. + * This is a blank page, so there are no kernel + * mappings: caller must map the stack! */ - pmd_table = (pmd_t *)get_zeroed_page(GFP_KERNEL); - if (!pmd_table) { - free_page((long)cpu->lg->pgdirs[next].pgdir); - set_pgd(cpu->lg->pgdirs[next].pgdir, __pgd(0)); - next = cpu->cpu_pgd; - } else { - set_pgd(cpu->lg->pgdirs[next].pgdir + - SWITCHER_PGD_INDEX, - __pgd(__pa(pmd_table) | _PAGE_PRESENT)); - /* - * This is a blank page, so there are no kernel - * mappings: caller must map the stack! - */ - *blank_pgdir = 1; - } -#else *blank_pgdir = 1; -#endif } } /* Record which Guest toplevel this shadows. */ @@ -764,6 +734,23 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, return next; } +/*H:501 + * We do need the Switcher code mapped at all times, so we allocate that + * part of the Guest page table here, and populate it when we're about to run + * the guest. + */ +static bool allocate_switcher_mapping(struct lg_cpu *cpu) +{ + int i; + + for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) { + if (!find_spte(cpu, switcher_addr + i * PAGE_SIZE, true, + CHECK_GPGD_MASK, _PAGE_TABLE)) + return false; + } + return true; +} + /*H:470 * Finally, a routine which throws away everything: all PGD entries in all * the shadow page tables, including the Guest's kernel mappings. This is used @@ -774,28 +761,14 @@ static void release_all_pagetables(struct lguest *lg) unsigned int i, j; /* Every shadow pagetable this Guest has */ - for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) - if (lg->pgdirs[i].pgdir) { -#ifdef CONFIG_X86_PAE - pgd_t *spgd; - pmd_t *pmdpage; - unsigned int k; + for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) { + if (!lg->pgdirs[i].pgdir) + continue; - /* Get the last pmd page. */ - spgd = lg->pgdirs[i].pgdir + SWITCHER_PGD_INDEX; - pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); - - /* - * And release the pmd entries of that pmd page, - * except for the switcher pmd. - */ - for (k = 0; k < SWITCHER_PMD_INDEX; k++) - release_pmd(&pmdpage[k]); -#endif - /* Every PGD entry except the Switcher at the top */ - for (j = 0; j < SWITCHER_PGD_INDEX; j++) - release_pgd(lg->pgdirs[i].pgdir + j); - } + /* Every PGD entry. */ + for (j = 0; j < PTRS_PER_PGD; j++) + release_pgd(lg->pgdirs[i].pgdir + j); + } } /* @@ -809,6 +782,9 @@ void guest_pagetable_clear_all(struct lg_cpu *cpu) release_all_pagetables(cpu->lg); /* We need the Guest kernel stack mapped again. */ pin_stack_pages(cpu); + /* And we need Switcher allocated. */ + if (!allocate_switcher_mapping(cpu)) + kill_guest(cpu, "Cannot populate switcher mapping"); } /*H:430 @@ -844,9 +820,15 @@ void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable) newpgdir = new_pgdir(cpu, pgtable, &repin); /* Change the current pgd index to the new one. */ cpu->cpu_pgd = newpgdir; - /* If it was completely blank, we map in the Guest kernel stack */ + /* + * If it was completely blank, we map in the Guest kernel stack and + * the Switcher. + */ if (repin) pin_stack_pages(cpu); + + if (!allocate_switcher_mapping(cpu)) + kill_guest(cpu, "Cannot populate switcher mapping"); } /*:*/ @@ -976,14 +958,23 @@ void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx) { int pgdir; - if (idx >= SWITCHER_PGD_INDEX) + if (idx > PTRS_PER_PGD) { + kill_guest(&lg->cpus[0], "Attempt to set pgd %u/%u", + idx, PTRS_PER_PGD); return; + } /* If they're talking about a page table we have a shadow for... */ pgdir = find_pgdir(lg, gpgdir); - if (pgdir < ARRAY_SIZE(lg->pgdirs)) + if (pgdir < ARRAY_SIZE(lg->pgdirs)) { /* ... throw it away. */ release_pgd(lg->pgdirs[pgdir].pgdir + idx); + /* That might have been the Switcher mapping, remap it. */ + if (!allocate_switcher_mapping(&lg->cpus[0])) { + kill_guest(&lg->cpus[0], + "Cannot populate switcher mapping"); + } + } } #ifdef CONFIG_X86_PAE @@ -1001,6 +992,9 @@ void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx) * we will populate on future faults. The Guest doesn't have any actual * pagetables yet, so we set linear_pages to tell demand_page() to fake it * for the moment. + * + * We do need the Switcher to be mapped at all times, so we allocate that + * part of the Guest page table here. */ int init_guest_pagetable(struct lguest *lg) { @@ -1014,6 +1008,13 @@ int init_guest_pagetable(struct lguest *lg) /* We start with a linear mapping until the initialize. */ cpu->linear_pages = true; + + /* Allocate the page tables for the Switcher. */ + if (!allocate_switcher_mapping(cpu)) { + release_all_pagetables(lg); + return -ENOMEM; + } + return 0; } @@ -1065,91 +1066,68 @@ void free_guest_pagetable(struct lguest *lg) * (vi) Mapping the Switcher when the Guest is about to run. * * The Switcher and the two pages for this CPU need to be visible in the - * Guest (and not the pages for other CPUs). We have the appropriate PTE pages - * for each CPU already set up, we just need to hook them in now we know which - * Guest is about to run on this CPU. + * Guest (and not the pages for other CPUs). + * + * The pages have all been allocate */ void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) { - pte_t *switcher_pte_page = __this_cpu_read(switcher_pte_pages); - pte_t regs_pte; + unsigned long base, i; + struct page *percpu_switcher_page, *regs_page; + pte_t *pte; -#ifdef CONFIG_X86_PAE - pmd_t switcher_pmd; - pmd_t *pmd_table; - - switcher_pmd = pfn_pmd(__pa(switcher_pte_page) >> PAGE_SHIFT, - PAGE_KERNEL_EXEC); - - /* Figure out where the pmd page is, by reading the PGD, and converting - * it to a virtual address. */ - pmd_table = __va(pgd_pfn(cpu->lg-> - pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX]) - << PAGE_SHIFT); - /* Now write it into the shadow page table. */ - set_pmd(&pmd_table[SWITCHER_PMD_INDEX], switcher_pmd); -#else - pgd_t switcher_pgd; + /* Code page should always be mapped, and executable. */ + pte = find_spte(cpu, switcher_addr, false, 0, 0); + get_page(lg_switcher_pages[0]); + set_pte(pte, mk_pte(lg_switcher_pages[0], PAGE_KERNEL_RX)); - /* - * Make the last PGD entry for this Guest point to the Switcher's PTE - * page for this CPU (with appropriate flags). - */ - switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL_EXEC); + /* Clear all the Switcher mappings for any other CPUs. */ + /* FIXME: This is dumb: update only when Host CPU changes. */ + for_each_possible_cpu(i) { + /* Get location of lguest_pages (indexed by Host CPU) */ + base = switcher_addr + PAGE_SIZE + + i * sizeof(struct lguest_pages); - cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd; + /* Get shadow PTE for first page (where we put guest regs). */ + pte = find_spte(cpu, base, false, 0, 0); + set_pte(pte, __pte(0)); + + /* This is where we put R/O state. */ + pte = find_spte(cpu, base + PAGE_SIZE, false, 0, 0); + set_pte(pte, __pte(0)); + } -#endif /* - * We also change the Switcher PTE page. When we're running the Guest, - * we want the Guest's "regs" page to appear where the first Switcher - * page for this CPU is. This is an optimization: when the Switcher - * saves the Guest registers, it saves them into the first page of this - * CPU's "struct lguest_pages": if we make sure the Guest's register - * page is already mapped there, we don't have to copy them out - * again. + * When we're running the Guest, we want the Guest's "regs" page to + * appear where the first Switcher page for this CPU is. This is an + * optimization: when the Switcher saves the Guest registers, it saves + * them into the first page of this CPU's "struct lguest_pages": if we + * make sure the Guest's register page is already mapped there, we + * don't have to copy them out again. */ - regs_pte = pfn_pte(__pa(cpu->regs_page) >> PAGE_SHIFT, PAGE_KERNEL); - set_pte(&switcher_pte_page[pte_index((unsigned long)pages)], regs_pte); -} -/*:*/ - -static void free_switcher_pte_pages(void) -{ - unsigned int i; - - for_each_possible_cpu(i) - free_page((long)switcher_pte_page(i)); -} - -/*H:520 - * Setting up the Switcher PTE page for given CPU is fairly easy, given - * the CPU number and the "struct page"s for the Switcher and per-cpu pages. - */ -static __init void populate_switcher_pte_page(unsigned int cpu, - struct page *switcher_pages[]) -{ - pte_t *pte = switcher_pte_page(cpu); - int i; - - /* The first entries maps the Switcher code. */ - set_pte(&pte[0], mk_pte(switcher_pages[0], - __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED))); - - /* The only other thing we map is this CPU's pair of pages. */ - i = 1 + cpu*2; - - /* First page (Guest registers) is writable from the Guest */ - set_pte(&pte[i], pfn_pte(page_to_pfn(switcher_pages[i]), - __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW))); + /* Find the shadow PTE for this regs page. */ + base = switcher_addr + PAGE_SIZE + + raw_smp_processor_id() * sizeof(struct lguest_pages); + pte = find_spte(cpu, base, false, 0, 0); + regs_page = pfn_to_page(__pa(cpu->regs_page) >> PAGE_SHIFT); + get_page(regs_page); + set_pte(pte, mk_pte(regs_page, __pgprot(__PAGE_KERNEL & ~_PAGE_GLOBAL))); /* - * The second page contains the "struct lguest_ro_state", and is - * read-only. + * We map the second page of the struct lguest_pages read-only in + * the Guest: the IDT, GDT and other things it's not supposed to + * change. */ - set_pte(&pte[i+1], pfn_pte(page_to_pfn(switcher_pages[i+1]), - __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED))); + base += PAGE_SIZE; + pte = find_spte(cpu, base, false, 0, 0); + + percpu_switcher_page + = lg_switcher_pages[1 + raw_smp_processor_id()*2 + 1]; + get_page(percpu_switcher_page); + set_pte(pte, mk_pte(percpu_switcher_page, + __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL))); } +/*:*/ /* * We've made it through the page table code. Perhaps our tired brains are @@ -1163,29 +1141,3 @@ static __init void populate_switcher_pte_page(unsigned int cpu, * * There is just one file remaining in the Host. */ - -/*H:510 - * At boot or module load time, init_pagetables() allocates and populates - * the Switcher PTE page for each CPU. - */ -__init int init_pagetables(struct page **switcher_pages) -{ - unsigned int i; - - for_each_possible_cpu(i) { - switcher_pte_page(i) = (pte_t *)get_zeroed_page(GFP_KERNEL); - if (!switcher_pte_page(i)) { - free_switcher_pte_pages(); - return -ENOMEM; - } - populate_switcher_pte_page(i, switcher_pages); - } - return 0; -} -/*:*/ - -/* Cleaning up simply involves freeing the PTE page for each CPU. */ -void free_pagetables(void) -{ - free_switcher_pte_pages(); -} -- cgit v1.1 From 86935fc4ee4d95efe01b6c91cd5143fa4c38c02b Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 22 Apr 2013 14:10:41 +0930 Subject: lguest: map Switcher text whenever we allocate a new pagetable. It's always to same, so no need to put in the PTE every time we're about to run. Keep a flag to track whether the pagetable has the Switcher entries allocated, and when allocating always initialize the Switcher text PTE. Signed-off-by: Rusty Russell --- drivers/lguest/page_tables.c | 42 ++++++++++++++++++++++++++++++++---------- 1 file changed, 32 insertions(+), 10 deletions(-) (limited to 'drivers/lguest/page_tables.c') diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index 1f48f27..d1a5de4 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c @@ -736,18 +736,39 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, /*H:501 * We do need the Switcher code mapped at all times, so we allocate that - * part of the Guest page table here, and populate it when we're about to run - * the guest. + * part of the Guest page table here. We map the Switcher code immediately, + * but defer mapping of the guest register page and IDT/LDT etc page until + * just before we run the guest in map_switcher_in_guest(). + * + * We *could* do this setup in map_switcher_in_guest(), but at that point + * we've interrupts disabled, and allocating pages like that is fraught: we + * can't sleep if we need to free up some memory. */ static bool allocate_switcher_mapping(struct lg_cpu *cpu) { int i; for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) { - if (!find_spte(cpu, switcher_addr + i * PAGE_SIZE, true, - CHECK_GPGD_MASK, _PAGE_TABLE)) + pte_t *pte = find_spte(cpu, switcher_addr + i * PAGE_SIZE, true, + CHECK_GPGD_MASK, _PAGE_TABLE); + if (!pte) return false; + + /* + * Map the switcher page if not already there. It might + * already be there because we call allocate_switcher_mapping() + * in guest_set_pgd() just in case it did discard our Switcher + * mapping, but it probably didn't. + */ + if (i == 0 && !(pte_flags(*pte) & _PAGE_PRESENT)) { + /* Get a reference to the Switcher page. */ + get_page(lg_switcher_pages[0]); + /* Create a read-only, exectuable, kernel-style PTE */ + set_pte(pte, + mk_pte(lg_switcher_pages[0], PAGE_KERNEL_RX)); + } } + cpu->lg->pgdirs[cpu->cpu_pgd].switcher_mapped = true; return true; } @@ -768,6 +789,7 @@ static void release_all_pagetables(struct lguest *lg) /* Every PGD entry. */ for (j = 0; j < PTRS_PER_PGD; j++) release_pgd(lg->pgdirs[i].pgdir + j); + lg->pgdirs[i].switcher_mapped = false; } } @@ -827,8 +849,10 @@ void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable) if (repin) pin_stack_pages(cpu); - if (!allocate_switcher_mapping(cpu)) - kill_guest(cpu, "Cannot populate switcher mapping"); + if (!cpu->lg->pgdirs[cpu->cpu_pgd].switcher_mapped) { + if (!allocate_switcher_mapping(cpu)) + kill_guest(cpu, "Cannot populate switcher mapping"); + } } /*:*/ @@ -1076,10 +1100,8 @@ void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) struct page *percpu_switcher_page, *regs_page; pte_t *pte; - /* Code page should always be mapped, and executable. */ - pte = find_spte(cpu, switcher_addr, false, 0, 0); - get_page(lg_switcher_pages[0]); - set_pte(pte, mk_pte(lg_switcher_pages[0], PAGE_KERNEL_RX)); + /* Switcher page should always be mapped! */ + BUG_ON(!cpu->lg->pgdirs[cpu->cpu_pgd].switcher_mapped); /* Clear all the Switcher mappings for any other CPUs. */ /* FIXME: This is dumb: update only when Host CPU changes. */ -- cgit v1.1 From 6d0cda93c0d3c8bb0a553047c10f114c88c8af89 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 22 Apr 2013 14:10:41 +0930 Subject: lguest: cache last cpu we ran on. This optimizes the frobbing of our Switcher map. Signed-off-by: Rusty Russell --- drivers/lguest/page_tables.c | 78 ++++++++++++++++++++++++++++++-------------- 1 file changed, 53 insertions(+), 25 deletions(-) (limited to 'drivers/lguest/page_tables.c') diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index d1a5de4..19611b0 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c @@ -7,7 +7,7 @@ * converted Guest pages when running the Guest. :*/ -/* Copyright (C) Rusty Russell IBM Corporation 2006. +/* Copyright (C) Rusty Russell IBM Corporation 2013. * GPL v2 and any later version */ #include #include @@ -731,6 +731,9 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, /* Release all the non-kernel mappings. */ flush_user_mappings(cpu->lg, next); + /* This hasn't run on any CPU at all. */ + cpu->lg->pgdirs[next].last_host_cpu = -1; + return next; } @@ -790,6 +793,7 @@ static void release_all_pagetables(struct lguest *lg) for (j = 0; j < PTRS_PER_PGD; j++) release_pgd(lg->pgdirs[i].pgdir + j); lg->pgdirs[i].switcher_mapped = false; + lg->pgdirs[i].last_host_cpu = -1; } } @@ -1086,37 +1090,62 @@ void free_guest_pagetable(struct lguest *lg) free_page((long)lg->pgdirs[i].pgdir); } +/*H:481 + * This clears the Switcher mappings for cpu #i. + */ +static void remove_switcher_percpu_map(struct lg_cpu *cpu, unsigned int i) +{ + unsigned long base = switcher_addr + PAGE_SIZE + i * PAGE_SIZE*2; + pte_t *pte; + + /* Clear the mappings for both pages. */ + pte = find_spte(cpu, base, false, 0, 0); + release_pte(*pte); + set_pte(pte, __pte(0)); + + pte = find_spte(cpu, base + PAGE_SIZE, false, 0, 0); + release_pte(*pte); + set_pte(pte, __pte(0)); +} + /*H:480 * (vi) Mapping the Switcher when the Guest is about to run. * - * The Switcher and the two pages for this CPU need to be visible in the - * Guest (and not the pages for other CPUs). + * The Switcher and the two pages for this CPU need to be visible in the Guest + * (and not the pages for other CPUs). * - * The pages have all been allocate + * The pages for the pagetables have all been allocated before: we just need + * to make sure the actual PTEs are up-to-date for the CPU we're about to run + * on. */ void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) { - unsigned long base, i; + unsigned long base; struct page *percpu_switcher_page, *regs_page; pte_t *pte; + struct pgdir *pgdir = &cpu->lg->pgdirs[cpu->cpu_pgd]; - /* Switcher page should always be mapped! */ - BUG_ON(!cpu->lg->pgdirs[cpu->cpu_pgd].switcher_mapped); - - /* Clear all the Switcher mappings for any other CPUs. */ - /* FIXME: This is dumb: update only when Host CPU changes. */ - for_each_possible_cpu(i) { - /* Get location of lguest_pages (indexed by Host CPU) */ - base = switcher_addr + PAGE_SIZE - + i * sizeof(struct lguest_pages); + /* Switcher page should always be mapped by now! */ + BUG_ON(!pgdir->switcher_mapped); - /* Get shadow PTE for first page (where we put guest regs). */ - pte = find_spte(cpu, base, false, 0, 0); - set_pte(pte, __pte(0)); + /* + * Remember that we have two pages for each Host CPU, so we can run a + * Guest on each CPU without them interfering. We need to make sure + * those pages are mapped correctly in the Guest, but since we usually + * run on the same CPU, we cache that, and only update the mappings + * when we move. + */ + if (pgdir->last_host_cpu == raw_smp_processor_id()) + return; - /* This is where we put R/O state. */ - pte = find_spte(cpu, base + PAGE_SIZE, false, 0, 0); - set_pte(pte, __pte(0)); + /* -1 means unknown so we remove everything. */ + if (pgdir->last_host_cpu == -1) { + unsigned int i; + for_each_possible_cpu(i) + remove_switcher_percpu_map(cpu, i); + } else { + /* We know exactly what CPU mapping to remove. */ + remove_switcher_percpu_map(cpu, pgdir->last_host_cpu); } /* @@ -1140,18 +1169,17 @@ void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) * the Guest: the IDT, GDT and other things it's not supposed to * change. */ - base += PAGE_SIZE; - pte = find_spte(cpu, base, false, 0, 0); - + pte = find_spte(cpu, base + PAGE_SIZE, false, 0, 0); percpu_switcher_page = lg_switcher_pages[1 + raw_smp_processor_id()*2 + 1]; get_page(percpu_switcher_page); set_pte(pte, mk_pte(percpu_switcher_page, __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL))); + + pgdir->last_host_cpu = raw_smp_processor_id(); } -/*:*/ -/* +/*H:490 * We've made it through the page table code. Perhaps our tired brains are * still processing the details, or perhaps we're simply glad it's over. * -- cgit v1.1