summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/x86/x86_64/5level-paging.txt9
-rw-r--r--arch/x86/Kconfig13
-rw-r--r--arch/x86/boot/compressed/kaslr.c14
-rw-r--r--arch/x86/boot/compressed/misc.c16
-rw-r--r--arch/x86/boot/compressed/misc.h5
-rw-r--r--arch/x86/entry/entry_64.S5
-rw-r--r--arch/x86/include/asm/kaslr.h4
-rw-r--r--arch/x86/include/asm/mem_encrypt.h1
-rw-r--r--arch/x86/include/asm/page_64.h4
-rw-r--r--arch/x86/include/asm/page_64_types.h20
-rw-r--r--arch/x86/include/asm/paravirt.h21
-rw-r--r--arch/x86/include/asm/pgalloc.h5
-rw-r--r--arch/x86/include/asm/pgtable-3level_types.h1
-rw-r--r--arch/x86/include/asm/pgtable.h11
-rw-r--r--arch/x86/include/asm/pgtable_32.h2
-rw-r--r--arch/x86/include/asm/pgtable_32_types.h2
-rw-r--r--arch/x86/include/asm/pgtable_64.h23
-rw-r--r--arch/x86/include/asm/pgtable_64_types.h70
-rw-r--r--arch/x86/include/asm/required-features.h8
-rw-r--r--arch/x86/include/asm/sparsemem.h9
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c18
-rw-r--r--arch/x86/kernel/head64.c81
-rw-r--r--arch/x86/kernel/head_64.S22
-rw-r--r--arch/x86/kernel/setup.c5
-rw-r--r--arch/x86/mm/Makefile14
-rw-r--r--arch/x86/mm/debug_pagetables.c32
-rw-r--r--arch/x86/mm/dump_pagetables.c33
-rw-r--r--arch/x86/mm/fault.c4
-rw-r--r--arch/x86/mm/ident_map.c2
-rw-r--r--arch/x86/mm/init_64.c32
-rw-r--r--arch/x86/mm/kasan_init_64.c20
-rw-r--r--arch/x86/mm/kaslr.c29
-rw-r--r--arch/x86/mm/mem_encrypt.c578
-rw-r--r--arch/x86/mm/mem_encrypt_identity.c564
-rw-r--r--arch/x86/mm/tlb.c4
-rw-r--r--arch/x86/platform/efi/efi_64.c10
-rw-r--r--arch/x86/power/hibernate_64.c6
-rw-r--r--arch/x86/xen/Kconfig5
-rw-r--r--arch/x86/xen/mmu_pv.c21
-rw-r--r--include/asm-generic/5level-fixup.h1
-rw-r--r--include/asm-generic/pgtable-nop4d.h9
-rw-r--r--include/linux/kasan.h2
-rw-r--r--mm/kasan/kasan_init.c2
-rw-r--r--mm/zsmalloc.c13
44 files changed, 952 insertions, 798 deletions
diff --git a/Documentation/x86/x86_64/5level-paging.txt b/Documentation/x86/x86_64/5level-paging.txt
index 087251a..2432a5e 100644
--- a/Documentation/x86/x86_64/5level-paging.txt
+++ b/Documentation/x86/x86_64/5level-paging.txt
@@ -20,12 +20,9 @@ Documentation/x86/x86_64/mm.txt
CONFIG_X86_5LEVEL=y enables the feature.
-So far, a kernel compiled with the option enabled will be able to boot
-only on machines that supports the feature -- see for 'la57' flag in
-/proc/cpuinfo.
-
-The plan is to implement boot-time switching between 4- and 5-level paging
-in the future.
+Kernel with CONFIG_X86_5LEVEL=y still able to boot on 4-level hardware.
+In this case additional page table level -- p4d -- will be folded at
+runtime.
== User-space and large virtual address space ==
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index c1236b1..552b3d0 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1471,6 +1471,8 @@ config X86_PAE
config X86_5LEVEL
bool "Enable 5-level page tables support"
+ select DYNAMIC_MEMORY_LAYOUT
+ select SPARSEMEM_VMEMMAP
depends on X86_64
---help---
5-level paging enables access to larger address space:
@@ -1479,8 +1481,8 @@ config X86_5LEVEL
It will be supported by future Intel CPUs.
- Note: a kernel with this option enabled can only be booted
- on machines that support the feature.
+ A kernel with the option enabled can be booted on machines that
+ support 4- or 5-level paging.
See Documentation/x86/x86_64/5level-paging.txt for more
information.
@@ -2184,10 +2186,17 @@ config PHYSICAL_ALIGN
Don't change this unless you know what you are doing.
+config DYNAMIC_MEMORY_LAYOUT
+ bool
+ ---help---
+ This option makes base addresses of vmalloc and vmemmap as well as
+ __PAGE_OFFSET movable during boot.
+
config RANDOMIZE_MEMORY
bool "Randomize the kernel memory sections"
depends on X86_64
depends on RANDOMIZE_BASE
+ select DYNAMIC_MEMORY_LAYOUT
default RANDOMIZE_BASE
---help---
Randomizes the base virtual address of kernel memory sections
diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c
index 8199a61..66e42a0 100644
--- a/arch/x86/boot/compressed/kaslr.c
+++ b/arch/x86/boot/compressed/kaslr.c
@@ -46,6 +46,12 @@
#define STATIC
#include <linux/decompress/mm.h>
+#ifdef CONFIG_X86_5LEVEL
+unsigned int pgtable_l5_enabled __ro_after_init;
+unsigned int pgdir_shift __ro_after_init = 39;
+unsigned int ptrs_per_p4d __ro_after_init = 1;
+#endif
+
extern unsigned long get_cmd_line_ptr(void);
/* Simplified build-specific string for starting entropy. */
@@ -723,6 +729,14 @@ void choose_random_location(unsigned long input,
return;
}
+#ifdef CONFIG_X86_5LEVEL
+ if (__read_cr4() & X86_CR4_LA57) {
+ pgtable_l5_enabled = 1;
+ pgdir_shift = 48;
+ ptrs_per_p4d = 512;
+ }
+#endif
+
boot_params->hdr.loadflags |= KASLR_FLAG;
/* Prepare to add new identity pagetables on demand. */
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 98761a1..b50c424 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -169,16 +169,6 @@ void __puthex(unsigned long value)
}
}
-static bool l5_supported(void)
-{
- /* Check if leaf 7 is supported. */
- if (native_cpuid_eax(0) < 7)
- return 0;
-
- /* Check if la57 is supported. */
- return native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31));
-}
-
#if CONFIG_X86_NEED_RELOCS
static void handle_relocations(void *output, unsigned long output_len,
unsigned long virt_addr)
@@ -372,12 +362,6 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,
console_init();
debug_putstr("early console in extract_kernel\n");
- if (IS_ENABLED(CONFIG_X86_5LEVEL) && !l5_supported()) {
- error("This linux kernel as configured requires 5-level paging\n"
- "This CPU does not support the required 'cr4.la57' feature\n"
- "Unable to boot - please use a kernel appropriate for your CPU\n");
- }
-
free_mem_ptr = heap; /* Heap */
free_mem_end_ptr = heap + BOOT_HEAP_SIZE;
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index 9d323dc..4d369c3 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -12,6 +12,11 @@
#undef CONFIG_PARAVIRT_SPINLOCKS
#undef CONFIG_KASAN
+#ifdef CONFIG_X86_5LEVEL
+/* cpu_feature_enabled() cannot be used that early */
+#define pgtable_l5_enabled __pgtable_l5_enabled
+#endif
+
#include <linux/linkage.h>
#include <linux/screen_info.h>
#include <linux/elf.h>
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 8971bd6..c9e55b8 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -260,8 +260,13 @@ GLOBAL(entry_SYSCALL_64_after_hwframe)
* Change top bits to match most significant bit (47th or 56th bit
* depending on paging mode) in the address.
*/
+#ifdef CONFIG_X86_5LEVEL
+ ALTERNATIVE "shl $(64 - 48), %rcx; sar $(64 - 48), %rcx", \
+ "shl $(64 - 57), %rcx; sar $(64 - 57), %rcx", X86_FEATURE_LA57
+#else
shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
+#endif
/* If this changed %rcx, it was not canonical */
cmpq %rcx, %r11
diff --git a/arch/x86/include/asm/kaslr.h b/arch/x86/include/asm/kaslr.h
index 460991e..db7ba2f 100644
--- a/arch/x86/include/asm/kaslr.h
+++ b/arch/x86/include/asm/kaslr.h
@@ -5,10 +5,6 @@
unsigned long kaslr_get_random_long(const char *purpose);
#ifdef CONFIG_RANDOMIZE_MEMORY
-extern unsigned long page_offset_base;
-extern unsigned long vmalloc_base;
-extern unsigned long vmemmap_base;
-
void kernel_randomize_memory(void);
#else
static inline void kernel_randomize_memory(void) { }
diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
index 22c5f3e..8fe61ad 100644
--- a/arch/x86/include/asm/mem_encrypt.h
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -22,6 +22,7 @@
#ifdef CONFIG_AMD_MEM_ENCRYPT
extern u64 sme_me_mask;
+extern bool sev_enabled;
void sme_encrypt_execute(unsigned long encrypted_kernel_vaddr,
unsigned long decrypted_kernel_vaddr,
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index d652a38..e7f7a4f 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -11,6 +11,10 @@
extern unsigned long max_pfn;
extern unsigned long phys_base;
+extern unsigned long page_offset_base;
+extern unsigned long vmalloc_base;
+extern unsigned long vmemmap_base;
+
static inline unsigned long __phys_addr_nodebug(unsigned long x)
{
unsigned long y = x - __START_KERNEL_map;
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index e140731..2c5a966 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -37,26 +37,24 @@
* hypervisor to fit. Choosing 16 slots here is arbitrary, but it's
* what Xen requires.
*/
-#ifdef CONFIG_X86_5LEVEL
-#define __PAGE_OFFSET_BASE _AC(0xff10000000000000, UL)
-#else
-#define __PAGE_OFFSET_BASE _AC(0xffff880000000000, UL)
-#endif
+#define __PAGE_OFFSET_BASE_L5 _AC(0xff10000000000000, UL)
+#define __PAGE_OFFSET_BASE_L4 _AC(0xffff880000000000, UL)
-#ifdef CONFIG_RANDOMIZE_MEMORY
+#ifdef CONFIG_DYNAMIC_MEMORY_LAYOUT
#define __PAGE_OFFSET page_offset_base
#else
-#define __PAGE_OFFSET __PAGE_OFFSET_BASE
-#endif /* CONFIG_RANDOMIZE_MEMORY */
+#define __PAGE_OFFSET __PAGE_OFFSET_BASE_L4
+#endif /* CONFIG_DYNAMIC_MEMORY_LAYOUT */
#define __START_KERNEL_map _AC(0xffffffff80000000, UL)
/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
-#ifdef CONFIG_X86_5LEVEL
+
#define __PHYSICAL_MASK_SHIFT 52
-#define __VIRTUAL_MASK_SHIFT 56
+
+#ifdef CONFIG_X86_5LEVEL
+#define __VIRTUAL_MASK_SHIFT (pgtable_l5_enabled ? 56 : 47)
#else
-#define __PHYSICAL_MASK_SHIFT 46
#define __VIRTUAL_MASK_SHIFT 47
#endif
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 554841f..6d3b921 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -567,17 +567,22 @@ static inline p4dval_t p4d_val(p4d_t p4d)
return PVOP_CALLEE1(p4dval_t, pv_mmu_ops.p4d_val, p4d.p4d);
}
-static inline void set_pgd(pgd_t *pgdp, pgd_t pgd)
+static inline void __set_pgd(pgd_t *pgdp, pgd_t pgd)
{
- pgdval_t val = native_pgd_val(pgd);
-
- PVOP_VCALL2(pv_mmu_ops.set_pgd, pgdp, val);
+ PVOP_VCALL2(pv_mmu_ops.set_pgd, pgdp, native_pgd_val(pgd));
}
-static inline void pgd_clear(pgd_t *pgdp)
-{
- set_pgd(pgdp, __pgd(0));
-}
+#define set_pgd(pgdp, pgdval) do { \
+ if (pgtable_l5_enabled) \
+ __set_pgd(pgdp, pgdval); \
+ else \
+ set_p4d((p4d_t *)(pgdp), (p4d_t) { (pgdval).pgd }); \
+} while (0)
+
+#define pgd_clear(pgdp) do { \
+ if (pgtable_l5_enabled) \
+ set_pgd(pgdp, __pgd(0)); \
+} while (0)
#endif /* CONFIG_PGTABLE_LEVELS == 5 */
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index aff42e1..263c142 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -167,6 +167,8 @@ static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
#if CONFIG_PGTABLE_LEVELS > 4
static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, p4d_t *p4d)
{
+ if (!pgtable_l5_enabled)
+ return;
paravirt_alloc_p4d(mm, __pa(p4d) >> PAGE_SHIFT);
set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(p4d)));
}
@@ -191,7 +193,8 @@ extern void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d);
static inline void __p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d,
unsigned long address)
{
- ___p4d_free_tlb(tlb, p4d);
+ if (pgtable_l5_enabled)
+ ___p4d_free_tlb(tlb, p4d);
}
#endif /* CONFIG_PGTABLE_LEVELS > 4 */
diff --git a/arch/x86/include/asm/pgtable-3level_types.h b/arch/x86/include/asm/pgtable-3level_types.h
index 876b4c7..6a59a6d 100644
--- a/arch/x86/include/asm/pgtable-3level_types.h
+++ b/arch/x86/include/asm/pgtable-3level_types.h
@@ -44,5 +44,6 @@ typedef union {
*/
#define PTRS_PER_PTE 512
+#define MAX_POSSIBLE_PHYSMEM_BITS 36
#endif /* _ASM_X86_PGTABLE_3LEVEL_DEFS_H */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 63c2552..c8baa7f 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -65,7 +65,7 @@ extern pmdval_t early_pmd_flags;
#ifndef __PAGETABLE_P4D_FOLDED
#define set_pgd(pgdp, pgd) native_set_pgd(pgdp, pgd)
-#define pgd_clear(pgd) native_pgd_clear(pgd)
+#define pgd_clear(pgd) (pgtable_l5_enabled ? native_pgd_clear(pgd) : 0)
#endif
#ifndef set_p4d
@@ -859,6 +859,8 @@ static inline unsigned long p4d_index(unsigned long address)
#if CONFIG_PGTABLE_LEVELS > 4
static inline int pgd_present(pgd_t pgd)
{
+ if (!pgtable_l5_enabled)
+ return 1;
return pgd_flags(pgd) & _PAGE_PRESENT;
}
@@ -876,6 +878,8 @@ static inline unsigned long pgd_page_vaddr(pgd_t pgd)
/* to find an entry in a page-table-directory. */
static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)
{
+ if (!pgtable_l5_enabled)
+ return (p4d_t *)pgd;
return (p4d_t *)pgd_page_vaddr(*pgd) + p4d_index(address);
}
@@ -883,6 +887,9 @@ static inline int pgd_bad(pgd_t pgd)
{
unsigned long ignore_flags = _PAGE_USER;
+ if (!pgtable_l5_enabled)
+ return 0;
+
if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
ignore_flags |= _PAGE_NX;
@@ -891,6 +898,8 @@ static inline int pgd_bad(pgd_t pgd)
static inline int pgd_none(pgd_t pgd)
{
+ if (!pgtable_l5_enabled)
+ return 0;
/*
* There is no need to do a workaround for the KNL stray
* A/D bit erratum here. PGDs only point to page tables
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
index e554667..b838c51 100644
--- a/arch/x86/include/asm/pgtable_32.h
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -33,6 +33,8 @@ static inline void pgtable_cache_init(void) { }
static inline void check_pgt_cache(void) { }
void paging_init(void);
+static inline int pgd_large(pgd_t pgd) { return 0; }
+
/*
* Define this if things work differently on an i386 and an i486:
* it will (on an i486) warn about kernel memory accesses that are
diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h
index 0777e18..e3225e8 100644
--- a/arch/x86/include/asm/pgtable_32_types.h
+++ b/arch/x86/include/asm/pgtable_32_types.h
@@ -15,6 +15,8 @@
# include <asm/pgtable-2level_types.h>
#endif
+#define pgtable_l5_enabled 0
+
#define PGDIR_SIZE (1UL << PGDIR_SHIFT)
#define PGDIR_MASK (~(PGDIR_SIZE - 1))
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 81462e9..81dda8d 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -217,29 +217,26 @@ static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d)
{
-#if defined(CONFIG_PAGE_TABLE_ISOLATION) && !defined(CONFIG_X86_5LEVEL)
- p4dp->pgd = pti_set_user_pgd(&p4dp->pgd, p4d.pgd);
-#else
- *p4dp = p4d;
-#endif
+ pgd_t pgd;
+
+ if (pgtable_l5_enabled || !IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) {
+ *p4dp = p4d;
+ return;
+ }
+
+ pgd = native_make_pgd(p4d_val(p4d));
+ pgd = pti_set_user_pgd((pgd_t *)p4dp, pgd);
+ *p4dp = native_make_p4d(pgd_val(pgd));
}
static inline void native_p4d_clear(p4d_t *p4d)
{
-#ifdef CONFIG_X86_5LEVEL
native_set_p4d(p4d, native_make_p4d(0));
-#else
- native_set_p4d(p4d, (p4d_t) { .pgd = native_make_pgd(0)});
-#endif
}
static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
{
-#ifdef CONFIG_PAGE_TABLE_ISOLATION
*pgdp = pti_set_user_pgd(pgdp, pgd);
-#else
- *pgdp = pgd;
-#endif
}
static inline void native_pgd_clear(pgd_t *pgd)
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 6b8f73d..d5c21a3 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -20,6 +20,18 @@ typedef unsigned long pgprotval_t;
typedef struct { pteval_t pte; } pte_t;
+#ifdef CONFIG_X86_5LEVEL
+extern unsigned int __pgtable_l5_enabled;
+#ifndef pgtable_l5_enabled
+#define pgtable_l5_enabled cpu_feature_enabled(X86_FEATURE_LA57)
+#endif
+#else
+#define pgtable_l5_enabled 0
+#endif
+
+extern unsigned int pgdir_shift;
+extern unsigned int ptrs_per_p4d;
+
#endif /* !__ASSEMBLY__ */
#define SHARED_KERNEL_PMD 0
@@ -29,24 +41,28 @@ typedef struct { pteval_t pte; } pte_t;
/*
* PGDIR_SHIFT determines what a top-level page table entry can map
*/
-#define PGDIR_SHIFT 48
+#define PGDIR_SHIFT pgdir_shift
#define PTRS_PER_PGD 512
/*
* 4th level page in 5-level paging case
*/
-#define P4D_SHIFT 39
-#define PTRS_PER_P4D 512
-#define P4D_SIZE (_AC(1, UL) << P4D_SHIFT)
-#define P4D_MASK (~(P4D_SIZE - 1))
+#define P4D_SHIFT 39
+#define MAX_PTRS_PER_P4D 512
+#define PTRS_PER_P4D ptrs_per_p4d
+#define P4D_SIZE (_AC(1, UL) << P4D_SHIFT)
+#define P4D_MASK (~(P4D_SIZE - 1))
+
+#define MAX_POSSIBLE_PHYSMEM_BITS 52
#else /* CONFIG_X86_5LEVEL */
/*
* PGDIR_SHIFT determines what a top-level page table entry can map
*/
-#define PGDIR_SHIFT 39
-#define PTRS_PER_PGD 512
+#define PGDIR_SHIFT 39
+#define PTRS_PER_PGD 512
+#define MAX_PTRS_PER_P4D 1
#endif /* CONFIG_X86_5LEVEL */
@@ -82,31 +98,33 @@ typedef struct { pteval_t pte; } pte_t;
* range must not overlap with anything except the KASAN shadow area, which
* is correct as KASAN disables KASLR.
*/
-#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
+#define MAXMEM (1UL << MAX_PHYSMEM_BITS)
-#ifdef CONFIG_X86_5LEVEL
-# define VMALLOC_SIZE_TB _AC(12800, UL)
-# define __VMALLOC_BASE _AC(0xffa0000000000000, UL)
-# define __VMEMMAP_BASE _AC(0xffd4000000000000, UL)
-# define LDT_PGD_ENTRY _AC(-112, UL)
-# define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT)
-#else
-# define VMALLOC_SIZE_TB _AC(32, UL)
-# define __VMALLOC_BASE _AC(0xffffc90000000000, UL)
-# define __VMEMMAP_BASE _AC(0xffffea0000000000, UL)
-# define LDT_PGD_ENTRY _AC(-3, UL)
-# define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT)
-#endif
+#define LDT_PGD_ENTRY_L4 -3UL
+#define LDT_PGD_ENTRY_L5 -112UL
+#define LDT_PGD_ENTRY (pgtable_l5_enabled ? LDT_PGD_ENTRY_L5 : LDT_PGD_ENTRY_L4)
+#define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT)
+
+#define __VMALLOC_BASE_L4 0xffffc90000000000
+#define __VMALLOC_BASE_L5 0xffa0000000000000
+
+#define VMALLOC_SIZE_TB_L4 32UL
+#define VMALLOC_SIZE_TB_L5 12800UL
+
+#define __VMEMMAP_BASE_L4 0xffffea0000000000
+#define __VMEMMAP_BASE_L5 0xffd4000000000000
-#ifdef CONFIG_RANDOMIZE_MEMORY
+#ifdef CONFIG_DYNAMIC_MEMORY_LAYOUT
# define VMALLOC_START vmalloc_base
+# define VMALLOC_SIZE_TB (pgtable_l5_enabled ? VMALLOC_SIZE_TB_L5 : VMALLOC_SIZE_TB_L4)
# define VMEMMAP_START vmemmap_base
#else
-# define VMALLOC_START __VMALLOC_BASE
-# define VMEMMAP_START __VMEMMAP_BASE
-#endif /* CONFIG_RANDOMIZE_MEMORY */
+# define VMALLOC_START __VMALLOC_BASE_L4
+# define VMALLOC_SIZE_TB VMALLOC_SIZE_TB_L4
+# define VMEMMAP_START __VMEMMAP_BASE_L4
+#endif /* CONFIG_DYNAMIC_MEMORY_LAYOUT */
-#define VMALLOC_END (VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 1, UL))
+#define VMALLOC_END (VMALLOC_START + (VMALLOC_SIZE_TB << 40) - 1)
#define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE)
/* The module sections ends with the start of the fixmap */
diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h
index fb3a6de..6847d85 100644
--- a/arch/x86/include/asm/required-features.h
+++ b/arch/x86/include/asm/required-features.h
@@ -53,12 +53,6 @@
# define NEED_MOVBE 0
#endif
-#ifdef CONFIG_X86_5LEVEL
-# define NEED_LA57 (1<<(X86_FEATURE_LA57 & 31))
-#else
-# define NEED_LA57 0
-#endif
-
#ifdef CONFIG_X86_64
#ifdef CONFIG_PARAVIRT
/* Paravirtualized systems may not have PSE or PGE available */
@@ -104,7 +98,7 @@
#define REQUIRED_MASK13 0
#define REQUIRED_MASK14 0
#define REQUIRED_MASK15 0
-#define REQUIRED_MASK16 (NEED_LA57)
+#define REQUIRED_MASK16 0
#define REQUIRED_MASK17 0
#define REQUIRED_MASK18 0
#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19)
diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h
index 4fc1e9d..4617a2b 100644
--- a/arch/x86/include/asm/sparsemem.h
+++ b/arch/x86/include/asm/sparsemem.h
@@ -27,13 +27,8 @@
# endif
#else /* CONFIG_X86_32 */
# define SECTION_SIZE_BITS 27 /* matt - 128 is convenient right now */
-# ifdef CONFIG_X86_5LEVEL
-# define MAX_PHYSADDR_BITS 52
-# define MAX_PHYSMEM_BITS 52
-# else
-# define MAX_PHYSADDR_BITS 44
-# define MAX_PHYSMEM_BITS 46
-# endif
+# define MAX_PHYSADDR_BITS (pgtable_l5_enabled ? 52 : 44)
+# define MAX_PHYSMEM_BITS (pgtable_l5_enabled ? 52 : 46)
#endif
#endif /* CONFIG_SPARSEMEM */
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 8ff94d1..3d8c573 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1088,19 +1088,7 @@ static void mce_unmap_kpfn(unsigned long pfn)
* a legal address.
*/
-/*
- * Build time check to see if we have a spare virtual bit. Don't want
- * to leave this until run time because most developers don't have a
- * system that can exercise this code path. This will only become a
- * problem if/when we move beyond 5-level page tables.
- *
- * Hard code "9" here because cpp doesn't grok ilog2(PTRS_PER_PGD)
- */
-#if PGDIR_SHIFT + 9 < 63
decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63));
-#else
-#error "no unused virtual bit available"
-#endif
if (set_memory_np(decoy_addr, 1))
pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn);
@@ -2333,6 +2321,12 @@ static __init int mcheck_init_device(void)
{
int err;
+ /*
+ * Check if we have a spare virtual bit. This will only become
+ * a problem if/when we move beyond 5-level page tables.
+ */
+ MAYBE_BUILD_BUG_ON(__VIRTUAL_MASK_SHIFT >= 63);
+
if (!mce_available(&boot_cpu_data)) {
err = -EIO;
goto err_out;
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 7ba5d81..0c855de 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -32,6 +32,11 @@
#include <asm/microcode.h>
#include <asm/kasan.h>
+#ifdef CONFIG_X86_5LEVEL
+#undef pgtable_l5_enabled
+#define pgtable_l5_enabled __pgtable_l5_enabled
+#endif
+
/*
* Manage page tables very early on.
*/
@@ -39,6 +44,24 @@ extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD];
static unsigned int __initdata next_early_pgt;
pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX);
+#ifdef CONFIG_X86_5LEVEL
+unsigned int __pgtable_l5_enabled __ro_after_init;
+EXPORT_SYMBOL(__pgtable_l5_enabled);
+unsigned int pgdir_shift __ro_after_init = 39;
+EXPORT_SYMBOL(pgdir_shift);
+unsigned int ptrs_per_p4d __ro_after_init = 1;
+EXPORT_SYMBOL(ptrs_per_p4d);
+#endif
+
+#ifdef CONFIG_DYNAMIC_MEMORY_LAYOUT
+unsigned long page_offset_base __ro_after_init = __PAGE_OFFSET_BASE_L4;
+EXPORT_SYMBOL(page_offset_base);
+unsigned long vmalloc_base __ro_after_init = __VMALLOC_BASE_L4;
+EXPORT_SYMBOL(vmalloc_base);
+unsigned long vmemmap_base __ro_after_init = __VMEMMAP_BASE_L4;
+EXPORT_SYMBOL(vmemmap_base);
+#endif
+
#define __head __section(.head.text)
static void __head *fixup_pointer(void *ptr, unsigned long physaddr)
@@ -46,6 +69,41 @@ static void __head *fixup_pointer(void *ptr, unsigned long physaddr)
return ptr - (void *)_text + (void *)physaddr;
}
+static unsigned long __head *fixup_long(void *ptr, unsigned long physaddr)
+{
+ return fixup_pointer(ptr, physaddr);
+}
+
+#ifdef CONFIG_X86_5LEVEL
+static unsigned int __head *fixup_int(void *ptr, unsigned long physaddr)
+{
+ return fixup_pointer(ptr, physaddr);
+}
+
+static bool __head check_la57_support(unsigned long physaddr)
+{
+ if (native_cpuid_eax(0) < 7)
+ return false;
+
+ if (!(native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31))))
+ return false;
+
+ *fixup_int(&pgtable_l5_enabled, physaddr) = 1;
+ *fixup_int(&pgdir_shift, physaddr) = 48;
+ *fixup_int(&ptrs_per_p4d, physaddr) = 512;
+ *fixup_long(&page_offset_base, physaddr) = __PAGE_OFFSET_BASE_L5;
+ *fixup_long(&vmalloc_base, physaddr) = __VMALLOC_BASE_L5;
+ *fixup_long(&vmemmap_base, physaddr) = __VMEMMAP_BASE_L5;
+
+ return true;
+}
+#else
+static bool __head check_la57_support(unsigned long physaddr)
+{
+ return false;
+}
+#endif
+
unsigned long __head __startup_64(unsigned long physaddr,
struct boot_params *bp)
{
@@ -55,9 +113,12 @@ unsigned long __head __startup_64(unsigned long physaddr,
p4dval_t *p4d;
pudval_t *pud;
pmdval_t *pmd, pmd_entry;
+ bool la57;
int i;
unsigned int *next_pgt_ptr;
+ la57 = check_la57_support(physaddr);
+
/* Is the address too large? */
if (physaddr >> MAX_PHYSMEM_BITS)
for (;;);
@@ -81,9 +142,14 @@ unsigned long __head __startup_64(unsigned long physaddr,
/* Fixup the physical addresses in the page table */
pgd = fixup_pointer(&early_top_pgt, physaddr);
- pgd[pgd_index(__START_KERNEL_map)] += load_delta;
-
- if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+ p = pgd + pgd_index(__START_KERNEL_map);
+ if (la57)
+ *p = (unsigned long)level4_kernel_pgt;
+ else
+ *p = (unsigned long)level3_kernel_pgt;
+ *p += _PAGE_TABLE_NOENC - __START_KERNEL_map + load_delta;
+
+ if (la57) {
p4d = fixup_pointer(&level4_kernel_pgt, physaddr);
p4d[511] += load_delta;
}
@@ -108,7 +174,7 @@ unsigned long __head __startup_64(unsigned long physaddr,
pgtable_flags = _KERNPG_TABLE_NOENC + sme_get_me_mask();
- if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+ if (la57) {
p4d = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);
i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
@@ -154,8 +220,7 @@ unsigned long __head __startup_64(unsigned long physaddr,
* Fixup phys_base - remove the memory encryption mask to obtain
* the true physical address.
*/
- p = fixup_pointer(&phys_base, physaddr);
- *p += load_delta - sme_get_me_mask();
+ *fixup_long(&phys_base, physaddr) += load_delta - sme_get_me_mask();
/* Encrypt the kernel and related (if SME is active) */
sme_encrypt_kernel(bp);
@@ -206,7 +271,7 @@ again:
* critical -- __PAGE_OFFSET would point us back into the dynamic
* range and we might end up looping forever...
*/
- if (!IS_ENABLED(CONFIG_X86_5LEVEL))
+ if (!pgtable_l5_enabled)
p4d_p = pgd_p;
else if (pgd)
p4d_p = (p4dval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
@@ -322,7 +387,7 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
BUILD_BUG_ON((__START_KERNEL_map & ~PMD_MASK) != 0);
BUILD_BUG_ON((MODULES_VADDR & ~PMD_MASK) != 0);
BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
- BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
+ MAYBE_BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
(__START_KERNEL & PGDIR_MASK)));
BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 04a625f..326c631 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -38,12 +38,12 @@
*
*/
+#define l4_index(x) (((x) >> 39) & 511)
#define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
-#if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH)
-PGD_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE)
-PGD_START_KERNEL = pgd_index(__START_KERNEL_map)
-#endif
+L4_PAGE_OFFSET = l4_index(__PAGE_OFFSET_BASE_L4)
+L4_START_KERNEL = l4_index(__START_KERNEL_map)
+
L3_START_KERNEL = pud_index(__START_KERNEL_map)
.text
@@ -124,7 +124,10 @@ ENTRY(secondary_startup_64)
/* Enable PAE mode, PGE and LA57 */
movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx
#ifdef CONFIG_X86_5LEVEL
+ testl $1, __pgtable_l5_enabled(%rip)
+ jz 1f
orl $X86_CR4_LA57, %ecx
+1:
#endif
movq %rcx, %cr4
@@ -372,12 +375,7 @@ GLOBAL(name)
__INITDATA
NEXT_PGD_PAGE(early_top_pgt)
- .fill 511,8,0
-#ifdef CONFIG_X86_5LEVEL
- .quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
-#else
- .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
-#endif
+ .fill 512,8,0
.fill PTI_USER_PGD_FILL,8,0
NEXT_PAGE(early_dynamic_pgts)
@@ -388,9 +386,9 @@ NEXT_PAGE(early_dynamic_pgts)
#if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH)
NEXT_PGD_PAGE(init_top_pgt)
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
- .org init_top_pgt + PGD_PAGE_OFFSET*8, 0
+ .org init_top_pgt + L4_PAGE_OFFSET*8, 0
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
- .org init_top_pgt + PGD_START_KERNEL*8, 0
+ .org init_top_pgt + L4_START_KERNEL*8, 0
/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
.fill PTI_USER_PGD_FILL,8,0
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 1ae67e9..399d0f7 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -189,9 +189,7 @@ struct ist_info ist_info;
#endif
#else
-struct cpuinfo_x86 boot_cpu_data __read_mostly = {
- .x86_phys_bits = MAX_PHYSMEM_BITS,
-};
+struct cpuinfo_x86 boot_cpu_data __read_mostly;
EXPORT_SYMBOL(boot_cpu_data);
#endif
@@ -851,6 +849,7 @@ void __init setup_arch(char **cmdline_p)
__flush_tlb_all();
#else
printk(KERN_INFO "Command line: %s\n", boot_command_line);
+ boot_cpu_data.x86_phys_bits = MAX_PHYSMEM_BITS;
#endif
/*
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 27e9e90..03c6c85 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,12 +1,15 @@
# SPDX-License-Identifier: GPL-2.0
-# Kernel does not boot with instrumentation of tlb.c and mem_encrypt.c
-KCOV_INSTRUMENT_tlb.o := n
-KCOV_INSTRUMENT_mem_encrypt.o := n
+# Kernel does not boot with instrumentation of tlb.c and mem_encrypt*.c
+KCOV_INSTRUMENT_tlb.o := n
+KCOV_INSTRUMENT_mem_encrypt.o := n
+KCOV_INSTRUMENT_mem_encrypt_identity.o := n
-KASAN_SANITIZE_mem_encrypt.o := n
+KASAN_SANITIZE_mem_encrypt.o := n
+KASAN_SANITIZE_mem_encrypt_identity.o := n
ifdef CONFIG_FUNCTION_TRACER
-CFLAGS_REMOVE_mem_encrypt.o = -pg
+CFLAGS_REMOVE_mem_encrypt.o = -pg
+CFLAGS_REMOVE_mem_encrypt_identity.o = -pg
endif
obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
@@ -47,4 +50,5 @@ obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
obj-$(CONFIG_PAGE_TABLE_ISOLATION) += pti.o
obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt.o
+obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_identity.o
obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_boot.o
diff --git a/arch/x86/mm/debug_pagetables.c b/arch/x86/mm/debug_pagetables.c
index 421f266..51a6f92 100644
--- a/arch/x86/mm/debug_pagetables.c
+++ b/arch/x86/mm/debug_pagetables.c
@@ -72,6 +72,31 @@ static const struct file_operations ptdump_curusr_fops = {
};
#endif
+#if defined(CONFIG_EFI) && defined(CONFIG_X86_64)
+extern pgd_t *efi_pgd;
+static struct dentry *pe_efi;
+
+static int ptdump_show_efi(struct seq_file *m, void *v)
+{
+ if (efi_pgd)
+ ptdump_walk_pgd_level_debugfs(m, efi_pgd, false);
+ return 0;
+}
+
+static int ptdump_open_efi(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, ptdump_show_efi, NULL);
+}
+
+static const struct file_operations ptdump_efi_fops = {
+ .owner = THIS_MODULE,
+ .open = ptdump_open_efi,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+#endif
+
static struct dentry *dir, *pe_knl, *pe_curknl;
static int __init pt_dump_debug_init(void)
@@ -96,6 +121,13 @@ static int __init pt_dump_debug_init(void)
if (!pe_curusr)
goto err;
#endif
+
+#if defined(CONFIG_EFI) && defined(CONFIG_X86_64)
+ pe_efi = debugfs_create_file("efi", 0400, dir, NULL, &ptdump_efi_fops);
+ if (!pe_efi)
+ goto err;
+#endif
+
return 0;
err:
debugfs_remove_recursive(dir);
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 2a4849e..0d6d67d 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -85,11 +85,15 @@ static struct addr_marker address_markers[] = {
[VMALLOC_START_NR] = { 0UL, "vmalloc() Area" },
[VMEMMAP_START_NR] = { 0UL, "Vmemmap" },
#ifdef CONFIG_KASAN
- [KASAN_SHADOW_START_NR] = { KASAN_SHADOW_START, "KASAN shadow" },
- [KASAN_SHADOW_END_NR] = { KASAN_SHADOW_END, "KASAN shadow end" },
+ /*
+ * These fields get initialized with the (dynamic)
+ * KASAN_SHADOW_{START,END} values in pt_dump_init().
+ */
+ [KASAN_SHADOW_START_NR] = { 0UL, "KASAN shadow" },
+ [KASAN_SHADOW_END_NR] = { 0UL, "KASAN shadow end" },
#endif
#ifdef CONFIG_MODIFY_LDT_SYSCALL
- [LDT_NR] = { LDT_BASE_ADDR, "LDT remap" },
+ [LDT_NR] = { 0UL, "LDT remap" },
#endif
[CPU_ENTRY_AREA_NR] = { CPU_ENTRY_AREA_BASE,"CPU entry Area" },
#ifdef CONFIG_X86_ESPFIX64
@@ -344,9 +348,7 @@ static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st,
void *pt)
{
if (__pa(pt) == __pa(kasan_zero_pmd) ||
-#ifdef CONFIG_X86_5LEVEL
- __pa(pt) == __pa(kasan_zero_p4d) ||
-#endif
+ (pgtable_l5_enabled && __pa(pt) == __pa(kasan_zero_p4d)) ||
__pa(pt) == __pa(kasan_zero_pud)) {
pgprotval_t prot = pte_flags(kasan_zero_pte[0]);
note_page(m, st, __pgprot(prot), 5);
@@ -428,14 +430,15 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr,
#define p4d_none(a) pud_none(__pud(p4d_val(a)))
#endif
-#if PTRS_PER_P4D > 1
-
static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr, unsigned long P)
{
int i;
p4d_t *start, *p4d_start;
pgprotval_t prot;
+ if (PTRS_PER_P4D == 1)
+ return walk_pud_level(m, st, __p4d(pgd_val(addr)), P);
+
p4d_start = start = (p4d_t *)pgd_page_vaddr(addr);
for (i = 0; i < PTRS_PER_P4D; i++) {
@@ -455,11 +458,8 @@ static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
}
}
-#else
-#define walk_p4d_level(m,s,a,p) walk_pud_level(m,s,__p4d(pgd_val(a)),p)
-#define pgd_large(a) p4d_large(__p4d(pgd_val(a)))
-#define pgd_none(a) p4d_none(__p4d(pgd_val(a)))
-#endif
+#define pgd_large(a) (pgtable_l5_enabled ? pgd_large(a) : p4d_large(__p4d(pgd_val(a))))
+#define pgd_none(a) (pgtable_l5_enabled ? pgd_none(a) : p4d_none(__p4d(pgd_val(a))))
static inline bool is_hypervisor_range(int idx)
{
@@ -570,6 +570,13 @@ static int __init pt_dump_init(void)
address_markers[LOW_KERNEL_NR].start_address = PAGE_OFFSET;
address_markers[VMALLOC_START_NR].start_address = VMALLOC_START;
address_markers[VMEMMAP_START_NR].start_address = VMEMMAP_START;
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+ address_markers[LDT_NR].start_address = LDT_BASE_ADDR;
+#endif
+#ifdef CONFIG_KASAN
+ address_markers[KASAN_SHADOW_START_NR].start_address = KASAN_SHADOW_START;
+ address_markers[KASAN_SHADOW_END_NR].start_address = KASAN_SHADOW_END;
+#endif
#endif
#ifdef CONFIG_X86_32
address_markers[VMALLOC_START_NR].start_address = VMALLOC_START;
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 800de81..321b780 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -439,7 +439,7 @@ static noinline int vmalloc_fault(unsigned long address)
if (pgd_none(*pgd_ref))
return -1;
- if (CONFIG_PGTABLE_LEVELS > 4) {
+ if (pgtable_l5_enabled) {
if (pgd_none(*pgd)) {
set_pgd(pgd, *pgd_ref);
arch_flush_lazy_mmu_mode();
@@ -454,7 +454,7 @@ static noinline int vmalloc_fault(unsigned long address)
if (p4d_none(*p4d_ref))
return -1;
- if (p4d_none(*p4d) && CONFIG_PGTABLE_LEVELS == 4) {
+ if (p4d_none(*p4d) && !pgtable_l5_enabled) {
set_p4d(p4d, *p4d_ref);
arch_flush_lazy_mmu_mode();
} else {
diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c
index ab33a32..9aa22be 100644
--- a/arch/x86/mm/ident_map.c
+++ b/arch/x86/mm/ident_map.c
@@ -120,7 +120,7 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
result = ident_p4d_init(info, p4d, addr, next);
if (result)
return result;
- if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+ if (pgtable_l5_enabled) {
set_pgd(pgd, __pgd(__pa(p4d) | info->kernpg_flag));
} else {
/*
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 8b72923..9bbc51a 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -88,12 +88,7 @@ static int __init nonx32_setup(char *str)
}
__setup("noexec32=", nonx32_setup);
-/*
- * When memory was added make sure all the processes MM have
- * suitable PGD entries in the local PGD level page.
- */
-#ifdef CONFIG_X86_5LEVEL
-void sync_global_pgds(unsigned long start, unsigned long end)
+static void sync_global_pgds_l5(unsigned long start, unsigned long end)
{
unsigned long addr;
@@ -129,8 +124,8 @@ void sync_global_pgds(unsigned long start, unsigned long end)
spin_unlock(&pgd_lock);
}
}
-#else
-void sync_global_pgds(unsigned long start, unsigned long end)
+
+static void sync_global_pgds_l4(unsigned long start, unsigned long end)
{
unsigned long addr;
@@ -143,7 +138,7 @@ void sync_global_pgds(unsigned long start, unsigned long end)
* With folded p4d, pgd_none() is always false, we need to
* handle synchonization on p4d level.
*/
- BUILD_BUG_ON(pgd_none(*pgd_ref));
+ MAYBE_BUILD_BUG_ON(pgd_none(*pgd_ref));
p4d_ref = p4d_offset(pgd_ref, addr);
if (p4d_none(*p4d_ref))
@@ -173,7 +168,18 @@ void sync_global_pgds(unsigned long start, unsigned long end)
spin_unlock(&pgd_lock);
}
}
-#endif
+
+/*
+ * When memory was added make sure all the processes MM have
+ * suitable PGD entries in the local PGD level page.
+ */
+void sync_global_pgds(unsigned long start, unsigned long end)
+{
+ if (pgtable_l5_enabled)
+ sync_global_pgds_l5(start, end);
+ else
+ sync_global_pgds_l4(start, end);
+}
/*
* NOTE: This function is marked __ref because it calls __init function
@@ -632,7 +638,7 @@ phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end,
unsigned long vaddr = (unsigned long)__va(paddr);
int i = p4d_index(vaddr);
- if (!IS_ENABLED(CONFIG_X86_5LEVEL))
+ if (!pgtable_l5_enabled)
return phys_pud_init((pud_t *) p4d_page, paddr, paddr_end, page_size_mask);
for (; i < PTRS_PER_P4D; i++, paddr = paddr_next) {
@@ -712,7 +718,7 @@ kernel_physical_mapping_init(unsigned long paddr_start,
page_size_mask);
spin_lock(&init_mm.page_table_lock);
- if (IS_ENABLED(CONFIG_X86_5LEVEL))
+ if (pgtable_l5_enabled)
pgd_populate(&init_mm, pgd, p4d);
else
p4d_populate(&init_mm, p4d_offset(pgd, vaddr), (pud_t *) p4d);
@@ -1093,7 +1099,7 @@ remove_p4d_table(p4d_t *p4d_start, unsigned long addr, unsigned long end,
* 5-level case we should free them. This code will have to change
* to adapt for boot-time switching between 4 and 5 level page tables.
*/
- if (CONFIG_PGTABLE_LEVELS == 5)
+ if (pgtable_l5_enabled)
free_pud_table(pud_base, p4d, altmap);
}
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index af6f2f9..d8ff013 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -1,6 +1,12 @@
// SPDX-License-Identifier: GPL-2.0
#define DISABLE_BRANCH_PROFILING
#define pr_fmt(fmt) "kasan: " fmt
+
+#ifdef CONFIG_X86_5LEVEL
+/* Too early to use cpu_feature_enabled() */
+#define pgtable_l5_enabled __pgtable_l5_enabled
+#endif
+
#include <linux/bootmem.h>
#include <linux/kasan.h>
#include <linux/kdebug.h>
@@ -19,7 +25,7 @@
extern struct range pfn_mapped[E820_MAX_ENTRIES];
-static p4d_t tmp_p4d_table[PTRS_PER_P4D] __initdata __aligned(PAGE_SIZE);
+static p4d_t tmp_p4d_table[MAX_PTRS_PER_P4D] __initdata __aligned(PAGE_SIZE);
static __init void *early_alloc(size_t size, int nid, bool panic)
{
@@ -176,10 +182,10 @@ static void __init clear_pgds(unsigned long start,
* With folded p4d, pgd_clear() is nop, use p4d_clear()
* instead.
*/
- if (CONFIG_PGTABLE_LEVELS < 5)
- p4d_clear(p4d_offset(pgd, start));
- else
+ if (pgtable_l5_enabled)
pgd_clear(pgd);
+ else
+ p4d_clear(p4d_offset(pgd, start));
}
pgd = pgd_offset_k(start);
@@ -191,7 +197,7 @@ static inline p4d_t *early_p4d_offset(pgd_t *pgd, unsigned long addr)
{
unsigned long p4d;
- if (!IS_ENABLED(CONFIG_X86_5LEVEL))
+ if (!pgtable_l5_enabled)
return (p4d_t *)pgd;
p4d = __pa_nodebug(pgd_val(*pgd)) & PTE_PFN_MASK;
@@ -272,7 +278,7 @@ void __init kasan_early_init(void)
for (i = 0; i < PTRS_PER_PUD; i++)
kasan_zero_pud[i] = __pud(pud_val);
- for (i = 0; IS_ENABLED(CONFIG_X86_5LEVEL) && i < PTRS_PER_P4D; i++)
+ for (i = 0; pgtable_l5_enabled && i < PTRS_PER_P4D; i++)
kasan_zero_p4d[i] = __p4d(p4d_val);
kasan_map_early_shadow(early_top_pgt);
@@ -303,7 +309,7 @@ void __init kasan_init(void)
* bunch of things like kernel code, modules, EFI mapping, etc.
* We need to take extra steps to not overwrite them.
*/
- if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+ if (pgtable_l5_enabled) {
void *ptr;
ptr = (void *)pgd_page_vaddr(*pgd_offset_k(KASAN_SHADOW_END));
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index aedebd2..615cc03 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -34,23 +34,12 @@
#define TB_SHIFT 40
/*
- * Virtual address start and end range for randomization.
- *
* The end address could depend on more configuration options to make the
* highest amount of space for randomization available, but that's too hard
* to keep straight and caused issues already.
*/
-static const unsigned long vaddr_start = __PAGE_OFFSET_BASE;
static const unsigned long vaddr_end = CPU_ENTRY_AREA_BASE;
-/* Default values */
-unsigned long page_offset_base = __PAGE_OFFSET_BASE;
-EXPORT_SYMBOL(page_offset_base);
-unsigned long vmalloc_base = __VMALLOC_BASE;
-EXPORT_SYMBOL(vmalloc_base);
-unsigned long vmemmap_base = __VMEMMAP_BASE;
-EXPORT_SYMBOL(vmemmap_base);
-
/*
* Memory regions randomized by KASLR (except modules that use a separate logic
* earlier during boot). The list is ordered based on virtual addresses. This
@@ -60,8 +49,8 @@ static __initdata struct kaslr_memory_region {
unsigned long *base;
unsigned long size_tb;
} kaslr_regions[] = {
- { &page_offset_base, 1 << (__PHYSICAL_MASK_SHIFT - TB_SHIFT) /* Maximum */ },
- { &vmalloc_base, VMALLOC_SIZE_TB },
+ { &page_offset_base, 0 },
+ { &vmalloc_base, 0 },
{ &vmemmap_base, 1 },
};
@@ -84,11 +73,14 @@ static inline bool kaslr_memory_enabled(void)
void __init kernel_randomize_memory(void)
{
size_t i;
- unsigned long vaddr = vaddr_start;
+ unsigned long vaddr_start, vaddr;
unsigned long rand, memory_tb;
struct rnd_state rand_state;
unsigned long remain_entropy;
+ vaddr_start = pgtable_l5_enabled ? __PAGE_OFFSET_BASE_L5 : __PAGE_OFFSET_BASE_L4;
+ vaddr = vaddr_start;
+
/*
* These BUILD_BUG_ON checks ensure the memory layout is consistent
* with the vaddr_start/vaddr_end variables. These checks are very
@@ -101,6 +93,9 @@ void __init kernel_randomize_memory(void)
if (!kaslr_memory_enabled())
return;
+ kaslr_regions[0].size_tb = 1 << (__PHYSICAL_MASK_SHIFT - TB_SHIFT);
+ kaslr_regions[1].size_tb = VMALLOC_SIZE_TB;
+
/*
* Update Physical memory mapping to available and
* add padding if needed (especially for memory hotplug support).
@@ -129,7 +124,7 @@ void __init kernel_randomize_memory(void)
*/
entropy = remain_entropy / (ARRAY_SIZE(kaslr_regions) - i);
prandom_bytes_state(&rand_state, &rand, sizeof(rand));
- if (IS_ENABLED(CONFIG_X86_5LEVEL))
+ if (pgtable_l5_enabled)
entropy = (rand % (entropy + 1)) & P4D_MASK;
else
entropy = (rand % (entropy + 1)) & PUD_MASK;
@@ -141,7 +136,7 @@ void __init kernel_randomize_memory(void)
* randomization alignment.
*/
vaddr += get_padding(&kaslr_regions[i]);
- if (IS_ENABLED(CONFIG_X86_5LEVEL))
+ if (pgtable_l5_enabled)
vaddr = round_up(vaddr + 1, P4D_SIZE);
else
vaddr = round_up(vaddr + 1, PUD_SIZE);
@@ -217,7 +212,7 @@ void __meminit init_trampoline(void)
return;
}
- if (IS_ENABLED(CONFIG_X86_5LEVEL))
+ if (pgtable_l5_enabled)
init_trampoline_p4d();
else
init_trampoline_pud();
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index 1a53071..3a1b5fe 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -25,17 +25,12 @@
#include <asm/bootparam.h>
#include <asm/set_memory.h>
#include <asm/cacheflush.h>
-#include <asm/sections.h>
#include <asm/processor-flags.h>
#include <asm/msr.h>
#include <asm/cmdline.h>
#include "mm_internal.h"
-static char sme_cmdline_arg[] __initdata = "mem_encrypt";
-static char sme_cmdline_on[] __initdata = "on";
-static char sme_cmdline_off[] __initdata = "off";
-
/*
* Since SME related variables are set early in the boot process they must
* reside in the .data section so as not to be zeroed out when the .bss
@@ -46,7 +41,7 @@ EXPORT_SYMBOL(sme_me_mask);
DEFINE_STATIC_KEY_FALSE(sev_enable_key);
EXPORT_SYMBOL_GPL(sev_enable_key);
-static bool sev_enabled __section(.data);
+bool sev_enabled __section(.data);
/* Buffer used for early in-place encryption by BSP, no locking needed */
static char sme_early_buffer[PAGE_SIZE] __aligned(PAGE_SIZE);
@@ -463,574 +458,3 @@ void swiotlb_set_mem_attributes(void *vaddr, unsigned long size)
/* Make the SWIOTLB buffer area decrypted */
set_memory_decrypted((unsigned long)vaddr, size >> PAGE_SHIFT);
}
-
-struct sme_populate_pgd_data {
- void *pgtable_area;
- pgd_t *pgd;
-
- pmdval_t pmd_flags;
- pteval_t pte_flags;
- unsigned long paddr;
-
- unsigned long vaddr;
- unsigned long vaddr_end;
-};
-
-static void __init sme_clear_pgd(struct sme_populate_pgd_data *ppd)
-{
- unsigned long pgd_start, pgd_end, pgd_size;
- pgd_t *pgd_p;
-
- pgd_start = ppd->vaddr & PGDIR_MASK;
- pgd_end = ppd->vaddr_end & PGDIR_MASK;
-
- pgd_size = (((pgd_end - pgd_start) / PGDIR_SIZE) + 1) * sizeof(pgd_t);
-
- pgd_p = ppd->pgd + pgd_index(ppd->vaddr);
-
- memset(pgd_p, 0, pgd_size);
-}
-
-#define PGD_FLAGS _KERNPG_TABLE_NOENC
-#define P4D_FLAGS _KERNPG_TABLE_NOENC
-#define PUD_FLAGS _KERNPG_TABLE_NOENC
-#define PMD_FLAGS _KERNPG_TABLE_NOENC
-
-#define PMD_FLAGS_LARGE (__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL)
-
-#define PMD_FLAGS_DEC PMD_FLAGS_LARGE
-#define PMD_FLAGS_DEC_WP ((PMD_FLAGS_DEC & ~_PAGE_CACHE_MASK) | \
- (_PAGE_PAT | _PAGE_PWT))
-
-#define PMD_FLAGS_ENC (PMD_FLAGS_LARGE | _PAGE_ENC)
-
-#define PTE_FLAGS (__PAGE_KERNEL_EXEC & ~_PAGE_GLOBAL)
-
-#define PTE_FLAGS_DEC PTE_FLAGS
-#define PTE_FLAGS_DEC_WP ((PTE_FLAGS_DEC & ~_PAGE_CACHE_MASK) | \
- (_PAGE_PAT | _PAGE_PWT))
-
-#define PTE_FLAGS_ENC (PTE_FLAGS | _PAGE_ENC)
-
-static pmd_t __init *sme_prepare_pgd(struct sme_populate_pgd_data *ppd)
-{
- pgd_t *pgd_p;
- p4d_t *p4d_p;
- pud_t *pud_p;
- pmd_t *pmd_p;
-
- pgd_p = ppd->pgd + pgd_index(ppd->vaddr);
- if (native_pgd_val(*pgd_p)) {
- if (IS_ENABLED(CONFIG_X86_5LEVEL))
- p4d_p = (p4d_t *)(native_pgd_val(*pgd_p) & ~PTE_FLAGS_MASK);
- else
- pud_p = (pud_t *)(native_pgd_val(*pgd_p) & ~PTE_FLAGS_MASK);
- } else {
- pgd_t pgd;
-
- if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
- p4d_p = ppd->pgtable_area;
- memset(p4d_p, 0, sizeof(*p4d_p) * PTRS_PER_P4D);
- ppd->pgtable_area += sizeof(*p4d_p) * PTRS_PER_P4D;
-
- pgd = native_make_pgd((pgdval_t)p4d_p + PGD_FLAGS);
- } else {
- pud_p = ppd->pgtable_area;
- memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD);
- ppd->pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD;
-
- pgd = native_make_pgd((pgdval_t)pud_p + PGD_FLAGS);
- }
- native_set_pgd(pgd_p, pgd);
- }
-
- if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
- p4d_p += p4d_index(ppd->vaddr);
- if (native_p4d_val(*p4d_p)) {
- pud_p = (pud_t *)(native_p4d_val(*p4d_p) & ~PTE_FLAGS_MASK);
- } else {
- p4d_t p4d;
-
- pud_p = ppd->pgtable_area;
- memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD);
- ppd->pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD;
-
- p4d = native_make_p4d((pudval_t)pud_p + P4D_FLAGS);
- native_set_p4d(p4d_p, p4d);
- }
- }
-
- pud_p += pud_index(ppd->vaddr);
- if (native_pud_val(*pud_p)) {
- if (native_pud_val(*pud_p) & _PAGE_PSE)
- return NULL;
-
- pmd_p = (pmd_t *)(native_pud_val(*pud_p) & ~PTE_FLAGS_MASK);
- } else {
- pud_t pud;
-
- pmd_p = ppd->pgtable_area;
- memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD);
- ppd->pgtable_area += sizeof(*pmd_p) * PTRS_PER_PMD;
-
- pud = native_make_pud((pmdval_t)pmd_p + PUD_FLAGS);
- native_set_pud(pud_p, pud);
- }
-
- return pmd_p;
-}
-
-static void __init sme_populate_pgd_large(struct sme_populate_pgd_data *ppd)
-{
- pmd_t *pmd_p;
-
- pmd_p = sme_prepare_pgd(ppd);
- if (!pmd_p)
- return;
-
- pmd_p += pmd_index(ppd->vaddr);
- if (!native_pmd_val(*pmd_p) || !(native_pmd_val(*pmd_p) & _PAGE_PSE))
- native_set_pmd(pmd_p, native_make_pmd(ppd->paddr | ppd->pmd_flags));
-}
-
-static void __init sme_populate_pgd(struct sme_populate_pgd_data *ppd)
-{
- pmd_t *pmd_p;
- pte_t *pte_p;
-
- pmd_p = sme_prepare_pgd(ppd);
- if (!pmd_p)
- return;
-
- pmd_p += pmd_index(ppd->vaddr);
- if (native_pmd_val(*pmd_p)) {
- if (native_pmd_val(*pmd_p) & _PAGE_PSE)
- return;
-
- pte_p = (pte_t *)(native_pmd_val(*pmd_p) & ~PTE_FLAGS_MASK);
- } else {
- pmd_t pmd;
-
- pte_p = ppd->pgtable_area;
- memset(pte_p, 0, sizeof(*pte_p) * PTRS_PER_PTE);
- ppd->pgtable_area += sizeof(*pte_p) * PTRS_PER_PTE;
-
- pmd = native_make_pmd((pteval_t)pte_p + PMD_FLAGS);
- native_set_pmd(pmd_p, pmd);
- }
-
- pte_p += pte_index(ppd->vaddr);
- if (!native_pte_val(*pte_p))
- native_set_pte(pte_p, native_make_pte(ppd->paddr | ppd->pte_flags));
-}
-
-static void __init __sme_map_range_pmd(struct sme_populate_pgd_data *ppd)
-{
- while (ppd->vaddr < ppd->vaddr_end) {
- sme_populate_pgd_large(ppd);
-
- ppd->vaddr += PMD_PAGE_SIZE;
- ppd->paddr += PMD_PAGE_SIZE;
- }
-}
-
-static void __init __sme_map_range_pte(struct sme_populate_pgd_data *ppd)
-{
- while (ppd->vaddr < ppd->vaddr_end) {
- sme_populate_pgd(ppd);
-
- ppd->vaddr += PAGE_SIZE;
- ppd->paddr += PAGE_SIZE;
- }
-}
-
-static void __init __sme_map_range(struct sme_populate_pgd_data *ppd,
- pmdval_t pmd_flags, pteval_t pte_flags)
-{
- unsigned long vaddr_end;
-
- ppd->pmd_flags = pmd_flags;
- ppd->pte_flags = pte_flags;
-
- /* Save original end value since we modify the struct value */
- vaddr_end = ppd->vaddr_end;
-
- /* If start is not 2MB aligned, create PTE entries */
- ppd->vaddr_end = ALIGN(ppd->vaddr, PMD_PAGE_SIZE);
- __sme_map_range_pte(ppd);
-
- /* Create PMD entries */
- ppd->vaddr_end = vaddr_end & PMD_PAGE_MASK;
- __sme_map_range_pmd(ppd);
-
- /* If end is not 2MB aligned, create PTE entries */
- ppd->vaddr_end = vaddr_end;
- __sme_map_range_pte(ppd);
-}
-
-static void __init sme_map_range_encrypted(struct sme_populate_pgd_data *ppd)
-{
- __sme_map_range(ppd, PMD_FLAGS_ENC, PTE_FLAGS_ENC);
-}
-
-static void __init sme_map_range_decrypted(struct sme_populate_pgd_data *ppd)
-{
- __sme_map_range(ppd, PMD_FLAGS_DEC, PTE_FLAGS_DEC);
-}
-
-static void __init sme_map_range_decrypted_wp(struct sme_populate_pgd_data *ppd)
-{
- __sme_map_range(ppd, PMD_FLAGS_DEC_WP, PTE_FLAGS_DEC_WP);
-}
-
-static unsigned long __init sme_pgtable_calc(unsigned long len)
-{
- unsigned long p4d_size, pud_size, pmd_size, pte_size;
- unsigned long total;
-
- /*
- * Perform a relatively simplistic calculation of the pagetable
- * entries that are needed. Those mappings will be covered mostly
- * by 2MB PMD entries so we can conservatively calculate the required
- * number of P4D, PUD and PMD structures needed to perform the
- * mappings. For mappings that are not 2MB aligned, PTE mappings
- * would be needed for the start and end portion of the address range
- * that fall outside of the 2MB alignment. This results in, at most,
- * two extra pages to hold PTE entries for each range that is mapped.
- * Incrementing the count for each covers the case where the addresses
- * cross entries.
- */
- if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
- p4d_size = (ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE) + 1;
- p4d_size *= sizeof(p4d_t) * PTRS_PER_P4D;
- pud_size = (ALIGN(len, P4D_SIZE) / P4D_SIZE) + 1;
- pud_size *= sizeof(pud_t) * PTRS_PER_PUD;
- } else {
- p4d_size = 0;
- pud_size = (ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE) + 1;
- pud_size *= sizeof(pud_t) * PTRS_PER_PUD;
- }
- pmd_size = (ALIGN(len, PUD_SIZE) / PUD_SIZE) + 1;
- pmd_size *= sizeof(pmd_t) * PTRS_PER_PMD;
- pte_size = 2 * sizeof(pte_t) * PTRS_PER_PTE;
-
- total = p4d_size + pud_size + pmd_size + pte_size;
-
- /*
- * Now calculate the added pagetable structures needed to populate
- * the new pagetables.
- */
- if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
- p4d_size = ALIGN(total, PGDIR_SIZE) / PGDIR_SIZE;
- p4d_size *= sizeof(p4d_t) * PTRS_PER_P4D;
- pud_size = ALIGN(total, P4D_SIZE) / P4D_SIZE;
- pud_size *= sizeof(pud_t) * PTRS_PER_PUD;
- } else {
- p4d_size = 0;
- pud_size = ALIGN(total, PGDIR_SIZE) / PGDIR_SIZE;
- pud_size *= sizeof(pud_t) * PTRS_PER_PUD;
- }
- pmd_size = ALIGN(total, PUD_SIZE) / PUD_SIZE;
- pmd_size *= sizeof(pmd_t) * PTRS_PER_PMD;
-
- total += p4d_size + pud_size + pmd_size;
-
- return total;
-}
-
-void __init __nostackprotector sme_encrypt_kernel(struct boot_params *bp)
-{
- unsigned long workarea_start, workarea_end, workarea_len;
- unsigned long execute_start, execute_end, execute_len;
- unsigned long kernel_start, kernel_end, kernel_len;
- unsigned long initrd_start, initrd_end, initrd_len;
- struct sme_populate_pgd_data ppd;
- unsigned long pgtable_area_len;
- unsigned long decrypted_base;
-
- if (!sme_active())
- return;
-
- /*
- * Prepare for encrypting the kernel and initrd by building new
- * pagetables with the necessary attributes needed to encrypt the
- * kernel in place.
- *
- * One range of virtual addresses will map the memory occupied
- * by the kernel and initrd as encrypted.
- *
- * Another range of virtual addresses will map the memory occupied
- * by the kernel and initrd as decrypted and write-protected.
- *
- * The use of write-protect attribute will prevent any of the
- * memory from being cached.
- */
-
- /* Physical addresses gives us the identity mapped virtual addresses */
- kernel_start = __pa_symbol(_text);
- kernel_end = ALIGN(__pa_symbol(_end), PMD_PAGE_SIZE);
- kernel_len = kernel_end - kernel_start;
-
- initrd_start = 0;
- initrd_end = 0;
- initrd_len = 0;
-#ifdef CONFIG_BLK_DEV_INITRD
- initrd_len = (unsigned long)bp->hdr.ramdisk_size |
- ((unsigned long)bp->ext_ramdisk_size << 32);
- if (initrd_len) {
- initrd_start = (unsigned long)bp->hdr.ramdisk_image |
- ((unsigned long)bp->ext_ramdisk_image << 32);
- initrd_end = PAGE_ALIGN(initrd_start + initrd_len);
- initrd_len = initrd_end - initrd_start;
- }
-#endif
-
- /* Set the encryption workarea to be immediately after the kernel */
- workarea_start = kernel_end;
-
- /*
- * Calculate required number of workarea bytes needed:
- * executable encryption area size:
- * stack page (PAGE_SIZE)
- * encryption routine page (PAGE_SIZE)
- * intermediate copy buffer (PMD_PAGE_SIZE)
- * pagetable structures for the encryption of the kernel
- * pagetable structures for workarea (in case not currently mapped)
- */
- execute_start = workarea_start;
- execute_end = execute_start + (PAGE_SIZE * 2) + PMD_PAGE_SIZE;
- execute_len = execute_end - execute_start;
-
- /*
- * One PGD for both encrypted and decrypted mappings and a set of
- * PUDs and PMDs for each of the encrypted and decrypted mappings.
- */
- pgtable_area_len = sizeof(pgd_t) * PTRS_PER_PGD;
- pgtable_area_len += sme_pgtable_calc(execute_end - kernel_start) * 2;
- if (initrd_len)
- pgtable_area_len += sme_pgtable_calc(initrd_len) * 2;
-
- /* PUDs and PMDs needed in the current pagetables for the workarea */
- pgtable_area_len += sme_pgtable_calc(execute_len + pgtable_area_len);
-
- /*
- * The total workarea includes the executable encryption area and
- * the pagetable area. The start of the workarea is already 2MB
- * aligned, align the end of the workarea on a 2MB boundary so that
- * we don't try to create/allocate PTE entries from the workarea
- * before it is mapped.
- */
- workarea_len = execute_len + pgtable_area_len;
- workarea_end = ALIGN(workarea_start + workarea_len, PMD_PAGE_SIZE);
-
- /*
- * Set the address to the start of where newly created pagetable
- * structures (PGDs, PUDs and PMDs) will be allocated. New pagetable
- * structures are created when the workarea is added to the current
- * pagetables and when the new encrypted and decrypted kernel
- * mappings are populated.
- */
- ppd.pgtable_area = (void *)execute_end;
-
- /*
- * Make sure the current pagetable structure has entries for
- * addressing the workarea.
- */
- ppd.pgd = (pgd_t *)native_read_cr3_pa();
- ppd.paddr = workarea_start;
- ppd.vaddr = workarea_start;
- ppd.vaddr_end = workarea_end;
- sme_map_range_decrypted(&ppd);
-
- /* Flush the TLB - no globals so cr3 is enough */
- native_write_cr3(__native_read_cr3());
-
- /*
- * A new pagetable structure is being built to allow for the kernel
- * and initrd to be encrypted. It starts with an empty PGD that will
- * then be populated with new PUDs and PMDs as the encrypted and
- * decrypted kernel mappings are created.
- */
- ppd.pgd = ppd.pgtable_area;
- memset(ppd.pgd, 0, sizeof(pgd_t) * PTRS_PER_PGD);
- ppd.pgtable_area += sizeof(pgd_t) * PTRS_PER_PGD;
-
- /*
- * A different PGD index/entry must be used to get different
- * pagetable entries for the decrypted mapping. Choose the next
- * PGD index and convert it to a virtual address to be used as
- * the base of the mapping.
- */
- decrypted_base = (pgd_index(workarea_end) + 1) & (PTRS_PER_PGD - 1);
- if (initrd_len) {
- unsigned long check_base;
-
- check_base = (pgd_index(initrd_end) + 1) & (PTRS_PER_PGD - 1);
- decrypted_base = max(decrypted_base, check_base);
- }
- decrypted_base <<= PGDIR_SHIFT;
-
- /* Add encrypted kernel (identity) mappings */
- ppd.paddr = kernel_start;
- ppd.vaddr = kernel_start;
- ppd.vaddr_end = kernel_end;
- sme_map_range_encrypted(&ppd);
-
- /* Add decrypted, write-protected kernel (non-identity) mappings */
- ppd.paddr = kernel_start;
- ppd.vaddr = kernel_start + decrypted_base;
- ppd.vaddr_end = kernel_end + decrypted_base;
- sme_map_range_decrypted_wp(&ppd);
-
- if (initrd_len) {
- /* Add encrypted initrd (identity) mappings */
- ppd.paddr = initrd_start;
- ppd.vaddr = initrd_start;
- ppd.vaddr_end = initrd_end;
- sme_map_range_encrypted(&ppd);
- /*
- * Add decrypted, write-protected initrd (non-identity) mappings
- */
- ppd.paddr = initrd_start;
- ppd.vaddr = initrd_start + decrypted_base;
- ppd.vaddr_end = initrd_end + decrypted_base;
- sme_map_range_decrypted_wp(&ppd);
- }
-
- /* Add decrypted workarea mappings to both kernel mappings */
- ppd.paddr = workarea_start;
- ppd.vaddr = workarea_start;
- ppd.vaddr_end = workarea_end;
- sme_map_range_decrypted(&ppd);
-
- ppd.paddr = workarea_start;
- ppd.vaddr = workarea_start + decrypted_base;
- ppd.vaddr_end = workarea_end + decrypted_base;
- sme_map_range_decrypted(&ppd);
-
- /* Perform the encryption */
- sme_encrypt_execute(kernel_start, kernel_start + decrypted_base,
- kernel_len, workarea_start, (unsigned long)ppd.pgd);
-
- if (initrd_len)
- sme_encrypt_execute(initrd_start, initrd_start + decrypted_base,
- initrd_len, workarea_start,
- (unsigned long)ppd.pgd);
-
- /*
- * At this point we are running encrypted. Remove the mappings for
- * the decrypted areas - all that is needed for this is to remove
- * the PGD entry/entries.
- */
- ppd.vaddr = kernel_start + decrypted_base;
- ppd.vaddr_end = kernel_end + decrypted_base;
- sme_clear_pgd(&ppd);
-
- if (initrd_len) {
- ppd.vaddr = initrd_start + decrypted_base;
- ppd.vaddr_end = initrd_end + decrypted_base;
- sme_clear_pgd(&ppd);
- }
-
- ppd.vaddr = workarea_start + decrypted_base;
- ppd.vaddr_end = workarea_end + decrypted_base;
- sme_clear_pgd(&ppd);
-
- /* Flush the TLB - no globals so cr3 is enough */
- native_write_cr3(__native_read_cr3());
-}
-
-void __init __nostackprotector sme_enable(struct boot_params *bp)
-{
- const char *cmdline_ptr, *cmdline_arg, *cmdline_on, *cmdline_off;
- unsigned int eax, ebx, ecx, edx;
- unsigned long feature_mask;
- bool active_by_default;
- unsigned long me_mask;
- char buffer[16];
- u64 msr;
-
- /* Check for the SME/SEV support leaf */
- eax = 0x80000000;
- ecx = 0;
- native_cpuid(&eax, &ebx, &ecx, &edx);
- if (eax < 0x8000001f)
- return;
-
-#define AMD_SME_BIT BIT(0)
-#define AMD_SEV_BIT BIT(1)
- /*
- * Set the feature mask (SME or SEV) based on whether we are
- * running under a hypervisor.
- */
- eax = 1;
- ecx = 0;
- native_cpuid(&eax, &ebx, &ecx, &edx);
- feature_mask = (ecx & BIT(31)) ? AMD_SEV_BIT : AMD_SME_BIT;
-
- /*
- * Check for the SME/SEV feature:
- * CPUID Fn8000_001F[EAX]
- * - Bit 0 - Secure Memory Encryption support
- * - Bit 1 - Secure Encrypted Virtualization support
- * CPUID Fn8000_001F[EBX]
- * - Bits 5:0 - Pagetable bit position used to indicate encryption
- */
- eax = 0x8000001f;
- ecx = 0;
- native_cpuid(&eax, &ebx, &ecx, &edx);
- if (!(eax & feature_mask))
- return;
-
- me_mask = 1UL << (ebx & 0x3f);
-
- /* Check if memory encryption is enabled */
- if (feature_mask == AMD_SME_BIT) {
- /* For SME, check the SYSCFG MSR */
- msr = __rdmsr(MSR_K8_SYSCFG);
- if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT))
- return;
- } else {
- /* For SEV, check the SEV MSR */
- msr = __rdmsr(MSR_AMD64_SEV);
- if (!(msr & MSR_AMD64_SEV_ENABLED))
- return;
-
- /* SEV state cannot be controlled by a command line option */
- sme_me_mask = me_mask;
- sev_enabled = true;
- return;
- }
-
- /*
- * Fixups have not been applied to phys_base yet and we're running
- * identity mapped, so we must obtain the address to the SME command
- * line argument data using rip-relative addressing.
- */
- asm ("lea sme_cmdline_arg(%%rip), %0"
- : "=r" (cmdline_arg)
- : "p" (sme_cmdline_arg));
- asm ("lea sme_cmdline_on(%%rip), %0"
- : "=r" (cmdline_on)
- : "p" (sme_cmdline_on));
- asm ("lea sme_cmdline_off(%%rip), %0"
- : "=r" (cmdline_off)
- : "p" (sme_cmdline_off));
-
- if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT))
- active_by_default = true;
- else
- active_by_default = false;
-
- cmdline_ptr = (const char *)((u64)bp->hdr.cmd_line_ptr |
- ((u64)bp->ext_cmd_line_ptr << 32));
-
- cmdline_find_option(cmdline_ptr, cmdline_arg, buffer, sizeof(buffer));
-
- if (!strncmp(buffer, cmdline_on, sizeof(buffer)))
- sme_me_mask = me_mask;
- else if (!strncmp(buffer, cmdline_off, sizeof(buffer)))
- sme_me_mask = 0;
- else
- sme_me_mask = active_by_default ? me_mask : 0;
-}
diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c
new file mode 100644
index 0000000..b4139c5
--- /dev/null
+++ b/arch/x86/mm/mem_encrypt_identity.c
@@ -0,0 +1,564 @@
+/*
+ * AMD Memory Encryption Support
+ *
+ * Copyright (C) 2016 Advanced Micro Devices, Inc.
+ *
+ * Author: Tom Lendacky <thomas.lendacky@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#define DISABLE_BRANCH_PROFILING
+
+/*
+ * Since we're dealing with identity mappings, physical and virtual
+ * addresses are the same, so override these defines which are ultimately
+ * used by the headers in misc.h.
+ */
+#define __pa(x) ((unsigned long)(x))
+#define __va(x) ((void *)((unsigned long)(x)))
+
+/*
+ * Special hack: we have to be careful, because no indirections are
+ * allowed here, and paravirt_ops is a kind of one. As it will only run in
+ * baremetal anyway, we just keep it from happening. (This list needs to
+ * be extended when new paravirt and debugging variants are added.)
+ */
+#undef CONFIG_PARAVIRT
+#undef CONFIG_PARAVIRT_SPINLOCKS
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/mem_encrypt.h>
+
+#include <asm/setup.h>
+#include <asm/sections.h>
+#include <asm/cmdline.h>
+
+#include "mm_internal.h"
+
+#define PGD_FLAGS _KERNPG_TABLE_NOENC
+#define P4D_FLAGS _KERNPG_TABLE_NOENC
+#define PUD_FLAGS _KERNPG_TABLE_NOENC
+#define PMD_FLAGS _KERNPG_TABLE_NOENC
+
+#define PMD_FLAGS_LARGE (__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL)
+
+#define PMD_FLAGS_DEC PMD_FLAGS_LARGE
+#define PMD_FLAGS_DEC_WP ((PMD_FLAGS_DEC & ~_PAGE_CACHE_MASK) | \
+ (_PAGE_PAT | _PAGE_PWT))
+
+#define PMD_FLAGS_ENC (PMD_FLAGS_LARGE | _PAGE_ENC)
+
+#define PTE_FLAGS (__PAGE_KERNEL_EXEC & ~_PAGE_GLOBAL)
+
+#define PTE_FLAGS_DEC PTE_FLAGS
+#define PTE_FLAGS_DEC_WP ((PTE_FLAGS_DEC & ~_PAGE_CACHE_MASK) | \
+ (_PAGE_PAT | _PAGE_PWT))
+
+#define PTE_FLAGS_ENC (PTE_FLAGS | _PAGE_ENC)
+
+struct sme_populate_pgd_data {
+ void *pgtable_area;
+ pgd_t *pgd;
+
+ pmdval_t pmd_flags;
+ pteval_t pte_flags;
+ unsigned long paddr;
+
+ unsigned long vaddr;
+ unsigned long vaddr_end;
+};
+
+static char sme_cmdline_arg[] __initdata = "mem_encrypt";
+static char sme_cmdline_on[] __initdata = "on";
+static char sme_cmdline_off[] __initdata = "off";
+
+static void __init sme_clear_pgd(struct sme_populate_pgd_data *ppd)
+{
+ unsigned long pgd_start, pgd_end, pgd_size;
+ pgd_t *pgd_p;
+
+ pgd_start = ppd->vaddr & PGDIR_MASK;
+ pgd_end = ppd->vaddr_end & PGDIR_MASK;
+
+ pgd_size = (((pgd_end - pgd_start) / PGDIR_SIZE) + 1) * sizeof(pgd_t);
+
+ pgd_p = ppd->pgd + pgd_index(ppd->vaddr);
+
+ memset(pgd_p, 0, pgd_size);
+}
+
+static pud_t __init *sme_prepare_pgd(struct sme_populate_pgd_data *ppd)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+
+ pgd = ppd->pgd + pgd_index(ppd->vaddr);
+ if (pgd_none(*pgd)) {
+ p4d = ppd->pgtable_area;
+ memset(p4d, 0, sizeof(*p4d) * PTRS_PER_P4D);
+ ppd->pgtable_area += sizeof(*p4d) * PTRS_PER_P4D;
+ set_pgd(pgd, __pgd(PGD_FLAGS | __pa(p4d)));
+ }
+
+ p4d = p4d_offset(pgd, ppd->vaddr);
+ if (p4d_none(*p4d)) {
+ pud = ppd->pgtable_area;
+ memset(pud, 0, sizeof(*pud) * PTRS_PER_PUD);
+ ppd->pgtable_area += sizeof(*pud) * PTRS_PER_PUD;
+ set_p4d(p4d, __p4d(P4D_FLAGS | __pa(pud)));
+ }
+
+ pud = pud_offset(p4d, ppd->vaddr);
+ if (pud_none(*pud)) {
+ pmd = ppd->pgtable_area;
+ memset(pmd, 0, sizeof(*pmd) * PTRS_PER_PMD);
+ ppd->pgtable_area += sizeof(*pmd) * PTRS_PER_PMD;
+ set_pud(pud, __pud(PUD_FLAGS | __pa(pmd)));
+ }
+
+ if (pud_large(*pud))
+ return NULL;
+
+ return pud;
+}
+
+static void __init sme_populate_pgd_large(struct sme_populate_pgd_data *ppd)
+{
+ pud_t *pud;
+ pmd_t *pmd;
+
+ pud = sme_prepare_pgd(ppd);
+ if (!pud)
+ return;
+
+ pmd = pmd_offset(pud, ppd->vaddr);
+ if (pmd_large(*pmd))
+ return;
+
+ set_pmd(pmd, __pmd(ppd->paddr | ppd->pmd_flags));
+}
+
+static void __init sme_populate_pgd(struct sme_populate_pgd_data *ppd)
+{
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ pud = sme_prepare_pgd(ppd);
+ if (!pud)
+ return;
+
+ pmd = pmd_offset(pud, ppd->vaddr);
+ if (pmd_none(*pmd)) {
+ pte = ppd->pgtable_area;
+ memset(pte, 0, sizeof(pte) * PTRS_PER_PTE);
+ ppd->pgtable_area += sizeof(pte) * PTRS_PER_PTE;
+ set_pmd(pmd, __pmd(PMD_FLAGS | __pa(pte)));
+ }
+
+ if (pmd_large(*pmd))
+ return;
+
+ pte = pte_offset_map(pmd, ppd->vaddr);
+ if (pte_none(*pte))
+ set_pte(pte, __pte(ppd->paddr | ppd->pte_flags));
+}
+
+static void __init __sme_map_range_pmd(struct sme_populate_pgd_data *ppd)
+{
+ while (ppd->vaddr < ppd->vaddr_end) {
+ sme_populate_pgd_large(ppd);
+
+ ppd->vaddr += PMD_PAGE_SIZE;
+ ppd->paddr += PMD_PAGE_SIZE;
+ }
+}
+
+static void __init __sme_map_range_pte(struct sme_populate_pgd_data *ppd)
+{
+ while (ppd->vaddr < ppd->vaddr_end) {
+ sme_populate_pgd(ppd);
+
+ ppd->vaddr += PAGE_SIZE;
+ ppd->paddr += PAGE_SIZE;
+ }
+}
+
+static void __init __sme_map_range(struct sme_populate_pgd_data *ppd,
+ pmdval_t pmd_flags, pteval_t pte_flags)
+{
+ unsigned long vaddr_end;
+
+ ppd->pmd_flags = pmd_flags;
+ ppd->pte_flags = pte_flags;
+
+ /* Save original end value since we modify the struct value */
+ vaddr_end = ppd->vaddr_end;
+
+ /* If start is not 2MB aligned, create PTE entries */
+ ppd->vaddr_end = ALIGN(ppd->vaddr, PMD_PAGE_SIZE);
+ __sme_map_range_pte(ppd);
+
+ /* Create PMD entries */
+ ppd->vaddr_end = vaddr_end & PMD_PAGE_MASK;
+ __sme_map_range_pmd(ppd);
+
+ /* If end is not 2MB aligned, create PTE entries */
+ ppd->vaddr_end = vaddr_end;
+ __sme_map_range_pte(ppd);
+}
+
+static void __init sme_map_range_encrypted(struct sme_populate_pgd_data *ppd)
+{
+ __sme_map_range(ppd, PMD_FLAGS_ENC, PTE_FLAGS_ENC);
+}
+
+static void __init sme_map_range_decrypted(struct sme_populate_pgd_data *ppd)
+{
+ __sme_map_range(ppd, PMD_FLAGS_DEC, PTE_FLAGS_DEC);
+}
+
+static void __init sme_map_range_decrypted_wp(struct sme_populate_pgd_data *ppd)
+{
+ __sme_map_range(ppd, PMD_FLAGS_DEC_WP, PTE_FLAGS_DEC_WP);
+}
+
+static unsigned long __init sme_pgtable_calc(unsigned long len)
+{
+ unsigned long entries = 0, tables = 0;
+
+ /*
+ * Perform a relatively simplistic calculation of the pagetable
+ * entries that are needed. Those mappings will be covered mostly
+ * by 2MB PMD entries so we can conservatively calculate the required
+ * number of P4D, PUD and PMD structures needed to perform the
+ * mappings. For mappings that are not 2MB aligned, PTE mappings
+ * would be needed for the start and end portion of the address range
+ * that fall outside of the 2MB alignment. This results in, at most,
+ * two extra pages to hold PTE entries for each range that is mapped.
+ * Incrementing the count for each covers the case where the addresses
+ * cross entries.
+ */
+
+ /* PGDIR_SIZE is equal to P4D_SIZE on 4-level machine. */
+ if (PTRS_PER_P4D > 1)
+ entries += (DIV_ROUND_UP(len, PGDIR_SIZE) + 1) * sizeof(p4d_t) * PTRS_PER_P4D;
+ entries += (DIV_ROUND_UP(len, P4D_SIZE) + 1) * sizeof(pud_t) * PTRS_PER_PUD;
+ entries += (DIV_ROUND_UP(len, PUD_SIZE) + 1) * sizeof(pmd_t) * PTRS_PER_PMD;
+ entries += 2 * sizeof(pte_t) * PTRS_PER_PTE;
+
+ /*
+ * Now calculate the added pagetable structures needed to populate
+ * the new pagetables.
+ */
+
+ if (PTRS_PER_P4D > 1)
+ tables += DIV_ROUND_UP(entries, PGDIR_SIZE) * sizeof(p4d_t) * PTRS_PER_P4D;
+ tables += DIV_ROUND_UP(entries, P4D_SIZE) * sizeof(pud_t) * PTRS_PER_PUD;
+ tables += DIV_ROUND_UP(entries, PUD_SIZE) * sizeof(pmd_t) * PTRS_PER_PMD;
+
+ return entries + tables;
+}
+
+void __init __nostackprotector sme_encrypt_kernel(struct boot_params *bp)
+{
+ unsigned long workarea_start, workarea_end, workarea_len;
+ unsigned long execute_start, execute_end, execute_len;
+ unsigned long kernel_start, kernel_end, kernel_len;
+ unsigned long initrd_start, initrd_end, initrd_len;
+ struct sme_populate_pgd_data ppd;
+ unsigned long pgtable_area_len;
+ unsigned long decrypted_base;
+
+ if (!sme_active())
+ return;
+
+ /*
+ * Prepare for encrypting the kernel and initrd by building new
+ * pagetables with the necessary attributes needed to encrypt the
+ * kernel in place.
+ *
+ * One range of virtual addresses will map the memory occupied
+ * by the kernel and initrd as encrypted.
+ *
+ * Another range of virtual addresses will map the memory occupied
+ * by the kernel and initrd as decrypted and write-protected.
+ *
+ * The use of write-protect attribute will prevent any of the
+ * memory from being cached.
+ */
+
+ /* Physical addresses gives us the identity mapped virtual addresses */
+ kernel_start = __pa_symbol(_text);
+ kernel_end = ALIGN(__pa_symbol(_end), PMD_PAGE_SIZE);
+ kernel_len = kernel_end - kernel_start;
+
+ initrd_start = 0;
+ initrd_end = 0;
+ initrd_len = 0;
+#ifdef CONFIG_BLK_DEV_INITRD
+ initrd_len = (unsigned long)bp->hdr.ramdisk_size |
+ ((unsigned long)bp->ext_ramdisk_size << 32);
+ if (initrd_len) {
+ initrd_start = (unsigned long)bp->hdr.ramdisk_image |
+ ((unsigned long)bp->ext_ramdisk_image << 32);
+ initrd_end = PAGE_ALIGN(initrd_start + initrd_len);
+ initrd_len = initrd_end - initrd_start;
+ }
+#endif
+
+ /* Set the encryption workarea to be immediately after the kernel */
+ workarea_start = kernel_end;
+
+ /*
+ * Calculate required number of workarea bytes needed:
+ * executable encryption area size:
+ * stack page (PAGE_SIZE)
+ * encryption routine page (PAGE_SIZE)
+ * intermediate copy buffer (PMD_PAGE_SIZE)
+ * pagetable structures for the encryption of the kernel
+ * pagetable structures for workarea (in case not currently mapped)
+ */
+ execute_start = workarea_start;
+ execute_end = execute_start + (PAGE_SIZE * 2) + PMD_PAGE_SIZE;
+ execute_len = execute_end - execute_start;
+
+ /*
+ * One PGD for both encrypted and decrypted mappings and a set of
+ * PUDs and PMDs for each of the encrypted and decrypted mappings.
+ */
+ pgtable_area_len = sizeof(pgd_t) * PTRS_PER_PGD;
+ pgtable_area_len += sme_pgtable_calc(execute_end - kernel_start) * 2;
+ if (initrd_len)
+ pgtable_area_len += sme_pgtable_calc(initrd_len) * 2;
+
+ /* PUDs and PMDs needed in the current pagetables for the workarea */
+ pgtable_area_len += sme_pgtable_calc(execute_len + pgtable_area_len);
+
+ /*
+ * The total workarea includes the executable encryption area and
+ * the pagetable area. The start of the workarea is already 2MB
+ * aligned, align the end of the workarea on a 2MB boundary so that
+ * we don't try to create/allocate PTE entries from the workarea
+ * before it is mapped.
+ */
+ workarea_len = execute_len + pgtable_area_len;
+ workarea_end = ALIGN(workarea_start + workarea_len, PMD_PAGE_SIZE);
+
+ /*
+ * Set the address to the start of where newly created pagetable
+ * structures (PGDs, PUDs and PMDs) will be allocated. New pagetable
+ * structures are created when the workarea is added to the current
+ * pagetables and when the new encrypted and decrypted kernel
+ * mappings are populated.
+ */
+ ppd.pgtable_area = (void *)execute_end;
+
+ /*
+ * Make sure the current pagetable structure has entries for
+ * addressing the workarea.
+ */
+ ppd.pgd = (pgd_t *)native_read_cr3_pa();
+ ppd.paddr = workarea_start;
+ ppd.vaddr = workarea_start;
+ ppd.vaddr_end = workarea_end;
+ sme_map_range_decrypted(&ppd);
+
+ /* Flush the TLB - no globals so cr3 is enough */
+ native_write_cr3(__native_read_cr3());
+
+ /*
+ * A new pagetable structure is being built to allow for the kernel
+ * and initrd to be encrypted. It starts with an empty PGD that will
+ * then be populated with new PUDs and PMDs as the encrypted and
+ * decrypted kernel mappings are created.
+ */
+ ppd.pgd = ppd.pgtable_area;
+ memset(ppd.pgd, 0, sizeof(pgd_t) * PTRS_PER_PGD);
+ ppd.pgtable_area += sizeof(pgd_t) * PTRS_PER_PGD;
+
+ /*
+ * A different PGD index/entry must be used to get different
+ * pagetable entries for the decrypted mapping. Choose the next
+ * PGD index and convert it to a virtual address to be used as
+ * the base of the mapping.
+ */
+ decrypted_base = (pgd_index(workarea_end) + 1) & (PTRS_PER_PGD - 1);
+ if (initrd_len) {
+ unsigned long check_base;
+
+ check_base = (pgd_index(initrd_end) + 1) & (PTRS_PER_PGD - 1);
+ decrypted_base = max(decrypted_base, check_base);
+ }
+ decrypted_base <<= PGDIR_SHIFT;
+
+ /* Add encrypted kernel (identity) mappings */
+ ppd.paddr = kernel_start;
+ ppd.vaddr = kernel_start;
+ ppd.vaddr_end = kernel_end;
+ sme_map_range_encrypted(&ppd);
+
+ /* Add decrypted, write-protected kernel (non-identity) mappings */
+ ppd.paddr = kernel_start;
+ ppd.vaddr = kernel_start + decrypted_base;
+ ppd.vaddr_end = kernel_end + decrypted_base;
+ sme_map_range_decrypted_wp(&ppd);
+
+ if (initrd_len) {
+ /* Add encrypted initrd (identity) mappings */
+ ppd.paddr = initrd_start;
+ ppd.vaddr = initrd_start;
+ ppd.vaddr_end = initrd_end;
+ sme_map_range_encrypted(&ppd);
+ /*
+ * Add decrypted, write-protected initrd (non-identity) mappings
+ */
+ ppd.paddr = initrd_start;
+ ppd.vaddr = initrd_start + decrypted_base;
+ ppd.vaddr_end = initrd_end + decrypted_base;
+ sme_map_range_decrypted_wp(&ppd);
+ }
+
+ /* Add decrypted workarea mappings to both kernel mappings */
+ ppd.paddr = workarea_start;
+ ppd.vaddr = workarea_start;
+ ppd.vaddr_end = workarea_end;
+ sme_map_range_decrypted(&ppd);
+
+ ppd.paddr = workarea_start;
+ ppd.vaddr = workarea_start + decrypted_base;
+ ppd.vaddr_end = workarea_end + decrypted_base;
+ sme_map_range_decrypted(&ppd);
+
+ /* Perform the encryption */
+ sme_encrypt_execute(kernel_start, kernel_start + decrypted_base,
+ kernel_len, workarea_start, (unsigned long)ppd.pgd);
+
+ if (initrd_len)
+ sme_encrypt_execute(initrd_start, initrd_start + decrypted_base,
+ initrd_len, workarea_start,
+ (unsigned long)ppd.pgd);
+
+ /*
+ * At this point we are running encrypted. Remove the mappings for
+ * the decrypted areas - all that is needed for this is to remove
+ * the PGD entry/entries.
+ */
+ ppd.vaddr = kernel_start + decrypted_base;
+ ppd.vaddr_end = kernel_end + decrypted_base;
+ sme_clear_pgd(&ppd);
+
+ if (initrd_len) {
+ ppd.vaddr = initrd_start + decrypted_base;
+ ppd.vaddr_end = initrd_end + decrypted_base;
+ sme_clear_pgd(&ppd);
+ }
+
+ ppd.vaddr = workarea_start + decrypted_base;
+ ppd.vaddr_end = workarea_end + decrypted_base;
+ sme_clear_pgd(&ppd);
+
+ /* Flush the TLB - no globals so cr3 is enough */
+ native_write_cr3(__native_read_cr3());
+}
+
+void __init __nostackprotector sme_enable(struct boot_params *bp)
+{
+ const char *cmdline_ptr, *cmdline_arg, *cmdline_on, *cmdline_off;
+ unsigned int eax, ebx, ecx, edx;
+ unsigned long feature_mask;
+ bool active_by_default;
+ unsigned long me_mask;
+ char buffer[16];
+ u64 msr;
+
+ /* Check for the SME/SEV support leaf */
+ eax = 0x80000000;
+ ecx = 0;
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+ if (eax < 0x8000001f)
+ return;
+
+#define AMD_SME_BIT BIT(0)
+#define AMD_SEV_BIT BIT(1)
+ /*
+ * Set the feature mask (SME or SEV) based on whether we are
+ * running under a hypervisor.
+ */
+ eax = 1;
+ ecx = 0;
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+ feature_mask = (ecx & BIT(31)) ? AMD_SEV_BIT : AMD_SME_BIT;
+
+ /*
+ * Check for the SME/SEV feature:
+ * CPUID Fn8000_001F[EAX]
+ * - Bit 0 - Secure Memory Encryption support
+ * - Bit 1 - Secure Encrypted Virtualization support
+ * CPUID Fn8000_001F[EBX]
+ * - Bits 5:0 - Pagetable bit position used to indicate encryption
+ */
+ eax = 0x8000001f;
+ ecx = 0;
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+ if (!(eax & feature_mask))
+ return;
+
+ me_mask = 1UL << (ebx & 0x3f);
+
+ /* Check if memory encryption is enabled */
+ if (feature_mask == AMD_SME_BIT) {
+ /* For SME, check the SYSCFG MSR */
+ msr = __rdmsr(MSR_K8_SYSCFG);
+ if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT))
+ return;
+ } else {
+ /* For SEV, check the SEV MSR */
+ msr = __rdmsr(MSR_AMD64_SEV);
+ if (!(msr & MSR_AMD64_SEV_ENABLED))
+ return;
+
+ /* SEV state cannot be controlled by a command line option */
+ sme_me_mask = me_mask;
+ sev_enabled = true;
+ return;
+ }
+
+ /*
+ * Fixups have not been applied to phys_base yet and we're running
+ * identity mapped, so we must obtain the address to the SME command
+ * line argument data using rip-relative addressing.
+ */
+ asm ("lea sme_cmdline_arg(%%rip), %0"
+ : "=r" (cmdline_arg)
+ : "p" (sme_cmdline_arg));
+ asm ("lea sme_cmdline_on(%%rip), %0"
+ : "=r" (cmdline_on)
+ : "p" (sme_cmdline_on));
+ asm ("lea sme_cmdline_off(%%rip), %0"
+ : "=r" (cmdline_off)
+ : "p" (sme_cmdline_off));
+
+ if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT))
+ active_by_default = true;
+ else
+ active_by_default = false;
+
+ cmdline_ptr = (const char *)((u64)bp->hdr.cmd_line_ptr |
+ ((u64)bp->ext_cmd_line_ptr << 32));
+
+ cmdline_find_option(cmdline_ptr, cmdline_arg, buffer, sizeof(buffer));
+
+ if (!strncmp(buffer, cmdline_on, sizeof(buffer)))
+ sme_me_mask = me_mask;
+ else if (!strncmp(buffer, cmdline_off, sizeof(buffer)))
+ sme_me_mask = 0;
+ else
+ sme_me_mask = active_by_default ? me_mask : 0;
+}
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 7f1a513..e055d1a 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -157,7 +157,7 @@ static void sync_current_stack_to_mm(struct mm_struct *mm)
unsigned long sp = current_stack_pointer;
pgd_t *pgd = pgd_offset(mm, sp);
- if (CONFIG_PGTABLE_LEVELS > 4) {
+ if (pgtable_l5_enabled) {
if (unlikely(pgd_none(*pgd))) {
pgd_t *pgd_ref = pgd_offset_k(sp);
@@ -613,7 +613,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
{
int cpu;
- struct flush_tlb_info info = {
+ struct flush_tlb_info info __aligned(SMP_CACHE_BYTES) = {
.mm = mm,
};
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index c310a82..4845871 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -27,6 +27,7 @@
#include <linux/ioport.h>
#include <linux/mc146818rtc.h>
#include <linux/efi.h>
+#include <linux/export.h>
#include <linux/uaccess.h>
#include <linux/io.h>
#include <linux/reboot.h>
@@ -190,7 +191,8 @@ void __init efi_call_phys_epilog(pgd_t *save_pgd)
early_code_mapping_set_exec(0);
}
-static pgd_t *efi_pgd;
+pgd_t *efi_pgd;
+EXPORT_SYMBOL_GPL(efi_pgd);
/*
* We need our own copy of the higher levels of the page tables
@@ -225,7 +227,7 @@ int __init efi_alloc_page_tables(void)
pud = pud_alloc(&init_mm, p4d, EFI_VA_END);
if (!pud) {
- if (CONFIG_PGTABLE_LEVELS > 4)
+ if (pgtable_l5_enabled)
free_page((unsigned long) pgd_page_vaddr(*pgd));
free_page((unsigned long)efi_pgd);
return -ENOMEM;
@@ -255,8 +257,8 @@ void efi_sync_low_kernel_mappings(void)
* only span a single PGD entry and that the entry also maps
* other important kernel regions.
*/
- BUILD_BUG_ON(pgd_index(EFI_VA_END) != pgd_index(MODULES_END));
- BUILD_BUG_ON((EFI_VA_START & PGDIR_MASK) !=
+ MAYBE_BUILD_BUG_ON(pgd_index(EFI_VA_END) != pgd_index(MODULES_END));
+ MAYBE_BUILD_BUG_ON((EFI_VA_START & PGDIR_MASK) !=
(EFI_VA_END & PGDIR_MASK));
pgd_efi = efi_pgd + pgd_index(PAGE_OFFSET);
diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
index 0ef5e520..74a5329 100644
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -50,7 +50,7 @@ static int set_up_temporary_text_mapping(pgd_t *pgd)
{
pmd_t *pmd;
pud_t *pud;
- p4d_t *p4d;
+ p4d_t *p4d = NULL;
/*
* The new mapping only has to cover the page containing the image
@@ -66,7 +66,7 @@ static int set_up_temporary_text_mapping(pgd_t *pgd)
* tables used by the image kernel.
*/
- if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+ if (pgtable_l5_enabled) {
p4d = (p4d_t *)get_safe_page(GFP_ATOMIC);
if (!p4d)
return -ENOMEM;
@@ -84,7 +84,7 @@ static int set_up_temporary_text_mapping(pgd_t *pgd)
__pmd((jump_address_phys & PMD_MASK) | __PAGE_KERNEL_LARGE_EXEC));
set_pud(pud + pud_index(restore_jump_address),
__pud(__pa(pmd) | _KERNPG_TABLE));
- if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+ if (p4d) {
set_p4d(p4d + p4d_index(restore_jump_address), __p4d(__pa(pud) | _KERNPG_TABLE));
set_pgd(pgd + pgd_index(restore_jump_address), __pgd(__pa(p4d) | _KERNPG_TABLE));
} else {
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index f605825..c1f98f3 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -18,9 +18,6 @@ config XEN_PV
bool "Xen PV guest support"
default y
depends on XEN
- # XEN_PV is not ready to work with 5-level paging.
- # Changes to hypervisor are also required.
- depends on !X86_5LEVEL
select XEN_HAVE_PVMMU
select XEN_HAVE_VPMU
help
@@ -79,6 +76,4 @@ config XEN_DEBUG_FS
config XEN_PVH
bool "Support for running as a PVH guest"
depends on XEN && XEN_PVHVM && ACPI
- # Pre-built page tables are not ready to handle 5-level paging.
- depends on !X86_5LEVEL
def_bool n
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index aae88fe..d207634 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -538,6 +538,22 @@ static void xen_set_p4d(p4d_t *ptr, p4d_t val)
xen_mc_issue(PARAVIRT_LAZY_MMU);
}
+
+#if CONFIG_PGTABLE_LEVELS >= 5
+__visible p4dval_t xen_p4d_val(p4d_t p4d)
+{
+ return pte_mfn_to_pfn(p4d.p4d);
+}
+PV_CALLEE_SAVE_REGS_THUNK(xen_p4d_val);
+
+__visible p4d_t xen_make_p4d(p4dval_t p4d)
+{
+ p4d = pte_pfn_to_mfn(p4d);
+
+ return native_make_p4d(p4d);
+}
+PV_CALLEE_SAVE_REGS_THUNK(xen_make_p4d);
+#endif /* CONFIG_PGTABLE_LEVELS >= 5 */
#endif /* CONFIG_X86_64 */
static int xen_pmd_walk(struct mm_struct *mm, pmd_t *pmd,
@@ -2411,6 +2427,11 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = {
.alloc_pud = xen_alloc_pmd_init,
.release_pud = xen_release_pmd_init,
+
+#if CONFIG_PGTABLE_LEVELS >= 5
+ .p4d_val = PV_CALLEE_SAVE(xen_p4d_val),
+ .make_p4d = PV_CALLEE_SAVE(xen_make_p4d),
+#endif
#endif /* CONFIG_X86_64 */
.activate_mm = xen_activate_mm,
diff --git a/include/asm-generic/5level-fixup.h b/include/asm-generic/5level-fixup.h
index dfbd9d9..9c2e070 100644
--- a/include/asm-generic/5level-fixup.h
+++ b/include/asm-generic/5level-fixup.h
@@ -8,6 +8,7 @@
#define P4D_SHIFT PGDIR_SHIFT
#define P4D_SIZE PGDIR_SIZE
#define P4D_MASK PGDIR_MASK
+#define MAX_PTRS_PER_P4D 1
#define PTRS_PER_P4D 1
#define p4d_t pgd_t
diff --git a/include/asm-generic/pgtable-nop4d.h b/include/asm-generic/pgtable-nop4d.h
index 8f22f55..1a29b2a 100644
--- a/include/asm-generic/pgtable-nop4d.h
+++ b/include/asm-generic/pgtable-nop4d.h
@@ -8,10 +8,11 @@
typedef struct { pgd_t pgd; } p4d_t;
-#define P4D_SHIFT PGDIR_SHIFT
-#define PTRS_PER_P4D 1
-#define P4D_SIZE (1UL << P4D_SHIFT)
-#define P4D_MASK (~(P4D_SIZE-1))
+#define P4D_SHIFT PGDIR_SHIFT
+#define MAX_PTRS_PER_P4D 1
+#define PTRS_PER_P4D 1
+#define P4D_SIZE (1UL << P4D_SHIFT)
+#define P4D_MASK (~(P4D_SIZE-1))
/*
* The "pgd_xxx()" functions here are trivial for a folded two-level
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index adc1347..d6459bd 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -18,7 +18,7 @@ extern unsigned char kasan_zero_page[PAGE_SIZE];
extern pte_t kasan_zero_pte[PTRS_PER_PTE];
extern pmd_t kasan_zero_pmd[PTRS_PER_PMD];
extern pud_t kasan_zero_pud[PTRS_PER_PUD];
-extern p4d_t kasan_zero_p4d[PTRS_PER_P4D];
+extern p4d_t kasan_zero_p4d[MAX_PTRS_PER_P4D];
void kasan_populate_zero_shadow(const void *shadow_start,
const void *shadow_end);
diff --git a/mm/kasan/kasan_init.c b/mm/kasan/kasan_init.c
index 554e4c0..f436246 100644
--- a/mm/kasan/kasan_init.c
+++ b/mm/kasan/kasan_init.c
@@ -31,7 +31,7 @@
unsigned char kasan_zero_page[PAGE_SIZE] __page_aligned_bss;
#if CONFIG_PGTABLE_LEVELS > 4
-p4d_t kasan_zero_p4d[PTRS_PER_P4D] __page_aligned_bss;
+p4d_t kasan_zero_p4d[MAX_PTRS_PER_P4D] __page_aligned_bss;
#endif
#if CONFIG_PGTABLE_LEVELS > 3
pud_t kasan_zero_pud[PTRS_PER_PUD] __page_aligned_bss;
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index c301350..b7f61cd 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -84,18 +84,19 @@
* This is made more complicated by various memory models and PAE.
*/
-#ifndef MAX_PHYSMEM_BITS
-#ifdef CONFIG_HIGHMEM64G
-#define MAX_PHYSMEM_BITS 36
-#else /* !CONFIG_HIGHMEM64G */
+#ifndef MAX_POSSIBLE_PHYSMEM_BITS
+#ifdef MAX_PHYSMEM_BITS
+#define MAX_POSSIBLE_PHYSMEM_BITS MAX_PHYSMEM_BITS
+#else
/*
* If this definition of MAX_PHYSMEM_BITS is used, OBJ_INDEX_BITS will just
* be PAGE_SHIFT
*/
-#define MAX_PHYSMEM_BITS BITS_PER_LONG
+#define MAX_POSSIBLE_PHYSMEM_BITS BITS_PER_LONG
#endif
#endif
-#define _PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT)
+
+#define _PFN_BITS (MAX_POSSIBLE_PHYSMEM_BITS - PAGE_SHIFT)
/*
* Memory for allocating for handle keeps object position by
OpenPOWER on IntegriCloud