From fa2bbce985ca97943305cdc81d9626e6810ed7f2 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 24 Jan 2013 12:19:49 -0800 Subject: x86, 64bit: Copy struct boot_params early We want to support struct boot_params (formerly known as the zero-page, or real-mode data) above the 4 GiB mark. We will have #PF handler to set page table for not accessible ram early, but want to limit it before x86_64_start_reservations to limit the code change to native path only. Also we will need the ramdisk info in struct boot_params to access the microcode blob in ramdisk in x86_64_start_kernel, so copy struct boot_params early makes it accessing ramdisk info simple. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1359058816-7615-9-git-send-email-yinghai@kernel.org Cc: Alexander Duyck Cc: Fenghua Yu Signed-off-by: H. Peter Anvin --- arch/x86/kernel/head64.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel/head64.c') diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 849fc9e..7785e668 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -89,6 +89,8 @@ void __init x86_64_start_kernel(char * real_mode_data) } load_idt((const struct desc_ptr *)&idt_descr); + copy_bootdata(__va(real_mode_data)); + if (console_loglevel == 10) early_printk("Kernel alive\n"); @@ -97,7 +99,9 @@ void __init x86_64_start_kernel(char * real_mode_data) void __init x86_64_start_reservations(char *real_mode_data) { - copy_bootdata(__va(real_mode_data)); + /* version is always not zero if it is copied */ + if (!boot_params.hdr.version) + copy_bootdata(__va(real_mode_data)); memblock_reserve(__pa_symbol(&_text), __pa_symbol(&__bss_stop) - __pa_symbol(&_text)); -- cgit v1.1 From 8170e6bed465b4b0c7687f93e9948aca4358a33b Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 24 Jan 2013 12:19:52 -0800 Subject: x86, 64bit: Use a #PF handler to materialize early mappings on demand Linear mode (CR0.PG = 0) is mutually exclusive with 64-bit mode; all 64-bit code has to use page tables. This makes it awkward before we have first set up properly all-covering page tables to access objects that are outside the static kernel range. So far we have dealt with that simply by mapping a fixed amount of low memory, but that fails in at least two upcoming use cases: 1. We will support load and run kernel, struct boot_params, ramdisk, command line, etc. above the 4 GiB mark. 2. need to access ramdisk early to get microcode to update that as early possible. We could use early_iomap to access them too, but it will make code to messy and hard to be unified with 32 bit. Hence, set up a #PF table and use a fixed number of buffers to set up page tables on demand. If the buffers fill up then we simply flush them and start over. These buffers are all in __initdata, so it does not increase RAM usage at runtime. Thus, with the help of the #PF handler, we can set the final kernel mapping from blank, and switch to init_level4_pgt later. During the switchover in head_64.S, before #PF handler is available, we use three pages to handle kernel crossing 1G, 512G boundaries with sharing page by playing games with page aliasing: the same page is mapped twice in the higher-level tables with appropriate wraparound. The kernel region itself will be properly mapped; other mappings may be spurious. early_make_pgtable is using kernel high mapping address to access pages to set page table. -v4: Add phys_base offset to make kexec happy, and add init_mapping_kernel() - Yinghai -v5: fix compiling with xen, and add back ident level3 and level2 for xen also move back init_level4_pgt from BSS to DATA again. because we have to clear it anyway. - Yinghai -v6: switch to init_level4_pgt in init_mem_mapping. - Yinghai -v7: remove not needed clear_page for init_level4_page it is with fill 512,8,0 already in head_64.S - Yinghai -v8: we need to keep that handler alive until init_mem_mapping and don't let early_trap_init to trash that early #PF handler. So split early_trap_pf_init out and move it down. - Yinghai -v9: switchover only cover kernel space instead of 1G so could avoid touch possible mem holes. - Yinghai -v11: change far jmp back to far return to initial_code, that is needed to fix failure that is reported by Konrad on AMD systems. - Yinghai Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1359058816-7615-12-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/kernel/head64.c | 81 +++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 74 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel/head64.c') diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 7785e668..f57df05 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -27,11 +27,73 @@ #include #include -static void __init zap_identity_mappings(void) +/* + * Manage page tables very early on. + */ +extern pgd_t early_level4_pgt[PTRS_PER_PGD]; +extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD]; +static unsigned int __initdata next_early_pgt = 2; + +/* Wipe all early page tables except for the kernel symbol map */ +static void __init reset_early_page_tables(void) { - pgd_t *pgd = pgd_offset_k(0UL); - pgd_clear(pgd); - __flush_tlb_all(); + unsigned long i; + + for (i = 0; i < PTRS_PER_PGD-1; i++) + early_level4_pgt[i].pgd = 0; + + next_early_pgt = 0; + + write_cr3(__pa(early_level4_pgt)); +} + +/* Create a new PMD entry */ +int __init early_make_pgtable(unsigned long address) +{ + unsigned long physaddr = address - __PAGE_OFFSET; + unsigned long i; + pgdval_t pgd, *pgd_p; + pudval_t *pud_p; + pmdval_t pmd, *pmd_p; + + /* Invalid address or early pgt is done ? */ + if (physaddr >= MAXMEM || read_cr3() != __pa(early_level4_pgt)) + return -1; + + i = (address >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1); + pgd_p = &early_level4_pgt[i].pgd; + pgd = *pgd_p; + + /* + * The use of __START_KERNEL_map rather than __PAGE_OFFSET here is + * critical -- __PAGE_OFFSET would point us back into the dynamic + * range and we might end up looping forever... + */ + if (pgd && next_early_pgt < EARLY_DYNAMIC_PAGE_TABLES) { + pud_p = (pudval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base); + } else { + if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES-1) + reset_early_page_tables(); + + pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++]; + for (i = 0; i < PTRS_PER_PUD; i++) + pud_p[i] = 0; + + *pgd_p = (pgdval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; + } + i = (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1); + pud_p += i; + + pmd_p = (pmdval_t *)early_dynamic_pgts[next_early_pgt++]; + pmd = (physaddr & PUD_MASK) + (__PAGE_KERNEL_LARGE & ~_PAGE_GLOBAL); + for (i = 0; i < PTRS_PER_PMD; i++) { + pmd_p[i] = pmd; + pmd += PMD_SIZE; + } + + *pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; + + return 0; } /* Don't add a printk in there. printk relies on the PDA which is not initialized @@ -72,12 +134,13 @@ void __init x86_64_start_kernel(char * real_mode_data) (__START_KERNEL & PGDIR_MASK))); BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END); + /* Kill off the identity-map trampoline */ + reset_early_page_tables(); + /* clear bss before set_intr_gate with early_idt_handler */ clear_bss(); - /* Make NULL pointers segfault */ - zap_identity_mappings(); - + /* XXX - this is wrong... we need to build page tables from scratch */ max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT; for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) { @@ -94,6 +157,10 @@ void __init x86_64_start_kernel(char * real_mode_data) if (console_loglevel == 10) early_printk("Kernel alive\n"); + clear_page(init_level4_pgt); + /* set init_level4_pgt kernel high mapping*/ + init_level4_pgt[511] = early_level4_pgt[511]; + x86_64_start_reservations(real_mode_data); } -- cgit v1.1 From 6b9c75aca6cba4d99a6e8d8274b1788d4d4b50d9 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 24 Jan 2013 12:19:53 -0800 Subject: x86, 64bit: #PF handler set page to cover only 2M per #PF We only map a single 2 MiB page per #PF, even though we should be able to do this a full gigabyte at a time with no additional memory cost. This is a workaround for a broken AMD reference BIOS (and its derivatives in shipping system) which maps a large chunk of memory as WB in the MTRR system but will #MC if the processor wanders off and tries to prefetch that memory, which can happen any time the memory is mapped in the TLB. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1359058816-7615-13-git-send-email-yinghai@kernel.org Cc: Alexander Duyck [ hpa: rewrote the patch description ] Signed-off-by: H. Peter Anvin --- arch/x86/kernel/head64.c | 42 +++++++++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 17 deletions(-) (limited to 'arch/x86/kernel/head64.c') diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index f57df05..816fc85 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -53,15 +53,15 @@ int __init early_make_pgtable(unsigned long address) unsigned long physaddr = address - __PAGE_OFFSET; unsigned long i; pgdval_t pgd, *pgd_p; - pudval_t *pud_p; + pudval_t pud, *pud_p; pmdval_t pmd, *pmd_p; /* Invalid address or early pgt is done ? */ if (physaddr >= MAXMEM || read_cr3() != __pa(early_level4_pgt)) return -1; - i = (address >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1); - pgd_p = &early_level4_pgt[i].pgd; +again: + pgd_p = &early_level4_pgt[pgd_index(address)].pgd; pgd = *pgd_p; /* @@ -69,29 +69,37 @@ int __init early_make_pgtable(unsigned long address) * critical -- __PAGE_OFFSET would point us back into the dynamic * range and we might end up looping forever... */ - if (pgd && next_early_pgt < EARLY_DYNAMIC_PAGE_TABLES) { + if (pgd) pud_p = (pudval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base); - } else { - if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES-1) + else { + if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) { reset_early_page_tables(); + goto again; + } pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++]; for (i = 0; i < PTRS_PER_PUD; i++) pud_p[i] = 0; - *pgd_p = (pgdval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; } - i = (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1); - pud_p += i; - - pmd_p = (pmdval_t *)early_dynamic_pgts[next_early_pgt++]; - pmd = (physaddr & PUD_MASK) + (__PAGE_KERNEL_LARGE & ~_PAGE_GLOBAL); - for (i = 0; i < PTRS_PER_PMD; i++) { - pmd_p[i] = pmd; - pmd += PMD_SIZE; - } + pud_p += pud_index(address); + pud = *pud_p; - *pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; + if (pud) + pmd_p = (pmdval_t *)((pud & PTE_PFN_MASK) + __START_KERNEL_map - phys_base); + else { + if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) { + reset_early_page_tables(); + goto again; + } + + pmd_p = (pmdval_t *)early_dynamic_pgts[next_early_pgt++]; + for (i = 0; i < PTRS_PER_PMD; i++) + pmd_p[i] = 0; + *pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; + } + pmd = (physaddr & PMD_MASK) + (__PAGE_KERNEL_LARGE & ~_PAGE_GLOBAL); + pmd_p[pmd_index(address)] = pmd; return 0; } -- cgit v1.1 From 100542306f644fc580857a8ca4896fb12b794d41 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 24 Jan 2013 12:19:54 -0800 Subject: x86, 64bit: Don't set max_pfn_mapped wrong value early on native path We are not having max_pfn_mapped set correctly until init_memory_mapping. So don't print its initial value for 64bit Also need to use KERNEL_IMAGE_SIZE directly for highmap cleanup. -v2: update comments about max_pfn_mapped according to Stefano Stabellini. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1359058816-7615-14-git-send-email-yinghai@kernel.org Acked-by: Borislav Petkov Signed-off-by: H. Peter Anvin --- arch/x86/kernel/head64.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86/kernel/head64.c') diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 816fc85..f3b1968 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -148,9 +148,6 @@ void __init x86_64_start_kernel(char * real_mode_data) /* clear bss before set_intr_gate with early_idt_handler */ clear_bss(); - /* XXX - this is wrong... we need to build page tables from scratch */ - max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT; - for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) { #ifdef CONFIG_EARLY_PRINTK set_intr_gate(i, &early_idt_handlers[i]); -- cgit v1.1 From 1b8c78be01203e1c95ec5dfef6db307796fe0bc7 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 24 Jan 2013 12:19:55 -0800 Subject: x86: Merge early_reserve_initrd for 32bit and 64bit They are the same, could move them out from head32/64.c to setup.c. We are using memblock, and it could handle overlapping properly, so we don't need to reserve some at first to hold the location, and just need to make sure we reserve them before we are using memblock to find free mem to use. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1359058816-7615-15-git-send-email-yinghai@kernel.org Reviewed-by: Pekka Enberg Signed-off-by: H. Peter Anvin --- arch/x86/kernel/head64.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'arch/x86/kernel/head64.c') diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index f3b1968..b88a1fa 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -178,17 +178,6 @@ void __init x86_64_start_reservations(char *real_mode_data) memblock_reserve(__pa_symbol(&_text), __pa_symbol(&__bss_stop) - __pa_symbol(&_text)); -#ifdef CONFIG_BLK_DEV_INITRD - /* Reserve INITRD */ - if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) { - /* Assume only end is not page aligned */ - unsigned long ramdisk_image = boot_params.hdr.ramdisk_image; - unsigned long ramdisk_size = boot_params.hdr.ramdisk_size; - unsigned long ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); - memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image); - } -#endif - reserve_ebda_region(); /* -- cgit v1.1 From f1da834cd902f5e5df0b11a3948fc43c6071b590 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 24 Jan 2013 12:19:57 -0800 Subject: x86, boot: Add get_cmd_line_ptr() Add an accessor function for the command line address. Later we will add support for holding a 64-bit address via ext_cmd_line_ptr. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1359058816-7615-17-git-send-email-yinghai@kernel.org Cc: Gokul Caushik Cc: Josh Triplett Cc: Joe Millenbach Cc: Alexander Duyck Signed-off-by: H. Peter Anvin --- arch/x86/kernel/head64.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel/head64.c') diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index b88a1fa..62c8ce4 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -112,14 +112,23 @@ static void __init clear_bss(void) (unsigned long) __bss_stop - (unsigned long) __bss_start); } +static unsigned long get_cmd_line_ptr(void) +{ + unsigned long cmd_line_ptr = boot_params.hdr.cmd_line_ptr; + + return cmd_line_ptr; +} + static void __init copy_bootdata(char *real_mode_data) { char * command_line; + unsigned long cmd_line_ptr; memcpy(&boot_params, real_mode_data, sizeof boot_params); sanitize_boot_params(&boot_params); - if (boot_params.hdr.cmd_line_ptr) { - command_line = __va(boot_params.hdr.cmd_line_ptr); + cmd_line_ptr = get_cmd_line_ptr(); + if (cmd_line_ptr) { + command_line = __va(cmd_line_ptr); memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE); } } -- cgit v1.1 From ee92d815027a76ef92f3ec7b155b0c8aa345f239 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 28 Jan 2013 20:16:44 -0800 Subject: x86, boot: Support loading bzImage, boot_params and ramdisk above 4G xloadflags bit 1 indicates that we can load the kernel and all data structures above 4G; it is set if kernel is relocatable and 64bit. bootloader will check if xloadflags bit 1 is set to decide if it could load ramdisk and kernel high above 4G. bootloader will fill value to ext_ramdisk_image/size for high 32bits when it load ramdisk above 4G. kernel use get_ramdisk_image/size to use ext_ramdisk_image/size to get right positon for ramdisk. Signed-off-by: Yinghai Lu Cc: Rob Landley Cc: Matt Fleming Cc: Gokul Caushik Cc: Josh Triplett Cc: Joe Millenbach Link: http://lkml.kernel.org/r/1359058816-7615-26-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/kernel/head64.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kernel/head64.c') diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 62c8ce4..6873b07 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -116,6 +116,8 @@ static unsigned long get_cmd_line_ptr(void) { unsigned long cmd_line_ptr = boot_params.hdr.cmd_line_ptr; + cmd_line_ptr |= (u64)boot_params.ext_cmd_line_ptr << 32; + return cmd_line_ptr; } -- cgit v1.1 From 6c902b656c4a808d9c6f40a387b166455efecd62 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 24 Jan 2013 12:20:12 -0800 Subject: x86: Merge early kernel reserve for 32bit and 64bit They are the same, and we could move them out from head32/64.c to setup.c. We are using memblock, and it could handle overlapping properly, so we don't need to reserve some at first to hold the location, and just need to make sure we reserve them before we are using memblock to find free mem to use. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1359058816-7615-32-git-send-email-yinghai@kernel.org Cc: Alexander Duyck Signed-off-by: H. Peter Anvin --- arch/x86/kernel/head64.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'arch/x86/kernel/head64.c') diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 6873b07..57334f4c 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -186,16 +186,7 @@ void __init x86_64_start_reservations(char *real_mode_data) if (!boot_params.hdr.version) copy_bootdata(__va(real_mode_data)); - memblock_reserve(__pa_symbol(&_text), - __pa_symbol(&__bss_stop) - __pa_symbol(&_text)); - reserve_ebda_region(); - /* - * At this point everything still needed from the boot loader - * or BIOS or kernel text should be early reserved or marked not - * RAM in e820. All other memory is free game. - */ - start_kernel(); } -- cgit v1.1