diff options
Diffstat (limited to 'arch')
76 files changed, 2574 insertions, 993 deletions
diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 3850701..738d5ee 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -265,6 +265,15 @@ static inline void __cpu_init_stage2(void) kvm_call_hyp(__init_stage2_translation); } +static inline void __cpu_reset_hyp_mode(phys_addr_t boot_pgd_ptr, + phys_addr_t phys_idmap_start) +{ + /* + * TODO + * kvm_call_reset(boot_pgd_ptr, phys_idmap_start); + */ +} + static inline int kvm_arch_dev_ioctl_check_extension(long ext) { return 0; @@ -277,7 +286,6 @@ void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot); struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr); -static inline void kvm_arch_hardware_disable(void) {} static inline void kvm_arch_hardware_unsetup(void) {} static inline void kvm_arch_sync_events(struct kvm *kvm) {} static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {} diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index da44be9..f17a8d4 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -66,6 +66,7 @@ void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu); phys_addr_t kvm_mmu_get_httbr(void); phys_addr_t kvm_mmu_get_boot_httbr(void); phys_addr_t kvm_get_idmap_vector(void); +phys_addr_t kvm_get_idmap_start(void); int kvm_mmu_init(void); void kvm_clear_hyp_idmap(void); diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index dded1b7..9ef013d 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -16,7 +16,6 @@ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ -#include <linux/cpu.h> #include <linux/cpu_pm.h> #include <linux/errno.h> #include <linux/err.h> @@ -66,6 +65,8 @@ static DEFINE_SPINLOCK(kvm_vmid_lock); static bool vgic_present; +static DEFINE_PER_CPU(unsigned char, kvm_arm_hardware_enabled); + static void kvm_arm_set_running_vcpu(struct kvm_vcpu *vcpu) { BUG_ON(preemptible()); @@ -90,11 +91,6 @@ struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void) return &kvm_arm_running_vcpu; } -int kvm_arch_hardware_enable(void) -{ - return 0; -} - int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) { return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE; @@ -1033,11 +1029,6 @@ long kvm_arch_vm_ioctl(struct file *filp, } } -static void cpu_init_stage2(void *dummy) -{ - __cpu_init_stage2(); -} - static void cpu_init_hyp_mode(void *dummy) { phys_addr_t boot_pgd_ptr; @@ -1065,43 +1056,87 @@ static void cpu_hyp_reinit(void) { if (is_kernel_in_hyp_mode()) { /* - * cpu_init_stage2() is safe to call even if the PM + * __cpu_init_stage2() is safe to call even if the PM * event was cancelled before the CPU was reset. */ - cpu_init_stage2(NULL); + __cpu_init_stage2(); } else { if (__hyp_get_vectors() == hyp_default_vectors) cpu_init_hyp_mode(NULL); } } -static int hyp_init_cpu_notify(struct notifier_block *self, - unsigned long action, void *cpu) +static void cpu_hyp_reset(void) +{ + phys_addr_t boot_pgd_ptr; + phys_addr_t phys_idmap_start; + + if (!is_kernel_in_hyp_mode()) { + boot_pgd_ptr = kvm_mmu_get_boot_httbr(); + phys_idmap_start = kvm_get_idmap_start(); + + __cpu_reset_hyp_mode(boot_pgd_ptr, phys_idmap_start); + } +} + +static void _kvm_arch_hardware_enable(void *discard) { - switch (action) { - case CPU_STARTING: - case CPU_STARTING_FROZEN: + if (!__this_cpu_read(kvm_arm_hardware_enabled)) { cpu_hyp_reinit(); + __this_cpu_write(kvm_arm_hardware_enabled, 1); } +} + +int kvm_arch_hardware_enable(void) +{ + _kvm_arch_hardware_enable(NULL); + return 0; +} - return NOTIFY_OK; +static void _kvm_arch_hardware_disable(void *discard) +{ + if (__this_cpu_read(kvm_arm_hardware_enabled)) { + cpu_hyp_reset(); + __this_cpu_write(kvm_arm_hardware_enabled, 0); + } } -static struct notifier_block hyp_init_cpu_nb = { - .notifier_call = hyp_init_cpu_notify, -}; +void kvm_arch_hardware_disable(void) +{ + _kvm_arch_hardware_disable(NULL); +} #ifdef CONFIG_CPU_PM static int hyp_init_cpu_pm_notifier(struct notifier_block *self, unsigned long cmd, void *v) { - if (cmd == CPU_PM_EXIT) { - cpu_hyp_reinit(); + /* + * kvm_arm_hardware_enabled is left with its old value over + * PM_ENTER->PM_EXIT. It is used to indicate PM_EXIT should + * re-enable hyp. + */ + switch (cmd) { + case CPU_PM_ENTER: + if (__this_cpu_read(kvm_arm_hardware_enabled)) + /* + * don't update kvm_arm_hardware_enabled here + * so that the hardware will be re-enabled + * when we resume. See below. + */ + cpu_hyp_reset(); + + return NOTIFY_OK; + case CPU_PM_EXIT: + if (__this_cpu_read(kvm_arm_hardware_enabled)) + /* The hardware was enabled before suspend. */ + cpu_hyp_reinit(); + return NOTIFY_OK; - } - return NOTIFY_DONE; + default: + return NOTIFY_DONE; + } } static struct notifier_block hyp_init_cpu_pm_nb = { @@ -1143,16 +1178,12 @@ static int init_common_resources(void) static int init_subsystems(void) { - int err; + int err = 0; /* - * Register CPU Hotplug notifier + * Enable hardware so that subsystem initialisation can access EL2. */ - err = register_cpu_notifier(&hyp_init_cpu_nb); - if (err) { - kvm_err("Cannot register KVM init CPU notifier (%d)\n", err); - return err; - } + on_each_cpu(_kvm_arch_hardware_enable, NULL, 1); /* * Register CPU lower-power notifier @@ -1170,9 +1201,10 @@ static int init_subsystems(void) case -ENODEV: case -ENXIO: vgic_present = false; + err = 0; break; default: - return err; + goto out; } /* @@ -1180,12 +1212,15 @@ static int init_subsystems(void) */ err = kvm_timer_hyp_init(); if (err) - return err; + goto out; kvm_perf_init(); kvm_coproc_table_init(); - return 0; +out: + on_each_cpu(_kvm_arch_hardware_disable, NULL, 1); + + return err; } static void teardown_hyp_mode(void) @@ -1198,17 +1233,11 @@ static void teardown_hyp_mode(void) free_hyp_pgds(); for_each_possible_cpu(cpu) free_page(per_cpu(kvm_arm_hyp_stack_page, cpu)); - unregister_cpu_notifier(&hyp_init_cpu_nb); hyp_cpu_pm_exit(); } static int init_vhe_mode(void) { - /* - * Execute the init code on each CPU. - */ - on_each_cpu(cpu_init_stage2, NULL, 1); - /* set size of VMID supported by CPU */ kvm_vmid_bits = kvm_get_vmid_bits(); kvm_info("%d-bit VMID\n", kvm_vmid_bits); @@ -1295,11 +1324,6 @@ static int init_hyp_mode(void) } } - /* - * Execute the init code on each CPU. - */ - on_each_cpu(cpu_init_hyp_mode, NULL, 1); - #ifndef CONFIG_HOTPLUG_CPU free_boot_hyp_pgd(); #endif diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index d6d4191..be30212 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -1666,6 +1666,11 @@ phys_addr_t kvm_get_idmap_vector(void) return hyp_idmap_vector; } +phys_addr_t kvm_get_idmap_start(void) +{ + return hyp_idmap_start; +} + int kvm_mmu_init(void) { int err; diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 4f43622..8845c0d 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -11,6 +11,7 @@ config ARM64 select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_USE_CMPXCHG_LOCKREF select ARCH_SUPPORTS_ATOMIC_RMW + select ARCH_SUPPORTS_NUMA_BALANCING select ARCH_WANT_OPTIONAL_GPIOLIB select ARCH_WANT_COMPAT_IPC_PARSE_VERSION select ARCH_WANT_FRAME_POINTERS @@ -58,11 +59,14 @@ config ARM64 select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_TRACEHOOK + select HAVE_ARCH_TRANSPARENT_HUGEPAGE + select HAVE_ARM_SMCCC select HAVE_BPF_JIT select HAVE_C_RECORDMCOUNT select HAVE_CC_STACKPROTECTOR select HAVE_CMPXCHG_DOUBLE select HAVE_CMPXCHG_LOCAL + select HAVE_CONTEXT_TRACKING select HAVE_DEBUG_BUGVERBOSE select HAVE_DEBUG_KMEMLEAK select HAVE_DMA_API_DEBUG @@ -76,6 +80,7 @@ config ARM64 select HAVE_HW_BREAKPOINT if PERF_EVENTS select HAVE_IRQ_TIME_ACCOUNTING select HAVE_MEMBLOCK + select HAVE_MEMBLOCK_NODE_MAP if NUMA select HAVE_PATA_PLATFORM select HAVE_PERF_EVENTS select HAVE_PERF_REGS @@ -89,15 +94,13 @@ config ARM64 select NO_BOOTMEM select OF select OF_EARLY_FLATTREE + select OF_NUMA if NUMA && OF select OF_RESERVED_MEM select PERF_USE_VMALLOC select POWER_RESET select POWER_SUPPLY - select RTC_LIB select SPARSE_IRQ select SYSCTL_EXCEPTION_TRACE - select HAVE_CONTEXT_TRACKING - select HAVE_ARM_SMCCC help ARM 64-bit (AArch64) Linux support. @@ -546,10 +549,35 @@ config HOTPLUG_CPU Say Y here to experiment with turning CPUs off and on. CPUs can be controlled through /sys/devices/system/cpu. +# Common NUMA Features +config NUMA + bool "Numa Memory Allocation and Scheduler Support" + depends on SMP + help + Enable NUMA (Non Uniform Memory Access) support. + + The kernel will try to allocate memory used by a CPU on the + local memory of the CPU and add some more + NUMA awareness to the kernel. + +config NODES_SHIFT + int "Maximum NUMA Nodes (as a power of 2)" + range 1 10 + default "2" + depends on NEED_MULTIPLE_NODES + help + Specify the maximum number of NUMA Nodes available on the target + system. Increases memory reserved to accommodate various tables. + +config USE_PERCPU_NUMA_NODE_ID + def_bool y + depends on NUMA + source kernel/Kconfig.preempt source kernel/Kconfig.hz config ARCH_SUPPORTS_DEBUG_PAGEALLOC + depends on !HIBERNATION def_bool y config ARCH_HAS_HOLES_MEMORYMODEL @@ -578,9 +606,6 @@ config SYS_SUPPORTS_HUGETLBFS config ARCH_WANT_HUGE_PMD_SHARE def_bool y if ARM64_4K_PAGES || (ARM64_16K_PAGES && !ARM64_VA_BITS_36) -config HAVE_ARCH_TRANSPARENT_HUGEPAGE - def_bool y - config ARCH_HAS_CACHE_LINE_SIZE def_bool y @@ -953,6 +978,14 @@ menu "Power management options" source "kernel/power/Kconfig" +config ARCH_HIBERNATION_POSSIBLE + def_bool y + depends on CPU_PM + +config ARCH_HIBERNATION_HEADER + def_bool y + depends on HIBERNATION + config ARCH_SUSPEND_POSSIBLE def_bool y diff --git a/arch/arm64/Kconfig.debug b/arch/arm64/Kconfig.debug index 7e76845..710fde4 100644 --- a/arch/arm64/Kconfig.debug +++ b/arch/arm64/Kconfig.debug @@ -59,7 +59,7 @@ config DEBUG_RODATA If in doubt, say Y config DEBUG_ALIGN_RODATA - depends on DEBUG_RODATA && ARM64_4K_PAGES + depends on DEBUG_RODATA bool "Align linker sections up to SECTION_SIZE" help If this option is enabled, sections that may potentially be marked as diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index 70f7b9e..10b017c 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -1,5 +1,5 @@ /* - * Based on arch/arm/include/asm/assembler.h + * Based on arch/arm/include/asm/assembler.h, arch/arm/mm/proc-macros.S * * Copyright (C) 1996-2000 Russell King * Copyright (C) 2012 ARM Ltd. @@ -23,22 +23,13 @@ #ifndef __ASM_ASSEMBLER_H #define __ASM_ASSEMBLER_H +#include <asm/asm-offsets.h> +#include <asm/page.h> +#include <asm/pgtable-hwdef.h> #include <asm/ptrace.h> #include <asm/thread_info.h> /* - * Stack pushing/popping (register pairs only). Equivalent to store decrement - * before, load increment after. - */ - .macro push, xreg1, xreg2 - stp \xreg1, \xreg2, [sp, #-16]! - .endm - - .macro pop, xreg1, xreg2 - ldp \xreg1, \xreg2, [sp], #16 - .endm - -/* * Enable and disable interrupts. */ .macro disable_irq @@ -212,6 +203,102 @@ lr .req x30 // link register .endm /* + * vma_vm_mm - get mm pointer from vma pointer (vma->vm_mm) + */ + .macro vma_vm_mm, rd, rn + ldr \rd, [\rn, #VMA_VM_MM] + .endm + +/* + * mmid - get context id from mm pointer (mm->context.id) + */ + .macro mmid, rd, rn + ldr \rd, [\rn, #MM_CONTEXT_ID] + .endm + +/* + * dcache_line_size - get the minimum D-cache line size from the CTR register. + */ + .macro dcache_line_size, reg, tmp + mrs \tmp, ctr_el0 // read CTR + ubfm \tmp, \tmp, #16, #19 // cache line size encoding + mov \reg, #4 // bytes per word + lsl \reg, \reg, \tmp // actual cache line size + .endm + +/* + * icache_line_size - get the minimum I-cache line size from the CTR register. + */ + .macro icache_line_size, reg, tmp + mrs \tmp, ctr_el0 // read CTR + and \tmp, \tmp, #0xf // cache line size encoding + mov \reg, #4 // bytes per word + lsl \reg, \reg, \tmp // actual cache line size + .endm + +/* + * tcr_set_idmap_t0sz - update TCR.T0SZ so that we can load the ID map + */ + .macro tcr_set_idmap_t0sz, valreg, tmpreg +#ifndef CONFIG_ARM64_VA_BITS_48 + ldr_l \tmpreg, idmap_t0sz + bfi \valreg, \tmpreg, #TCR_T0SZ_OFFSET, #TCR_TxSZ_WIDTH +#endif + .endm + +/* + * Macro to perform a data cache maintenance for the interval + * [kaddr, kaddr + size) + * + * op: operation passed to dc instruction + * domain: domain used in dsb instruciton + * kaddr: starting virtual address of the region + * size: size of the region + * Corrupts: kaddr, size, tmp1, tmp2 + */ + .macro dcache_by_line_op op, domain, kaddr, size, tmp1, tmp2 + dcache_line_size \tmp1, \tmp2 + add \size, \kaddr, \size + sub \tmp2, \tmp1, #1 + bic \kaddr, \kaddr, \tmp2 +9998: dc \op, \kaddr + add \kaddr, \kaddr, \tmp1 + cmp \kaddr, \size + b.lo 9998b + dsb \domain + .endm + +/* + * reset_pmuserenr_el0 - reset PMUSERENR_EL0 if PMUv3 present + */ + .macro reset_pmuserenr_el0, tmpreg + mrs \tmpreg, id_aa64dfr0_el1 // Check ID_AA64DFR0_EL1 PMUVer + sbfx \tmpreg, \tmpreg, #8, #4 + cmp \tmpreg, #1 // Skip if no PMU present + b.lt 9000f + msr pmuserenr_el0, xzr // Disable PMU access from EL0 +9000: + .endm + +/* + * copy_page - copy src to dest using temp registers t1-t8 + */ + .macro copy_page dest:req src:req t1:req t2:req t3:req t4:req t5:req t6:req t7:req t8:req +9998: ldp \t1, \t2, [\src] + ldp \t3, \t4, [\src, #16] + ldp \t5, \t6, [\src, #32] + ldp \t7, \t8, [\src, #48] + add \src, \src, #64 + stnp \t1, \t2, [\dest] + stnp \t3, \t4, [\dest, #16] + stnp \t5, \t6, [\dest, #32] + stnp \t7, \t8, [\dest, #48] + add \dest, \dest, #64 + tst \src, #(PAGE_SIZE - 1) + b.ne 9998b + .endm + +/* * Annotate a function as position independent, i.e., safe to be called before * the kernel virtual mapping is activated. */ @@ -233,4 +320,24 @@ lr .req x30 // link register .long \sym\()_hi32 .endm + /* + * mov_q - move an immediate constant into a 64-bit register using + * between 2 and 4 movz/movk instructions (depending on the + * magnitude and sign of the operand) + */ + .macro mov_q, reg, val + .if (((\val) >> 31) == 0 || ((\val) >> 31) == 0x1ffffffff) + movz \reg, :abs_g1_s:\val + .else + .if (((\val) >> 47) == 0 || ((\val) >> 47) == 0x1ffff) + movz \reg, :abs_g2_s:\val + .else + movz \reg, :abs_g3:\val + movk \reg, :abs_g2_nc:\val + .endif + movk \reg, :abs_g1_nc:\val + .endif + movk \reg, :abs_g0_nc:\val + .endm + #endif /* __ASM_ASSEMBLER_H */ diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index b9b6494..224efe7 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -35,8 +35,9 @@ #define ARM64_ALT_PAN_NOT_UAO 10 #define ARM64_HAS_VIRT_HOST_EXTN 11 #define ARM64_WORKAROUND_CAVIUM_27456 12 +#define ARM64_HAS_32BIT_EL0 13 -#define ARM64_NCAPS 13 +#define ARM64_NCAPS 14 #ifndef __ASSEMBLY__ @@ -77,10 +78,17 @@ struct arm64_ftr_reg { struct arm64_ftr_bits *ftr_bits; }; +/* scope of capability check */ +enum { + SCOPE_SYSTEM, + SCOPE_LOCAL_CPU, +}; + struct arm64_cpu_capabilities { const char *desc; u16 capability; - bool (*matches)(const struct arm64_cpu_capabilities *); + int def_scope; /* default scope */ + bool (*matches)(const struct arm64_cpu_capabilities *caps, int scope); void (*enable)(void *); /* Called on all active CPUs */ union { struct { /* To be used for erratum handling only */ @@ -101,6 +109,8 @@ struct arm64_cpu_capabilities { extern DECLARE_BITMAP(cpu_hwcaps, ARM64_NCAPS); +bool this_cpu_has_cap(unsigned int cap); + static inline bool cpu_have_feature(unsigned int num) { return elf_hwcap & (1UL << num); @@ -170,12 +180,20 @@ static inline bool id_aa64mmfr0_mixed_endian_el0(u64 mmfr0) cpuid_feature_extract_unsigned_field(mmfr0, ID_AA64MMFR0_BIGENDEL0_SHIFT) == 0x1; } +static inline bool id_aa64pfr0_32bit_el0(u64 pfr0) +{ + u32 val = cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_EL0_SHIFT); + + return val == ID_AA64PFR0_EL0_32BIT_64BIT; +} + void __init setup_cpu_features(void); void update_cpu_capabilities(const struct arm64_cpu_capabilities *caps, const char *info); void check_local_cpu_errata(void); +void verify_local_cpu_errata(void); void verify_local_cpu_capabilities(void); u64 read_system_reg(u32 id); @@ -185,6 +203,11 @@ static inline bool cpu_supports_mixed_endian_el0(void) return id_aa64mmfr0_mixed_endian_el0(read_cpuid(ID_AA64MMFR0_EL1)); } +static inline bool system_supports_32bit_el0(void) +{ + return cpus_have_cap(ARM64_HAS_32BIT_EL0); +} + static inline bool system_supports_mixed_endian_el0(void) { return id_aa64mmfr0_mixed_endian_el0(read_system_reg(SYS_ID_AA64MMFR0_EL1)); diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h index 24ed037..7a09c48 100644 --- a/arch/arm64/include/asm/elf.h +++ b/arch/arm64/include/asm/elf.h @@ -177,7 +177,8 @@ typedef compat_elf_greg_t compat_elf_gregset_t[COMPAT_ELF_NGREG]; /* AArch32 EABI. */ #define EF_ARM_EABI_MASK 0xff000000 -#define compat_elf_check_arch(x) (((x)->e_machine == EM_ARM) && \ +#define compat_elf_check_arch(x) (system_supports_32bit_el0() && \ + ((x)->e_machine == EM_ARM) && \ ((x)->e_flags & EF_ARM_EABI_MASK)) #define compat_start_thread compat_start_thread diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h index 5c6375d..7e51d1b 100644 --- a/arch/arm64/include/asm/kernel-pgtable.h +++ b/arch/arm64/include/asm/kernel-pgtable.h @@ -19,6 +19,7 @@ #ifndef __ASM_KERNEL_PGTABLE_H #define __ASM_KERNEL_PGTABLE_H +#include <asm/sparsemem.h> /* * The linear mapping and the start of memory are both 2M aligned (per @@ -86,10 +87,24 @@ * (64k granule), or a multiple that can be mapped using contiguous bits * in the page tables: 32 * PMD_SIZE (16k granule) */ -#ifdef CONFIG_ARM64_64K_PAGES -#define ARM64_MEMSTART_ALIGN SZ_512M +#if defined(CONFIG_ARM64_4K_PAGES) +#define ARM64_MEMSTART_SHIFT PUD_SHIFT +#elif defined(CONFIG_ARM64_16K_PAGES) +#define ARM64_MEMSTART_SHIFT (PMD_SHIFT + 5) #else -#define ARM64_MEMSTART_ALIGN SZ_1G +#define ARM64_MEMSTART_SHIFT PMD_SHIFT +#endif + +/* + * sparsemem vmemmap imposes an additional requirement on the alignment of + * memstart_addr, due to the fact that the base of the vmemmap region + * has a direct correspondence, and needs to appear sufficiently aligned + * in the virtual address space. + */ +#if defined(CONFIG_SPARSEMEM_VMEMMAP) && ARM64_MEMSTART_SHIFT < SECTION_SIZE_BITS +#define ARM64_MEMSTART_ALIGN (1UL << SECTION_SIZE_BITS) +#else +#define ARM64_MEMSTART_ALIGN (1UL << ARM64_MEMSTART_SHIFT) #endif #endif /* __ASM_KERNEL_PGTABLE_H */ diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index 3f29887..1b3dc9df 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -84,17 +84,6 @@ #define HCR_INT_OVERRIDE (HCR_FMO | HCR_IMO) #define HCR_HOST_VHE_FLAGS (HCR_RW | HCR_TGE | HCR_E2H) -/* Hyp System Control Register (SCTLR_EL2) bits */ -#define SCTLR_EL2_EE (1 << 25) -#define SCTLR_EL2_WXN (1 << 19) -#define SCTLR_EL2_I (1 << 12) -#define SCTLR_EL2_SA (1 << 3) -#define SCTLR_EL2_C (1 << 2) -#define SCTLR_EL2_A (1 << 1) -#define SCTLR_EL2_M 1 -#define SCTLR_EL2_FLAGS (SCTLR_EL2_M | SCTLR_EL2_A | SCTLR_EL2_C | \ - SCTLR_EL2_SA | SCTLR_EL2_I) - /* TCR_EL2 Registers bits */ #define TCR_EL2_RES1 ((1 << 31) | (1 << 23)) #define TCR_EL2_TBI (1 << 20) diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 40a0a24..7561f63 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -22,6 +22,8 @@ #define ARM_EXCEPTION_IRQ 0 #define ARM_EXCEPTION_TRAP 1 +/* The hyp-stub will return this for any kvm_call_hyp() call */ +#define ARM_EXCEPTION_HYP_GONE 2 #define KVM_ARM64_DEBUG_DIRTY_SHIFT 0 #define KVM_ARM64_DEBUG_DIRTY (1 << KVM_ARM64_DEBUG_DIRTY_SHIFT) @@ -40,6 +42,7 @@ struct kvm_vcpu; extern char __kvm_hyp_init[]; extern char __kvm_hyp_init_end[]; +extern char __kvm_hyp_reset[]; extern char __kvm_hyp_vector[]; diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index f5c6bd2..90a8d23 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -46,6 +46,8 @@ int __attribute_const__ kvm_target_cpu(void); int kvm_reset_vcpu(struct kvm_vcpu *vcpu); int kvm_arch_dev_ioctl_check_extension(long ext); +unsigned long kvm_hyp_reset_entry(void); +void __extended_idmap_trampoline(phys_addr_t boot_pgd, phys_addr_t idmap_start); struct kvm_arch { /* The VMID generation used for the virt. memory system */ @@ -352,7 +354,17 @@ static inline void __cpu_init_hyp_mode(phys_addr_t boot_pgd_ptr, hyp_stack_ptr, vector_ptr); } -static inline void kvm_arch_hardware_disable(void) {} +static inline void __cpu_reset_hyp_mode(phys_addr_t boot_pgd_ptr, + phys_addr_t phys_idmap_start) +{ + /* + * Call reset code, and switch back to stub hyp vectors. + * Uses __kvm_call_hyp() to avoid kaslr's kvm_ksym_ref() translation. + */ + __kvm_call_hyp((void *)kvm_hyp_reset_entry(), + boot_pgd_ptr, phys_idmap_start); +} + static inline void kvm_arch_hardware_unsetup(void) {} static inline void kvm_arch_sync_events(struct kvm *kvm) {} static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {} diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 22732a5..e8d39d4 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -109,6 +109,7 @@ void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu); phys_addr_t kvm_mmu_get_httbr(void); phys_addr_t kvm_mmu_get_boot_httbr(void); phys_addr_t kvm_get_idmap_vector(void); +phys_addr_t kvm_get_idmap_start(void); int kvm_mmu_init(void); void kvm_clear_hyp_idmap(void); diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index 12f8a00..72a3025 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -40,6 +40,21 @@ #define PCI_IO_SIZE SZ_16M /* + * Log2 of the upper bound of the size of a struct page. Used for sizing + * the vmemmap region only, does not affect actual memory footprint. + * We don't use sizeof(struct page) directly since taking its size here + * requires its definition to be available at this point in the inclusion + * chain, and it may not be a power of 2 in the first place. + */ +#define STRUCT_PAGE_MAX_SHIFT 6 + +/* + * VMEMMAP_SIZE - allows the whole linear region to be covered by + * a struct page array + */ +#define VMEMMAP_SIZE (UL(1) << (VA_BITS - PAGE_SHIFT - 1 + STRUCT_PAGE_MAX_SHIFT)) + +/* * PAGE_OFFSET - the virtual address of the start of the kernel image (top * (VA_BITS - 1)) * VA_BITS - the maximum number of bits for virtual addresses. @@ -54,7 +69,8 @@ #define MODULES_END (MODULES_VADDR + MODULES_VSIZE) #define MODULES_VADDR (VA_START + KASAN_SHADOW_SIZE) #define MODULES_VSIZE (SZ_128M) -#define PCI_IO_END (PAGE_OFFSET - SZ_2M) +#define VMEMMAP_START (PAGE_OFFSET - VMEMMAP_SIZE) +#define PCI_IO_END (VMEMMAP_START - SZ_2M) #define PCI_IO_START (PCI_IO_END - PCI_IO_SIZE) #define FIXADDR_TOP (PCI_IO_START - SZ_2M) #define TASK_SIZE_64 (UL(1) << VA_BITS) @@ -71,6 +87,9 @@ #define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 4)) +#define KERNEL_START _text +#define KERNEL_END _end + /* * The size of the KASAN shadow region. This should be 1/8th of the * size of the entire kernel virtual address space. @@ -192,9 +211,19 @@ static inline void *phys_to_virt(phys_addr_t x) */ #define ARCH_PFN_OFFSET ((unsigned long)PHYS_PFN_OFFSET) +#ifndef CONFIG_SPARSEMEM_VMEMMAP #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) -#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) +#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) +#else +#define __virt_to_pgoff(kaddr) (((u64)(kaddr) & ~PAGE_OFFSET) / PAGE_SIZE * sizeof(struct page)) +#define __page_to_voff(kaddr) (((u64)(page) & ~VMEMMAP_START) * PAGE_SIZE / sizeof(struct page)) +#define page_to_virt(page) ((void *)((__page_to_voff(page)) | PAGE_OFFSET)) +#define virt_to_page(vaddr) ((struct page *)((__virt_to_pgoff(vaddr)) | VMEMMAP_START)) + +#define virt_addr_valid(kaddr) pfn_valid((((u64)(kaddr) & ~PAGE_OFFSET) \ + + PHYS_OFFSET) >> PAGE_SHIFT) +#endif #endif #include <asm-generic/memory_model.h> diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h index 990124a..97b1d8f 100644 --- a/arch/arm64/include/asm/mmu.h +++ b/arch/arm64/include/asm/mmu.h @@ -29,6 +29,7 @@ typedef struct { #define ASID(mm) ((mm)->context.id.counter & 0xffff) extern void paging_init(void); +extern void bootmem_init(void); extern void __iomem *early_io_map(phys_addr_t phys, unsigned long virt); extern void init_mem_pgprot(void); extern void create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys, diff --git a/arch/arm64/include/asm/mmzone.h b/arch/arm64/include/asm/mmzone.h new file mode 100644 index 0000000..a0de9e6 --- /dev/null +++ b/arch/arm64/include/asm/mmzone.h @@ -0,0 +1,12 @@ +#ifndef __ASM_MMZONE_H +#define __ASM_MMZONE_H + +#ifdef CONFIG_NUMA + +#include <asm/numa.h> + +extern struct pglist_data *node_data[]; +#define NODE_DATA(nid) (node_data[(nid)]) + +#endif /* CONFIG_NUMA */ +#endif /* __ASM_MMZONE_H */ diff --git a/arch/arm64/include/asm/numa.h b/arch/arm64/include/asm/numa.h new file mode 100644 index 0000000..e9b4f29 --- /dev/null +++ b/arch/arm64/include/asm/numa.h @@ -0,0 +1,45 @@ +#ifndef __ASM_NUMA_H +#define __ASM_NUMA_H + +#include <asm/topology.h> + +#ifdef CONFIG_NUMA + +/* currently, arm64 implements flat NUMA topology */ +#define parent_node(node) (node) + +int __node_distance(int from, int to); +#define node_distance(a, b) __node_distance(a, b) + +extern nodemask_t numa_nodes_parsed __initdata; + +/* Mappings between node number and cpus on that node. */ +extern cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; +void numa_clear_node(unsigned int cpu); + +#ifdef CONFIG_DEBUG_PER_CPU_MAPS +const struct cpumask *cpumask_of_node(int node); +#else +/* Returns a pointer to the cpumask of CPUs on Node 'node'. */ +static inline const struct cpumask *cpumask_of_node(int node) +{ + return node_to_cpumask_map[node]; +} +#endif + +void __init arm64_numa_init(void); +int __init numa_add_memblk(int nodeid, u64 start, u64 end); +void __init numa_set_distance(int from, int to, int distance); +void __init numa_free_distance(void); +void __init early_map_cpu_to_node(unsigned int cpu, int nid); +void numa_store_cpu_info(unsigned int cpu); + +#else /* CONFIG_NUMA */ + +static inline void numa_store_cpu_info(unsigned int cpu) { } +static inline void arm64_numa_init(void) { } +static inline void early_map_cpu_to_node(unsigned int cpu, int nid) { } + +#endif /* CONFIG_NUMA */ + +#endif /* __ASM_NUMA_H */ diff --git a/arch/arm64/include/asm/page.h b/arch/arm64/include/asm/page.h index ae615b9..17b45f7 100644 --- a/arch/arm64/include/asm/page.h +++ b/arch/arm64/include/asm/page.h @@ -19,6 +19,8 @@ #ifndef __ASM_PAGE_H #define __ASM_PAGE_H +#include <linux/const.h> + /* PAGE_SHIFT determines the page size */ /* CONT_SHIFT determines the number of pages which can be tracked together */ #ifdef CONFIG_ARM64_64K_PAGES diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index 5c25b83..9786f77 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -133,7 +133,6 @@ * Section */ #define PMD_SECT_VALID (_AT(pmdval_t, 1) << 0) -#define PMD_SECT_PROT_NONE (_AT(pmdval_t, 1) << 58) #define PMD_SECT_USER (_AT(pmdval_t, 1) << 6) /* AP[1] */ #define PMD_SECT_RDONLY (_AT(pmdval_t, 1) << 7) /* AP[2] */ #define PMD_SECT_S (_AT(pmdval_t, 3) << 8) diff --git a/arch/arm64/include/asm/pgtable-types.h b/arch/arm64/include/asm/pgtable-types.h index 2b1bd7e..69b2fd4 100644 --- a/arch/arm64/include/asm/pgtable-types.h +++ b/arch/arm64/include/asm/pgtable-types.h @@ -27,10 +27,6 @@ typedef u64 pmdval_t; typedef u64 pudval_t; typedef u64 pgdval_t; -#undef STRICT_MM_TYPECHECKS - -#ifdef STRICT_MM_TYPECHECKS - /* * These are used to make use of C type-checking.. */ @@ -58,34 +54,6 @@ typedef struct { pteval_t pgprot; } pgprot_t; #define pgprot_val(x) ((x).pgprot) #define __pgprot(x) ((pgprot_t) { (x) } ) -#else /* !STRICT_MM_TYPECHECKS */ - -typedef pteval_t pte_t; -#define pte_val(x) (x) -#define __pte(x) (x) - -#if CONFIG_PGTABLE_LEVELS > 2 -typedef pmdval_t pmd_t; -#define pmd_val(x) (x) -#define __pmd(x) (x) -#endif - -#if CONFIG_PGTABLE_LEVELS > 3 -typedef pudval_t pud_t; -#define pud_val(x) (x) -#define __pud(x) (x) -#endif - -typedef pgdval_t pgd_t; -#define pgd_val(x) (x) -#define __pgd(x) (x) - -typedef pteval_t pgprot_t; -#define pgprot_val(x) (x) -#define __pgprot(x) (x) - -#endif /* STRICT_MM_TYPECHECKS */ - #if CONFIG_PGTABLE_LEVELS == 2 #include <asm-generic/pgtable-nopmd.h> #elif CONFIG_PGTABLE_LEVELS == 3 diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 989fef1..2da46ae 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -24,22 +24,16 @@ #include <asm/pgtable-prot.h> /* - * VMALLOC and SPARSEMEM_VMEMMAP ranges. + * VMALLOC range. * - * VMEMAP_SIZE: allows the whole linear region to be covered by a struct page array - * (rounded up to PUD_SIZE). * VMALLOC_START: beginning of the kernel vmalloc space - * VMALLOC_END: extends to the available space below vmmemmap, PCI I/O space, - * fixed mappings and modules + * VMALLOC_END: extends to the available space below vmmemmap, PCI I/O space + * and fixed mappings */ -#define VMEMMAP_SIZE ALIGN((1UL << (VA_BITS - PAGE_SHIFT)) * sizeof(struct page), PUD_SIZE) - #define VMALLOC_START (MODULES_END) #define VMALLOC_END (PAGE_OFFSET - PUD_SIZE - VMEMMAP_SIZE - SZ_64K) -#define VMEMMAP_START (VMALLOC_END + SZ_64K) -#define vmemmap ((struct page *)VMEMMAP_START - \ - SECTION_ALIGN_DOWN(memstart_addr >> PAGE_SHIFT)) +#define vmemmap ((struct page *)VMEMMAP_START - (memstart_addr >> PAGE_SHIFT)) #define FIRST_USER_ADDRESS 0UL @@ -58,7 +52,7 @@ extern void __pgd_error(const char *file, int line, unsigned long val); * for zero-mapped memory areas etc.. */ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; -#define ZERO_PAGE(vaddr) virt_to_page(empty_zero_page) +#define ZERO_PAGE(vaddr) pfn_to_page(PHYS_PFN(__pa(empty_zero_page))) #define pte_ERROR(pte) __pte_error(__FILE__, __LINE__, pte_val(pte)) @@ -272,6 +266,21 @@ static inline pgprot_t mk_sect_prot(pgprot_t prot) return __pgprot(pgprot_val(prot) & ~PTE_TABLE_BIT); } +#ifdef CONFIG_NUMA_BALANCING +/* + * See the comment in include/asm-generic/pgtable.h + */ +static inline int pte_protnone(pte_t pte) +{ + return (pte_val(pte) & (PTE_VALID | PTE_PROT_NONE)) == PTE_PROT_NONE; +} + +static inline int pmd_protnone(pmd_t pmd) +{ + return pte_protnone(pmd_pte(pmd)); +} +#endif + /* * THP definitions. */ @@ -280,15 +289,16 @@ static inline pgprot_t mk_sect_prot(pgprot_t prot) #define pmd_trans_huge(pmd) (pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT)) #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +#define pmd_present(pmd) pte_present(pmd_pte(pmd)) #define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd)) #define pmd_young(pmd) pte_young(pmd_pte(pmd)) #define pmd_wrprotect(pmd) pte_pmd(pte_wrprotect(pmd_pte(pmd))) #define pmd_mkold(pmd) pte_pmd(pte_mkold(pmd_pte(pmd))) #define pmd_mkwrite(pmd) pte_pmd(pte_mkwrite(pmd_pte(pmd))) -#define pmd_mkclean(pmd) pte_pmd(pte_mkclean(pmd_pte(pmd))) +#define pmd_mkclean(pmd) pte_pmd(pte_mkclean(pmd_pte(pmd))) #define pmd_mkdirty(pmd) pte_pmd(pte_mkdirty(pmd_pte(pmd))) #define pmd_mkyoung(pmd) pte_pmd(pte_mkyoung(pmd_pte(pmd))) -#define pmd_mknotpresent(pmd) (__pmd(pmd_val(pmd) & ~PMD_TYPE_MASK)) +#define pmd_mknotpresent(pmd) (__pmd(pmd_val(pmd) & ~PMD_SECT_VALID)) #define __HAVE_ARCH_PMD_WRITE #define pmd_write(pmd) pte_write(pmd_pte(pmd)) @@ -327,9 +337,8 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, unsigned long size, pgprot_t vma_prot); #define pmd_none(pmd) (!pmd_val(pmd)) -#define pmd_present(pmd) (pmd_val(pmd)) -#define pmd_bad(pmd) (!(pmd_val(pmd) & 2)) +#define pmd_bad(pmd) (!(pmd_val(pmd) & PMD_TABLE_BIT)) #define pmd_table(pmd) ((pmd_val(pmd) & PMD_TYPE_MASK) == \ PMD_TYPE_TABLE) @@ -394,7 +403,7 @@ static inline phys_addr_t pmd_page_paddr(pmd_t pmd) #define pmd_ERROR(pmd) __pmd_error(__FILE__, __LINE__, pmd_val(pmd)) #define pud_none(pud) (!pud_val(pud)) -#define pud_bad(pud) (!(pud_val(pud) & 2)) +#define pud_bad(pud) (!(pud_val(pud) & PUD_TABLE_BIT)) #define pud_present(pud) (pud_val(pud)) static inline void set_pud(pud_t *pudp, pud_t pud) @@ -526,6 +535,21 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) } #ifdef CONFIG_ARM64_HW_AFDBM +#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS +extern int ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, + pte_t entry, int dirty); + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS +static inline int pmdp_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp, + pmd_t entry, int dirty) +{ + return ptep_set_access_flags(vma, address, (pte_t *)pmdp, pmd_pte(entry), dirty); +} +#endif + /* * Atomic pte/pmd modifications. */ @@ -578,9 +602,9 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, } #ifdef CONFIG_TRANSPARENT_HUGEPAGE -#define __HAVE_ARCH_PMDP_GET_AND_CLEAR -static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, - unsigned long address, pmd_t *pmdp) +#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR +static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, + unsigned long address, pmd_t *pmdp) { return pte_pmd(ptep_get_and_clear(mm, address, (pte_t *)pmdp)); } diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h index 817a067..433e504 100644 --- a/arch/arm64/include/asm/smp.h +++ b/arch/arm64/include/asm/smp.h @@ -113,6 +113,17 @@ static inline void update_cpu_boot_status(int val) dsb(ishst); } +/* + * The calling secondary CPU has detected serious configuration mismatch, + * which calls for a kernel panic. Update the boot status and park the calling + * CPU. + */ +static inline void cpu_panic_kernel(void) +{ + update_cpu_boot_status(CPU_PANIC_KERNEL); + cpu_park_loop(); +} + #endif /* ifndef __ASSEMBLY__ */ #endif /* ifndef __ASM_SMP_H */ diff --git a/arch/arm64/include/asm/suspend.h b/arch/arm64/include/asm/suspend.h index 59a5b0f1..024d623 100644 --- a/arch/arm64/include/asm/suspend.h +++ b/arch/arm64/include/asm/suspend.h @@ -1,7 +1,8 @@ #ifndef __ASM_SUSPEND_H #define __ASM_SUSPEND_H -#define NR_CTX_REGS 11 +#define NR_CTX_REGS 10 +#define NR_CALLEE_SAVED_REGS 12 /* * struct cpu_suspend_ctx must be 16-byte aligned since it is allocated on @@ -16,11 +17,34 @@ struct cpu_suspend_ctx { u64 sp; } __aligned(16); -struct sleep_save_sp { - phys_addr_t *save_ptr_stash; - phys_addr_t save_ptr_stash_phys; +/* + * Memory to save the cpu state is allocated on the stack by + * __cpu_suspend_enter()'s caller, and populated by __cpu_suspend_enter(). + * This data must survive until cpu_resume() is called. + * + * This struct desribes the size and the layout of the saved cpu state. + * The layout of the callee_saved_regs is defined by the implementation + * of __cpu_suspend_enter(), and cpu_resume(). This struct must be passed + * in by the caller as __cpu_suspend_enter()'s stack-frame is gone once it + * returns, and the data would be subsequently corrupted by the call to the + * finisher. + */ +struct sleep_stack_data { + struct cpu_suspend_ctx system_regs; + unsigned long callee_saved_regs[NR_CALLEE_SAVED_REGS]; }; +extern unsigned long *sleep_save_stash; + extern int cpu_suspend(unsigned long arg, int (*fn)(unsigned long)); extern void cpu_resume(void); +int __cpu_suspend_enter(struct sleep_stack_data *state); +void __cpu_suspend_exit(void); +void _cpu_resume(void); + +int swsusp_arch_suspend(void); +int swsusp_arch_resume(void); +int arch_hibernation_header_save(void *addr, unsigned int max_size); +int arch_hibernation_header_restore(void *addr); + #endif diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 1287416..751e901 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -86,10 +86,21 @@ #define SET_PSTATE_UAO(x) __inst_arm(0xd5000000 | REG_PSTATE_UAO_IMM |\ (!!x)<<8 | 0x1f) -/* SCTLR_EL1 */ -#define SCTLR_EL1_CP15BEN (0x1 << 5) -#define SCTLR_EL1_SED (0x1 << 8) -#define SCTLR_EL1_SPAN (0x1 << 23) +/* Common SCTLR_ELx flags. */ +#define SCTLR_ELx_EE (1 << 25) +#define SCTLR_ELx_I (1 << 12) +#define SCTLR_ELx_SA (1 << 3) +#define SCTLR_ELx_C (1 << 2) +#define SCTLR_ELx_A (1 << 1) +#define SCTLR_ELx_M 1 + +#define SCTLR_ELx_FLAGS (SCTLR_ELx_M | SCTLR_ELx_A | SCTLR_ELx_C | \ + SCTLR_ELx_SA | SCTLR_ELx_I) + +/* SCTLR_EL1 specific flags. */ +#define SCTLR_EL1_SPAN (1 << 23) +#define SCTLR_EL1_SED (1 << 8) +#define SCTLR_EL1_CP15BEN (1 << 5) /* id_aa64isar0 */ @@ -115,6 +126,7 @@ #define ID_AA64PFR0_ASIMD_SUPPORTED 0x0 #define ID_AA64PFR0_EL1_64BIT_ONLY 0x1 #define ID_AA64PFR0_EL0_64BIT_ONLY 0x1 +#define ID_AA64PFR0_EL0_32BIT_64BIT 0x2 /* id_aa64mmfr0 */ #define ID_AA64MMFR0_TGRAN4_SHIFT 28 @@ -145,7 +157,11 @@ #define ID_AA64MMFR1_VMIDBITS_16 2 /* id_aa64mmfr2 */ +#define ID_AA64MMFR2_LVA_SHIFT 16 +#define ID_AA64MMFR2_IESB_SHIFT 12 +#define ID_AA64MMFR2_LSM_SHIFT 8 #define ID_AA64MMFR2_UAO_SHIFT 4 +#define ID_AA64MMFR2_CNP_SHIFT 0 /* id_aa64dfr0 */ #define ID_AA64DFR0_CTX_CMPS_SHIFT 28 diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h index a3e9d6f..8b57339 100644 --- a/arch/arm64/include/asm/topology.h +++ b/arch/arm64/include/asm/topology.h @@ -22,6 +22,16 @@ void init_cpu_topology(void); void store_cpu_topology(unsigned int cpuid); const struct cpumask *cpu_coregroup_mask(int cpu); +#ifdef CONFIG_NUMA + +struct pci_bus; +int pcibus_to_node(struct pci_bus *bus); +#define cpumask_of_pcibus(bus) (pcibus_to_node(bus) == -1 ? \ + cpu_all_mask : \ + cpumask_of_node(pcibus_to_node(bus))) + +#endif /* CONFIG_NUMA */ + #include <asm-generic/topology.h> #endif /* _ASM_ARM_TOPOLOGY_H */ diff --git a/arch/arm64/include/asm/virt.h b/arch/arm64/include/asm/virt.h index 9f22dd6..dcbcf8d 100644 --- a/arch/arm64/include/asm/virt.h +++ b/arch/arm64/include/asm/virt.h @@ -18,6 +18,22 @@ #ifndef __ASM__VIRT_H #define __ASM__VIRT_H +/* + * The arm64 hcall implementation uses x0 to specify the hcall type. A value + * less than 0xfff indicates a special hcall, such as get/set vector. + * Any other value is used as a pointer to the function to call. + */ + +/* HVC_GET_VECTORS - Return the value of the vbar_el2 register. */ +#define HVC_GET_VECTORS 0 + +/* + * HVC_SET_VECTORS - Set the value of the vbar_el2 register. + * + * @x1: Physical address of the new vector table. + */ +#define HVC_SET_VECTORS 1 + #define BOOT_CPU_MODE_EL1 (0xe11) #define BOOT_CPU_MODE_EL2 (0xe12) @@ -60,6 +76,12 @@ static inline bool is_kernel_in_hyp_mode(void) return el == CurrentEL_EL2; } +#ifdef CONFIG_ARM64_VHE +extern void verify_cpu_run_el(void); +#else +static inline void verify_cpu_run_el(void) {} +#endif + /* The section containing the hypervisor text */ extern char __hyp_text_start[]; extern char __hyp_text_end[]; diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index 3793003..2173149 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -45,6 +45,7 @@ arm64-obj-$(CONFIG_ACPI) += acpi.o arm64-obj-$(CONFIG_ARM64_ACPI_PARKING_PROTOCOL) += acpi_parking_protocol.o arm64-obj-$(CONFIG_PARAVIRT) += paravirt.o arm64-obj-$(CONFIG_RANDOMIZE_BASE) += kaslr.o +arm64-obj-$(CONFIG_HIBERNATION) += hibernate.o hibernate-asm.o obj-y += $(arm64-obj-y) vdso/ obj-m += $(arm64-obj-m) diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c index d1ce8e2..3e4f1a4 100644 --- a/arch/arm64/kernel/acpi.c +++ b/arch/arm64/kernel/acpi.c @@ -42,6 +42,7 @@ int acpi_pci_disabled = 1; /* skip ACPI PCI scan and IRQ initialization */ EXPORT_SYMBOL(acpi_pci_disabled); static bool param_acpi_off __initdata; +static bool param_acpi_on __initdata; static bool param_acpi_force __initdata; static int __init parse_acpi(char *arg) @@ -52,6 +53,8 @@ static int __init parse_acpi(char *arg) /* "acpi=off" disables both ACPI table parsing and interpreter */ if (strcmp(arg, "off") == 0) param_acpi_off = true; + else if (strcmp(arg, "on") == 0) /* prefer ACPI over DT */ + param_acpi_on = true; else if (strcmp(arg, "force") == 0) /* force ACPI to be enabled */ param_acpi_force = true; else @@ -66,12 +69,24 @@ static int __init dt_scan_depth1_nodes(unsigned long node, void *data) { /* - * Return 1 as soon as we encounter a node at depth 1 that is - * not the /chosen node. + * Ignore anything not directly under the root node; we'll + * catch its parent instead. */ - if (depth == 1 && (strcmp(uname, "chosen") != 0)) - return 1; - return 0; + if (depth != 1) + return 0; + + if (strcmp(uname, "chosen") == 0) + return 0; + + if (strcmp(uname, "hypervisor") == 0 && + of_flat_dt_is_compatible(node, "xen,xen")) + return 0; + + /* + * This node at depth 1 is neither a chosen node nor a xen node, + * which we do not expect. + */ + return 1; } /* @@ -184,11 +199,13 @@ void __init acpi_boot_table_init(void) /* * Enable ACPI instead of device tree unless * - ACPI has been disabled explicitly (acpi=off), or - * - the device tree is not empty (it has more than just a /chosen node) - * and ACPI has not been force enabled (acpi=force) + * - the device tree is not empty (it has more than just a /chosen node, + * and a /hypervisor node when running on Xen) + * and ACPI has not been [force] enabled (acpi=on|force) */ if (param_acpi_off || - (!param_acpi_force && of_scan_flat_dt(dt_scan_depth1_nodes, NULL))) + (!param_acpi_on && !param_acpi_force && + of_scan_flat_dt(dt_scan_depth1_nodes, NULL))) return; /* diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index 3ae6b31..f8e5d47 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -22,6 +22,7 @@ #include <linux/mm.h> #include <linux/dma-mapping.h> #include <linux/kvm_host.h> +#include <linux/suspend.h> #include <asm/thread_info.h> #include <asm/memory.h> #include <asm/smp_plat.h> @@ -119,11 +120,14 @@ int main(void) DEFINE(CPU_CTX_SP, offsetof(struct cpu_suspend_ctx, sp)); DEFINE(MPIDR_HASH_MASK, offsetof(struct mpidr_hash, mask)); DEFINE(MPIDR_HASH_SHIFTS, offsetof(struct mpidr_hash, shift_aff)); - DEFINE(SLEEP_SAVE_SP_SZ, sizeof(struct sleep_save_sp)); - DEFINE(SLEEP_SAVE_SP_PHYS, offsetof(struct sleep_save_sp, save_ptr_stash_phys)); - DEFINE(SLEEP_SAVE_SP_VIRT, offsetof(struct sleep_save_sp, save_ptr_stash)); + DEFINE(SLEEP_STACK_DATA_SYSTEM_REGS, offsetof(struct sleep_stack_data, system_regs)); + DEFINE(SLEEP_STACK_DATA_CALLEE_REGS, offsetof(struct sleep_stack_data, callee_saved_regs)); #endif DEFINE(ARM_SMCCC_RES_X0_OFFS, offsetof(struct arm_smccc_res, a0)); DEFINE(ARM_SMCCC_RES_X2_OFFS, offsetof(struct arm_smccc_res, a2)); + BLANK(); + DEFINE(HIBERN_PBE_ORIG, offsetof(struct pbe, orig_address)); + DEFINE(HIBERN_PBE_ADDR, offsetof(struct pbe, address)); + DEFINE(HIBERN_PBE_NEXT, offsetof(struct pbe, next)); return 0; } diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index 06afd04..d427894 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -22,14 +22,16 @@ #include <asm/cpufeature.h> static bool __maybe_unused -is_affected_midr_range(const struct arm64_cpu_capabilities *entry) +is_affected_midr_range(const struct arm64_cpu_capabilities *entry, int scope) { + WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible()); return MIDR_IS_CPU_MODEL_RANGE(read_cpuid_id(), entry->midr_model, entry->midr_range_min, entry->midr_range_max); } #define MIDR_RANGE(model, min, max) \ + .def_scope = SCOPE_LOCAL_CPU, \ .matches = is_affected_midr_range, \ .midr_model = model, \ .midr_range_min = min, \ @@ -101,6 +103,26 @@ const struct arm64_cpu_capabilities arm64_errata[] = { } }; +/* + * The CPU Errata work arounds are detected and applied at boot time + * and the related information is freed soon after. If the new CPU requires + * an errata not detected at boot, fail this CPU. + */ +void verify_local_cpu_errata(void) +{ + const struct arm64_cpu_capabilities *caps = arm64_errata; + + for (; caps->matches; caps++) + if (!cpus_have_cap(caps->capability) && + caps->matches(caps, SCOPE_LOCAL_CPU)) { + pr_crit("CPU%d: Requires work around for %s, not detected" + " at boot time\n", + smp_processor_id(), + caps->desc ? : "an erratum"); + cpu_die_early(); + } +} + void check_local_cpu_errata(void) { update_cpu_capabilities(arm64_errata, "enabling workaround for"); diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 943f514..811773d 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -71,7 +71,8 @@ DECLARE_BITMAP(cpu_hwcaps, ARM64_NCAPS); /* meta feature for alternatives */ static bool __maybe_unused -cpufeature_pan_not_uao(const struct arm64_cpu_capabilities *entry); +cpufeature_pan_not_uao(const struct arm64_cpu_capabilities *entry, int __unused); + static struct arm64_ftr_bits ftr_id_aa64isar0[] = { ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 32, 32, 0), @@ -130,7 +131,11 @@ static struct arm64_ftr_bits ftr_id_aa64mmfr1[] = { }; static struct arm64_ftr_bits ftr_id_aa64mmfr2[] = { + ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, ID_AA64MMFR2_LVA_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, ID_AA64MMFR2_IESB_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, ID_AA64MMFR2_LSM_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, ID_AA64MMFR2_UAO_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, ID_AA64MMFR2_CNP_SHIFT, 4, 0), ARM64_FTR_END, }; @@ -435,22 +440,26 @@ void __init init_cpu_features(struct cpuinfo_arm64 *info) init_cpu_ftr_reg(SYS_ID_AA64MMFR2_EL1, info->reg_id_aa64mmfr2); init_cpu_ftr_reg(SYS_ID_AA64PFR0_EL1, info->reg_id_aa64pfr0); init_cpu_ftr_reg(SYS_ID_AA64PFR1_EL1, info->reg_id_aa64pfr1); - init_cpu_ftr_reg(SYS_ID_DFR0_EL1, info->reg_id_dfr0); - init_cpu_ftr_reg(SYS_ID_ISAR0_EL1, info->reg_id_isar0); - init_cpu_ftr_reg(SYS_ID_ISAR1_EL1, info->reg_id_isar1); - init_cpu_ftr_reg(SYS_ID_ISAR2_EL1, info->reg_id_isar2); - init_cpu_ftr_reg(SYS_ID_ISAR3_EL1, info->reg_id_isar3); - init_cpu_ftr_reg(SYS_ID_ISAR4_EL1, info->reg_id_isar4); - init_cpu_ftr_reg(SYS_ID_ISAR5_EL1, info->reg_id_isar5); - init_cpu_ftr_reg(SYS_ID_MMFR0_EL1, info->reg_id_mmfr0); - init_cpu_ftr_reg(SYS_ID_MMFR1_EL1, info->reg_id_mmfr1); - init_cpu_ftr_reg(SYS_ID_MMFR2_EL1, info->reg_id_mmfr2); - init_cpu_ftr_reg(SYS_ID_MMFR3_EL1, info->reg_id_mmfr3); - init_cpu_ftr_reg(SYS_ID_PFR0_EL1, info->reg_id_pfr0); - init_cpu_ftr_reg(SYS_ID_PFR1_EL1, info->reg_id_pfr1); - init_cpu_ftr_reg(SYS_MVFR0_EL1, info->reg_mvfr0); - init_cpu_ftr_reg(SYS_MVFR1_EL1, info->reg_mvfr1); - init_cpu_ftr_reg(SYS_MVFR2_EL1, info->reg_mvfr2); + + if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) { + init_cpu_ftr_reg(SYS_ID_DFR0_EL1, info->reg_id_dfr0); + init_cpu_ftr_reg(SYS_ID_ISAR0_EL1, info->reg_id_isar0); + init_cpu_ftr_reg(SYS_ID_ISAR1_EL1, info->reg_id_isar1); + init_cpu_ftr_reg(SYS_ID_ISAR2_EL1, info->reg_id_isar2); + init_cpu_ftr_reg(SYS_ID_ISAR3_EL1, info->reg_id_isar3); + init_cpu_ftr_reg(SYS_ID_ISAR4_EL1, info->reg_id_isar4); + init_cpu_ftr_reg(SYS_ID_ISAR5_EL1, info->reg_id_isar5); + init_cpu_ftr_reg(SYS_ID_MMFR0_EL1, info->reg_id_mmfr0); + init_cpu_ftr_reg(SYS_ID_MMFR1_EL1, info->reg_id_mmfr1); + init_cpu_ftr_reg(SYS_ID_MMFR2_EL1, info->reg_id_mmfr2); + init_cpu_ftr_reg(SYS_ID_MMFR3_EL1, info->reg_id_mmfr3); + init_cpu_ftr_reg(SYS_ID_PFR0_EL1, info->reg_id_pfr0); + init_cpu_ftr_reg(SYS_ID_PFR1_EL1, info->reg_id_pfr1); + init_cpu_ftr_reg(SYS_MVFR0_EL1, info->reg_mvfr0); + init_cpu_ftr_reg(SYS_MVFR1_EL1, info->reg_mvfr1); + init_cpu_ftr_reg(SYS_MVFR2_EL1, info->reg_mvfr2); + } + } static void update_cpu_ftr_reg(struct arm64_ftr_reg *reg, u64 new) @@ -555,47 +564,51 @@ void update_cpu_features(int cpu, info->reg_id_aa64pfr1, boot->reg_id_aa64pfr1); /* - * If we have AArch32, we care about 32-bit features for compat. These - * registers should be RES0 otherwise. + * If we have AArch32, we care about 32-bit features for compat. + * If the system doesn't support AArch32, don't update them. */ - taint |= check_update_ftr_reg(SYS_ID_DFR0_EL1, cpu, + if (id_aa64pfr0_32bit_el0(read_system_reg(SYS_ID_AA64PFR0_EL1)) && + id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) { + + taint |= check_update_ftr_reg(SYS_ID_DFR0_EL1, cpu, info->reg_id_dfr0, boot->reg_id_dfr0); - taint |= check_update_ftr_reg(SYS_ID_ISAR0_EL1, cpu, + taint |= check_update_ftr_reg(SYS_ID_ISAR0_EL1, cpu, info->reg_id_isar0, boot->reg_id_isar0); - taint |= check_update_ftr_reg(SYS_ID_ISAR1_EL1, cpu, + taint |= check_update_ftr_reg(SYS_ID_ISAR1_EL1, cpu, info->reg_id_isar1, boot->reg_id_isar1); - taint |= check_update_ftr_reg(SYS_ID_ISAR2_EL1, cpu, + taint |= check_update_ftr_reg(SYS_ID_ISAR2_EL1, cpu, info->reg_id_isar2, boot->reg_id_isar2); - taint |= check_update_ftr_reg(SYS_ID_ISAR3_EL1, cpu, + taint |= check_update_ftr_reg(SYS_ID_ISAR3_EL1, cpu, info->reg_id_isar3, boot->reg_id_isar3); - taint |= check_update_ftr_reg(SYS_ID_ISAR4_EL1, cpu, + taint |= check_update_ftr_reg(SYS_ID_ISAR4_EL1, cpu, info->reg_id_isar4, boot->reg_id_isar4); - taint |= check_update_ftr_reg(SYS_ID_ISAR5_EL1, cpu, + taint |= check_update_ftr_reg(SYS_ID_ISAR5_EL1, cpu, info->reg_id_isar5, boot->reg_id_isar5); - /* - * Regardless of the value of the AuxReg field, the AIFSR, ADFSR, and - * ACTLR formats could differ across CPUs and therefore would have to - * be trapped for virtualization anyway. - */ - taint |= check_update_ftr_reg(SYS_ID_MMFR0_EL1, cpu, + /* + * Regardless of the value of the AuxReg field, the AIFSR, ADFSR, and + * ACTLR formats could differ across CPUs and therefore would have to + * be trapped for virtualization anyway. + */ + taint |= check_update_ftr_reg(SYS_ID_MMFR0_EL1, cpu, info->reg_id_mmfr0, boot->reg_id_mmfr0); - taint |= check_update_ftr_reg(SYS_ID_MMFR1_EL1, cpu, + taint |= check_update_ftr_reg(SYS_ID_MMFR1_EL1, cpu, info->reg_id_mmfr1, boot->reg_id_mmfr1); - taint |= check_update_ftr_reg(SYS_ID_MMFR2_EL1, cpu, + taint |= check_update_ftr_reg(SYS_ID_MMFR2_EL1, cpu, info->reg_id_mmfr2, boot->reg_id_mmfr2); - taint |= check_update_ftr_reg(SYS_ID_MMFR3_EL1, cpu, + taint |= check_update_ftr_reg(SYS_ID_MMFR3_EL1, cpu, info->reg_id_mmfr3, boot->reg_id_mmfr3); - taint |= check_update_ftr_reg(SYS_ID_PFR0_EL1, cpu, + taint |= check_update_ftr_reg(SYS_ID_PFR0_EL1, cpu, info->reg_id_pfr0, boot->reg_id_pfr0); - taint |= check_update_ftr_reg(SYS_ID_PFR1_EL1, cpu, + taint |= check_update_ftr_reg(SYS_ID_PFR1_EL1, cpu, info->reg_id_pfr1, boot->reg_id_pfr1); - taint |= check_update_ftr_reg(SYS_MVFR0_EL1, cpu, + taint |= check_update_ftr_reg(SYS_MVFR0_EL1, cpu, info->reg_mvfr0, boot->reg_mvfr0); - taint |= check_update_ftr_reg(SYS_MVFR1_EL1, cpu, + taint |= check_update_ftr_reg(SYS_MVFR1_EL1, cpu, info->reg_mvfr1, boot->reg_mvfr1); - taint |= check_update_ftr_reg(SYS_MVFR2_EL1, cpu, + taint |= check_update_ftr_reg(SYS_MVFR2_EL1, cpu, info->reg_mvfr2, boot->reg_mvfr2); + } /* * Mismatched CPU features are a recipe for disaster. Don't even @@ -614,6 +627,49 @@ u64 read_system_reg(u32 id) return regp->sys_val; } +/* + * __raw_read_system_reg() - Used by a STARTING cpu before cpuinfo is populated. + * Read the system register on the current CPU + */ +static u64 __raw_read_system_reg(u32 sys_id) +{ + switch (sys_id) { + case SYS_ID_PFR0_EL1: return read_cpuid(ID_PFR0_EL1); + case SYS_ID_PFR1_EL1: return read_cpuid(ID_PFR1_EL1); + case SYS_ID_DFR0_EL1: return read_cpuid(ID_DFR0_EL1); + case SYS_ID_MMFR0_EL1: return read_cpuid(ID_MMFR0_EL1); + case SYS_ID_MMFR1_EL1: return read_cpuid(ID_MMFR1_EL1); + case SYS_ID_MMFR2_EL1: return read_cpuid(ID_MMFR2_EL1); + case SYS_ID_MMFR3_EL1: return read_cpuid(ID_MMFR3_EL1); + case SYS_ID_ISAR0_EL1: return read_cpuid(ID_ISAR0_EL1); + case SYS_ID_ISAR1_EL1: return read_cpuid(ID_ISAR1_EL1); + case SYS_ID_ISAR2_EL1: return read_cpuid(ID_ISAR2_EL1); + case SYS_ID_ISAR3_EL1: return read_cpuid(ID_ISAR3_EL1); + case SYS_ID_ISAR4_EL1: return read_cpuid(ID_ISAR4_EL1); + case SYS_ID_ISAR5_EL1: return read_cpuid(ID_ISAR4_EL1); + case SYS_MVFR0_EL1: return read_cpuid(MVFR0_EL1); + case SYS_MVFR1_EL1: return read_cpuid(MVFR1_EL1); + case SYS_MVFR2_EL1: return read_cpuid(MVFR2_EL1); + + case SYS_ID_AA64PFR0_EL1: return read_cpuid(ID_AA64PFR0_EL1); + case SYS_ID_AA64PFR1_EL1: return read_cpuid(ID_AA64PFR0_EL1); + case SYS_ID_AA64DFR0_EL1: return read_cpuid(ID_AA64DFR0_EL1); + case SYS_ID_AA64DFR1_EL1: return read_cpuid(ID_AA64DFR0_EL1); + case SYS_ID_AA64MMFR0_EL1: return read_cpuid(ID_AA64MMFR0_EL1); + case SYS_ID_AA64MMFR1_EL1: return read_cpuid(ID_AA64MMFR1_EL1); + case SYS_ID_AA64MMFR2_EL1: return read_cpuid(ID_AA64MMFR2_EL1); + case SYS_ID_AA64ISAR0_EL1: return read_cpuid(ID_AA64ISAR0_EL1); + case SYS_ID_AA64ISAR1_EL1: return read_cpuid(ID_AA64ISAR1_EL1); + + case SYS_CNTFRQ_EL0: return read_cpuid(CNTFRQ_EL0); + case SYS_CTR_EL0: return read_cpuid(CTR_EL0); + case SYS_DCZID_EL0: return read_cpuid(DCZID_EL0); + default: + BUG(); + return 0; + } +} + #include <linux/irqchip/arm-gic-v3.h> static bool @@ -625,19 +681,24 @@ feature_matches(u64 reg, const struct arm64_cpu_capabilities *entry) } static bool -has_cpuid_feature(const struct arm64_cpu_capabilities *entry) +has_cpuid_feature(const struct arm64_cpu_capabilities *entry, int scope) { u64 val; - val = read_system_reg(entry->sys_reg); + WARN_ON(scope == SCOPE_LOCAL_CPU && preemptible()); + if (scope == SCOPE_SYSTEM) + val = read_system_reg(entry->sys_reg); + else + val = __raw_read_system_reg(entry->sys_reg); + return feature_matches(val, entry); } -static bool has_useable_gicv3_cpuif(const struct arm64_cpu_capabilities *entry) +static bool has_useable_gicv3_cpuif(const struct arm64_cpu_capabilities *entry, int scope) { bool has_sre; - if (!has_cpuid_feature(entry)) + if (!has_cpuid_feature(entry, scope)) return false; has_sre = gic_enable_sre(); @@ -648,7 +709,7 @@ static bool has_useable_gicv3_cpuif(const struct arm64_cpu_capabilities *entry) return has_sre; } -static bool has_no_hw_prefetch(const struct arm64_cpu_capabilities *entry) +static bool has_no_hw_prefetch(const struct arm64_cpu_capabilities *entry, int __unused) { u32 midr = read_cpuid_id(); u32 rv_min, rv_max; @@ -660,7 +721,7 @@ static bool has_no_hw_prefetch(const struct arm64_cpu_capabilities *entry) return MIDR_IS_CPU_MODEL_RANGE(midr, MIDR_THUNDERX, rv_min, rv_max); } -static bool runs_at_el2(const struct arm64_cpu_capabilities *entry) +static bool runs_at_el2(const struct arm64_cpu_capabilities *entry, int __unused) { return is_kernel_in_hyp_mode(); } @@ -669,6 +730,7 @@ static const struct arm64_cpu_capabilities arm64_features[] = { { .desc = "GIC system register CPU interface", .capability = ARM64_HAS_SYSREG_GIC_CPUIF, + .def_scope = SCOPE_SYSTEM, .matches = has_useable_gicv3_cpuif, .sys_reg = SYS_ID_AA64PFR0_EL1, .field_pos = ID_AA64PFR0_GIC_SHIFT, @@ -679,6 +741,7 @@ static const struct arm64_cpu_capabilities arm64_features[] = { { .desc = "Privileged Access Never", .capability = ARM64_HAS_PAN, + .def_scope = SCOPE_SYSTEM, .matches = has_cpuid_feature, .sys_reg = SYS_ID_AA64MMFR1_EL1, .field_pos = ID_AA64MMFR1_PAN_SHIFT, @@ -691,6 +754,7 @@ static const struct arm64_cpu_capabilities arm64_features[] = { { .desc = "LSE atomic instructions", .capability = ARM64_HAS_LSE_ATOMICS, + .def_scope = SCOPE_SYSTEM, .matches = has_cpuid_feature, .sys_reg = SYS_ID_AA64ISAR0_EL1, .field_pos = ID_AA64ISAR0_ATOMICS_SHIFT, @@ -701,12 +765,14 @@ static const struct arm64_cpu_capabilities arm64_features[] = { { .desc = "Software prefetching using PRFM", .capability = ARM64_HAS_NO_HW_PREFETCH, + .def_scope = SCOPE_SYSTEM, .matches = has_no_hw_prefetch, }, #ifdef CONFIG_ARM64_UAO { .desc = "User Access Override", .capability = ARM64_HAS_UAO, + .def_scope = SCOPE_SYSTEM, .matches = has_cpuid_feature, .sys_reg = SYS_ID_AA64MMFR2_EL1, .field_pos = ID_AA64MMFR2_UAO_SHIFT, @@ -717,20 +783,33 @@ static const struct arm64_cpu_capabilities arm64_features[] = { #ifdef CONFIG_ARM64_PAN { .capability = ARM64_ALT_PAN_NOT_UAO, + .def_scope = SCOPE_SYSTEM, .matches = cpufeature_pan_not_uao, }, #endif /* CONFIG_ARM64_PAN */ { .desc = "Virtualization Host Extensions", .capability = ARM64_HAS_VIRT_HOST_EXTN, + .def_scope = SCOPE_SYSTEM, .matches = runs_at_el2, }, + { + .desc = "32-bit EL0 Support", + .capability = ARM64_HAS_32BIT_EL0, + .def_scope = SCOPE_SYSTEM, + .matches = has_cpuid_feature, + .sys_reg = SYS_ID_AA64PFR0_EL1, + .sign = FTR_UNSIGNED, + .field_pos = ID_AA64PFR0_EL0_SHIFT, + .min_field_value = ID_AA64PFR0_EL0_32BIT_64BIT, + }, {}, }; #define HWCAP_CAP(reg, field, s, min_value, type, cap) \ { \ .desc = #cap, \ + .def_scope = SCOPE_SYSTEM, \ .matches = has_cpuid_feature, \ .sys_reg = reg, \ .field_pos = field, \ @@ -740,7 +819,7 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .hwcap = cap, \ } -static const struct arm64_cpu_capabilities arm64_hwcaps[] = { +static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_AES_SHIFT, FTR_UNSIGNED, 2, CAP_HWCAP, HWCAP_PMULL), HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_AES_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_AES), HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SHA1_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_SHA1), @@ -751,6 +830,10 @@ static const struct arm64_cpu_capabilities arm64_hwcaps[] = { HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_FP_SHIFT, FTR_SIGNED, 1, CAP_HWCAP, HWCAP_FPHP), HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_ASIMD_SHIFT, FTR_SIGNED, 0, CAP_HWCAP, HWCAP_ASIMD), HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_ASIMD_SHIFT, FTR_SIGNED, 1, CAP_HWCAP, HWCAP_ASIMDHP), + {}, +}; + +static const struct arm64_cpu_capabilities compat_elf_hwcaps[] = { #ifdef CONFIG_COMPAT HWCAP_CAP(SYS_ID_ISAR5_EL1, ID_ISAR5_AES_SHIFT, FTR_UNSIGNED, 2, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_PMULL), HWCAP_CAP(SYS_ID_ISAR5_EL1, ID_ISAR5_AES_SHIFT, FTR_UNSIGNED, 1, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_AES), @@ -761,7 +844,7 @@ static const struct arm64_cpu_capabilities arm64_hwcaps[] = { {}, }; -static void __init cap_set_hwcap(const struct arm64_cpu_capabilities *cap) +static void __init cap_set_elf_hwcap(const struct arm64_cpu_capabilities *cap) { switch (cap->hwcap_type) { case CAP_HWCAP: @@ -782,7 +865,7 @@ static void __init cap_set_hwcap(const struct arm64_cpu_capabilities *cap) } /* Check if we have a particular HWCAP enabled */ -static bool __maybe_unused cpus_have_hwcap(const struct arm64_cpu_capabilities *cap) +static bool cpus_have_elf_hwcap(const struct arm64_cpu_capabilities *cap) { bool rc; @@ -806,28 +889,23 @@ static bool __maybe_unused cpus_have_hwcap(const struct arm64_cpu_capabilities * return rc; } -static void __init setup_cpu_hwcaps(void) +static void __init setup_elf_hwcaps(const struct arm64_cpu_capabilities *hwcaps) { - int i; - const struct arm64_cpu_capabilities *hwcaps = arm64_hwcaps; - - for (i = 0; hwcaps[i].matches; i++) - if (hwcaps[i].matches(&hwcaps[i])) - cap_set_hwcap(&hwcaps[i]); + for (; hwcaps->matches; hwcaps++) + if (hwcaps->matches(hwcaps, hwcaps->def_scope)) + cap_set_elf_hwcap(hwcaps); } void update_cpu_capabilities(const struct arm64_cpu_capabilities *caps, const char *info) { - int i; - - for (i = 0; caps[i].matches; i++) { - if (!caps[i].matches(&caps[i])) + for (; caps->matches; caps++) { + if (!caps->matches(caps, caps->def_scope)) continue; - if (!cpus_have_cap(caps[i].capability) && caps[i].desc) - pr_info("%s %s\n", info, caps[i].desc); - cpus_set_cap(caps[i].capability); + if (!cpus_have_cap(caps->capability) && caps->desc) + pr_info("%s %s\n", info, caps->desc); + cpus_set_cap(caps->capability); } } @@ -838,11 +916,9 @@ void update_cpu_capabilities(const struct arm64_cpu_capabilities *caps, static void __init enable_cpu_capabilities(const struct arm64_cpu_capabilities *caps) { - int i; - - for (i = 0; caps[i].matches; i++) - if (caps[i].enable && cpus_have_cap(caps[i].capability)) - on_each_cpu(caps[i].enable, NULL, true); + for (; caps->matches; caps++) + if (caps->enable && cpus_have_cap(caps->capability)) + on_each_cpu(caps->enable, NULL, true); } /* @@ -861,54 +937,45 @@ static inline void set_sys_caps_initialised(void) } /* - * __raw_read_system_reg() - Used by a STARTING cpu before cpuinfo is populated. + * Check for CPU features that are used in early boot + * based on the Boot CPU value. */ -static u64 __raw_read_system_reg(u32 sys_id) +static void check_early_cpu_features(void) { - switch (sys_id) { - case SYS_ID_PFR0_EL1: return read_cpuid(ID_PFR0_EL1); - case SYS_ID_PFR1_EL1: return read_cpuid(ID_PFR1_EL1); - case SYS_ID_DFR0_EL1: return read_cpuid(ID_DFR0_EL1); - case SYS_ID_MMFR0_EL1: return read_cpuid(ID_MMFR0_EL1); - case SYS_ID_MMFR1_EL1: return read_cpuid(ID_MMFR1_EL1); - case SYS_ID_MMFR2_EL1: return read_cpuid(ID_MMFR2_EL1); - case SYS_ID_MMFR3_EL1: return read_cpuid(ID_MMFR3_EL1); - case SYS_ID_ISAR0_EL1: return read_cpuid(ID_ISAR0_EL1); - case SYS_ID_ISAR1_EL1: return read_cpuid(ID_ISAR1_EL1); - case SYS_ID_ISAR2_EL1: return read_cpuid(ID_ISAR2_EL1); - case SYS_ID_ISAR3_EL1: return read_cpuid(ID_ISAR3_EL1); - case SYS_ID_ISAR4_EL1: return read_cpuid(ID_ISAR4_EL1); - case SYS_ID_ISAR5_EL1: return read_cpuid(ID_ISAR4_EL1); - case SYS_MVFR0_EL1: return read_cpuid(MVFR0_EL1); - case SYS_MVFR1_EL1: return read_cpuid(MVFR1_EL1); - case SYS_MVFR2_EL1: return read_cpuid(MVFR2_EL1); + verify_cpu_run_el(); + verify_cpu_asid_bits(); +} - case SYS_ID_AA64PFR0_EL1: return read_cpuid(ID_AA64PFR0_EL1); - case SYS_ID_AA64PFR1_EL1: return read_cpuid(ID_AA64PFR0_EL1); - case SYS_ID_AA64DFR0_EL1: return read_cpuid(ID_AA64DFR0_EL1); - case SYS_ID_AA64DFR1_EL1: return read_cpuid(ID_AA64DFR0_EL1); - case SYS_ID_AA64MMFR0_EL1: return read_cpuid(ID_AA64MMFR0_EL1); - case SYS_ID_AA64MMFR1_EL1: return read_cpuid(ID_AA64MMFR1_EL1); - case SYS_ID_AA64MMFR2_EL1: return read_cpuid(ID_AA64MMFR2_EL1); - case SYS_ID_AA64ISAR0_EL1: return read_cpuid(ID_AA64ISAR0_EL1); - case SYS_ID_AA64ISAR1_EL1: return read_cpuid(ID_AA64ISAR1_EL1); +static void +verify_local_elf_hwcaps(const struct arm64_cpu_capabilities *caps) +{ - case SYS_CNTFRQ_EL0: return read_cpuid(CNTFRQ_EL0); - case SYS_CTR_EL0: return read_cpuid(CTR_EL0); - case SYS_DCZID_EL0: return read_cpuid(DCZID_EL0); - default: - BUG(); - return 0; - } + for (; caps->matches; caps++) + if (cpus_have_elf_hwcap(caps) && !caps->matches(caps, SCOPE_LOCAL_CPU)) { + pr_crit("CPU%d: missing HWCAP: %s\n", + smp_processor_id(), caps->desc); + cpu_die_early(); + } } -/* - * Check for CPU features that are used in early boot - * based on the Boot CPU value. - */ -static void check_early_cpu_features(void) +static void +verify_local_cpu_features(const struct arm64_cpu_capabilities *caps) { - verify_cpu_asid_bits(); + for (; caps->matches; caps++) { + if (!cpus_have_cap(caps->capability)) + continue; + /* + * If the new CPU misses an advertised feature, we cannot proceed + * further, park the cpu. + */ + if (!caps->matches(caps, SCOPE_LOCAL_CPU)) { + pr_crit("CPU%d: missing feature: %s\n", + smp_processor_id(), caps->desc); + cpu_die_early(); + } + if (caps->enable) + caps->enable(NULL); + } } /* @@ -921,8 +988,6 @@ static void check_early_cpu_features(void) */ void verify_local_cpu_capabilities(void) { - int i; - const struct arm64_cpu_capabilities *caps; check_early_cpu_features(); @@ -933,32 +998,11 @@ void verify_local_cpu_capabilities(void) if (!sys_caps_initialised) return; - caps = arm64_features; - for (i = 0; caps[i].matches; i++) { - if (!cpus_have_cap(caps[i].capability) || !caps[i].sys_reg) - continue; - /* - * If the new CPU misses an advertised feature, we cannot proceed - * further, park the cpu. - */ - if (!feature_matches(__raw_read_system_reg(caps[i].sys_reg), &caps[i])) { - pr_crit("CPU%d: missing feature: %s\n", - smp_processor_id(), caps[i].desc); - cpu_die_early(); - } - if (caps[i].enable) - caps[i].enable(NULL); - } - - for (i = 0, caps = arm64_hwcaps; caps[i].matches; i++) { - if (!cpus_have_hwcap(&caps[i])) - continue; - if (!feature_matches(__raw_read_system_reg(caps[i].sys_reg), &caps[i])) { - pr_crit("CPU%d: missing HWCAP: %s\n", - smp_processor_id(), caps[i].desc); - cpu_die_early(); - } - } + verify_local_cpu_errata(); + verify_local_cpu_features(arm64_features); + verify_local_elf_hwcaps(arm64_elf_hwcaps); + if (system_supports_32bit_el0()) + verify_local_elf_hwcaps(compat_elf_hwcaps); } static void __init setup_feature_capabilities(void) @@ -967,6 +1011,24 @@ static void __init setup_feature_capabilities(void) enable_cpu_capabilities(arm64_features); } +/* + * Check if the current CPU has a given feature capability. + * Should be called from non-preemptible context. + */ +bool this_cpu_has_cap(unsigned int cap) +{ + const struct arm64_cpu_capabilities *caps; + + if (WARN_ON(preemptible())) + return false; + + for (caps = arm64_features; caps->desc; caps++) + if (caps->capability == cap && caps->matches) + return caps->matches(caps, SCOPE_LOCAL_CPU); + + return false; +} + void __init setup_cpu_features(void) { u32 cwg; @@ -974,7 +1036,10 @@ void __init setup_cpu_features(void) /* Set the CPU feature capabilies */ setup_feature_capabilities(); - setup_cpu_hwcaps(); + setup_elf_hwcaps(arm64_elf_hwcaps); + + if (system_supports_32bit_el0()) + setup_elf_hwcaps(compat_elf_hwcaps); /* Advertise that we have computed the system capabilities */ set_sys_caps_initialised(); @@ -993,7 +1058,7 @@ void __init setup_cpu_features(void) } static bool __maybe_unused -cpufeature_pan_not_uao(const struct arm64_cpu_capabilities *entry) +cpufeature_pan_not_uao(const struct arm64_cpu_capabilities *entry, int __unused) { return (cpus_have_cap(ARM64_HAS_PAN) && !cpus_have_cap(ARM64_HAS_UAO)); } diff --git a/arch/arm64/kernel/cpuidle.c b/arch/arm64/kernel/cpuidle.c index 9047cab6..e11857f 100644 --- a/arch/arm64/kernel/cpuidle.c +++ b/arch/arm64/kernel/cpuidle.c @@ -19,7 +19,8 @@ int __init arm_cpuidle_init(unsigned int cpu) { int ret = -EOPNOTSUPP; - if (cpu_ops[cpu] && cpu_ops[cpu]->cpu_init_idle) + if (cpu_ops[cpu] && cpu_ops[cpu]->cpu_suspend && + cpu_ops[cpu]->cpu_init_idle) ret = cpu_ops[cpu]->cpu_init_idle(cpu); return ret; @@ -36,11 +37,5 @@ int arm_cpuidle_suspend(int index) { int cpu = smp_processor_id(); - /* - * If cpu_ops have not been registered or suspend - * has not been initialized, cpu_suspend call fails early. - */ - if (!cpu_ops[cpu] || !cpu_ops[cpu]->cpu_suspend) - return -EOPNOTSUPP; return cpu_ops[cpu]->cpu_suspend(index); } diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index 84c8684..3808470 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -87,7 +87,8 @@ static const char *const compat_hwcap_str[] = { "idivt", "vfpd32", "lpae", - "evtstrm" + "evtstrm", + NULL }; static const char *const compat_hwcap2_str[] = { @@ -216,23 +217,26 @@ static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info) info->reg_id_aa64pfr0 = read_cpuid(ID_AA64PFR0_EL1); info->reg_id_aa64pfr1 = read_cpuid(ID_AA64PFR1_EL1); - info->reg_id_dfr0 = read_cpuid(ID_DFR0_EL1); - info->reg_id_isar0 = read_cpuid(ID_ISAR0_EL1); - info->reg_id_isar1 = read_cpuid(ID_ISAR1_EL1); - info->reg_id_isar2 = read_cpuid(ID_ISAR2_EL1); - info->reg_id_isar3 = read_cpuid(ID_ISAR3_EL1); - info->reg_id_isar4 = read_cpuid(ID_ISAR4_EL1); - info->reg_id_isar5 = read_cpuid(ID_ISAR5_EL1); - info->reg_id_mmfr0 = read_cpuid(ID_MMFR0_EL1); - info->reg_id_mmfr1 = read_cpuid(ID_MMFR1_EL1); - info->reg_id_mmfr2 = read_cpuid(ID_MMFR2_EL1); - info->reg_id_mmfr3 = read_cpuid(ID_MMFR3_EL1); - info->reg_id_pfr0 = read_cpuid(ID_PFR0_EL1); - info->reg_id_pfr1 = read_cpuid(ID_PFR1_EL1); - - info->reg_mvfr0 = read_cpuid(MVFR0_EL1); - info->reg_mvfr1 = read_cpuid(MVFR1_EL1); - info->reg_mvfr2 = read_cpuid(MVFR2_EL1); + /* Update the 32bit ID registers only if AArch32 is implemented */ + if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) { + info->reg_id_dfr0 = read_cpuid(ID_DFR0_EL1); + info->reg_id_isar0 = read_cpuid(ID_ISAR0_EL1); + info->reg_id_isar1 = read_cpuid(ID_ISAR1_EL1); + info->reg_id_isar2 = read_cpuid(ID_ISAR2_EL1); + info->reg_id_isar3 = read_cpuid(ID_ISAR3_EL1); + info->reg_id_isar4 = read_cpuid(ID_ISAR4_EL1); + info->reg_id_isar5 = read_cpuid(ID_ISAR5_EL1); + info->reg_id_mmfr0 = read_cpuid(ID_MMFR0_EL1); + info->reg_id_mmfr1 = read_cpuid(ID_MMFR1_EL1); + info->reg_id_mmfr2 = read_cpuid(ID_MMFR2_EL1); + info->reg_id_mmfr3 = read_cpuid(ID_MMFR3_EL1); + info->reg_id_pfr0 = read_cpuid(ID_PFR0_EL1); + info->reg_id_pfr1 = read_cpuid(ID_PFR1_EL1); + + info->reg_mvfr0 = read_cpuid(MVFR0_EL1); + info->reg_mvfr1 = read_cpuid(MVFR1_EL1); + info->reg_mvfr2 = read_cpuid(MVFR2_EL1); + } cpuinfo_detect_icache_policy(info); diff --git a/arch/arm64/kernel/debug-monitors.c b/arch/arm64/kernel/debug-monitors.c index c45f296..4fbf3c5 100644 --- a/arch/arm64/kernel/debug-monitors.c +++ b/arch/arm64/kernel/debug-monitors.c @@ -135,9 +135,8 @@ static void clear_os_lock(void *unused) static int os_lock_notify(struct notifier_block *self, unsigned long action, void *data) { - int cpu = (unsigned long)data; if ((action & ~CPU_TASKS_FROZEN) == CPU_ONLINE) - smp_call_function_single(cpu, clear_os_lock, NULL, 1); + clear_os_lock(NULL); return NOTIFY_OK; } diff --git a/arch/arm64/kernel/efi-entry.S b/arch/arm64/kernel/efi-entry.S index cae3112..e88c064 100644 --- a/arch/arm64/kernel/efi-entry.S +++ b/arch/arm64/kernel/efi-entry.S @@ -62,7 +62,7 @@ ENTRY(entry) */ mov x20, x0 // DTB address ldr x0, [sp, #16] // relocated _text address - movz x21, #:abs_g0:stext_offset + ldr w21, =stext_offset add x21, x0, x21 /* diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 85da0f5..2c6e598 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -25,6 +25,7 @@ #include <linux/irqchip/arm-gic-v3.h> #include <asm/assembler.h> +#include <asm/boot.h> #include <asm/ptrace.h> #include <asm/asm-offsets.h> #include <asm/cache.h> @@ -51,9 +52,6 @@ #error TEXT_OFFSET must be less than 2MB #endif -#define KERNEL_START _text -#define KERNEL_END _end - /* * Kernel startup entry point. * --------------------------- @@ -102,8 +100,6 @@ _head: #endif #ifdef CONFIG_EFI - .globl __efistub_stext_offset - .set __efistub_stext_offset, stext - _head .align 3 pe_header: .ascii "PE" @@ -123,11 +119,11 @@ optional_header: .short 0x20b // PE32+ format .byte 0x02 // MajorLinkerVersion .byte 0x14 // MinorLinkerVersion - .long _end - stext // SizeOfCode + .long _end - efi_header_end // SizeOfCode .long 0 // SizeOfInitializedData .long 0 // SizeOfUninitializedData .long __efistub_entry - _head // AddressOfEntryPoint - .long __efistub_stext_offset // BaseOfCode + .long efi_header_end - _head // BaseOfCode extra_header_fields: .quad 0 // ImageBase @@ -144,7 +140,7 @@ extra_header_fields: .long _end - _head // SizeOfImage // Everything before the kernel image is considered part of the header - .long __efistub_stext_offset // SizeOfHeaders + .long efi_header_end - _head // SizeOfHeaders .long 0 // CheckSum .short 0xa // Subsystem (EFI application) .short 0 // DllCharacteristics @@ -188,10 +184,10 @@ section_table: .byte 0 .byte 0 .byte 0 // end of 0 padding of section name - .long _end - stext // VirtualSize - .long __efistub_stext_offset // VirtualAddress - .long _edata - stext // SizeOfRawData - .long __efistub_stext_offset // PointerToRawData + .long _end - efi_header_end // VirtualSize + .long efi_header_end - _head // VirtualAddress + .long _edata - efi_header_end // SizeOfRawData + .long efi_header_end - _head // PointerToRawData .long 0 // PointerToRelocations (0 for executables) .long 0 // PointerToLineNumbers (0 for executables) @@ -200,20 +196,23 @@ section_table: .long 0xe0500020 // Characteristics (section flags) /* - * EFI will load stext onwards at the 4k section alignment + * EFI will load .text onwards at the 4k section alignment * described in the PE/COFF header. To ensure that instruction * sequences using an adrp and a :lo12: immediate will function - * correctly at this alignment, we must ensure that stext is + * correctly at this alignment, we must ensure that .text is * placed at a 4k boundary in the Image to begin with. */ .align 12 +efi_header_end: #endif + __INIT + ENTRY(stext) bl preserve_boot_args bl el2_setup // Drop to EL1, w20=cpu_boot_mode - mov x23, xzr // KASLR offset, defaults to 0 adrp x24, __PHYS_OFFSET + and x23, x24, MIN_KIMG_ALIGN - 1 // KASLR offset, defaults to 0 bl set_cpu_boot_mode_flag bl __create_page_tables // x25=TTBR0, x26=TTBR1 /* @@ -222,13 +221,11 @@ ENTRY(stext) * On return, the CPU will be ready for the MMU to be turned on and * the TCR will have been set. */ - ldr x27, 0f // address to jump to after + bl __cpu_setup // initialise processor + adr_l x27, __primary_switch // address to jump to after // MMU has been enabled - adr_l lr, __enable_mmu // return (PIC) address - b __cpu_setup // initialise processor + b __enable_mmu ENDPROC(stext) - .align 3 -0: .quad __mmap_switched - (_head - TEXT_OFFSET) + KIMAGE_VADDR /* * Preserve the arguments passed by the bootloader in x0 .. x3 @@ -338,7 +335,7 @@ __create_page_tables: cmp x0, x6 b.lo 1b - ldr x7, =SWAPPER_MM_MMUFLAGS + mov x7, SWAPPER_MM_MMUFLAGS /* * Create the identity mapping. @@ -394,12 +391,13 @@ __create_page_tables: * Map the kernel image (starting with PHYS_OFFSET). */ mov x0, x26 // swapper_pg_dir - ldr x5, =KIMAGE_VADDR + mov_q x5, KIMAGE_VADDR + TEXT_OFFSET // compile time __va(_text) add x5, x5, x23 // add KASLR displacement create_pgd_entry x0, x5, x3, x6 - ldr w6, kernel_img_size - add x6, x6, x5 - mov x3, x24 // phys offset + adrp x6, _end // runtime __pa(_end) + adrp x3, _text // runtime __pa(_text) + sub x6, x6, x3 // _end - _text + add x6, x6, x5 // runtime __va(_end) create_block_map x0, x7, x3, x5, x6 /* @@ -414,16 +412,13 @@ __create_page_tables: ret x28 ENDPROC(__create_page_tables) - -kernel_img_size: - .long _end - (_head - TEXT_OFFSET) .ltorg /* * The following fragment of code is executed with the MMU enabled. */ .set initial_sp, init_thread_union + THREAD_START_SP -__mmap_switched: +__primary_switched: mov x28, lr // preserve LR adr_l x8, vectors // load VBAR_EL1 with virtual msr vbar_el1, x8 // vector table address @@ -437,44 +432,6 @@ __mmap_switched: bl __pi_memset dsb ishst // Make zero page visible to PTW -#ifdef CONFIG_RELOCATABLE - - /* - * Iterate over each entry in the relocation table, and apply the - * relocations in place. - */ - adr_l x8, __dynsym_start // start of symbol table - adr_l x9, __reloc_start // start of reloc table - adr_l x10, __reloc_end // end of reloc table - -0: cmp x9, x10 - b.hs 2f - ldp x11, x12, [x9], #24 - ldr x13, [x9, #-8] - cmp w12, #R_AARCH64_RELATIVE - b.ne 1f - add x13, x13, x23 // relocate - str x13, [x11, x23] - b 0b - -1: cmp w12, #R_AARCH64_ABS64 - b.ne 0b - add x12, x12, x12, lsl #1 // symtab offset: 24x top word - add x12, x8, x12, lsr #(32 - 3) // ... shifted into bottom word - ldrsh w14, [x12, #6] // Elf64_Sym::st_shndx - ldr x15, [x12, #8] // Elf64_Sym::st_value - cmp w14, #-0xf // SHN_ABS (0xfff1) ? - add x14, x15, x23 // relocate - csel x15, x14, x15, ne - add x15, x13, x15 - str x15, [x11, x23] - b 0b - -2: adr_l x8, kimage_vaddr // make relocated kimage_vaddr - dc cvac, x8 // value visible to secondaries - dsb sy // with MMU off -#endif - adr_l sp, initial_sp, x4 mov x4, sp and x4, x4, #~(THREAD_SIZE - 1) @@ -490,17 +447,19 @@ __mmap_switched: bl kasan_early_init #endif #ifdef CONFIG_RANDOMIZE_BASE - cbnz x23, 0f // already running randomized? + tst x23, ~(MIN_KIMG_ALIGN - 1) // already running randomized? + b.ne 0f mov x0, x21 // pass FDT address in x0 + mov x1, x23 // pass modulo offset in x1 bl kaslr_early_init // parse FDT for KASLR options cbz x0, 0f // KASLR disabled? just proceed - mov x23, x0 // record KASLR offset + orr x23, x23, x0 // record KASLR offset ret x28 // we must enable KASLR, return // to __enable_mmu() 0: #endif b start_kernel -ENDPROC(__mmap_switched) +ENDPROC(__primary_switched) /* * end early head section, begin head code that is also used for @@ -650,7 +609,7 @@ ENDPROC(el2_setup) * Sets the __boot_cpu_mode flag depending on the CPU boot mode passed * in x20. See arch/arm64/include/asm/virt.h for more info. */ -ENTRY(set_cpu_boot_mode_flag) +set_cpu_boot_mode_flag: adr_l x1, __boot_cpu_mode cmp w20, #BOOT_CPU_MODE_EL2 b.ne 1f @@ -683,7 +642,7 @@ ENTRY(secondary_holding_pen) bl el2_setup // Drop to EL1, w20=cpu_boot_mode bl set_cpu_boot_mode_flag mrs x0, mpidr_el1 - ldr x1, =MPIDR_HWID_BITMASK + mov_q x1, MPIDR_HWID_BITMASK and x0, x0, x1 adr_l x3, secondary_holding_pen_release pen: ldr x4, [x3] @@ -703,7 +662,7 @@ ENTRY(secondary_entry) b secondary_startup ENDPROC(secondary_entry) -ENTRY(secondary_startup) +secondary_startup: /* * Common entry point for secondary CPUs. */ @@ -711,14 +670,11 @@ ENTRY(secondary_startup) adrp x26, swapper_pg_dir bl __cpu_setup // initialise processor - ldr x8, kimage_vaddr - ldr w9, 0f - sub x27, x8, w9, sxtw // address to jump to after enabling the MMU + adr_l x27, __secondary_switch // address to jump to after enabling the MMU b __enable_mmu ENDPROC(secondary_startup) -0: .long (_text - TEXT_OFFSET) - __secondary_switched -ENTRY(__secondary_switched) +__secondary_switched: adr_l x5, vectors msr vbar_el1, x5 isb @@ -768,7 +724,7 @@ ENTRY(__early_cpu_boot_status) * If it isn't, park the CPU */ .section ".idmap.text", "ax" -__enable_mmu: +ENTRY(__enable_mmu) mrs x22, sctlr_el1 // preserve old SCTLR_EL1 value mrs x1, ID_AA64MMFR0_EL1 ubfx x2, x1, #ID_AA64MMFR0_TGRAN_SHIFT, 4 @@ -806,7 +762,6 @@ __enable_mmu: ic iallu // flush instructions fetched dsb nsh // via old mapping isb - add x27, x27, x23 // relocated __mmap_switched #endif br x27 ENDPROC(__enable_mmu) @@ -819,3 +774,53 @@ __no_granule_support: wfi b 1b ENDPROC(__no_granule_support) + +__primary_switch: +#ifdef CONFIG_RELOCATABLE + /* + * Iterate over each entry in the relocation table, and apply the + * relocations in place. + */ + ldr w8, =__dynsym_offset // offset to symbol table + ldr w9, =__rela_offset // offset to reloc table + ldr w10, =__rela_size // size of reloc table + + mov_q x11, KIMAGE_VADDR // default virtual offset + add x11, x11, x23 // actual virtual offset + add x8, x8, x11 // __va(.dynsym) + add x9, x9, x11 // __va(.rela) + add x10, x9, x10 // __va(.rela) + sizeof(.rela) + +0: cmp x9, x10 + b.hs 2f + ldp x11, x12, [x9], #24 + ldr x13, [x9, #-8] + cmp w12, #R_AARCH64_RELATIVE + b.ne 1f + add x13, x13, x23 // relocate + str x13, [x11, x23] + b 0b + +1: cmp w12, #R_AARCH64_ABS64 + b.ne 0b + add x12, x12, x12, lsl #1 // symtab offset: 24x top word + add x12, x8, x12, lsr #(32 - 3) // ... shifted into bottom word + ldrsh w14, [x12, #6] // Elf64_Sym::st_shndx + ldr x15, [x12, #8] // Elf64_Sym::st_value + cmp w14, #-0xf // SHN_ABS (0xfff1) ? + add x14, x15, x23 // relocate + csel x15, x14, x15, ne + add x15, x13, x15 + str x15, [x11, x23] + b 0b + +2: +#endif + ldr x8, =__primary_switched + br x8 +ENDPROC(__primary_switch) + +__secondary_switch: + ldr x8, =__secondary_switched + br x8 +ENDPROC(__secondary_switch) diff --git a/arch/arm64/kernel/hibernate-asm.S b/arch/arm64/kernel/hibernate-asm.S new file mode 100644 index 0000000..46f29b6 --- /dev/null +++ b/arch/arm64/kernel/hibernate-asm.S @@ -0,0 +1,176 @@ +/* + * Hibernate low-level support + * + * Copyright (C) 2016 ARM Ltd. + * Author: James Morse <james.morse@arm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +#include <linux/linkage.h> +#include <linux/errno.h> + +#include <asm/asm-offsets.h> +#include <asm/assembler.h> +#include <asm/cputype.h> +#include <asm/memory.h> +#include <asm/page.h> +#include <asm/virt.h> + +/* + * To prevent the possibility of old and new partial table walks being visible + * in the tlb, switch the ttbr to a zero page when we invalidate the old + * records. D4.7.1 'General TLB maintenance requirements' in ARM DDI 0487A.i + * Even switching to our copied tables will cause a changed output address at + * each stage of the walk. + */ +.macro break_before_make_ttbr_switch zero_page, page_table + msr ttbr1_el1, \zero_page + isb + tlbi vmalle1is + dsb ish + msr ttbr1_el1, \page_table + isb +.endm + + +/* + * Resume from hibernate + * + * Loads temporary page tables then restores the memory image. + * Finally branches to cpu_resume() to restore the state saved by + * swsusp_arch_suspend(). + * + * Because this code has to be copied to a 'safe' page, it can't call out to + * other functions by PC-relative address. Also remember that it may be + * mid-way through over-writing other functions. For this reason it contains + * code from flush_icache_range() and uses the copy_page() macro. + * + * This 'safe' page is mapped via ttbr0, and executed from there. This function + * switches to a copy of the linear map in ttbr1, performs the restore, then + * switches ttbr1 to the original kernel's swapper_pg_dir. + * + * All of memory gets written to, including code. We need to clean the kernel + * text to the Point of Coherence (PoC) before secondary cores can be booted. + * Because the kernel modules and executable pages mapped to user space are + * also written as data, we clean all pages we touch to the Point of + * Unification (PoU). + * + * x0: physical address of temporary page tables + * x1: physical address of swapper page tables + * x2: address of cpu_resume + * x3: linear map address of restore_pblist in the current kernel + * x4: physical address of __hyp_stub_vectors, or 0 + * x5: physical address of a zero page that remains zero after resume + */ +.pushsection ".hibernate_exit.text", "ax" +ENTRY(swsusp_arch_suspend_exit) + /* + * We execute from ttbr0, change ttbr1 to our copied linear map tables + * with a break-before-make via the zero page + */ + break_before_make_ttbr_switch x5, x0 + + mov x21, x1 + mov x30, x2 + mov x24, x4 + mov x25, x5 + + /* walk the restore_pblist and use copy_page() to over-write memory */ + mov x19, x3 + +1: ldr x10, [x19, #HIBERN_PBE_ORIG] + mov x0, x10 + ldr x1, [x19, #HIBERN_PBE_ADDR] + + copy_page x0, x1, x2, x3, x4, x5, x6, x7, x8, x9 + + add x1, x10, #PAGE_SIZE + /* Clean the copied page to PoU - based on flush_icache_range() */ + dcache_line_size x2, x3 + sub x3, x2, #1 + bic x4, x10, x3 +2: dc cvau, x4 /* clean D line / unified line */ + add x4, x4, x2 + cmp x4, x1 + b.lo 2b + + ldr x19, [x19, #HIBERN_PBE_NEXT] + cbnz x19, 1b + dsb ish /* wait for PoU cleaning to finish */ + + /* switch to the restored kernels page tables */ + break_before_make_ttbr_switch x25, x21 + + ic ialluis + dsb ish + isb + + cbz x24, 3f /* Do we need to re-initialise EL2? */ + hvc #0 +3: ret + + .ltorg +ENDPROC(swsusp_arch_suspend_exit) + +/* + * Restore the hyp stub. + * This must be done before the hibernate page is unmapped by _cpu_resume(), + * but happens before any of the hyp-stub's code is cleaned to PoC. + * + * x24: The physical address of __hyp_stub_vectors + */ +el1_sync: + msr vbar_el2, x24 + eret +ENDPROC(el1_sync) + +.macro invalid_vector label +\label: + b \label +ENDPROC(\label) +.endm + + invalid_vector el2_sync_invalid + invalid_vector el2_irq_invalid + invalid_vector el2_fiq_invalid + invalid_vector el2_error_invalid + invalid_vector el1_sync_invalid + invalid_vector el1_irq_invalid + invalid_vector el1_fiq_invalid + invalid_vector el1_error_invalid + +/* el2 vectors - switch el2 here while we restore the memory image. */ + .align 11 +ENTRY(hibernate_el2_vectors) + ventry el2_sync_invalid // Synchronous EL2t + ventry el2_irq_invalid // IRQ EL2t + ventry el2_fiq_invalid // FIQ EL2t + ventry el2_error_invalid // Error EL2t + + ventry el2_sync_invalid // Synchronous EL2h + ventry el2_irq_invalid // IRQ EL2h + ventry el2_fiq_invalid // FIQ EL2h + ventry el2_error_invalid // Error EL2h + + ventry el1_sync // Synchronous 64-bit EL1 + ventry el1_irq_invalid // IRQ 64-bit EL1 + ventry el1_fiq_invalid // FIQ 64-bit EL1 + ventry el1_error_invalid // Error 64-bit EL1 + + ventry el1_sync_invalid // Synchronous 32-bit EL1 + ventry el1_irq_invalid // IRQ 32-bit EL1 + ventry el1_fiq_invalid // FIQ 32-bit EL1 + ventry el1_error_invalid // Error 32-bit EL1 +END(hibernate_el2_vectors) + +.popsection diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c new file mode 100644 index 0000000..f8df75d --- /dev/null +++ b/arch/arm64/kernel/hibernate.c @@ -0,0 +1,487 @@ +/*: + * Hibernate support specific for ARM64 + * + * Derived from work on ARM hibernation support by: + * + * Ubuntu project, hibernation support for mach-dove + * Copyright (C) 2010 Nokia Corporation (Hiroshi Doyu) + * Copyright (C) 2010 Texas Instruments, Inc. (Teerth Reddy et al.) + * https://lkml.org/lkml/2010/6/18/4 + * https://lists.linux-foundation.org/pipermail/linux-pm/2010-June/027422.html + * https://patchwork.kernel.org/patch/96442/ + * + * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> + * + * License terms: GNU General Public License (GPL) version 2 + */ +#define pr_fmt(x) "hibernate: " x +#include <linux/kvm_host.h> +#include <linux/mm.h> +#include <linux/notifier.h> +#include <linux/pm.h> +#include <linux/sched.h> +#include <linux/suspend.h> +#include <linux/utsname.h> +#include <linux/version.h> + +#include <asm/barrier.h> +#include <asm/cacheflush.h> +#include <asm/irqflags.h> +#include <asm/memory.h> +#include <asm/mmu_context.h> +#include <asm/pgalloc.h> +#include <asm/pgtable.h> +#include <asm/pgtable-hwdef.h> +#include <asm/sections.h> +#include <asm/suspend.h> +#include <asm/virt.h> + +/* + * Hibernate core relies on this value being 0 on resume, and marks it + * __nosavedata assuming it will keep the resume kernel's '0' value. This + * doesn't happen with either KASLR. + * + * defined as "__visible int in_suspend __nosavedata" in + * kernel/power/hibernate.c + */ +extern int in_suspend; + +/* Find a symbols alias in the linear map */ +#define LMADDR(x) phys_to_virt(virt_to_phys(x)) + +/* Do we need to reset el2? */ +#define el2_reset_needed() (is_hyp_mode_available() && !is_kernel_in_hyp_mode()) + +/* + * Start/end of the hibernate exit code, this must be copied to a 'safe' + * location in memory, and executed from there. + */ +extern char __hibernate_exit_text_start[], __hibernate_exit_text_end[]; + +/* temporary el2 vectors in the __hibernate_exit_text section. */ +extern char hibernate_el2_vectors[]; + +/* hyp-stub vectors, used to restore el2 during resume from hibernate. */ +extern char __hyp_stub_vectors[]; + +/* + * Values that may not change over hibernate/resume. We put the build number + * and date in here so that we guarantee not to resume with a different + * kernel. + */ +struct arch_hibernate_hdr_invariants { + char uts_version[__NEW_UTS_LEN + 1]; +}; + +/* These values need to be know across a hibernate/restore. */ +static struct arch_hibernate_hdr { + struct arch_hibernate_hdr_invariants invariants; + + /* These are needed to find the relocated kernel if built with kaslr */ + phys_addr_t ttbr1_el1; + void (*reenter_kernel)(void); + + /* + * We need to know where the __hyp_stub_vectors are after restore to + * re-configure el2. + */ + phys_addr_t __hyp_stub_vectors; +} resume_hdr; + +static inline void arch_hdr_invariants(struct arch_hibernate_hdr_invariants *i) +{ + memset(i, 0, sizeof(*i)); + memcpy(i->uts_version, init_utsname()->version, sizeof(i->uts_version)); +} + +int pfn_is_nosave(unsigned long pfn) +{ + unsigned long nosave_begin_pfn = virt_to_pfn(&__nosave_begin); + unsigned long nosave_end_pfn = virt_to_pfn(&__nosave_end - 1); + + return (pfn >= nosave_begin_pfn) && (pfn <= nosave_end_pfn); +} + +void notrace save_processor_state(void) +{ + WARN_ON(num_online_cpus() != 1); +} + +void notrace restore_processor_state(void) +{ +} + +int arch_hibernation_header_save(void *addr, unsigned int max_size) +{ + struct arch_hibernate_hdr *hdr = addr; + + if (max_size < sizeof(*hdr)) + return -EOVERFLOW; + + arch_hdr_invariants(&hdr->invariants); + hdr->ttbr1_el1 = virt_to_phys(swapper_pg_dir); + hdr->reenter_kernel = _cpu_resume; + + /* We can't use __hyp_get_vectors() because kvm may still be loaded */ + if (el2_reset_needed()) + hdr->__hyp_stub_vectors = virt_to_phys(__hyp_stub_vectors); + else + hdr->__hyp_stub_vectors = 0; + + return 0; +} +EXPORT_SYMBOL(arch_hibernation_header_save); + +int arch_hibernation_header_restore(void *addr) +{ + struct arch_hibernate_hdr_invariants invariants; + struct arch_hibernate_hdr *hdr = addr; + + arch_hdr_invariants(&invariants); + if (memcmp(&hdr->invariants, &invariants, sizeof(invariants))) { + pr_crit("Hibernate image not generated by this kernel!\n"); + return -EINVAL; + } + + resume_hdr = *hdr; + + return 0; +} +EXPORT_SYMBOL(arch_hibernation_header_restore); + +/* + * Copies length bytes, starting at src_start into an new page, + * perform cache maintentance, then maps it at the specified address low + * address as executable. + * + * This is used by hibernate to copy the code it needs to execute when + * overwriting the kernel text. This function generates a new set of page + * tables, which it loads into ttbr0. + * + * Length is provided as we probably only want 4K of data, even on a 64K + * page system. + */ +static int create_safe_exec_page(void *src_start, size_t length, + unsigned long dst_addr, + phys_addr_t *phys_dst_addr, + void *(*allocator)(gfp_t mask), + gfp_t mask) +{ + int rc = 0; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + unsigned long dst = (unsigned long)allocator(mask); + + if (!dst) { + rc = -ENOMEM; + goto out; + } + + memcpy((void *)dst, src_start, length); + flush_icache_range(dst, dst + length); + + pgd = pgd_offset_raw(allocator(mask), dst_addr); + if (pgd_none(*pgd)) { + pud = allocator(mask); + if (!pud) { + rc = -ENOMEM; + goto out; + } + pgd_populate(&init_mm, pgd, pud); + } + + pud = pud_offset(pgd, dst_addr); + if (pud_none(*pud)) { + pmd = allocator(mask); + if (!pmd) { + rc = -ENOMEM; + goto out; + } + pud_populate(&init_mm, pud, pmd); + } + + pmd = pmd_offset(pud, dst_addr); + if (pmd_none(*pmd)) { + pte = allocator(mask); + if (!pte) { + rc = -ENOMEM; + goto out; + } + pmd_populate_kernel(&init_mm, pmd, pte); + } + + pte = pte_offset_kernel(pmd, dst_addr); + set_pte(pte, __pte(virt_to_phys((void *)dst) | + pgprot_val(PAGE_KERNEL_EXEC))); + + /* Load our new page tables */ + asm volatile("msr ttbr0_el1, %0;" + "isb;" + "tlbi vmalle1is;" + "dsb ish;" + "isb" : : "r"(virt_to_phys(pgd))); + + *phys_dst_addr = virt_to_phys((void *)dst); + +out: + return rc; +} + + +int swsusp_arch_suspend(void) +{ + int ret = 0; + unsigned long flags; + struct sleep_stack_data state; + + local_dbg_save(flags); + + if (__cpu_suspend_enter(&state)) { + ret = swsusp_save(); + } else { + /* Clean kernel to PoC for secondary core startup */ + __flush_dcache_area(LMADDR(KERNEL_START), KERNEL_END - KERNEL_START); + + /* + * Tell the hibernation core that we've just restored + * the memory + */ + in_suspend = 0; + + __cpu_suspend_exit(); + } + + local_dbg_restore(flags); + + return ret; +} + +static int copy_pte(pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long start, + unsigned long end) +{ + pte_t *src_pte; + pte_t *dst_pte; + unsigned long addr = start; + + dst_pte = (pte_t *)get_safe_page(GFP_ATOMIC); + if (!dst_pte) + return -ENOMEM; + pmd_populate_kernel(&init_mm, dst_pmd, dst_pte); + dst_pte = pte_offset_kernel(dst_pmd, start); + + src_pte = pte_offset_kernel(src_pmd, start); + do { + if (!pte_none(*src_pte)) + /* + * Resume will overwrite areas that may be marked + * read only (code, rodata). Clear the RDONLY bit from + * the temporary mappings we use during restore. + */ + set_pte(dst_pte, __pte(pte_val(*src_pte) & ~PTE_RDONLY)); + } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); + + return 0; +} + +static int copy_pmd(pud_t *dst_pud, pud_t *src_pud, unsigned long start, + unsigned long end) +{ + pmd_t *src_pmd; + pmd_t *dst_pmd; + unsigned long next; + unsigned long addr = start; + + if (pud_none(*dst_pud)) { + dst_pmd = (pmd_t *)get_safe_page(GFP_ATOMIC); + if (!dst_pmd) + return -ENOMEM; + pud_populate(&init_mm, dst_pud, dst_pmd); + } + dst_pmd = pmd_offset(dst_pud, start); + + src_pmd = pmd_offset(src_pud, start); + do { + next = pmd_addr_end(addr, end); + if (pmd_none(*src_pmd)) + continue; + if (pmd_table(*src_pmd)) { + if (copy_pte(dst_pmd, src_pmd, addr, next)) + return -ENOMEM; + } else { + set_pmd(dst_pmd, + __pmd(pmd_val(*src_pmd) & ~PMD_SECT_RDONLY)); + } + } while (dst_pmd++, src_pmd++, addr = next, addr != end); + + return 0; +} + +static int copy_pud(pgd_t *dst_pgd, pgd_t *src_pgd, unsigned long start, + unsigned long end) +{ + pud_t *dst_pud; + pud_t *src_pud; + unsigned long next; + unsigned long addr = start; + + if (pgd_none(*dst_pgd)) { + dst_pud = (pud_t *)get_safe_page(GFP_ATOMIC); + if (!dst_pud) + return -ENOMEM; + pgd_populate(&init_mm, dst_pgd, dst_pud); + } + dst_pud = pud_offset(dst_pgd, start); + + src_pud = pud_offset(src_pgd, start); + do { + next = pud_addr_end(addr, end); + if (pud_none(*src_pud)) + continue; + if (pud_table(*(src_pud))) { + if (copy_pmd(dst_pud, src_pud, addr, next)) + return -ENOMEM; + } else { + set_pud(dst_pud, + __pud(pud_val(*src_pud) & ~PMD_SECT_RDONLY)); + } + } while (dst_pud++, src_pud++, addr = next, addr != end); + + return 0; +} + +static int copy_page_tables(pgd_t *dst_pgd, unsigned long start, + unsigned long end) +{ + unsigned long next; + unsigned long addr = start; + pgd_t *src_pgd = pgd_offset_k(start); + + dst_pgd = pgd_offset_raw(dst_pgd, start); + do { + next = pgd_addr_end(addr, end); + if (pgd_none(*src_pgd)) + continue; + if (copy_pud(dst_pgd, src_pgd, addr, next)) + return -ENOMEM; + } while (dst_pgd++, src_pgd++, addr = next, addr != end); + + return 0; +} + +/* + * Setup then Resume from the hibernate image using swsusp_arch_suspend_exit(). + * + * Memory allocated by get_safe_page() will be dealt with by the hibernate code, + * we don't need to free it here. + */ +int swsusp_arch_resume(void) +{ + int rc = 0; + void *zero_page; + size_t exit_size; + pgd_t *tmp_pg_dir; + void *lm_restore_pblist; + phys_addr_t phys_hibernate_exit; + void __noreturn (*hibernate_exit)(phys_addr_t, phys_addr_t, void *, + void *, phys_addr_t, phys_addr_t); + + /* + * Locate the exit code in the bottom-but-one page, so that *NULL + * still has disastrous affects. + */ + hibernate_exit = (void *)PAGE_SIZE; + exit_size = __hibernate_exit_text_end - __hibernate_exit_text_start; + /* + * Copy swsusp_arch_suspend_exit() to a safe page. This will generate + * a new set of ttbr0 page tables and load them. + */ + rc = create_safe_exec_page(__hibernate_exit_text_start, exit_size, + (unsigned long)hibernate_exit, + &phys_hibernate_exit, + (void *)get_safe_page, GFP_ATOMIC); + if (rc) { + pr_err("Failed to create safe executable page for hibernate_exit code."); + goto out; + } + + /* + * The hibernate exit text contains a set of el2 vectors, that will + * be executed at el2 with the mmu off in order to reload hyp-stub. + */ + __flush_dcache_area(hibernate_exit, exit_size); + + /* + * Restoring the memory image will overwrite the ttbr1 page tables. + * Create a second copy of just the linear map, and use this when + * restoring. + */ + tmp_pg_dir = (pgd_t *)get_safe_page(GFP_ATOMIC); + if (!tmp_pg_dir) { + pr_err("Failed to allocate memory for temporary page tables."); + rc = -ENOMEM; + goto out; + } + rc = copy_page_tables(tmp_pg_dir, PAGE_OFFSET, 0); + if (rc) + goto out; + + /* + * Since we only copied the linear map, we need to find restore_pblist's + * linear map address. + */ + lm_restore_pblist = LMADDR(restore_pblist); + + /* + * KASLR will cause the el2 vectors to be in a different location in + * the resumed kernel. Load hibernate's temporary copy into el2. + * + * We can skip this step if we booted at EL1, or are running with VHE. + */ + if (el2_reset_needed()) { + phys_addr_t el2_vectors = phys_hibernate_exit; /* base */ + el2_vectors += hibernate_el2_vectors - + __hibernate_exit_text_start; /* offset */ + + __hyp_set_vectors(el2_vectors); + } + + /* + * We need a zero page that is zero before & after resume in order to + * to break before make on the ttbr1 page tables. + */ + zero_page = (void *)get_safe_page(GFP_ATOMIC); + + hibernate_exit(virt_to_phys(tmp_pg_dir), resume_hdr.ttbr1_el1, + resume_hdr.reenter_kernel, lm_restore_pblist, + resume_hdr.__hyp_stub_vectors, virt_to_phys(zero_page)); + +out: + return rc; +} + +static int check_boot_cpu_online_pm_callback(struct notifier_block *nb, + unsigned long action, void *ptr) +{ + if (action == PM_HIBERNATION_PREPARE && + cpumask_first(cpu_online_mask) != 0) { + pr_warn("CPU0 is offline.\n"); + return notifier_from_errno(-ENODEV); + } + + return NOTIFY_OK; +} + +static int __init check_boot_cpu_online_init(void) +{ + /* + * Set this pm_notifier callback with a lower priority than + * cpu_hotplug_pm_callback, so that cpu_hotplug_pm_callback will be + * called earlier to disable cpu hotplug before the cpu online check. + */ + pm_notifier(check_boot_cpu_online_pm_callback, -INT_MAX); + + return 0; +} +core_initcall(check_boot_cpu_online_init); diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c index 4ef5373..ce21aa8 100644 --- a/arch/arm64/kernel/hw_breakpoint.c +++ b/arch/arm64/kernel/hw_breakpoint.c @@ -886,9 +886,11 @@ static int hw_breakpoint_reset_notify(struct notifier_block *self, unsigned long action, void *hcpu) { - int cpu = (long)hcpu; - if ((action & ~CPU_TASKS_FROZEN) == CPU_ONLINE) - smp_call_function_single(cpu, hw_breakpoint_reset, NULL, 1); + if ((action & ~CPU_TASKS_FROZEN) == CPU_ONLINE) { + local_irq_disable(); + hw_breakpoint_reset(NULL); + local_irq_enable(); + } return NOTIFY_OK; } diff --git a/arch/arm64/kernel/hyp-stub.S b/arch/arm64/kernel/hyp-stub.S index a272f33..8727f44 100644 --- a/arch/arm64/kernel/hyp-stub.S +++ b/arch/arm64/kernel/hyp-stub.S @@ -22,6 +22,8 @@ #include <linux/irqchip/arm-gic-v3.h> #include <asm/assembler.h> +#include <asm/kvm_arm.h> +#include <asm/kvm_asm.h> #include <asm/ptrace.h> #include <asm/virt.h> @@ -53,15 +55,26 @@ ENDPROC(__hyp_stub_vectors) .align 11 el1_sync: - mrs x1, esr_el2 - lsr x1, x1, #26 - cmp x1, #0x16 - b.ne 2f // Not an HVC trap - cbz x0, 1f - msr vbar_el2, x0 // Set vbar_el2 - b 2f -1: mrs x0, vbar_el2 // Return vbar_el2 -2: eret + mrs x30, esr_el2 + lsr x30, x30, #ESR_ELx_EC_SHIFT + + cmp x30, #ESR_ELx_EC_HVC64 + b.ne 9f // Not an HVC trap + + cmp x0, #HVC_GET_VECTORS + b.ne 1f + mrs x0, vbar_el2 + b 9f + +1: cmp x0, #HVC_SET_VECTORS + b.ne 2f + msr vbar_el2, x1 + b 9f + + /* Someone called kvm_call_hyp() against the hyp-stub... */ +2: mov x0, #ARM_EXCEPTION_HYP_GONE + +9: eret ENDPROC(el1_sync) .macro invalid_vector label @@ -101,10 +114,18 @@ ENDPROC(\label) */ ENTRY(__hyp_get_vectors) - mov x0, xzr - // fall through -ENTRY(__hyp_set_vectors) + str lr, [sp, #-16]! + mov x0, #HVC_GET_VECTORS hvc #0 + ldr lr, [sp], #16 ret ENDPROC(__hyp_get_vectors) + +ENTRY(__hyp_set_vectors) + str lr, [sp, #-16]! + mov x1, x0 + mov x0, #HVC_SET_VECTORS + hvc #0 + ldr lr, [sp], #16 + ret ENDPROC(__hyp_set_vectors) diff --git a/arch/arm64/kernel/image.h b/arch/arm64/kernel/image.h index 1428849a..c7fcb23 100644 --- a/arch/arm64/kernel/image.h +++ b/arch/arm64/kernel/image.h @@ -73,6 +73,8 @@ #ifdef CONFIG_EFI +__efistub_stext_offset = stext - _text; + /* * Prevent the symbol aliases below from being emitted into the kallsyms * table, by forcing them to be absolute symbols (which are conveniently diff --git a/arch/arm64/kernel/insn.c b/arch/arm64/kernel/insn.c index 7371455..368c082 100644 --- a/arch/arm64/kernel/insn.c +++ b/arch/arm64/kernel/insn.c @@ -96,7 +96,7 @@ static void __kprobes *patch_map(void *addr, int fixmap) if (module && IS_ENABLED(CONFIG_DEBUG_SET_MODULE_RONX)) page = vmalloc_to_page(addr); else if (!module && IS_ENABLED(CONFIG_DEBUG_RODATA)) - page = virt_to_page(addr); + page = pfn_to_page(PHYS_PFN(__pa(addr))); else return addr; diff --git a/arch/arm64/kernel/kaslr.c b/arch/arm64/kernel/kaslr.c index 5829839..b054691 100644 --- a/arch/arm64/kernel/kaslr.c +++ b/arch/arm64/kernel/kaslr.c @@ -74,7 +74,7 @@ extern void *__init __fixmap_remap_fdt(phys_addr_t dt_phys, int *size, * containing function pointers) to be reinitialized, and zero-initialized * .bss variables will be reset to 0. */ -u64 __init kaslr_early_init(u64 dt_phys) +u64 __init kaslr_early_init(u64 dt_phys, u64 modulo_offset) { void *fdt; u64 seed, offset, mask, module_range; @@ -132,8 +132,8 @@ u64 __init kaslr_early_init(u64 dt_phys) * boundary (for 4KB/16KB/64KB granule kernels, respectively). If this * happens, increase the KASLR offset by the size of the kernel image. */ - if ((((u64)_text + offset) >> SWAPPER_TABLE_SHIFT) != - (((u64)_end + offset) >> SWAPPER_TABLE_SHIFT)) + if ((((u64)_text + offset + modulo_offset) >> SWAPPER_TABLE_SHIFT) != + (((u64)_end + offset + modulo_offset) >> SWAPPER_TABLE_SHIFT)) offset = (offset + (u64)(_end - _text)) & mask; if (IS_ENABLED(CONFIG_KASAN)) diff --git a/arch/arm64/kernel/pci.c b/arch/arm64/kernel/pci.c index c72de66..3c4e308 100644 --- a/arch/arm64/kernel/pci.c +++ b/arch/arm64/kernel/pci.c @@ -74,6 +74,16 @@ int raw_pci_write(unsigned int domain, unsigned int bus, return -ENXIO; } +#ifdef CONFIG_NUMA + +int pcibus_to_node(struct pci_bus *bus) +{ + return dev_to_node(&bus->dev); +} +EXPORT_SYMBOL(pcibus_to_node); + +#endif + #ifdef CONFIG_ACPI /* Root bridge scanning */ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root) diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 8062482..48eea68 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -265,9 +265,6 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start, if (stack_start) { if (is_compat_thread(task_thread_info(p))) childregs->compat_sp = stack_start; - /* 16-byte aligned stack mandatory on AArch64 */ - else if (stack_start & 15) - return -EINVAL; else childregs->sp = stack_start; } @@ -382,13 +379,14 @@ unsigned long arch_align_stack(unsigned long sp) return sp & ~0xf; } -static unsigned long randomize_base(unsigned long base) -{ - unsigned long range_end = base + (STACK_RND_MASK << PAGE_SHIFT) + 1; - return randomize_range(base, range_end, 0) ? : base; -} - unsigned long arch_randomize_brk(struct mm_struct *mm) { - return randomize_base(mm->brk); + unsigned long range_end = mm->brk; + + if (is_compat_task()) + range_end += 0x02000000; + else + range_end += 0x40000000; + + return randomize_range(mm->brk, range_end, 0) ? : mm->brk; } diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 9dc6776..3279def 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -53,6 +53,7 @@ #include <asm/cpufeature.h> #include <asm/cpu_ops.h> #include <asm/kasan.h> +#include <asm/numa.h> #include <asm/sections.h> #include <asm/setup.h> #include <asm/smp_plat.h> @@ -175,7 +176,6 @@ static void __init smp_build_mpidr_hash(void) */ if (mpidr_hash_size() > 4 * num_possible_cpus()) pr_warn("Large number of MPIDR hash buckets detected\n"); - __flush_dcache_area(&mpidr_hash, sizeof(struct mpidr_hash)); } static void __init setup_machine_fdt(phys_addr_t dt_phys) @@ -224,69 +224,6 @@ static void __init request_standard_resources(void) } } -#ifdef CONFIG_BLK_DEV_INITRD -/* - * Relocate initrd if it is not completely within the linear mapping. - * This would be the case if mem= cuts out all or part of it. - */ -static void __init relocate_initrd(void) -{ - phys_addr_t orig_start = __virt_to_phys(initrd_start); - phys_addr_t orig_end = __virt_to_phys(initrd_end); - phys_addr_t ram_end = memblock_end_of_DRAM(); - phys_addr_t new_start; - unsigned long size, to_free = 0; - void *dest; - - if (orig_end <= ram_end) - return; - - /* - * Any of the original initrd which overlaps the linear map should - * be freed after relocating. - */ - if (orig_start < ram_end) - to_free = ram_end - orig_start; - - size = orig_end - orig_start; - if (!size) - return; - - /* initrd needs to be relocated completely inside linear mapping */ - new_start = memblock_find_in_range(0, PFN_PHYS(max_pfn), - size, PAGE_SIZE); - if (!new_start) - panic("Cannot relocate initrd of size %ld\n", size); - memblock_reserve(new_start, size); - - initrd_start = __phys_to_virt(new_start); - initrd_end = initrd_start + size; - - pr_info("Moving initrd from [%llx-%llx] to [%llx-%llx]\n", - orig_start, orig_start + size - 1, - new_start, new_start + size - 1); - - dest = (void *)initrd_start; - - if (to_free) { - memcpy(dest, (void *)__phys_to_virt(orig_start), to_free); - dest += to_free; - } - - copy_from_early_mem(dest, orig_start + to_free, size - to_free); - - if (to_free) { - pr_info("Freeing original RAMDISK from [%llx-%llx]\n", - orig_start, orig_start + to_free - 1); - memblock_free(orig_start, to_free); - } -} -#else -static inline void __init relocate_initrd(void) -{ -} -#endif - u64 __cpu_logical_map[NR_CPUS] = { [0 ... NR_CPUS-1] = INVALID_HWID }; void __init setup_arch(char **cmdline_p) @@ -327,7 +264,11 @@ void __init setup_arch(char **cmdline_p) acpi_boot_table_init(); paging_init(); - relocate_initrd(); + + if (acpi_disabled) + unflatten_device_tree(); + + bootmem_init(); kasan_init(); @@ -335,12 +276,11 @@ void __init setup_arch(char **cmdline_p) early_ioremap_reset(); - if (acpi_disabled) { - unflatten_device_tree(); + if (acpi_disabled) psci_dt_init(); - } else { + else psci_acpi_init(); - } + xen_early_init(); cpu_read_bootcpu_ops(); @@ -379,6 +319,9 @@ static int __init topology_init(void) { int i; + for_each_online_node(i) + register_one_node(i); + for_each_possible_cpu(i) { struct cpu *cpu = &per_cpu(cpu_data.cpu, i); cpu->hotpluggable = 1; diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S index fd10eb6..9a3aec9 100644 --- a/arch/arm64/kernel/sleep.S +++ b/arch/arm64/kernel/sleep.S @@ -49,39 +49,32 @@ orr \dst, \dst, \mask // dst|=(aff3>>rs3) .endm /* - * Save CPU state for a suspend and execute the suspend finisher. - * On success it will return 0 through cpu_resume - ie through a CPU - * soft/hard reboot from the reset vector. - * On failure it returns the suspend finisher return value or force - * -EOPNOTSUPP if the finisher erroneously returns 0 (the suspend finisher - * is not allowed to return, if it does this must be considered failure). - * It saves callee registers, and allocates space on the kernel stack - * to save the CPU specific registers + some other data for resume. + * Save CPU state in the provided sleep_stack_data area, and publish its + * location for cpu_resume()'s use in sleep_save_stash. * - * x0 = suspend finisher argument - * x1 = suspend finisher function pointer + * cpu_resume() will restore this saved state, and return. Because the + * link-register is saved and restored, it will appear to return from this + * function. So that the caller can tell the suspend/resume paths apart, + * __cpu_suspend_enter() will always return a non-zero value, whereas the + * path through cpu_resume() will return 0. + * + * x0 = struct sleep_stack_data area */ ENTRY(__cpu_suspend_enter) - stp x29, lr, [sp, #-96]! - stp x19, x20, [sp,#16] - stp x21, x22, [sp,#32] - stp x23, x24, [sp,#48] - stp x25, x26, [sp,#64] - stp x27, x28, [sp,#80] - /* - * Stash suspend finisher and its argument in x20 and x19 - */ - mov x19, x0 - mov x20, x1 + stp x29, lr, [x0, #SLEEP_STACK_DATA_CALLEE_REGS] + stp x19, x20, [x0,#SLEEP_STACK_DATA_CALLEE_REGS+16] + stp x21, x22, [x0,#SLEEP_STACK_DATA_CALLEE_REGS+32] + stp x23, x24, [x0,#SLEEP_STACK_DATA_CALLEE_REGS+48] + stp x25, x26, [x0,#SLEEP_STACK_DATA_CALLEE_REGS+64] + stp x27, x28, [x0,#SLEEP_STACK_DATA_CALLEE_REGS+80] + + /* save the sp in cpu_suspend_ctx */ mov x2, sp - sub sp, sp, #CPU_SUSPEND_SZ // allocate cpu_suspend_ctx - mov x0, sp - /* - * x0 now points to struct cpu_suspend_ctx allocated on the stack - */ - str x2, [x0, #CPU_CTX_SP] - ldr x1, =sleep_save_sp - ldr x1, [x1, #SLEEP_SAVE_SP_VIRT] + str x2, [x0, #SLEEP_STACK_DATA_SYSTEM_REGS + CPU_CTX_SP] + + /* find the mpidr_hash */ + ldr x1, =sleep_save_stash + ldr x1, [x1] mrs x7, mpidr_el1 ldr x9, =mpidr_hash ldr x10, [x9, #MPIDR_HASH_MASK] @@ -93,74 +86,28 @@ ENTRY(__cpu_suspend_enter) ldp w5, w6, [x9, #(MPIDR_HASH_SHIFTS + 8)] compute_mpidr_hash x8, x3, x4, x5, x6, x7, x10 add x1, x1, x8, lsl #3 - bl __cpu_suspend_save - /* - * Grab suspend finisher in x20 and its argument in x19 - */ - mov x0, x19 - mov x1, x20 - /* - * We are ready for power down, fire off the suspend finisher - * in x1, with argument in x0 - */ - blr x1 - /* - * Never gets here, unless suspend finisher fails. - * Successful cpu_suspend should return from cpu_resume, returning - * through this code path is considered an error - * If the return value is set to 0 force x0 = -EOPNOTSUPP - * to make sure a proper error condition is propagated - */ - cmp x0, #0 - mov x3, #-EOPNOTSUPP - csel x0, x3, x0, eq - add sp, sp, #CPU_SUSPEND_SZ // rewind stack pointer - ldp x19, x20, [sp, #16] - ldp x21, x22, [sp, #32] - ldp x23, x24, [sp, #48] - ldp x25, x26, [sp, #64] - ldp x27, x28, [sp, #80] - ldp x29, lr, [sp], #96 + + str x0, [x1] + add x0, x0, #SLEEP_STACK_DATA_SYSTEM_REGS + stp x29, lr, [sp, #-16]! + bl cpu_do_suspend + ldp x29, lr, [sp], #16 + mov x0, #1 ret ENDPROC(__cpu_suspend_enter) .ltorg -/* - * x0 must contain the sctlr value retrieved from restored context - */ - .pushsection ".idmap.text", "ax" -ENTRY(cpu_resume_mmu) - ldr x3, =cpu_resume_after_mmu - msr sctlr_el1, x0 // restore sctlr_el1 - isb - /* - * Invalidate the local I-cache so that any instructions fetched - * speculatively from the PoC are discarded, since they may have - * been dynamically patched at the PoU. - */ - ic iallu - dsb nsh - isb - br x3 // global jump to virtual address -ENDPROC(cpu_resume_mmu) - .popsection -cpu_resume_after_mmu: -#ifdef CONFIG_KASAN - mov x0, sp - bl kasan_unpoison_remaining_stack -#endif - mov x0, #0 // return zero on success - ldp x19, x20, [sp, #16] - ldp x21, x22, [sp, #32] - ldp x23, x24, [sp, #48] - ldp x25, x26, [sp, #64] - ldp x27, x28, [sp, #80] - ldp x29, lr, [sp], #96 - ret -ENDPROC(cpu_resume_after_mmu) - ENTRY(cpu_resume) bl el2_setup // if in EL2 drop to EL1 cleanly + /* enable the MMU early - so we can access sleep_save_stash by va */ + adr_l lr, __enable_mmu /* __cpu_setup will return here */ + ldr x27, =_cpu_resume /* __enable_mmu will branch here */ + adrp x25, idmap_pg_dir + adrp x26, swapper_pg_dir + b __cpu_setup +ENDPROC(cpu_resume) + +ENTRY(_cpu_resume) mrs x1, mpidr_el1 adrp x8, mpidr_hash add x8, x8, #:lo12:mpidr_hash // x8 = struct mpidr_hash phys address @@ -170,20 +117,32 @@ ENTRY(cpu_resume) ldp w5, w6, [x8, #(MPIDR_HASH_SHIFTS + 8)] compute_mpidr_hash x7, x3, x4, x5, x6, x1, x2 /* x7 contains hash index, let's use it to grab context pointer */ - ldr_l x0, sleep_save_sp + SLEEP_SAVE_SP_PHYS + ldr_l x0, sleep_save_stash ldr x0, [x0, x7, lsl #3] + add x29, x0, #SLEEP_STACK_DATA_CALLEE_REGS + add x0, x0, #SLEEP_STACK_DATA_SYSTEM_REGS /* load sp from context */ ldr x2, [x0, #CPU_CTX_SP] - /* load physical address of identity map page table in x1 */ - adrp x1, idmap_pg_dir mov sp, x2 /* save thread_info */ and x2, x2, #~(THREAD_SIZE - 1) msr sp_el0, x2 /* - * cpu_do_resume expects x0 to contain context physical address - * pointer and x1 to contain physical address of 1:1 page tables + * cpu_do_resume expects x0 to contain context address pointer */ - bl cpu_do_resume // PC relative jump, MMU off - b cpu_resume_mmu // Resume MMU, never returns -ENDPROC(cpu_resume) + bl cpu_do_resume + +#ifdef CONFIG_KASAN + mov x0, sp + bl kasan_unpoison_remaining_stack +#endif + + ldp x19, x20, [x29, #16] + ldp x21, x22, [x29, #32] + ldp x23, x24, [x29, #48] + ldp x25, x26, [x29, #64] + ldp x27, x28, [x29, #80] + ldp x29, lr, [x29] + mov x0, #0 + ret +ENDPROC(_cpu_resume) diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index b2d5f4e..678e084 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -45,6 +45,7 @@ #include <asm/cputype.h> #include <asm/cpu_ops.h> #include <asm/mmu_context.h> +#include <asm/numa.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> #include <asm/processor.h> @@ -75,6 +76,43 @@ enum ipi_msg_type { IPI_WAKEUP }; +#ifdef CONFIG_ARM64_VHE + +/* Whether the boot CPU is running in HYP mode or not*/ +static bool boot_cpu_hyp_mode; + +static inline void save_boot_cpu_run_el(void) +{ + boot_cpu_hyp_mode = is_kernel_in_hyp_mode(); +} + +static inline bool is_boot_cpu_in_hyp_mode(void) +{ + return boot_cpu_hyp_mode; +} + +/* + * Verify that a secondary CPU is running the kernel at the same + * EL as that of the boot CPU. + */ +void verify_cpu_run_el(void) +{ + bool in_el2 = is_kernel_in_hyp_mode(); + bool boot_cpu_el2 = is_boot_cpu_in_hyp_mode(); + + if (in_el2 ^ boot_cpu_el2) { + pr_crit("CPU%d: mismatched Exception Level(EL%d) with boot CPU(EL%d)\n", + smp_processor_id(), + in_el2 ? 2 : 1, + boot_cpu_el2 ? 2 : 1); + cpu_panic_kernel(); + } +} + +#else +static inline void save_boot_cpu_run_el(void) {} +#endif + #ifdef CONFIG_HOTPLUG_CPU static int op_cpu_kill(unsigned int cpu); #else @@ -166,6 +204,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle) static void smp_store_cpu_info(unsigned int cpuid) { store_cpu_topology(cpuid); + numa_store_cpu_info(cpuid); } /* @@ -225,8 +264,6 @@ asmlinkage void secondary_start_kernel(void) pr_info("CPU%u: Booted secondary processor [%08x]\n", cpu, read_cpuid_id()); update_cpu_boot_status(CPU_BOOT_SUCCESS); - /* Make sure the status update is visible before we complete */ - smp_wmb(); set_cpu_online(cpu, true); complete(&cpu_running); @@ -401,6 +438,7 @@ void __init smp_cpus_done(unsigned int max_cpus) void __init smp_prepare_boot_cpu(void) { cpuinfo_store_boot_cpu(); + save_boot_cpu_run_el(); set_my_cpu_offset(per_cpu_offset(smp_processor_id())); } @@ -595,6 +633,8 @@ static void __init of_parse_and_init_cpus(void) pr_debug("cpu logical map 0x%llx\n", hwid); cpu_logical_map(cpu_count) = hwid; + + early_map_cpu_to_node(cpu_count, of_node_to_nid(dn)); next: cpu_count++; } @@ -647,33 +687,18 @@ void __init smp_init_cpus(void) void __init smp_prepare_cpus(unsigned int max_cpus) { int err; - unsigned int cpu, ncores = num_possible_cpus(); + unsigned int cpu; init_cpu_topology(); smp_store_cpu_info(smp_processor_id()); /* - * are we trying to boot more cores than exist? - */ - if (max_cpus > ncores) - max_cpus = ncores; - - /* Don't bother if we're effectively UP */ - if (max_cpus <= 1) - return; - - /* * Initialise the present map (which describes the set of CPUs * actually populated at the present time) and release the * secondaries from the bootloader. - * - * Make sure we online at most (max_cpus - 1) additional CPUs. */ - max_cpus--; for_each_possible_cpu(cpu) { - if (max_cpus == 0) - break; if (cpu == smp_processor_id()) continue; @@ -686,7 +711,6 @@ void __init smp_prepare_cpus(unsigned int max_cpus) continue; set_cpu_present(cpu, true); - max_cpus--; } } @@ -763,21 +787,11 @@ void arch_irq_work_raise(void) } #endif -static DEFINE_RAW_SPINLOCK(stop_lock); - /* * ipi_cpu_stop - handle IPI from smp_send_stop() */ static void ipi_cpu_stop(unsigned int cpu) { - if (system_state == SYSTEM_BOOTING || - system_state == SYSTEM_RUNNING) { - raw_spin_lock(&stop_lock); - pr_crit("CPU%u: stopping\n", cpu); - dump_stack(); - raw_spin_unlock(&stop_lock); - } - set_cpu_online(cpu, false); local_irq_disable(); @@ -872,6 +886,9 @@ void smp_send_stop(void) cpumask_copy(&mask, cpu_online_mask); cpumask_clear_cpu(smp_processor_id(), &mask); + if (system_state == SYSTEM_BOOTING || + system_state == SYSTEM_RUNNING) + pr_crit("SMP: stopping secondary CPUs\n"); smp_cross_call(&mask, IPI_CPU_STOP); } @@ -881,7 +898,8 @@ void smp_send_stop(void) udelay(1); if (num_online_cpus() > 1) - pr_warning("SMP: failed to stop secondary CPUs\n"); + pr_warning("SMP: failed to stop secondary CPUs %*pbl\n", + cpumask_pr_args(cpu_online_mask)); } /* diff --git a/arch/arm64/kernel/suspend.c b/arch/arm64/kernel/suspend.c index 6605539..b616e36 100644 --- a/arch/arm64/kernel/suspend.c +++ b/arch/arm64/kernel/suspend.c @@ -10,30 +10,11 @@ #include <asm/suspend.h> #include <asm/tlbflush.h> -extern int __cpu_suspend_enter(unsigned long arg, int (*fn)(unsigned long)); /* - * This is called by __cpu_suspend_enter() to save the state, and do whatever - * flushing is required to ensure that when the CPU goes to sleep we have - * the necessary data available when the caches are not searched. - * - * ptr: CPU context virtual address - * save_ptr: address of the location where the context physical address - * must be saved + * This is allocated by cpu_suspend_init(), and used to store a pointer to + * the 'struct sleep_stack_data' the contains a particular CPUs state. */ -void notrace __cpu_suspend_save(struct cpu_suspend_ctx *ptr, - phys_addr_t *save_ptr) -{ - *save_ptr = virt_to_phys(ptr); - - cpu_do_suspend(ptr); - /* - * Only flush the context that must be retrieved with the MMU - * off. VA primitives ensure the flush is applied to all - * cache levels so context is pushed to DRAM. - */ - __flush_dcache_area(ptr, sizeof(*ptr)); - __flush_dcache_area(save_ptr, sizeof(*save_ptr)); -} +unsigned long *sleep_save_stash; /* * This hook is provided so that cpu_suspend code can restore HW @@ -51,6 +32,30 @@ void __init cpu_suspend_set_dbg_restorer(void (*hw_bp_restore)(void *)) hw_breakpoint_restore = hw_bp_restore; } +void notrace __cpu_suspend_exit(void) +{ + /* + * We are resuming from reset with the idmap active in TTBR0_EL1. + * We must uninstall the idmap and restore the expected MMU + * state before we can possibly return to userspace. + */ + cpu_uninstall_idmap(); + + /* + * Restore per-cpu offset before any kernel + * subsystem relying on it has a chance to run. + */ + set_my_cpu_offset(per_cpu_offset(smp_processor_id())); + + /* + * Restore HW breakpoint registers to sane values + * before debug exceptions are possibly reenabled + * through local_dbg_restore. + */ + if (hw_breakpoint_restore) + hw_breakpoint_restore(NULL); +} + /* * cpu_suspend * @@ -60,8 +65,9 @@ void __init cpu_suspend_set_dbg_restorer(void (*hw_bp_restore)(void *)) */ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long)) { - int ret; + int ret = 0; unsigned long flags; + struct sleep_stack_data state; /* * From this point debug exceptions are disabled to prevent @@ -77,34 +83,21 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long)) */ pause_graph_tracing(); - /* - * mm context saved on the stack, it will be restored when - * the cpu comes out of reset through the identity mapped - * page tables, so that the thread address space is properly - * set-up on function return. - */ - ret = __cpu_suspend_enter(arg, fn); - if (ret == 0) { - /* - * We are resuming from reset with the idmap active in TTBR0_EL1. - * We must uninstall the idmap and restore the expected MMU - * state before we can possibly return to userspace. - */ - cpu_uninstall_idmap(); + if (__cpu_suspend_enter(&state)) { + /* Call the suspend finisher */ + ret = fn(arg); /* - * Restore per-cpu offset before any kernel - * subsystem relying on it has a chance to run. + * Never gets here, unless the suspend finisher fails. + * Successful cpu_suspend() should return from cpu_resume(), + * returning through this code path is considered an error + * If the return value is set to 0 force ret = -EOPNOTSUPP + * to make sure a proper error condition is propagated */ - set_my_cpu_offset(per_cpu_offset(smp_processor_id())); - - /* - * Restore HW breakpoint registers to sane values - * before debug exceptions are possibly reenabled - * through local_dbg_restore. - */ - if (hw_breakpoint_restore) - hw_breakpoint_restore(NULL); + if (!ret) + ret = -EOPNOTSUPP; + } else { + __cpu_suspend_exit(); } unpause_graph_tracing(); @@ -119,22 +112,15 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long)) return ret; } -struct sleep_save_sp sleep_save_sp; - static int __init cpu_suspend_init(void) { - void *ctx_ptr; - /* ctx_ptr is an array of physical addresses */ - ctx_ptr = kcalloc(mpidr_hash_size(), sizeof(phys_addr_t), GFP_KERNEL); + sleep_save_stash = kcalloc(mpidr_hash_size(), sizeof(*sleep_save_stash), + GFP_KERNEL); - if (WARN_ON(!ctx_ptr)) + if (WARN_ON(!sleep_save_stash)) return -ENOMEM; - sleep_save_sp.save_ptr_stash = ctx_ptr; - sleep_save_sp.save_ptr_stash_phys = virt_to_phys(ctx_ptr); - __flush_dcache_area(&sleep_save_sp, sizeof(struct sleep_save_sp)); - return 0; } early_initcall(cpu_suspend_init); diff --git a/arch/arm64/kernel/sys.c b/arch/arm64/kernel/sys.c index 75151aa..26fe8ea 100644 --- a/arch/arm64/kernel/sys.c +++ b/arch/arm64/kernel/sys.c @@ -25,6 +25,7 @@ #include <linux/sched.h> #include <linux/slab.h> #include <linux/syscalls.h> +#include <asm/cpufeature.h> asmlinkage long sys_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, @@ -36,11 +37,20 @@ asmlinkage long sys_mmap(unsigned long addr, unsigned long len, return sys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT); } +SYSCALL_DEFINE1(arm64_personality, unsigned int, personality) +{ + if (personality(personality) == PER_LINUX32 && + !system_supports_32bit_el0()) + return -EINVAL; + return sys_personality(personality); +} + /* * Wrappers to pass the pt_regs argument. */ asmlinkage long sys_rt_sigreturn_wrapper(void); #define sys_rt_sigreturn sys_rt_sigreturn_wrapper +#define sys_personality sys_arm64_personality #undef __SYSCALL #define __SYSCALL(nr, sym) [nr] = sym, diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c index 97bc68f..64fc030 100644 --- a/arch/arm64/kernel/vdso.c +++ b/arch/arm64/kernel/vdso.c @@ -131,11 +131,11 @@ static int __init vdso_init(void) return -ENOMEM; /* Grab the vDSO data page. */ - vdso_pagelist[0] = virt_to_page(vdso_data); + vdso_pagelist[0] = pfn_to_page(PHYS_PFN(__pa(vdso_data))); /* Grab the vDSO code pages. */ for (i = 0; i < vdso_pages; i++) - vdso_pagelist[i + 1] = virt_to_page(&vdso_start + i * PAGE_SIZE); + vdso_pagelist[i + 1] = pfn_to_page(PHYS_PFN(__pa(&vdso_start)) + i); /* Populate the special mapping structures */ vdso_spec[0] = (struct vm_special_mapping) { diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 5a1939a..435e820 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -46,6 +46,16 @@ jiffies = jiffies_64; *(.idmap.text) \ VMLINUX_SYMBOL(__idmap_text_end) = .; +#ifdef CONFIG_HIBERNATION +#define HIBERNATE_TEXT \ + . = ALIGN(SZ_4K); \ + VMLINUX_SYMBOL(__hibernate_exit_text_start) = .;\ + *(.hibernate_exit.text) \ + VMLINUX_SYMBOL(__hibernate_exit_text_end) = .; +#else +#define HIBERNATE_TEXT +#endif + /* * The size of the PE/COFF section that covers the kernel image, which * runs from stext to _edata, must be a round multiple of the PE/COFF @@ -63,14 +73,19 @@ PECOFF_FILE_ALIGNMENT = 0x200; #endif #if defined(CONFIG_DEBUG_ALIGN_RODATA) -#define ALIGN_DEBUG_RO . = ALIGN(1<<SECTION_SHIFT); -#define ALIGN_DEBUG_RO_MIN(min) ALIGN_DEBUG_RO -#elif defined(CONFIG_DEBUG_RODATA) -#define ALIGN_DEBUG_RO . = ALIGN(1<<PAGE_SHIFT); -#define ALIGN_DEBUG_RO_MIN(min) ALIGN_DEBUG_RO +/* + * 4 KB granule: 1 level 2 entry + * 16 KB granule: 128 level 3 entries, with contiguous bit + * 64 KB granule: 32 level 3 entries, with contiguous bit + */ +#define SEGMENT_ALIGN SZ_2M #else -#define ALIGN_DEBUG_RO -#define ALIGN_DEBUG_RO_MIN(min) . = ALIGN(min); +/* + * 4 KB granule: 16 level 3 entries, with contiguous bit + * 16 KB granule: 4 level 3 entries, without contiguous bit + * 64 KB granule: 1 level 3 entry + */ +#define SEGMENT_ALIGN SZ_64K #endif SECTIONS @@ -96,7 +111,6 @@ SECTIONS _text = .; HEAD_TEXT } - ALIGN_DEBUG_RO_MIN(PAGE_SIZE) .text : { /* Real text segment */ _stext = .; /* Text and read-only data */ __exception_text_start = .; @@ -109,18 +123,19 @@ SECTIONS LOCK_TEXT HYPERVISOR_TEXT IDMAP_TEXT + HIBERNATE_TEXT *(.fixup) *(.gnu.warning) . = ALIGN(16); *(.got) /* Global offset table */ } - ALIGN_DEBUG_RO_MIN(PAGE_SIZE) + . = ALIGN(SEGMENT_ALIGN); RO_DATA(PAGE_SIZE) /* everything from this point to */ EXCEPTION_TABLE(8) /* _etext will be marked RO NX */ NOTES - ALIGN_DEBUG_RO_MIN(PAGE_SIZE) + . = ALIGN(SEGMENT_ALIGN); _etext = .; /* End of text and rodata section */ __init_begin = .; @@ -154,12 +169,9 @@ SECTIONS *(.altinstr_replacement) } .rela : ALIGN(8) { - __reloc_start = .; *(.rela .rela*) - __reloc_end = .; } .dynsym : ALIGN(8) { - __dynsym_start = .; *(.dynsym) } .dynstr : { @@ -169,7 +181,11 @@ SECTIONS *(.hash) } - . = ALIGN(PAGE_SIZE); + __rela_offset = ADDR(.rela) - KIMAGE_VADDR; + __rela_size = SIZEOF(.rela); + __dynsym_offset = ADDR(.dynsym) - KIMAGE_VADDR; + + . = ALIGN(SEGMENT_ALIGN); __init_end = .; _data = .; @@ -201,6 +217,10 @@ ASSERT(__hyp_idmap_text_end - (__hyp_idmap_text_start & ~(SZ_4K - 1)) <= SZ_4K, "HYP init code too big or misaligned") ASSERT(__idmap_text_end - (__idmap_text_start & ~(SZ_4K - 1)) <= SZ_4K, "ID map text too big or misaligned") +#ifdef CONFIG_HIBERNATION +ASSERT(__hibernate_exit_text_end - (__hibernate_exit_text_start & ~(SZ_4K - 1)) + <= SZ_4K, "Hibernate exit text too big or misaligned") +#endif /* * If padding is applied before .head.text, virt<->phys conversions will fail. diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index eba89e4..3246c4a 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -186,6 +186,13 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, exit_handler = kvm_get_exit_handler(vcpu); return exit_handler(vcpu, run); + case ARM_EXCEPTION_HYP_GONE: + /* + * EL2 has been reset to the hyp-stub. This happens when a guest + * is pre-empted by kvm_reboot()'s shutdown call. + */ + run->exit_reason = KVM_EXIT_FAIL_ENTRY; + return 0; default: kvm_pr_unimpl("Unsupported exception type: %d", exception_index); diff --git a/arch/arm64/kvm/hyp-init.S b/arch/arm64/kvm/hyp-init.S index 7d8747c..a873a6d 100644 --- a/arch/arm64/kvm/hyp-init.S +++ b/arch/arm64/kvm/hyp-init.S @@ -21,6 +21,7 @@ #include <asm/kvm_arm.h> #include <asm/kvm_mmu.h> #include <asm/pgtable-hwdef.h> +#include <asm/sysreg.h> .text .pushsection .hyp.idmap.text, "ax" @@ -103,8 +104,8 @@ __do_hyp_init: dsb sy mrs x4, sctlr_el2 - and x4, x4, #SCTLR_EL2_EE // preserve endianness of EL2 - ldr x5, =SCTLR_EL2_FLAGS + and x4, x4, #SCTLR_ELx_EE // preserve endianness of EL2 + ldr x5, =SCTLR_ELx_FLAGS orr x4, x4, x5 msr sctlr_el2, x4 isb @@ -138,6 +139,49 @@ merged: eret ENDPROC(__kvm_hyp_init) + /* + * Reset kvm back to the hyp stub. This is the trampoline dance in + * reverse. If kvm used an extended idmap, __extended_idmap_trampoline + * calls this code directly in the idmap. In this case switching to the + * boot tables is a no-op. + * + * x0: HYP boot pgd + * x1: HYP phys_idmap_start + */ +ENTRY(__kvm_hyp_reset) + /* We're in trampoline code in VA, switch back to boot page tables */ + msr ttbr0_el2, x0 + isb + + /* Ensure the PA branch doesn't find a stale tlb entry or stale code. */ + ic iallu + tlbi alle2 + dsb sy + isb + + /* Branch into PA space */ + adr x0, 1f + bfi x1, x0, #0, #PAGE_SHIFT + br x1 + + /* We're now in idmap, disable MMU */ +1: mrs x0, sctlr_el2 + ldr x1, =SCTLR_ELx_FLAGS + bic x0, x0, x1 // Clear SCTL_M and etc + msr sctlr_el2, x0 + isb + + /* Invalidate the old TLBs */ + tlbi alle2 + dsb sy + + /* Install stub vectors */ + adr_l x0, __hyp_stub_vectors + msr vbar_el2, x0 + + eret +ENDPROC(__kvm_hyp_reset) + .ltorg .popsection diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S index 48f19a3..7ce9315 100644 --- a/arch/arm64/kvm/hyp.S +++ b/arch/arm64/kvm/hyp.S @@ -35,16 +35,21 @@ * in Hyp mode (see init_hyp_mode in arch/arm/kvm/arm.c). Return values are * passed in x0. * - * A function pointer with a value of 0 has a special meaning, and is - * used to implement __hyp_get_vectors in the same way as in + * A function pointer with a value less than 0xfff has a special meaning, + * and is used to implement __hyp_get_vectors in the same way as in * arch/arm64/kernel/hyp_stub.S. + * HVC behaves as a 'bl' call and will clobber lr. */ ENTRY(__kvm_call_hyp) -alternative_if_not ARM64_HAS_VIRT_HOST_EXTN +alternative_if_not ARM64_HAS_VIRT_HOST_EXTN + str lr, [sp, #-16]! hvc #0 + ldr lr, [sp], #16 ret alternative_else b __vhe_hyp_call nop + nop + nop alternative_endif ENDPROC(__kvm_call_hyp) diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S index ce9e5e5..70254a6 100644 --- a/arch/arm64/kvm/hyp/entry.S +++ b/arch/arm64/kvm/hyp/entry.S @@ -164,3 +164,22 @@ alternative_endif eret ENDPROC(__fpsimd_guest_restore) + +/* + * When using the extended idmap, we don't have a trampoline page we can use + * while we switch pages tables during __kvm_hyp_reset. Accessing the idmap + * directly would be ideal, but if we're using the extended idmap then the + * idmap is located above HYP_PAGE_OFFSET, and the address will be masked by + * kvm_call_hyp using kern_hyp_va. + * + * x0: HYP boot pgd + * x1: HYP phys_idmap_start + */ +ENTRY(__extended_idmap_trampoline) + mov x4, x1 + adr_l x3, __kvm_hyp_reset + + /* insert __kvm_hyp_reset()s offset into phys_idmap_start */ + bfi x4, x3, #0, #PAGE_SHIFT + br x4 +ENDPROC(__extended_idmap_trampoline) diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S index 3488894..2d87f36 100644 --- a/arch/arm64/kvm/hyp/hyp-entry.S +++ b/arch/arm64/kvm/hyp/hyp-entry.S @@ -42,19 +42,17 @@ * Shuffle the parameters before calling the function * pointed to in x0. Assumes parameters in x[1,2,3]. */ - sub sp, sp, #16 - str lr, [sp] mov lr, x0 mov x0, x1 mov x1, x2 mov x2, x3 blr lr - ldr lr, [sp] - add sp, sp, #16 .endm ENTRY(__vhe_hyp_call) + str lr, [sp, #-16]! do_el2_call + ldr lr, [sp], #16 /* * We used to rely on having an exception return to get * an implicit isb. In the E2H case, we don't have it anymore. @@ -84,8 +82,8 @@ alternative_endif /* Here, we're pretty sure the host called HVC. */ restore_x0_to_x3 - /* Check for __hyp_get_vectors */ - cbnz x0, 1f + cmp x0, #HVC_GET_VECTORS + b.ne 1f mrs x0, vbar_el2 b 2f diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index 9677bf0..b1ad730 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -29,7 +29,9 @@ #include <asm/cputype.h> #include <asm/ptrace.h> #include <asm/kvm_arm.h> +#include <asm/kvm_asm.h> #include <asm/kvm_coproc.h> +#include <asm/kvm_mmu.h> /* * ARMv8 Reset Values @@ -130,3 +132,31 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) /* Reset timer */ return kvm_timer_vcpu_reset(vcpu, cpu_vtimer_irq); } + +extern char __hyp_idmap_text_start[]; + +unsigned long kvm_hyp_reset_entry(void) +{ + if (!__kvm_cpu_uses_extended_idmap()) { + unsigned long offset; + + /* + * Find the address of __kvm_hyp_reset() in the trampoline page. + * This is present in the running page tables, and the boot page + * tables, so we call the code here to start the trampoline + * dance in reverse. + */ + offset = (unsigned long)__kvm_hyp_reset + - ((unsigned long)__hyp_idmap_text_start & PAGE_MASK); + + return TRAMPOLINE_VA + offset; + } else { + /* + * KVM is running with merged page tables, which don't have the + * trampoline page mapped. We know the idmap is still mapped, + * but can't be called into directly. Use + * __extended_idmap_trampoline to do the call. + */ + return (unsigned long)kvm_ksym_ref(__extended_idmap_trampoline); + } +} diff --git a/arch/arm64/mm/Makefile b/arch/arm64/mm/Makefile index 57f57fd..54bb209 100644 --- a/arch/arm64/mm/Makefile +++ b/arch/arm64/mm/Makefile @@ -4,6 +4,7 @@ obj-y := dma-mapping.o extable.o fault.o init.o \ context.o proc.o pageattr.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_ARM64_PTDUMP) += dump.o +obj-$(CONFIG_NUMA) += numa.o obj-$(CONFIG_KASAN) += kasan_init.o KASAN_SANITIZE_kasan_init.o := n diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S index 6df0706..50ff9ba 100644 --- a/arch/arm64/mm/cache.S +++ b/arch/arm64/mm/cache.S @@ -24,8 +24,6 @@ #include <asm/cpufeature.h> #include <asm/alternative.h> -#include "proc-macros.S" - /* * flush_icache_range(start,end) * diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c index c90c3c5..b7b3978 100644 --- a/arch/arm64/mm/context.c +++ b/arch/arm64/mm/context.c @@ -75,8 +75,7 @@ void verify_cpu_asid_bits(void) */ pr_crit("CPU%d: smaller ASID size(%u) than boot CPU (%u)\n", smp_processor_id(), asid, asid_bits); - update_cpu_boot_status(CPU_PANIC_KERNEL); - cpu_park_loop(); + cpu_panic_kernel(); } } diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c index a6e757c..fd8b942 100644 --- a/arch/arm64/mm/dma-mapping.c +++ b/arch/arm64/mm/dma-mapping.c @@ -804,57 +804,24 @@ struct iommu_dma_notifier_data { static LIST_HEAD(iommu_dma_masters); static DEFINE_MUTEX(iommu_dma_notifier_lock); -/* - * Temporarily "borrow" a domain feature flag to to tell if we had to resort - * to creating our own domain here, in case we need to clean it up again. - */ -#define __IOMMU_DOMAIN_FAKE_DEFAULT (1U << 31) - static bool do_iommu_attach(struct device *dev, const struct iommu_ops *ops, u64 dma_base, u64 size) { struct iommu_domain *domain = iommu_get_domain_for_dev(dev); /* - * Best case: The device is either part of a group which was - * already attached to a domain in a previous call, or it's - * been put in a default DMA domain by the IOMMU core. + * If the IOMMU driver has the DMA domain support that we require, + * then the IOMMU core will have already configured a group for this + * device, and allocated the default domain for that group. */ - if (!domain) { - /* - * Urgh. The IOMMU core isn't going to do default domains - * for non-PCI devices anyway, until it has some means of - * abstracting the entirely implementation-specific - * sideband data/SoC topology/unicorn dust that may or - * may not differentiate upstream masters. - * So until then, HORRIBLE HACKS! - */ - domain = ops->domain_alloc(IOMMU_DOMAIN_DMA); - if (!domain) - goto out_no_domain; - - domain->ops = ops; - domain->type = IOMMU_DOMAIN_DMA | __IOMMU_DOMAIN_FAKE_DEFAULT; - - if (iommu_attach_device(domain, dev)) - goto out_put_domain; + if (!domain || iommu_dma_init_domain(domain, dma_base, size)) { + pr_warn("Failed to set up IOMMU for device %s; retaining platform DMA ops\n", + dev_name(dev)); + return false; } - if (iommu_dma_init_domain(domain, dma_base, size)) - goto out_detach; - dev->archdata.dma_ops = &iommu_dma_ops; return true; - -out_detach: - iommu_detach_device(domain, dev); -out_put_domain: - if (domain->type & __IOMMU_DOMAIN_FAKE_DEFAULT) - iommu_domain_free(domain); -out_no_domain: - pr_warn("Failed to set up IOMMU for device %s; retaining platform DMA ops\n", - dev_name(dev)); - return false; } static void queue_iommu_attach(struct device *dev, const struct iommu_ops *ops, @@ -933,6 +900,10 @@ static int __init __iommu_dma_init(void) ret = register_iommu_dma_ops_notifier(&platform_bus_type); if (!ret) ret = register_iommu_dma_ops_notifier(&amba_bustype); +#ifdef CONFIG_PCI + if (!ret) + ret = register_iommu_dma_ops_notifier(&pci_bus_type); +#endif /* handle devices queued before this arch_initcall */ if (!ret) @@ -967,11 +938,8 @@ void arch_teardown_dma_ops(struct device *dev) { struct iommu_domain *domain = iommu_get_domain_for_dev(dev); - if (domain) { + if (WARN_ON(domain)) iommu_detach_device(domain, dev); - if (domain->type & __IOMMU_DOMAIN_FAKE_DEFAULT) - iommu_domain_free(domain); - } dev->archdata.dma_ops = NULL; } diff --git a/arch/arm64/mm/dump.c b/arch/arm64/mm/dump.c index f9271cb..8404190 100644 --- a/arch/arm64/mm/dump.c +++ b/arch/arm64/mm/dump.c @@ -23,6 +23,7 @@ #include <linux/seq_file.h> #include <asm/fixmap.h> +#include <asm/kasan.h> #include <asm/memory.h> #include <asm/pgtable.h> #include <asm/pgtable-hwdef.h> @@ -32,37 +33,25 @@ struct addr_marker { const char *name; }; -enum address_markers_idx { - MODULES_START_NR = 0, - MODULES_END_NR, - VMALLOC_START_NR, - VMALLOC_END_NR, -#ifdef CONFIG_SPARSEMEM_VMEMMAP - VMEMMAP_START_NR, - VMEMMAP_END_NR, +static const struct addr_marker address_markers[] = { +#ifdef CONFIG_KASAN + { KASAN_SHADOW_START, "Kasan shadow start" }, + { KASAN_SHADOW_END, "Kasan shadow end" }, #endif - FIXADDR_START_NR, - FIXADDR_END_NR, - PCI_START_NR, - PCI_END_NR, - KERNEL_SPACE_NR, -}; - -static struct addr_marker address_markers[] = { - { MODULES_VADDR, "Modules start" }, - { MODULES_END, "Modules end" }, - { VMALLOC_START, "vmalloc() Area" }, - { VMALLOC_END, "vmalloc() End" }, + { MODULES_VADDR, "Modules start" }, + { MODULES_END, "Modules end" }, + { VMALLOC_START, "vmalloc() Area" }, + { VMALLOC_END, "vmalloc() End" }, + { FIXADDR_START, "Fixmap start" }, + { FIXADDR_TOP, "Fixmap end" }, + { PCI_IO_START, "PCI I/O start" }, + { PCI_IO_END, "PCI I/O end" }, #ifdef CONFIG_SPARSEMEM_VMEMMAP - { 0, "vmemmap start" }, - { 0, "vmemmap end" }, + { VMEMMAP_START, "vmemmap start" }, + { VMEMMAP_START + VMEMMAP_SIZE, "vmemmap end" }, #endif - { FIXADDR_START, "Fixmap start" }, - { FIXADDR_TOP, "Fixmap end" }, - { PCI_IO_START, "PCI I/O start" }, - { PCI_IO_END, "PCI I/O end" }, - { PAGE_OFFSET, "Linear Mapping" }, - { -1, NULL }, + { PAGE_OFFSET, "Linear Mapping" }, + { -1, NULL }, }; /* @@ -347,13 +336,6 @@ static int ptdump_init(void) for (j = 0; j < pg_level[i].num; j++) pg_level[i].mask |= pg_level[i].bits[j].mask; -#ifdef CONFIG_SPARSEMEM_VMEMMAP - address_markers[VMEMMAP_START_NR].start_address = - (unsigned long)virt_to_page(PAGE_OFFSET); - address_markers[VMEMMAP_END_NR].start_address = - (unsigned long)virt_to_page(high_memory); -#endif - pe = debugfs_create_file("kernel_page_tables", 0400, NULL, NULL, &ptdump_fops); return pe ? 0 : -ENOMEM; diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 95df28b..5954881 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -81,6 +81,56 @@ void show_pte(struct mm_struct *mm, unsigned long addr) printk("\n"); } +#ifdef CONFIG_ARM64_HW_AFDBM +/* + * This function sets the access flags (dirty, accessed), as well as write + * permission, and only to a more permissive setting. + * + * It needs to cope with hardware update of the accessed/dirty state by other + * agents in the system and can safely skip the __sync_icache_dcache() call as, + * like set_pte_at(), the PTE is never changed from no-exec to exec here. + * + * Returns whether or not the PTE actually changed. + */ +int ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, + pte_t entry, int dirty) +{ + pteval_t old_pteval; + unsigned int tmp; + + if (pte_same(*ptep, entry)) + return 0; + + /* only preserve the access flags and write permission */ + pte_val(entry) &= PTE_AF | PTE_WRITE | PTE_DIRTY; + + /* + * PTE_RDONLY is cleared by default in the asm below, so set it in + * back if necessary (read-only or clean PTE). + */ + if (!pte_write(entry) || !dirty) + pte_val(entry) |= PTE_RDONLY; + + /* + * Setting the flags must be done atomically to avoid racing with the + * hardware update of the access/dirty state. + */ + asm volatile("// ptep_set_access_flags\n" + " prfm pstl1strm, %2\n" + "1: ldxr %0, %2\n" + " and %0, %0, %3 // clear PTE_RDONLY\n" + " orr %0, %0, %4 // set flags\n" + " stxr %w1, %0, %2\n" + " cbnz %w1, 1b\n" + : "=&r" (old_pteval), "=&r" (tmp), "+Q" (pte_val(*ptep)) + : "L" (~PTE_RDONLY), "r" (pte_val(entry))); + + flush_tlb_fix_spurious_fault(vma, address); + return 1; +} +#endif + /* * The kernel tried to access some page that wasn't present. */ @@ -212,10 +262,6 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, tsk = current; mm = tsk->mm; - /* Enable interrupts if they were enabled in the parent context. */ - if (interrupts_enabled(regs)) - local_irq_enable(); - /* * If we're in an interrupt or have no user context, we must not take * the fault. @@ -555,20 +601,33 @@ asmlinkage int __exception do_debug_exception(unsigned long addr, { const struct fault_info *inf = debug_fault_info + DBG_ESR_EVT(esr); struct siginfo info; + int rv; - if (!inf->fn(addr, esr, regs)) - return 1; + /* + * Tell lockdep we disabled irqs in entry.S. Do nothing if they were + * already disabled to preserve the last enabled/disabled addresses. + */ + if (interrupts_enabled(regs)) + trace_hardirqs_off(); - pr_alert("Unhandled debug exception: %s (0x%08x) at 0x%016lx\n", - inf->name, esr, addr); + if (!inf->fn(addr, esr, regs)) { + rv = 1; + } else { + pr_alert("Unhandled debug exception: %s (0x%08x) at 0x%016lx\n", + inf->name, esr, addr); + + info.si_signo = inf->sig; + info.si_errno = 0; + info.si_code = inf->code; + info.si_addr = (void __user *)addr; + arm64_notify_die("", regs, &info, 0); + rv = 0; + } - info.si_signo = inf->sig; - info.si_errno = 0; - info.si_code = inf->code; - info.si_addr = (void __user *)addr; - arm64_notify_die("", regs, &info, 0); + if (interrupts_enabled(regs)) + trace_hardirqs_on(); - return 0; + return rv; } #ifdef CONFIG_ARM64_PAN diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index ea989d8..d45f862 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -40,6 +40,7 @@ #include <asm/kasan.h> #include <asm/kernel-pgtable.h> #include <asm/memory.h> +#include <asm/numa.h> #include <asm/sections.h> #include <asm/setup.h> #include <asm/sizes.h> @@ -86,6 +87,21 @@ static phys_addr_t __init max_zone_dma_phys(void) return min(offset + (1ULL << 32), memblock_end_of_DRAM()); } +#ifdef CONFIG_NUMA + +static void __init zone_sizes_init(unsigned long min, unsigned long max) +{ + unsigned long max_zone_pfns[MAX_NR_ZONES] = {0}; + + if (IS_ENABLED(CONFIG_ZONE_DMA)) + max_zone_pfns[ZONE_DMA] = PFN_DOWN(max_zone_dma_phys()); + max_zone_pfns[ZONE_NORMAL] = max; + + free_area_init_nodes(max_zone_pfns); +} + +#else + static void __init zone_sizes_init(unsigned long min, unsigned long max) { struct memblock_region *reg; @@ -126,6 +142,8 @@ static void __init zone_sizes_init(unsigned long min, unsigned long max) free_area_init_node(0, zone_size, min, zhole_size); } +#endif /* CONFIG_NUMA */ + #ifdef CONFIG_HAVE_ARCH_PFN_VALID int pfn_valid(unsigned long pfn) { @@ -142,10 +160,15 @@ static void __init arm64_memory_present(void) static void __init arm64_memory_present(void) { struct memblock_region *reg; + int nid = 0; - for_each_memblock(memory, reg) - memory_present(0, memblock_region_memory_base_pfn(reg), - memblock_region_memory_end_pfn(reg)); + for_each_memblock(memory, reg) { +#ifdef CONFIG_NUMA + nid = reg->nid; +#endif + memory_present(nid, memblock_region_memory_base_pfn(reg), + memblock_region_memory_end_pfn(reg)); + } } #endif @@ -190,8 +213,12 @@ void __init arm64_memblock_init(void) */ memblock_remove(max_t(u64, memstart_addr + linear_region_size, __pa(_end)), ULLONG_MAX); - if (memblock_end_of_DRAM() > linear_region_size) - memblock_remove(0, memblock_end_of_DRAM() - linear_region_size); + if (memstart_addr + linear_region_size < memblock_end_of_DRAM()) { + /* ensure that memstart_addr remains sufficiently aligned */ + memstart_addr = round_up(memblock_end_of_DRAM() - linear_region_size, + ARM64_MEMSTART_ALIGN); + memblock_remove(0, memstart_addr); + } /* * Apply the memory limit if it was set. Since the kernel may be loaded @@ -203,6 +230,35 @@ void __init arm64_memblock_init(void) memblock_add(__pa(_text), (u64)(_end - _text)); } + if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && initrd_start) { + /* + * Add back the memory we just removed if it results in the + * initrd to become inaccessible via the linear mapping. + * Otherwise, this is a no-op + */ + u64 base = initrd_start & PAGE_MASK; + u64 size = PAGE_ALIGN(initrd_end) - base; + + /* + * We can only add back the initrd memory if we don't end up + * with more memory than we can address via the linear mapping. + * It is up to the bootloader to position the kernel and the + * initrd reasonably close to each other (i.e., within 32 GB of + * each other) so that all granule/#levels combinations can + * always access both. + */ + if (WARN(base < memblock_start_of_DRAM() || + base + size > memblock_start_of_DRAM() + + linear_region_size, + "initrd not fully accessible via the linear mapping -- please check your bootloader ...\n")) { + initrd_start = 0; + } else { + memblock_remove(base, size); /* clear MEMBLOCK_ flags */ + memblock_add(base, size); + memblock_reserve(base, size); + } + } + if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) { extern u16 memstart_offset_seed; u64 range = linear_region_size - @@ -245,7 +301,6 @@ void __init arm64_memblock_init(void) dma_contiguous_reserve(arm64_dma_phys_limit); memblock_allow_resize(); - memblock_dump_all(); } void __init bootmem_init(void) @@ -257,6 +312,9 @@ void __init bootmem_init(void) early_memtest(min << PAGE_SHIFT, max << PAGE_SHIFT); + max_pfn = max_low_pfn = max; + + arm64_numa_init(); /* * Sparsemem tries to allocate bootmem in memory_present(), so must be * done after the fixed reservations. @@ -267,7 +325,7 @@ void __init bootmem_init(void) zone_sizes_init(min, max); high_memory = __va((max << PAGE_SHIFT) - 1) + 1; - max_pfn = max_low_pfn = max; + memblock_dump_all(); } #ifndef CONFIG_SPARSEMEM_VMEMMAP @@ -371,26 +429,27 @@ void __init mem_init(void) MLM(MODULES_VADDR, MODULES_END)); pr_cont(" vmalloc : 0x%16lx - 0x%16lx (%6ld GB)\n", MLG(VMALLOC_START, VMALLOC_END)); - pr_cont(" .text : 0x%p" " - 0x%p" " (%6ld KB)\n" - " .rodata : 0x%p" " - 0x%p" " (%6ld KB)\n" - " .init : 0x%p" " - 0x%p" " (%6ld KB)\n" - " .data : 0x%p" " - 0x%p" " (%6ld KB)\n", - MLK_ROUNDUP(_text, __start_rodata), - MLK_ROUNDUP(__start_rodata, _etext), - MLK_ROUNDUP(__init_begin, __init_end), + pr_cont(" .text : 0x%p" " - 0x%p" " (%6ld KB)\n", + MLK_ROUNDUP(_text, __start_rodata)); + pr_cont(" .rodata : 0x%p" " - 0x%p" " (%6ld KB)\n", + MLK_ROUNDUP(__start_rodata, _etext)); + pr_cont(" .init : 0x%p" " - 0x%p" " (%6ld KB)\n", + MLK_ROUNDUP(__init_begin, __init_end)); + pr_cont(" .data : 0x%p" " - 0x%p" " (%6ld KB)\n", MLK_ROUNDUP(_sdata, _edata)); -#ifdef CONFIG_SPARSEMEM_VMEMMAP - pr_cont(" vmemmap : 0x%16lx - 0x%16lx (%6ld GB maximum)\n" - " 0x%16lx - 0x%16lx (%6ld MB actual)\n", - MLG(VMEMMAP_START, - VMEMMAP_START + VMEMMAP_SIZE), - MLM((unsigned long)phys_to_page(memblock_start_of_DRAM()), - (unsigned long)virt_to_page(high_memory))); -#endif + pr_cont(" .bss : 0x%p" " - 0x%p" " (%6ld KB)\n", + MLK_ROUNDUP(__bss_start, __bss_stop)); pr_cont(" fixed : 0x%16lx - 0x%16lx (%6ld KB)\n", MLK(FIXADDR_START, FIXADDR_TOP)); pr_cont(" PCI I/O : 0x%16lx - 0x%16lx (%6ld MB)\n", MLM(PCI_IO_START, PCI_IO_END)); +#ifdef CONFIG_SPARSEMEM_VMEMMAP + pr_cont(" vmemmap : 0x%16lx - 0x%16lx (%6ld GB maximum)\n", + MLG(VMEMMAP_START, VMEMMAP_START + VMEMMAP_SIZE)); + pr_cont(" 0x%16lx - 0x%16lx (%6ld MB actual)\n", + MLM((unsigned long)phys_to_page(memblock_start_of_DRAM()), + (unsigned long)virt_to_page(high_memory))); +#endif pr_cont(" memory : 0x%16lx - 0x%16lx (%6ld MB)\n", MLM(__phys_to_virt(memblock_start_of_DRAM()), (unsigned long)high_memory)); @@ -407,6 +466,12 @@ void __init mem_init(void) BUILD_BUG_ON(TASK_SIZE_32 > TASK_SIZE_64); #endif + /* + * Make sure we chose the upper bound of sizeof(struct page) + * correctly. + */ + BUILD_BUG_ON(sizeof(struct page) > (1 << STRUCT_PAGE_MAX_SHIFT)); + if (PAGE_SIZE >= 16384 && get_num_physpages() <= 128) { extern int sysctl_overcommit_memory; /* @@ -419,7 +484,8 @@ void __init mem_init(void) void free_initmem(void) { - free_initmem_default(0); + free_reserved_area(__va(__pa(__init_begin)), __va(__pa(__init_end)), + 0, "unused kernel"); fixup_init(); } diff --git a/arch/arm64/mm/mm.h b/arch/arm64/mm/mm.h index ef47d99..71fe9898 100644 --- a/arch/arm64/mm/mm.h +++ b/arch/arm64/mm/mm.h @@ -1,3 +1,2 @@ -extern void __init bootmem_init(void); void fixup_init(void); diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c index 232f787..01c1717 100644 --- a/arch/arm64/mm/mmap.c +++ b/arch/arm64/mm/mmap.c @@ -95,8 +95,6 @@ void arch_pick_mmap_layout(struct mm_struct *mm) mm->get_unmapped_area = arch_get_unmapped_area_topdown; } } -EXPORT_SYMBOL_GPL(arch_pick_mmap_layout); - /* * You really shouldn't be using read() or write() on /dev/mem. This might go diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index f3e5c74..0f85a46 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -385,7 +385,7 @@ static void create_mapping_late(phys_addr_t phys, unsigned long virt, static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end) { - unsigned long kernel_start = __pa(_stext); + unsigned long kernel_start = __pa(_text); unsigned long kernel_end = __pa(_etext); /* @@ -417,7 +417,7 @@ static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end early_pgtable_alloc); /* - * Map the linear alias of the [_stext, _etext) interval as + * Map the linear alias of the [_text, _etext) interval as * read-only/non-executable. This makes the contents of the * region accessible to subsystems such as hibernate, but * protects it from inadvertent modification or execution. @@ -449,8 +449,8 @@ void mark_rodata_ro(void) { unsigned long section_size; - section_size = (unsigned long)__start_rodata - (unsigned long)_stext; - create_mapping_late(__pa(_stext), (unsigned long)_stext, + section_size = (unsigned long)__start_rodata - (unsigned long)_text; + create_mapping_late(__pa(_text), (unsigned long)_text, section_size, PAGE_KERNEL_ROX); /* * mark .rodata as read only. Use _etext rather than __end_rodata to @@ -471,8 +471,8 @@ void fixup_init(void) unmap_kernel_range((u64)__init_begin, (u64)(__init_end - __init_begin)); } -static void __init map_kernel_chunk(pgd_t *pgd, void *va_start, void *va_end, - pgprot_t prot, struct vm_struct *vma) +static void __init map_kernel_segment(pgd_t *pgd, void *va_start, void *va_end, + pgprot_t prot, struct vm_struct *vma) { phys_addr_t pa_start = __pa(va_start); unsigned long size = va_end - va_start; @@ -499,11 +499,11 @@ static void __init map_kernel(pgd_t *pgd) { static struct vm_struct vmlinux_text, vmlinux_rodata, vmlinux_init, vmlinux_data; - map_kernel_chunk(pgd, _stext, __start_rodata, PAGE_KERNEL_EXEC, &vmlinux_text); - map_kernel_chunk(pgd, __start_rodata, _etext, PAGE_KERNEL, &vmlinux_rodata); - map_kernel_chunk(pgd, __init_begin, __init_end, PAGE_KERNEL_EXEC, - &vmlinux_init); - map_kernel_chunk(pgd, _data, _end, PAGE_KERNEL, &vmlinux_data); + map_kernel_segment(pgd, _text, __start_rodata, PAGE_KERNEL_EXEC, &vmlinux_text); + map_kernel_segment(pgd, __start_rodata, _etext, PAGE_KERNEL, &vmlinux_rodata); + map_kernel_segment(pgd, __init_begin, __init_end, PAGE_KERNEL_EXEC, + &vmlinux_init); + map_kernel_segment(pgd, _data, _end, PAGE_KERNEL, &vmlinux_data); if (!pgd_val(*pgd_offset_raw(pgd, FIXADDR_START))) { /* @@ -564,8 +564,6 @@ void __init paging_init(void) */ memblock_free(__pa(swapper_pg_dir) + PAGE_SIZE, SWAPPER_DIR_SIZE - PAGE_SIZE); - - bootmem_init(); } /* diff --git a/arch/arm64/mm/numa.c b/arch/arm64/mm/numa.c new file mode 100644 index 0000000..98dc104 --- /dev/null +++ b/arch/arm64/mm/numa.c @@ -0,0 +1,396 @@ +/* + * NUMA support, based on the x86 implementation. + * + * Copyright (C) 2015 Cavium Inc. + * Author: Ganapatrao Kulkarni <gkulkarni@cavium.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/bootmem.h> +#include <linux/memblock.h> +#include <linux/module.h> +#include <linux/of.h> + +struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; +EXPORT_SYMBOL(node_data); +nodemask_t numa_nodes_parsed __initdata; +static int cpu_to_node_map[NR_CPUS] = { [0 ... NR_CPUS-1] = NUMA_NO_NODE }; + +static int numa_distance_cnt; +static u8 *numa_distance; +static int numa_off; + +static __init int numa_parse_early_param(char *opt) +{ + if (!opt) + return -EINVAL; + if (!strncmp(opt, "off", 3)) { + pr_info("%s\n", "NUMA turned off"); + numa_off = 1; + } + return 0; +} +early_param("numa", numa_parse_early_param); + +cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; +EXPORT_SYMBOL(node_to_cpumask_map); + +#ifdef CONFIG_DEBUG_PER_CPU_MAPS + +/* + * Returns a pointer to the bitmask of CPUs on Node 'node'. + */ +const struct cpumask *cpumask_of_node(int node) +{ + if (WARN_ON(node >= nr_node_ids)) + return cpu_none_mask; + + if (WARN_ON(node_to_cpumask_map[node] == NULL)) + return cpu_online_mask; + + return node_to_cpumask_map[node]; +} +EXPORT_SYMBOL(cpumask_of_node); + +#endif + +static void map_cpu_to_node(unsigned int cpu, int nid) +{ + set_cpu_numa_node(cpu, nid); + if (nid >= 0) + cpumask_set_cpu(cpu, node_to_cpumask_map[nid]); +} + +void numa_clear_node(unsigned int cpu) +{ + int nid = cpu_to_node(cpu); + + if (nid >= 0) + cpumask_clear_cpu(cpu, node_to_cpumask_map[nid]); + set_cpu_numa_node(cpu, NUMA_NO_NODE); +} + +/* + * Allocate node_to_cpumask_map based on number of available nodes + * Requires node_possible_map to be valid. + * + * Note: cpumask_of_node() is not valid until after this is done. + * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.) + */ +static void __init setup_node_to_cpumask_map(void) +{ + unsigned int cpu; + int node; + + /* setup nr_node_ids if not done yet */ + if (nr_node_ids == MAX_NUMNODES) + setup_nr_node_ids(); + + /* allocate and clear the mapping */ + for (node = 0; node < nr_node_ids; node++) { + alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]); + cpumask_clear(node_to_cpumask_map[node]); + } + + for_each_possible_cpu(cpu) + set_cpu_numa_node(cpu, NUMA_NO_NODE); + + /* cpumask_of_node() will now work */ + pr_debug("NUMA: Node to cpumask map for %d nodes\n", nr_node_ids); +} + +/* + * Set the cpu to node and mem mapping + */ +void numa_store_cpu_info(unsigned int cpu) +{ + map_cpu_to_node(cpu, numa_off ? 0 : cpu_to_node_map[cpu]); +} + +void __init early_map_cpu_to_node(unsigned int cpu, int nid) +{ + /* fallback to node 0 */ + if (nid < 0 || nid >= MAX_NUMNODES) + nid = 0; + + cpu_to_node_map[cpu] = nid; +} + +/** + * numa_add_memblk - Set node id to memblk + * @nid: NUMA node ID of the new memblk + * @start: Start address of the new memblk + * @size: Size of the new memblk + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int __init numa_add_memblk(int nid, u64 start, u64 size) +{ + int ret; + + ret = memblock_set_node(start, size, &memblock.memory, nid); + if (ret < 0) { + pr_err("NUMA: memblock [0x%llx - 0x%llx] failed to add on node %d\n", + start, (start + size - 1), nid); + return ret; + } + + node_set(nid, numa_nodes_parsed); + pr_info("NUMA: Adding memblock [0x%llx - 0x%llx] on node %d\n", + start, (start + size - 1), nid); + return ret; +} + +/** + * Initialize NODE_DATA for a node on the local memory + */ +static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn) +{ + const size_t nd_size = roundup(sizeof(pg_data_t), SMP_CACHE_BYTES); + u64 nd_pa; + void *nd; + int tnid; + + pr_info("NUMA: Initmem setup node %d [mem %#010Lx-%#010Lx]\n", + nid, start_pfn << PAGE_SHIFT, + (end_pfn << PAGE_SHIFT) - 1); + + nd_pa = memblock_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid); + nd = __va(nd_pa); + + /* report and initialize */ + pr_info("NUMA: NODE_DATA [mem %#010Lx-%#010Lx]\n", + nd_pa, nd_pa + nd_size - 1); + tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT); + if (tnid != nid) + pr_info("NUMA: NODE_DATA(%d) on node %d\n", nid, tnid); + + node_data[nid] = nd; + memset(NODE_DATA(nid), 0, sizeof(pg_data_t)); + NODE_DATA(nid)->node_id = nid; + NODE_DATA(nid)->node_start_pfn = start_pfn; + NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn; +} + +/** + * numa_free_distance + * + * The current table is freed. + */ +void __init numa_free_distance(void) +{ + size_t size; + + if (!numa_distance) + return; + + size = numa_distance_cnt * numa_distance_cnt * + sizeof(numa_distance[0]); + + memblock_free(__pa(numa_distance), size); + numa_distance_cnt = 0; + numa_distance = NULL; +} + +/** + * + * Create a new NUMA distance table. + * + */ +static int __init numa_alloc_distance(void) +{ + size_t size; + u64 phys; + int i, j; + + size = nr_node_ids * nr_node_ids * sizeof(numa_distance[0]); + phys = memblock_find_in_range(0, PFN_PHYS(max_pfn), + size, PAGE_SIZE); + if (WARN_ON(!phys)) + return -ENOMEM; + + memblock_reserve(phys, size); + + numa_distance = __va(phys); + numa_distance_cnt = nr_node_ids; + + /* fill with the default distances */ + for (i = 0; i < numa_distance_cnt; i++) + for (j = 0; j < numa_distance_cnt; j++) + numa_distance[i * numa_distance_cnt + j] = i == j ? + LOCAL_DISTANCE : REMOTE_DISTANCE; + + pr_debug("NUMA: Initialized distance table, cnt=%d\n", + numa_distance_cnt); + + return 0; +} + +/** + * numa_set_distance - Set inter node NUMA distance from node to node. + * @from: the 'from' node to set distance + * @to: the 'to' node to set distance + * @distance: NUMA distance + * + * Set the distance from node @from to @to to @distance. + * If distance table doesn't exist, a warning is printed. + * + * If @from or @to is higher than the highest known node or lower than zero + * or @distance doesn't make sense, the call is ignored. + * + */ +void __init numa_set_distance(int from, int to, int distance) +{ + if (!numa_distance) { + pr_warn_once("NUMA: Warning: distance table not allocated yet\n"); + return; + } + + if (from >= numa_distance_cnt || to >= numa_distance_cnt || + from < 0 || to < 0) { + pr_warn_once("NUMA: Warning: node ids are out of bound, from=%d to=%d distance=%d\n", + from, to, distance); + return; + } + + if ((u8)distance != distance || + (from == to && distance != LOCAL_DISTANCE)) { + pr_warn_once("NUMA: Warning: invalid distance parameter, from=%d to=%d distance=%d\n", + from, to, distance); + return; + } + + numa_distance[from * numa_distance_cnt + to] = distance; +} + +/** + * Return NUMA distance @from to @to + */ +int __node_distance(int from, int to) +{ + if (from >= numa_distance_cnt || to >= numa_distance_cnt) + return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE; + return numa_distance[from * numa_distance_cnt + to]; +} +EXPORT_SYMBOL(__node_distance); + +static int __init numa_register_nodes(void) +{ + int nid; + struct memblock_region *mblk; + + /* Check that valid nid is set to memblks */ + for_each_memblock(memory, mblk) + if (mblk->nid == NUMA_NO_NODE || mblk->nid >= MAX_NUMNODES) { + pr_warn("NUMA: Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n", + mblk->nid, mblk->base, + mblk->base + mblk->size - 1); + return -EINVAL; + } + + /* Finally register nodes. */ + for_each_node_mask(nid, numa_nodes_parsed) { + unsigned long start_pfn, end_pfn; + + get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); + setup_node_data(nid, start_pfn, end_pfn); + node_set_online(nid); + } + + /* Setup online nodes to actual nodes*/ + node_possible_map = numa_nodes_parsed; + + return 0; +} + +static int __init numa_init(int (*init_func)(void)) +{ + int ret; + + nodes_clear(numa_nodes_parsed); + nodes_clear(node_possible_map); + nodes_clear(node_online_map); + numa_free_distance(); + + ret = numa_alloc_distance(); + if (ret < 0) + return ret; + + ret = init_func(); + if (ret < 0) + return ret; + + if (nodes_empty(numa_nodes_parsed)) + return -EINVAL; + + ret = numa_register_nodes(); + if (ret < 0) + return ret; + + setup_node_to_cpumask_map(); + + /* init boot processor */ + cpu_to_node_map[0] = 0; + map_cpu_to_node(0, 0); + + return 0; +} + +/** + * dummy_numa_init - Fallback dummy NUMA init + * + * Used if there's no underlying NUMA architecture, NUMA initialization + * fails, or NUMA is disabled on the command line. + * + * Must online at least one node (node 0) and add memory blocks that cover all + * allowed memory. It is unlikely that this function fails. + */ +static int __init dummy_numa_init(void) +{ + int ret; + struct memblock_region *mblk; + + pr_info("%s\n", "No NUMA configuration found"); + pr_info("NUMA: Faking a node at [mem %#018Lx-%#018Lx]\n", + 0LLU, PFN_PHYS(max_pfn) - 1); + + for_each_memblock(memory, mblk) { + ret = numa_add_memblk(0, mblk->base, mblk->size); + if (!ret) + continue; + + pr_err("NUMA init failed\n"); + return ret; + } + + numa_off = 1; + return 0; +} + +/** + * arm64_numa_init - Initialize NUMA + * + * Try each configured NUMA initialization method until one succeeds. The + * last fallback is dummy single node config encomapssing whole memory. + */ +void __init arm64_numa_init(void) +{ + if (!numa_off) { + if (!numa_init(of_numa_init)) + return; + } + + numa_init(dummy_numa_init); +} diff --git a/arch/arm64/mm/proc-macros.S b/arch/arm64/mm/proc-macros.S deleted file mode 100644 index e6a30e1..0000000 --- a/arch/arm64/mm/proc-macros.S +++ /dev/null @@ -1,98 +0,0 @@ -/* - * Based on arch/arm/mm/proc-macros.S - * - * Copyright (C) 2012 ARM Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -#include <asm/asm-offsets.h> -#include <asm/thread_info.h> - -/* - * vma_vm_mm - get mm pointer from vma pointer (vma->vm_mm) - */ - .macro vma_vm_mm, rd, rn - ldr \rd, [\rn, #VMA_VM_MM] - .endm - -/* - * mmid - get context id from mm pointer (mm->context.id) - */ - .macro mmid, rd, rn - ldr \rd, [\rn, #MM_CONTEXT_ID] - .endm - -/* - * dcache_line_size - get the minimum D-cache line size from the CTR register. - */ - .macro dcache_line_size, reg, tmp - mrs \tmp, ctr_el0 // read CTR - ubfm \tmp, \tmp, #16, #19 // cache line size encoding - mov \reg, #4 // bytes per word - lsl \reg, \reg, \tmp // actual cache line size - .endm - -/* - * icache_line_size - get the minimum I-cache line size from the CTR register. - */ - .macro icache_line_size, reg, tmp - mrs \tmp, ctr_el0 // read CTR - and \tmp, \tmp, #0xf // cache line size encoding - mov \reg, #4 // bytes per word - lsl \reg, \reg, \tmp // actual cache line size - .endm - -/* - * tcr_set_idmap_t0sz - update TCR.T0SZ so that we can load the ID map - */ - .macro tcr_set_idmap_t0sz, valreg, tmpreg -#ifndef CONFIG_ARM64_VA_BITS_48 - ldr_l \tmpreg, idmap_t0sz - bfi \valreg, \tmpreg, #TCR_T0SZ_OFFSET, #TCR_TxSZ_WIDTH -#endif - .endm - -/* - * Macro to perform a data cache maintenance for the interval - * [kaddr, kaddr + size) - * - * op: operation passed to dc instruction - * domain: domain used in dsb instruciton - * kaddr: starting virtual address of the region - * size: size of the region - * Corrupts: kaddr, size, tmp1, tmp2 - */ - .macro dcache_by_line_op op, domain, kaddr, size, tmp1, tmp2 - dcache_line_size \tmp1, \tmp2 - add \size, \kaddr, \size - sub \tmp2, \tmp1, #1 - bic \kaddr, \kaddr, \tmp2 -9998: dc \op, \kaddr - add \kaddr, \kaddr, \tmp1 - cmp \kaddr, \size - b.lo 9998b - dsb \domain - .endm - -/* - * reset_pmuserenr_el0 - reset PMUSERENR_EL0 if PMUv3 present - */ - .macro reset_pmuserenr_el0, tmpreg - mrs \tmpreg, id_aa64dfr0_el1 // Check ID_AA64DFR0_EL1 PMUVer - sbfx \tmpreg, \tmpreg, #8, #4 - cmp \tmpreg, #1 // Skip if no PMU present - b.lt 9000f - msr pmuserenr_el0, xzr // Disable PMU access from EL0 -9000: - .endm diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index 543f519..c431787 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -23,13 +23,11 @@ #include <asm/assembler.h> #include <asm/asm-offsets.h> #include <asm/hwcap.h> -#include <asm/pgtable-hwdef.h> #include <asm/pgtable.h> +#include <asm/pgtable-hwdef.h> #include <asm/cpufeature.h> #include <asm/alternative.h> -#include "proc-macros.S" - #ifdef CONFIG_ARM64_64K_PAGES #define TCR_TG_FLAGS TCR_TG0_64K | TCR_TG1_64K #elif defined(CONFIG_ARM64_16K_PAGES) @@ -66,62 +64,50 @@ ENTRY(cpu_do_suspend) mrs x2, tpidr_el0 mrs x3, tpidrro_el0 mrs x4, contextidr_el1 - mrs x5, mair_el1 - mrs x6, cpacr_el1 - mrs x7, ttbr1_el1 - mrs x8, tcr_el1 - mrs x9, vbar_el1 - mrs x10, mdscr_el1 - mrs x11, oslsr_el1 - mrs x12, sctlr_el1 + mrs x5, cpacr_el1 + mrs x6, tcr_el1 + mrs x7, vbar_el1 + mrs x8, mdscr_el1 + mrs x9, oslsr_el1 + mrs x10, sctlr_el1 stp x2, x3, [x0] - stp x4, x5, [x0, #16] - stp x6, x7, [x0, #32] - stp x8, x9, [x0, #48] - stp x10, x11, [x0, #64] - str x12, [x0, #80] + stp x4, xzr, [x0, #16] + stp x5, x6, [x0, #32] + stp x7, x8, [x0, #48] + stp x9, x10, [x0, #64] ret ENDPROC(cpu_do_suspend) /** * cpu_do_resume - restore CPU register context * - * x0: Physical address of context pointer - * x1: ttbr0_el1 to be restored - * - * Returns: - * sctlr_el1 value in x0 + * x0: Address of context pointer */ ENTRY(cpu_do_resume) - /* - * Invalidate local tlb entries before turning on MMU - */ - tlbi vmalle1 ldp x2, x3, [x0] ldp x4, x5, [x0, #16] - ldp x6, x7, [x0, #32] - ldp x8, x9, [x0, #48] - ldp x10, x11, [x0, #64] - ldr x12, [x0, #80] + ldp x6, x8, [x0, #32] + ldp x9, x10, [x0, #48] + ldp x11, x12, [x0, #64] msr tpidr_el0, x2 msr tpidrro_el0, x3 msr contextidr_el1, x4 - msr mair_el1, x5 msr cpacr_el1, x6 - msr ttbr0_el1, x1 - msr ttbr1_el1, x7 - tcr_set_idmap_t0sz x8, x7 + + /* Don't change t0sz here, mask those bits when restoring */ + mrs x5, tcr_el1 + bfi x8, x5, TCR_T0SZ_OFFSET, TCR_TxSZ_WIDTH + msr tcr_el1, x8 msr vbar_el1, x9 msr mdscr_el1, x10 + msr sctlr_el1, x12 /* * Restore oslsr_el1 by writing oslar_el1 */ ubfx x11, x11, #1, #1 msr oslar_el1, x11 reset_pmuserenr_el0 x0 // Disable PMU access from EL0 - mov x0, x12 - dsb nsh // Make sure local tlb invalidation completed isb ret ENDPROC(cpu_do_resume) diff --git a/arch/nios2/include/asm/io.h b/arch/nios2/include/asm/io.h index c5a62da..ce072ba 100644 --- a/arch/nios2/include/asm/io.h +++ b/arch/nios2/include/asm/io.h @@ -50,7 +50,6 @@ static inline void iounmap(void __iomem *addr) /* Pages to physical address... */ #define page_to_phys(page) virt_to_phys(page_to_virt(page)) -#define page_to_bus(page) page_to_virt(page) /* Macros used for converting between virtual and physical mappings. */ #define phys_to_virt(vaddr) \ diff --git a/arch/nios2/include/asm/page.h b/arch/nios2/include/asm/page.h index 4b32d6f..c1683f5 100644 --- a/arch/nios2/include/asm/page.h +++ b/arch/nios2/include/asm/page.h @@ -84,7 +84,7 @@ extern struct page *mem_map; ((void *)((unsigned long)(x) + PAGE_OFFSET - PHYS_OFFSET)) #define page_to_virt(page) \ - ((((page) - mem_map) << PAGE_SHIFT) + PAGE_OFFSET) + ((void *)(((page) - mem_map) << PAGE_SHIFT) + PAGE_OFFSET) # define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) # define pfn_valid(pfn) ((pfn) >= ARCH_PFN_OFFSET && \ diff --git a/arch/nios2/include/asm/pgtable.h b/arch/nios2/include/asm/pgtable.h index a213e8c..298393c 100644 --- a/arch/nios2/include/asm/pgtable.h +++ b/arch/nios2/include/asm/pgtable.h @@ -209,7 +209,7 @@ static inline void set_pte(pte_t *ptep, pte_t pteval) static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval) { - unsigned long paddr = page_to_virt(pte_page(pteval)); + unsigned long paddr = (unsigned long)page_to_virt(pte_page(pteval)); flush_dcache_range(paddr, paddr + PAGE_SIZE); set_pte(ptep, pteval); diff --git a/arch/openrisc/include/asm/page.h b/arch/openrisc/include/asm/page.h index e613d36..35bcb7c 100644 --- a/arch/openrisc/include/asm/page.h +++ b/arch/openrisc/include/asm/page.h @@ -81,8 +81,6 @@ typedef struct page *pgtable_t; #define virt_to_page(addr) \ (mem_map + (((unsigned long)(addr)-PAGE_OFFSET) >> PAGE_SHIFT)) -#define page_to_virt(page) \ - ((((page) - mem_map) << PAGE_SHIFT) + PAGE_OFFSET) #define page_to_phys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT) |