diff options
Diffstat (limited to 'arch/powerpc/kernel')
31 files changed, 1396 insertions, 484 deletions
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index e132902..4aa7c14 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -25,8 +25,6 @@ CFLAGS_REMOVE_cputable.o = -mno-sched-epilog $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_prom_init.o = -mno-sched-epilog $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_btext.o = -mno-sched-epilog $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_prom.o = -mno-sched-epilog $(CC_FLAGS_FTRACE) -# timers used by tracing -CFLAGS_REMOVE_time.o = -mno-sched-epilog $(CC_FLAGS_FTRACE) endif obj-y := cputable.o ptrace.o syscalls.o \ @@ -40,6 +38,7 @@ obj-$(CONFIG_PPC64) += setup_64.o sys_ppc32.o \ signal_64.o ptrace32.o \ paca.o nvram_64.o firmware.o obj-$(CONFIG_VDSO32) += vdso32/ +obj-$(CONFIG_HARDLOCKUP_DETECTOR) += watchdog.o obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o obj-$(CONFIG_PPC_BOOK3S_64) += cpu_setup_ppc970.o cpu_setup_pa6t.o obj-$(CONFIG_PPC_BOOK3S_64) += cpu_setup_power.o diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 709e234..6e95c2c 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -100,12 +100,12 @@ int main(void) OFFSET(THREAD_NORMSAVES, thread_struct, normsave[0]); #endif OFFSET(THREAD_FPEXC_MODE, thread_struct, fpexc_mode); - OFFSET(THREAD_FPSTATE, thread_struct, fp_state); + OFFSET(THREAD_FPSTATE, thread_struct, fp_state.fpr); OFFSET(THREAD_FPSAVEAREA, thread_struct, fp_save_area); OFFSET(FPSTATE_FPSCR, thread_fp_state, fpscr); OFFSET(THREAD_LOAD_FP, thread_struct, load_fp); #ifdef CONFIG_ALTIVEC - OFFSET(THREAD_VRSTATE, thread_struct, vr_state); + OFFSET(THREAD_VRSTATE, thread_struct, vr_state.vr); OFFSET(THREAD_VRSAVEAREA, thread_struct, vr_save_area); OFFSET(THREAD_VRSAVE, thread_struct, vrsave); OFFSET(THREAD_USED_VR, thread_struct, used_vr); @@ -145,9 +145,9 @@ int main(void) OFFSET(THREAD_TM_PPR, thread_struct, tm_ppr); OFFSET(THREAD_TM_DSCR, thread_struct, tm_dscr); OFFSET(PT_CKPT_REGS, thread_struct, ckpt_regs); - OFFSET(THREAD_CKVRSTATE, thread_struct, ckvr_state); + OFFSET(THREAD_CKVRSTATE, thread_struct, ckvr_state.vr); OFFSET(THREAD_CKVRSAVE, thread_struct, ckvrsave); - OFFSET(THREAD_CKFPSTATE, thread_struct, ckfp_state); + OFFSET(THREAD_CKFPSTATE, thread_struct, ckfp_state.fpr); /* Local pt_regs on stack for Transactional Memory funcs. */ DEFINE(TM_FRAME_SIZE, STACK_FRAME_OVERHEAD + sizeof(struct pt_regs) + 16); @@ -485,6 +485,7 @@ int main(void) OFFSET(KVM_ENABLED_HCALLS, kvm, arch.enabled_hcalls); OFFSET(KVM_VRMA_SLB_V, kvm, arch.vrma_slb_v); OFFSET(KVM_RADIX, kvm, arch.radix); + OFFSET(KVM_FWNMI, kvm, arch.fwnmi_enabled); OFFSET(VCPU_DSISR, kvm_vcpu, arch.shregs.dsisr); OFFSET(VCPU_DAR, kvm_vcpu, arch.shregs.dar); OFFSET(VCPU_VPA, kvm_vcpu, arch.vpa.pinned_addr); @@ -513,6 +514,7 @@ int main(void) OFFSET(VCPU_PENDING_EXC, kvm_vcpu, arch.pending_exceptions); OFFSET(VCPU_CEDED, kvm_vcpu, arch.ceded); OFFSET(VCPU_PRODDED, kvm_vcpu, arch.prodded); + OFFSET(VCPU_DBELL_REQ, kvm_vcpu, arch.doorbell_request); OFFSET(VCPU_MMCR, kvm_vcpu, arch.mmcr); OFFSET(VCPU_PMC, kvm_vcpu, arch.pmc); OFFSET(VCPU_SPMC, kvm_vcpu, arch.spmc); @@ -542,6 +544,7 @@ int main(void) OFFSET(VCPU_WORT, kvm_vcpu, arch.wort); OFFSET(VCPU_TID, kvm_vcpu, arch.tid); OFFSET(VCPU_PSSCR, kvm_vcpu, arch.psscr); + OFFSET(VCPU_HFSCR, kvm_vcpu, arch.hfscr); OFFSET(VCORE_ENTRY_EXIT, kvmppc_vcore, entry_exit_map); OFFSET(VCORE_IN_GUEST, kvmppc_vcore, in_guest); OFFSET(VCORE_NAPPING_THREADS, kvmppc_vcore, napping_threads); @@ -742,9 +745,11 @@ int main(void) OFFSET(PACA_THREAD_MASK, paca_struct, thread_mask); OFFSET(PACA_SUBCORE_SIBLING_MASK, paca_struct, subcore_sibling_mask); OFFSET(PACA_SIBLING_PACA_PTRS, paca_struct, thread_sibling_pacas); + OFFSET(PACA_REQ_PSSCR, paca_struct, requested_psscr); #endif DEFINE(PPC_DBELL_SERVER, PPC_DBELL_SERVER); + DEFINE(PPC_DBELL_MSGTYPE, PPC_DBELL_MSGTYPE); #ifdef CONFIG_PPC_8xx DEFINE(VIRT_IMMR_BASE, (u64)__fix_to_virt(FIX_IMMR_BASE)); diff --git a/arch/powerpc/kernel/cpu_setup_power.S b/arch/powerpc/kernel/cpu_setup_power.S index 10cb289..610955f 100644 --- a/arch/powerpc/kernel/cpu_setup_power.S +++ b/arch/powerpc/kernel/cpu_setup_power.S @@ -218,13 +218,20 @@ __init_tlb_power8: ptesync 1: blr +/* + * Flush the TLB in hash mode. Hash must flush with RIC=2 once for process + * and one for partition scope to clear process and partition table entries. + */ __init_tlb_power9: - li r6,POWER9_TLB_SETS_HASH + li r6,POWER9_TLB_SETS_HASH - 1 mtctr r6 li r7,0xc00 /* IS field = 0b11 */ + li r8,0 ptesync -2: tlbiel r7 - addi r7,r7,0x1000 + PPC_TLBIEL(7, 8, 2, 1, 0) + PPC_TLBIEL(7, 8, 2, 0, 0) +2: addi r7,r7,0x1000 + PPC_TLBIEL(7, 8, 0, 0, 0) bdnz 2b ptesync 1: blr diff --git a/arch/powerpc/kernel/dma-iommu.c b/arch/powerpc/kernel/dma-iommu.c index fb7cbaa..8f7abf9 100644 --- a/arch/powerpc/kernel/dma-iommu.c +++ b/arch/powerpc/kernel/dma-iommu.c @@ -105,6 +105,11 @@ static u64 dma_iommu_get_required_mask(struct device *dev) return mask; } +int dma_iommu_mapping_error(struct device *dev, dma_addr_t dma_addr) +{ + return dma_addr == IOMMU_MAPPING_ERROR; +} + struct dma_map_ops dma_iommu_ops = { .alloc = dma_iommu_alloc_coherent, .free = dma_iommu_free_coherent, @@ -115,5 +120,6 @@ struct dma_map_ops dma_iommu_ops = { .map_page = dma_iommu_map_page, .unmap_page = dma_iommu_unmap_page, .get_required_mask = dma_iommu_get_required_mask, + .mapping_error = dma_iommu_mapping_error, }; EXPORT_SYMBOL(dma_iommu_ops); diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c index 41c7495..4194bbb 100644 --- a/arch/powerpc/kernel/dma.c +++ b/arch/powerpc/kernel/dma.c @@ -314,18 +314,6 @@ EXPORT_SYMBOL(dma_set_coherent_mask); #define PREALLOC_DMA_DEBUG_ENTRIES (1 << 16) -int __dma_set_mask(struct device *dev, u64 dma_mask) -{ - const struct dma_map_ops *dma_ops = get_dma_ops(dev); - - if ((dma_ops != NULL) && (dma_ops->set_dma_mask != NULL)) - return dma_ops->set_dma_mask(dev, dma_mask); - if (!dev->dma_mask || !dma_supported(dev, dma_mask)) - return -EIO; - *dev->dma_mask = dma_mask; - return 0; -} - int dma_set_mask(struct device *dev, u64 dma_mask) { if (ppc_md.dma_set_mask) @@ -338,7 +326,10 @@ int dma_set_mask(struct device *dev, u64 dma_mask) return phb->controller_ops.dma_set_mask(pdev, dma_mask); } - return __dma_set_mask(dev, dma_mask); + if (!dev->dma_mask || !dma_supported(dev, dma_mask)) + return -EIO; + *dev->dma_mask = dma_mask; + return 0; } EXPORT_SYMBOL(dma_set_mask); diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c index 4c7656d..1df770e 100644 --- a/arch/powerpc/kernel/dt_cpu_ftrs.c +++ b/arch/powerpc/kernel/dt_cpu_ftrs.c @@ -94,9 +94,6 @@ static void (*init_pmu_registers)(void); static void cpufeatures_flush_tlb(void) { - unsigned long rb; - unsigned int i, num_sets; - /* * This is a temporary measure to keep equivalent TLB flush as the * cputable based setup code. @@ -105,24 +102,15 @@ static void cpufeatures_flush_tlb(void) case PVR_POWER8: case PVR_POWER8E: case PVR_POWER8NVL: - num_sets = POWER8_TLB_SETS; + __flush_tlb_power8(POWER8_TLB_SETS); break; case PVR_POWER9: - num_sets = POWER9_TLB_SETS_HASH; + __flush_tlb_power9(POWER9_TLB_SETS_HASH); break; default: - num_sets = 1; pr_err("unknown CPU version for boot TLB flush\n"); break; } - - asm volatile("ptesync" : : : "memory"); - rb = TLBIEL_INVAL_SET; - for (i = 0; i < num_sets; i++) { - asm volatile("tlbiel %0" : : "r" (rb)); - rb += 1 << TLBIEL_INVAL_SET_SHIFT; - } - asm volatile("ptesync" : : : "memory"); } static void __restore_cpu_cpufeatures(void) diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index bfbad08..49d8422 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -57,7 +57,7 @@ system_call_common: #ifdef CONFIG_PPC_TRANSACTIONAL_MEM BEGIN_FTR_SECTION extrdi. r10, r12, 1, (63-MSR_TS_T_LG) /* transaction active? */ - bne tabort_syscall + bne .Ltabort_syscall END_FTR_SECTION_IFSET(CPU_FTR_TM) #endif andi. r10,r12,MSR_PR @@ -143,6 +143,7 @@ END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR) mtmsrd r11,1 #endif /* CONFIG_PPC_BOOK3E */ +system_call: /* label this so stack traces look sane */ /* We do need to set SOFTE in the stack frame or the return * from interrupt will be painful */ @@ -152,11 +153,11 @@ END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR) CURRENT_THREAD_INFO(r11, r1) ld r10,TI_FLAGS(r11) andi. r11,r10,_TIF_SYSCALL_DOTRACE - bne syscall_dotrace /* does not return */ + bne .Lsyscall_dotrace /* does not return */ cmpldi 0,r0,NR_syscalls - bge- syscall_enosys + bge- .Lsyscall_enosys -system_call: /* label this so stack traces look sane */ +.Lsyscall: /* * Need to vector to 32 Bit or default sys_call_table here, * based on caller's run-mode / personality. @@ -185,8 +186,20 @@ system_call: /* label this so stack traces look sane */ #ifdef CONFIG_PPC_BOOK3S /* No MSR:RI on BookE */ andi. r10,r8,MSR_RI - beq- unrecov_restore + beq- .Lunrecov_restore #endif + +/* + * This is a few instructions into the actual syscall exit path (which actually + * starts at .Lsyscall_exit) to cater to kprobe blacklisting and to reduce the + * number of visible symbols for profiling purposes. + * + * We can probe from system_call until this point as MSR_RI is set. But once it + * is cleared below, we won't be able to take a trap. + * + * This is blacklisted from kprobes further below with _ASM_NOKPROBE_SYMBOL(). + */ +system_call_exit: /* * Disable interrupts so current_thread_info()->flags can't change, * and so that we don't get interrupted after loading SRR0/1. @@ -208,31 +221,21 @@ system_call: /* label this so stack traces look sane */ ld r9,TI_FLAGS(r12) li r11,-MAX_ERRNO andi. r0,r9,(_TIF_SYSCALL_DOTRACE|_TIF_SINGLESTEP|_TIF_USER_WORK_MASK|_TIF_PERSYSCALL_MASK) - bne- syscall_exit_work + bne- .Lsyscall_exit_work - andi. r0,r8,MSR_FP - beq 2f + /* If MSR_FP and MSR_VEC are set in user msr, then no need to restore */ + li r7,MSR_FP #ifdef CONFIG_ALTIVEC - andis. r0,r8,MSR_VEC@h - bne 3f -#endif -2: addi r3,r1,STACK_FRAME_OVERHEAD -#ifdef CONFIG_PPC_BOOK3S - li r10,MSR_RI - mtmsrd r10,1 /* Restore RI */ -#endif - bl restore_math -#ifdef CONFIG_PPC_BOOK3S - li r11,0 - mtmsrd r11,1 + oris r7,r7,MSR_VEC@h #endif - ld r8,_MSR(r1) - ld r3,RESULT(r1) - li r11,-MAX_ERRNO + and r0,r8,r7 + cmpd r0,r7 + bne .Lsyscall_restore_math +.Lsyscall_restore_math_cont: -3: cmpld r3,r11 + cmpld r3,r11 ld r5,_CCR(r1) - bge- syscall_error + bge- .Lsyscall_error .Lsyscall_error_cont: ld r7,_NIP(r1) BEGIN_FTR_SECTION @@ -258,14 +261,48 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) RFI b . /* prevent speculative execution */ -syscall_error: +.Lsyscall_error: oris r5,r5,0x1000 /* Set SO bit in CR */ neg r3,r3 std r5,_CCR(r1) b .Lsyscall_error_cont - + +.Lsyscall_restore_math: + /* + * Some initial tests from restore_math to avoid the heavyweight + * C code entry and MSR manipulations. + */ + LOAD_REG_IMMEDIATE(r0, MSR_TS_MASK) + and. r0,r0,r8 + bne 1f + + ld r7,PACACURRENT(r13) + lbz r0,THREAD+THREAD_LOAD_FP(r7) +#ifdef CONFIG_ALTIVEC + lbz r6,THREAD+THREAD_LOAD_VEC(r7) + add r0,r0,r6 +#endif + cmpdi r0,0 + beq .Lsyscall_restore_math_cont + +1: addi r3,r1,STACK_FRAME_OVERHEAD +#ifdef CONFIG_PPC_BOOK3S + li r10,MSR_RI + mtmsrd r10,1 /* Restore RI */ +#endif + bl restore_math +#ifdef CONFIG_PPC_BOOK3S + li r11,0 + mtmsrd r11,1 +#endif + /* Restore volatiles, reload MSR from updated one */ + ld r8,_MSR(r1) + ld r3,RESULT(r1) + li r11,-MAX_ERRNO + b .Lsyscall_restore_math_cont + /* Traced system call support */ -syscall_dotrace: +.Lsyscall_dotrace: bl save_nvgprs addi r3,r1,STACK_FRAME_OVERHEAD bl do_syscall_trace_enter @@ -286,23 +323,23 @@ syscall_dotrace: ld r7,GPR7(r1) ld r8,GPR8(r1) - /* Repopulate r9 and r10 for the system_call path */ + /* Repopulate r9 and r10 for the syscall path */ addi r9,r1,STACK_FRAME_OVERHEAD CURRENT_THREAD_INFO(r10, r1) ld r10,TI_FLAGS(r10) cmpldi r0,NR_syscalls - blt+ system_call + blt+ .Lsyscall /* Return code is already in r3 thanks to do_syscall_trace_enter() */ b .Lsyscall_exit -syscall_enosys: +.Lsyscall_enosys: li r3,-ENOSYS b .Lsyscall_exit -syscall_exit_work: +.Lsyscall_exit_work: #ifdef CONFIG_PPC_BOOK3S li r10,MSR_RI mtmsrd r10,1 /* Restore RI */ @@ -362,7 +399,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) b ret_from_except #ifdef CONFIG_PPC_TRANSACTIONAL_MEM -tabort_syscall: +.Ltabort_syscall: /* Firstly we need to enable TM in the kernel */ mfmsr r10 li r9, 1 @@ -388,6 +425,8 @@ tabort_syscall: rfid b . /* prevent speculative execution */ #endif +_ASM_NOKPROBE_SYMBOL(system_call_common); +_ASM_NOKPROBE_SYMBOL(system_call_exit); /* Save non-volatile GPRs, if not already saved. */ _GLOBAL(save_nvgprs) @@ -398,6 +437,7 @@ _GLOBAL(save_nvgprs) clrrdi r0,r11,1 std r0,_TRAP(r1) blr +_ASM_NOKPROBE_SYMBOL(save_nvgprs); /* @@ -488,33 +528,30 @@ _GLOBAL(_switch) std r23,_CCR(r1) std r1,KSP(r3) /* Set old stack pointer */ -#ifdef CONFIG_SMP - /* We need a sync somewhere here to make sure that if the - * previous task gets rescheduled on another CPU, it sees all - * stores it has performed on this one. + /* + * On SMP kernels, care must be taken because a task may be + * scheduled off CPUx and on to CPUy. Memory ordering must be + * considered. + * + * Cacheable stores on CPUx will be visible when the task is + * scheduled on CPUy by virtue of the core scheduler barriers + * (see "Notes on Program-Order guarantees on SMP systems." in + * kernel/sched/core.c). + * + * Uncacheable stores in the case of involuntary preemption must + * be taken care of. The smp_mb__before_spin_lock() in __schedule() + * is implemented as hwsync on powerpc, which orders MMIO too. So + * long as there is an hwsync in the context switch path, it will + * be executed on the source CPU after the task has performed + * all MMIO ops on that CPU, and on the destination CPU before the + * task performs any MMIO ops there. */ - sync -#endif /* CONFIG_SMP */ /* - * If we optimise away the clear of the reservation in system - * calls because we know the CPU tracks the address of the - * reservation, then we need to clear it here to cover the - * case that the kernel context switch path has no larx - * instructions. + * The kernel context switch path must contain a spin_lock, + * which contains larx/stcx, which will clear any reservation + * of the task being switched. */ -BEGIN_FTR_SECTION - ldarx r6,0,r1 -END_FTR_SECTION_IFSET(CPU_FTR_STCX_CHECKS_ADDRESS) - -BEGIN_FTR_SECTION -/* - * A cp_abort (copy paste abort) here ensures that when context switching, a - * copy from one process can't leak into the paste of another. - */ - PPC_CP_ABORT -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) - #ifdef CONFIG_PPC_BOOK3S /* Cancel all explict user streams as they will have no use after context * switch and will stop the HW from creating streams itself @@ -583,6 +620,14 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT) top of the kernel stack. */ addi r7,r7,THREAD_SIZE-SWITCH_FRAME_SIZE + /* + * PMU interrupts in radix may come in here. They will use r1, not + * PACAKSAVE, so this stack switch will not cause a problem. They + * will store to the process stack, which may then be migrated to + * another CPU. However the rq lock release on this CPU paired with + * the rq lock acquire on the new CPU before the stack becomes + * active on the new CPU, will order those stores. + */ mr r1,r8 /* start using new stack pointer */ std r7,PACAKSAVE(r13) @@ -763,11 +808,11 @@ restore: ld r5,SOFTE(r1) lbz r6,PACASOFTIRQEN(r13) cmpwi cr0,r5,0 - beq restore_irq_off + beq .Lrestore_irq_off /* We are enabling, were we already enabled ? Yes, just return */ cmpwi cr0,r6,1 - beq cr0,do_restore + beq cr0,.Ldo_restore /* * We are about to soft-enable interrupts (we are hard disabled @@ -776,14 +821,14 @@ restore: */ lbz r0,PACAIRQHAPPENED(r13) cmpwi cr0,r0,0 - bne- restore_check_irq_replay + bne- .Lrestore_check_irq_replay /* * Get here when nothing happened while soft-disabled, just * soft-enable and move-on. We will hard-enable as a side * effect of rfi */ -restore_no_replay: +.Lrestore_no_replay: TRACE_ENABLE_INTS li r0,1 stb r0,PACASOFTIRQEN(r13); @@ -791,7 +836,7 @@ restore_no_replay: /* * Final return path. BookE is handled in a different file */ -do_restore: +.Ldo_restore: #ifdef CONFIG_PPC_BOOK3E b exception_return_book3e #else @@ -825,7 +870,7 @@ fast_exception_return: REST_8GPRS(5, r1) andi. r0,r3,MSR_RI - beq- unrecov_restore + beq- .Lunrecov_restore /* Load PPR from thread struct before we clear MSR:RI */ BEGIN_FTR_SECTION @@ -883,7 +928,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) * make sure that in this case, we also clear PACA_IRQ_HARD_DIS * or that bit can get out of sync and bad things will happen */ -restore_irq_off: +.Lrestore_irq_off: ld r3,_MSR(r1) lbz r7,PACAIRQHAPPENED(r13) andi. r0,r3,MSR_EE @@ -893,13 +938,13 @@ restore_irq_off: 1: li r0,0 stb r0,PACASOFTIRQEN(r13); TRACE_DISABLE_INTS - b do_restore + b .Ldo_restore /* * Something did happen, check if a re-emit is needed * (this also clears paca->irq_happened) */ -restore_check_irq_replay: +.Lrestore_check_irq_replay: /* XXX: We could implement a fast path here where we check * for irq_happened being just 0x01, in which case we can * clear it and return. That means that we would potentially @@ -909,7 +954,7 @@ restore_check_irq_replay: */ bl __check_irq_replay cmpwi cr0,r3,0 - beq restore_no_replay + beq .Lrestore_no_replay /* * We need to re-emit an interrupt. We do so by re-using our @@ -958,10 +1003,18 @@ restore_check_irq_replay: #endif /* CONFIG_PPC_DOORBELL */ 1: b ret_from_except /* What else to do here ? */ -unrecov_restore: +.Lunrecov_restore: addi r3,r1,STACK_FRAME_OVERHEAD bl unrecoverable_exception - b unrecov_restore + b .Lunrecov_restore + +_ASM_NOKPROBE_SYMBOL(ret_from_except); +_ASM_NOKPROBE_SYMBOL(ret_from_except_lite); +_ASM_NOKPROBE_SYMBOL(resume_kernel); +_ASM_NOKPROBE_SYMBOL(fast_exc_return_irq); +_ASM_NOKPROBE_SYMBOL(restore); +_ASM_NOKPROBE_SYMBOL(fast_exception_return); + #ifdef CONFIG_PPC_RTAS /* @@ -1038,6 +1091,8 @@ _GLOBAL(enter_rtas) rldicr r9,r9,MSR_SF_LG,(63-MSR_SF_LG) ori r9,r9,MSR_IR|MSR_DR|MSR_FE0|MSR_FE1|MSR_FP|MSR_RI|MSR_LE andc r6,r0,r9 + +__enter_rtas: sync /* disable interrupts so SRR0/1 */ mtmsrd r0 /* don't get trashed */ @@ -1074,6 +1129,8 @@ rtas_return_loc: mtspr SPRN_SRR1,r4 rfid b . /* prevent speculative execution */ +_ASM_NOKPROBE_SYMBOL(__enter_rtas) +_ASM_NOKPROBE_SYMBOL(rtas_return_loc) .align 3 1: .llong rtas_restore_regs diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index b886795..9029afd 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -99,7 +99,11 @@ EXC_VIRT_NONE(0x4000, 0x100) #ifdef CONFIG_PPC_P7_NAP /* * If running native on arch 2.06 or later, check if we are waking up - * from nap/sleep/winkle, and branch to idle handler. + * from nap/sleep/winkle, and branch to idle handler. This tests SRR1 + * bits 46:47. A non-0 value indicates that we are coming from a power + * saving state. The idle wakeup handler initially runs in real mode, + * but we branch to the 0xc000... address so we can turn on relocation + * with mtmsr. */ #define IDLETEST(n) \ BEGIN_FTR_SECTION ; \ @@ -107,7 +111,7 @@ EXC_VIRT_NONE(0x4000, 0x100) rlwinm. r10,r10,47-31,30,31 ; \ beq- 1f ; \ cmpwi cr3,r10,2 ; \ - BRANCH_TO_COMMON(r10, system_reset_idle_common) ; \ + BRANCH_TO_C000(r10, system_reset_idle_common) ; \ 1: \ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) #else @@ -128,6 +132,7 @@ EXC_VIRT_NONE(0x4100, 0x100) #ifdef CONFIG_PPC_P7_NAP EXC_COMMON_BEGIN(system_reset_idle_common) + mfspr r12,SPRN_SRR1 b pnv_powersave_wakeup #endif @@ -507,46 +512,22 @@ EXC_REAL_BEGIN(data_access_slb, 0x380, 0x80) SET_SCRATCH0(r13) EXCEPTION_PROLOG_0(PACA_EXSLB) EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_PR, 0x380) - std r3,PACA_EXSLB+EX_R3(r13) + mr r12,r3 /* save r3 */ mfspr r3,SPRN_DAR - mfspr r12,SPRN_SRR1 + mfspr r11,SPRN_SRR1 crset 4*cr6+eq -#ifndef CONFIG_RELOCATABLE - b slb_miss_realmode -#else - /* - * We can't just use a direct branch to slb_miss_realmode - * because the distance from here to there depends on where - * the kernel ends up being put. - */ - mfctr r11 - LOAD_HANDLER(r10, slb_miss_realmode) - mtctr r10 - bctr -#endif + BRANCH_TO_COMMON(r10, slb_miss_common) EXC_REAL_END(data_access_slb, 0x380, 0x80) EXC_VIRT_BEGIN(data_access_slb, 0x4380, 0x80) SET_SCRATCH0(r13) EXCEPTION_PROLOG_0(PACA_EXSLB) EXCEPTION_PROLOG_1(PACA_EXSLB, NOTEST, 0x380) - std r3,PACA_EXSLB+EX_R3(r13) + mr r12,r3 /* save r3 */ mfspr r3,SPRN_DAR - mfspr r12,SPRN_SRR1 + mfspr r11,SPRN_SRR1 crset 4*cr6+eq -#ifndef CONFIG_RELOCATABLE - b slb_miss_realmode -#else - /* - * We can't just use a direct branch to slb_miss_realmode - * because the distance from here to there depends on where - * the kernel ends up being put. - */ - mfctr r11 - LOAD_HANDLER(r10, slb_miss_realmode) - mtctr r10 - bctr -#endif + BRANCH_TO_COMMON(r10, slb_miss_common) EXC_VIRT_END(data_access_slb, 0x4380, 0x80) TRAMP_KVM_SKIP(PACA_EXSLB, 0x380) @@ -575,88 +556,82 @@ EXC_REAL_BEGIN(instruction_access_slb, 0x480, 0x80) SET_SCRATCH0(r13) EXCEPTION_PROLOG_0(PACA_EXSLB) EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_PR, 0x480) - std r3,PACA_EXSLB+EX_R3(r13) + mr r12,r3 /* save r3 */ mfspr r3,SPRN_SRR0 /* SRR0 is faulting address */ - mfspr r12,SPRN_SRR1 + mfspr r11,SPRN_SRR1 crclr 4*cr6+eq -#ifndef CONFIG_RELOCATABLE - b slb_miss_realmode -#else - mfctr r11 - LOAD_HANDLER(r10, slb_miss_realmode) - mtctr r10 - bctr -#endif + BRANCH_TO_COMMON(r10, slb_miss_common) EXC_REAL_END(instruction_access_slb, 0x480, 0x80) EXC_VIRT_BEGIN(instruction_access_slb, 0x4480, 0x80) SET_SCRATCH0(r13) EXCEPTION_PROLOG_0(PACA_EXSLB) EXCEPTION_PROLOG_1(PACA_EXSLB, NOTEST, 0x480) - std r3,PACA_EXSLB+EX_R3(r13) + mr r12,r3 /* save r3 */ mfspr r3,SPRN_SRR0 /* SRR0 is faulting address */ - mfspr r12,SPRN_SRR1 + mfspr r11,SPRN_SRR1 crclr 4*cr6+eq -#ifndef CONFIG_RELOCATABLE - b slb_miss_realmode -#else - mfctr r11 - LOAD_HANDLER(r10, slb_miss_realmode) - mtctr r10 - bctr -#endif + BRANCH_TO_COMMON(r10, slb_miss_common) EXC_VIRT_END(instruction_access_slb, 0x4480, 0x80) TRAMP_KVM(PACA_EXSLB, 0x480) -/* This handler is used by both 0x380 and 0x480 slb miss interrupts */ -EXC_COMMON_BEGIN(slb_miss_realmode) +/* + * This handler is used by the 0x380 and 0x480 SLB miss interrupts, as well as + * the virtual mode 0x4380 and 0x4480 interrupts if AIL is enabled. + */ +EXC_COMMON_BEGIN(slb_miss_common) /* * r13 points to the PACA, r9 contains the saved CR, - * r12 contain the saved SRR1, SRR0 is still ready for return + * r12 contains the saved r3, + * r11 contain the saved SRR1, SRR0 is still ready for return * r3 has the faulting address * r9 - r13 are saved in paca->exslb. - * r3 is saved in paca->slb_r3 * cr6.eq is set for a D-SLB miss, clear for a I-SLB miss * We assume we aren't going to take any exceptions during this * procedure. */ mflr r10 -#ifdef CONFIG_RELOCATABLE - mtctr r11 -#endif - stw r9,PACA_EXSLB+EX_CCR(r13) /* save CR in exc. frame */ std r10,PACA_EXSLB+EX_LR(r13) /* save LR */ - std r3,PACA_EXSLB+EX_DAR(r13) + + /* + * Test MSR_RI before calling slb_allocate_realmode, because the + * MSR in r11 gets clobbered. However we still want to allocate + * SLB in case MSR_RI=0, to minimise the risk of getting stuck in + * recursive SLB faults. So use cr5 for this, which is preserved. + */ + andi. r11,r11,MSR_RI /* check for unrecoverable exception */ + cmpdi cr5,r11,MSR_RI crset 4*cr0+eq #ifdef CONFIG_PPC_STD_MMU_64 BEGIN_MMU_FTR_SECTION - bl slb_allocate_realmode + bl slb_allocate END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX) #endif ld r10,PACA_EXSLB+EX_LR(r13) - ld r3,PACA_EXSLB+EX_R3(r13) lwz r9,PACA_EXSLB+EX_CCR(r13) /* get saved CR */ mtlr r10 - beq 8f /* if bad address, make full stack frame */ + beq- 8f /* if bad address, make full stack frame */ - andi. r10,r12,MSR_RI /* check for unrecoverable exception */ - beq- 2f + bne- cr5,2f /* if unrecoverable exception, oops */ /* All done -- return from exception. */ .machine push .machine "power4" mtcrf 0x80,r9 + mtcrf 0x04,r9 /* MSR[RI] indication is in cr5 */ mtcrf 0x02,r9 /* I/D indication is in cr6 */ mtcrf 0x01,r9 /* slb_allocate uses cr0 and cr7 */ .machine pop + RESTORE_CTR(r9, PACA_EXSLB) RESTORE_PPR_PACA(PACA_EXSLB, r9) + mr r3,r12 ld r9,PACA_EXSLB+EX_R9(r13) ld r10,PACA_EXSLB+EX_R10(r13) ld r11,PACA_EXSLB+EX_R11(r13) @@ -665,7 +640,10 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX) rfid b . /* prevent speculative execution */ -2: mfspr r11,SPRN_SRR0 +2: std r3,PACA_EXSLB+EX_DAR(r13) + mr r3,r12 + mfspr r11,SPRN_SRR0 + mfspr r12,SPRN_SRR1 LOAD_HANDLER(r10,unrecov_slb) mtspr SPRN_SRR0,r10 ld r10,PACAKMSR(r13) @@ -673,7 +651,10 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX) rfid b . -8: mfspr r11,SPRN_SRR0 +8: std r3,PACA_EXSLB+EX_DAR(r13) + mr r3,r12 + mfspr r11,SPRN_SRR0 + mfspr r12,SPRN_SRR1 LOAD_HANDLER(r10,bad_addr_slb) mtspr SPRN_SRR0,r10 ld r10,PACAKMSR(r13) @@ -821,46 +802,81 @@ EXC_VIRT(trap_0b, 0x4b00, 0x100, 0xb00) TRAMP_KVM(PACA_EXGEN, 0xb00) EXC_COMMON(trap_0b_common, 0xb00, unknown_exception) +/* + * system call / hypercall (0xc00, 0x4c00) + * + * The system call exception is invoked with "sc 0" and does not alter HV bit. + * There is support for kernel code to invoke system calls but there are no + * in-tree users. + * + * The hypercall is invoked with "sc 1" and sets HV=1. + * + * In HPT, sc 1 always goes to 0xc00 real mode. In RADIX, sc 1 can go to + * 0x4c00 virtual mode. + * + * Call convention: + * + * syscall register convention is in Documentation/powerpc/syscall64-abi.txt + * + * For hypercalls, the register convention is as follows: + * r0 volatile + * r1-2 nonvolatile + * r3 volatile parameter and return value for status + * r4-r10 volatile input and output value + * r11 volatile hypercall number and output value + * r12 volatile input and output value + * r13-r31 nonvolatile + * LR nonvolatile + * CTR volatile + * XER volatile + * CR0-1 CR5-7 volatile + * CR2-4 nonvolatile + * Other registers nonvolatile + * + * The intersection of volatile registers that don't contain possible + * inputs is: cr0, xer, ctr. We may use these as scratch regs upon entry + * without saving, though xer is not a good idea to use, as hardware may + * interpret some bits so it may be costly to change them. + */ #ifdef CONFIG_KVM_BOOK3S_64_HANDLER - /* - * If CONFIG_KVM_BOOK3S_64_HANDLER is set, save the PPR (on systems - * that support it) before changing to HMT_MEDIUM. That allows the KVM - * code to save that value into the guest state (it is the guest's PPR - * value). Otherwise just change to HMT_MEDIUM as userspace has - * already saved the PPR. - */ + /* + * There is a little bit of juggling to get syscall and hcall + * working well. Save r13 in ctr to avoid using SPRG scratch + * register. + * + * Userspace syscalls have already saved the PPR, hcalls must save + * it before setting HMT_MEDIUM. + */ #define SYSCALL_KVMTEST \ - SET_SCRATCH0(r13); \ + mtctr r13; \ GET_PACA(r13); \ - std r9,PACA_EXGEN+EX_R9(r13); \ - OPT_GET_SPR(r9, SPRN_PPR, CPU_FTR_HAS_PPR); \ - HMT_MEDIUM; \ std r10,PACA_EXGEN+EX_R10(r13); \ - OPT_SAVE_REG_TO_PACA(PACA_EXGEN+EX_PPR, r9, CPU_FTR_HAS_PPR); \ - mfcr r9; \ - KVMTEST_PR(0xc00); \ - GET_SCRATCH0(r13) + KVMTEST_PR(0xc00); /* uses r10, branch to do_kvm_0xc00_system_call */ \ + HMT_MEDIUM; \ + mfctr r9; #else #define SYSCALL_KVMTEST \ - HMT_MEDIUM + HMT_MEDIUM; \ + mr r9,r13; \ + GET_PACA(r13); #endif #define LOAD_SYSCALL_HANDLER(reg) \ __LOAD_HANDLER(reg, system_call_common) -/* Syscall routine is used twice, in reloc-off and reloc-on paths */ -#define SYSCALL_PSERIES_1 \ +#define SYSCALL_FASTENDIAN_TEST \ BEGIN_FTR_SECTION \ cmpdi r0,0x1ebe ; \ beq- 1f ; \ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) \ - mr r9,r13 ; \ - GET_PACA(r13) ; \ - mfspr r11,SPRN_SRR0 ; \ -0: -#define SYSCALL_PSERIES_2_RFID \ +/* + * After SYSCALL_KVMTEST, we reach here with PACA in r13, r13 in r9, + * and HMT_MEDIUM. + */ +#define SYSCALL_REAL \ + mfspr r11,SPRN_SRR0 ; \ mfspr r12,SPRN_SRR1 ; \ LOAD_SYSCALL_HANDLER(r10) ; \ mtspr SPRN_SRR0,r10 ; \ @@ -869,11 +885,12 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) \ rfid ; \ b . ; /* prevent speculative execution */ -#define SYSCALL_PSERIES_3 \ +#define SYSCALL_FASTENDIAN \ /* Fast LE/BE switch system call */ \ 1: mfspr r12,SPRN_SRR1 ; \ xori r12,r12,MSR_LE ; \ mtspr SPRN_SRR1,r12 ; \ + mr r13,r9 ; \ rfid ; /* return to userspace */ \ b . ; /* prevent speculative execution */ @@ -882,16 +899,18 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) \ * We can't branch directly so we do it via the CTR which * is volatile across system calls. */ -#define SYSCALL_PSERIES_2_DIRECT \ - LOAD_SYSCALL_HANDLER(r12) ; \ - mtctr r12 ; \ +#define SYSCALL_VIRT \ + LOAD_SYSCALL_HANDLER(r10) ; \ + mtctr r10 ; \ + mfspr r11,SPRN_SRR0 ; \ mfspr r12,SPRN_SRR1 ; \ li r10,MSR_RI ; \ mtmsrd r10,1 ; \ bctr ; #else /* We can branch directly */ -#define SYSCALL_PSERIES_2_DIRECT \ +#define SYSCALL_VIRT \ + mfspr r11,SPRN_SRR0 ; \ mfspr r12,SPRN_SRR1 ; \ li r10,MSR_RI ; \ mtmsrd r10,1 ; /* Set RI (EE=0) */ \ @@ -899,20 +918,42 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) \ #endif EXC_REAL_BEGIN(system_call, 0xc00, 0x100) - SYSCALL_KVMTEST - SYSCALL_PSERIES_1 - SYSCALL_PSERIES_2_RFID - SYSCALL_PSERIES_3 + SYSCALL_KVMTEST /* loads PACA into r13, and saves r13 to r9 */ + SYSCALL_FASTENDIAN_TEST + SYSCALL_REAL + SYSCALL_FASTENDIAN EXC_REAL_END(system_call, 0xc00, 0x100) EXC_VIRT_BEGIN(system_call, 0x4c00, 0x100) - SYSCALL_KVMTEST - SYSCALL_PSERIES_1 - SYSCALL_PSERIES_2_DIRECT - SYSCALL_PSERIES_3 + SYSCALL_KVMTEST /* loads PACA into r13, and saves r13 to r9 */ + SYSCALL_FASTENDIAN_TEST + SYSCALL_VIRT + SYSCALL_FASTENDIAN EXC_VIRT_END(system_call, 0x4c00, 0x100) -TRAMP_KVM(PACA_EXGEN, 0xc00) +#ifdef CONFIG_KVM_BOOK3S_64_HANDLER + /* + * This is a hcall, so register convention is as above, with these + * differences: + * r13 = PACA + * ctr = orig r13 + * orig r10 saved in PACA + */ +TRAMP_KVM_BEGIN(do_kvm_0xc00) + /* + * Save the PPR (on systems that support it) before changing to + * HMT_MEDIUM. That allows the KVM code to save that value into the + * guest state (it is the guest's PPR value). + */ + OPT_GET_SPR(r10, SPRN_PPR, CPU_FTR_HAS_PPR) + HMT_MEDIUM + OPT_SAVE_REG_TO_PACA(PACA_EXGEN+EX_PPR, r10, CPU_FTR_HAS_PPR) + mfctr r10 + SET_SCRATCH0(r10) + std r9,PACA_EXGEN+EX_R9(r13) + mfcr r9 + KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xc00) +#endif EXC_REAL(single_step, 0xd00, 0x100) @@ -1273,6 +1314,31 @@ EXC_REAL_NONE(0x1800, 0x100) EXC_VIRT_NONE(0x5800, 0x100) #endif +#if defined(CONFIG_HARDLOCKUP_DETECTOR) && defined(CONFIG_HAVE_HARDLOCKUP_DETECTOR_ARCH) + +#define MASKED_DEC_HANDLER_LABEL 3f + +#define MASKED_DEC_HANDLER(_H) \ +3: /* soft-nmi */ \ + std r12,PACA_EXGEN+EX_R12(r13); \ + GET_SCRATCH0(r10); \ + std r10,PACA_EXGEN+EX_R13(r13); \ + EXCEPTION_PROLOG_PSERIES_1(soft_nmi_common, _H) + +EXC_COMMON_BEGIN(soft_nmi_common) + mr r10,r1 + ld r1,PACAEMERGSP(r13) + ld r1,PACA_NMI_EMERG_SP(r13) + subi r1,r1,INT_FRAME_SIZE + EXCEPTION_COMMON_NORET_STACK(PACA_EXGEN, 0x900, + system_reset, soft_nmi_interrupt, + ADD_NVGPRS;ADD_RECONCILE) + b ret_from_except + +#else +#define MASKED_DEC_HANDLER_LABEL 2f /* normal return */ +#define MASKED_DEC_HANDLER(_H) +#endif /* * An interrupt came in while soft-disabled. We set paca->irq_happened, then: @@ -1295,7 +1361,7 @@ masked_##_H##interrupt: \ lis r10,0x7fff; \ ori r10,r10,0xffff; \ mtspr SPRN_DEC,r10; \ - b 2f; \ + b MASKED_DEC_HANDLER_LABEL; \ 1: cmpwi r10,PACA_IRQ_DBELL; \ beq 2f; \ cmpwi r10,PACA_IRQ_HMI; \ @@ -1310,7 +1376,8 @@ masked_##_H##interrupt: \ ld r11,PACA_EXGEN+EX_R11(r13); \ GET_SCRATCH0(r13); \ ##_H##rfid; \ - b . + b .; \ + MASKED_DEC_HANDLER(_H) /* * Real mode exceptions actually use this too, but alternate @@ -1553,6 +1620,26 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR) 1: addi r3,r1,STACK_FRAME_OVERHEAD bl kernel_bad_stack b 1b +_ASM_NOKPROBE_SYMBOL(bad_stack); + +/* + * When doorbell is triggered from system reset wakeup, the message is + * not cleared, so it would fire again when EE is enabled. + * + * When coming from local_irq_enable, there may be the same problem if + * we were hard disabled. + * + * Execute msgclr to clear pending exceptions before handling it. + */ +h_doorbell_common_msgclr: + LOAD_REG_IMMEDIATE(r3, PPC_DBELL_MSGTYPE << (63-36)) + PPC_MSGCLR(3) + b h_doorbell_common + +doorbell_super_common_msgclr: + LOAD_REG_IMMEDIATE(r3, PPC_DBELL_MSGTYPE << (63-36)) + PPC_MSGCLRP(3) + b doorbell_super_common /* * Called from arch_local_irq_enable when an interrupt needs @@ -1563,6 +1650,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR) * Note: While MSR:EE is off, we need to make sure that _MSR * in the generated frame has EE set to 1 or the exception * handler will not properly re-enable them. + * + * Note that we don't specify LR as the NIP (return address) for + * the interrupt because that would unbalance the return branch + * predictor. */ _GLOBAL(__replay_interrupt) /* We are going to jump to the exception common code which @@ -1570,7 +1661,7 @@ _GLOBAL(__replay_interrupt) * we don't give a damn about, so we don't bother storing them. */ mfmsr r12 - mflr r11 + LOAD_REG_ADDR(r11, 1f) mfcr r9 ori r12,r12,MSR_EE cmpwi r3,0x900 @@ -1579,13 +1670,16 @@ _GLOBAL(__replay_interrupt) beq hardware_interrupt_common BEGIN_FTR_SECTION cmpwi r3,0xe80 - beq h_doorbell_common + beq h_doorbell_common_msgclr cmpwi r3,0xea0 beq h_virt_irq_common cmpwi r3,0xe60 beq hmi_exception_common FTR_SECTION_ELSE cmpwi r3,0xa00 - beq doorbell_super_common + beq doorbell_super_common_msgclr ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE) +1: blr + +_ASM_NOKPROBE_SYMBOL(__replay_interrupt) diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 466569e..dc0c49c 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -113,11 +113,55 @@ int __init early_init_dt_scan_fw_dump(unsigned long node, return 1; } +/* + * If fadump is registered, check if the memory provided + * falls within boot memory area. + */ +int is_fadump_boot_memory_area(u64 addr, ulong size) +{ + if (!fw_dump.dump_registered) + return 0; + + return (addr + size) > RMA_START && addr <= fw_dump.boot_memory_size; +} + int is_fadump_active(void) { return fw_dump.dump_active; } +/* + * Returns 1, if there are no holes in boot memory area, + * 0 otherwise. + */ +static int is_boot_memory_area_contiguous(void) +{ + struct memblock_region *reg; + unsigned long tstart, tend; + unsigned long start_pfn = PHYS_PFN(RMA_START); + unsigned long end_pfn = PHYS_PFN(RMA_START + fw_dump.boot_memory_size); + unsigned int ret = 0; + + for_each_memblock(memory, reg) { + tstart = max(start_pfn, memblock_region_memory_base_pfn(reg)); + tend = min(end_pfn, memblock_region_memory_end_pfn(reg)); + if (tstart < tend) { + /* Memory hole from start_pfn to tstart */ + if (tstart > start_pfn) + break; + + if (tend == end_pfn) { + ret = 1; + break; + } + + start_pfn = tend + 1; + } + } + + return ret; +} + /* Print firmware assisted dump configurations for debugging purpose. */ static void fadump_show_config(void) { @@ -212,20 +256,46 @@ static inline unsigned long fadump_calculate_reserve_size(void) int ret; unsigned long long base, size; + if (fw_dump.reserve_bootvar) + pr_warn("'fadump_reserve_mem=' parameter is deprecated in favor of 'crashkernel=' parameter.\n"); + /* * Check if the size is specified through crashkernel= cmdline - * option. If yes, then use that but ignore base as fadump - * reserves memory at end of RAM. + * option. If yes, then use that but ignore base as fadump reserves + * memory at a predefined offset. */ ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), &size, &base); if (ret == 0 && size > 0) { + unsigned long max_size; + + if (fw_dump.reserve_bootvar) + pr_info("Using 'crashkernel=' parameter for memory reservation.\n"); + fw_dump.reserve_bootvar = (unsigned long)size; + + /* + * Adjust if the boot memory size specified is above + * the upper limit. + */ + max_size = memblock_phys_mem_size() / MAX_BOOT_MEM_RATIO; + if (fw_dump.reserve_bootvar > max_size) { + fw_dump.reserve_bootvar = max_size; + pr_info("Adjusted boot memory size to %luMB\n", + (fw_dump.reserve_bootvar >> 20)); + } + + return fw_dump.reserve_bootvar; + } else if (fw_dump.reserve_bootvar) { + /* + * 'fadump_reserve_mem=' is being used to reserve memory + * for firmware-assisted dump. + */ return fw_dump.reserve_bootvar; } /* divide by 20 to get 5% of value */ - size = memblock_end_of_DRAM() / 20; + size = memblock_phys_mem_size() / 20; /* round it down in multiples of 256 */ size = size & ~0x0FFFFFFFUL; @@ -377,9 +447,22 @@ static int __init early_fadump_param(char *p) } early_param("fadump", early_fadump_param); -static void register_fw_dump(struct fadump_mem_struct *fdm) +/* + * Look for fadump_reserve_mem= cmdline option + * TODO: Remove references to 'fadump_reserve_mem=' parameter, + * the sooner 'crashkernel=' parameter is accustomed to. + */ +static int __init early_fadump_reserve_mem(char *p) +{ + if (p) + fw_dump.reserve_bootvar = memparse(p, &p); + return 0; +} +early_param("fadump_reserve_mem", early_fadump_reserve_mem); + +static int register_fw_dump(struct fadump_mem_struct *fdm) { - int rc; + int rc, err; unsigned int wait_time; pr_debug("Registering for firmware-assisted kernel dump...\n"); @@ -396,26 +479,38 @@ static void register_fw_dump(struct fadump_mem_struct *fdm) } while (wait_time); + err = -EIO; switch (rc) { + default: + pr_err("Failed to register. Unknown Error(%d).\n", rc); + break; case -1: printk(KERN_ERR "Failed to register firmware-assisted kernel" " dump. Hardware Error(%d).\n", rc); break; case -3: + if (!is_boot_memory_area_contiguous()) + pr_err("Can't have holes in boot memory area while " + "registering fadump\n"); + printk(KERN_ERR "Failed to register firmware-assisted kernel" " dump. Parameter Error(%d).\n", rc); + err = -EINVAL; break; case -9: printk(KERN_ERR "firmware-assisted kernel dump is already " " registered."); fw_dump.dump_registered = 1; + err = -EEXIST; break; case 0: printk(KERN_INFO "firmware-assisted kernel dump registration" " is successful\n"); fw_dump.dump_registered = 1; + err = 0; break; } + return err; } void crash_fadump(struct pt_regs *regs, const char *str) @@ -831,8 +926,19 @@ static void fadump_setup_crash_memory_ranges(void) for_each_memblock(memory, reg) { start = (unsigned long long)reg->base; end = start + (unsigned long long)reg->size; - if (start == RMA_START && end >= fw_dump.boot_memory_size) - start = fw_dump.boot_memory_size; + + /* + * skip the first memory chunk that is already added (RMA_START + * through boot_memory_size). This logic needs a relook if and + * when RMA_START changes to a non-zero value. + */ + BUILD_BUG_ON(RMA_START != 0); + if (start < fw_dump.boot_memory_size) { + if (end > fw_dump.boot_memory_size) + start = fw_dump.boot_memory_size; + else + continue; + } /* add this range excluding the reserved dump area. */ fadump_exclude_reserved_area(start, end); @@ -893,8 +999,7 @@ static int fadump_create_elfcore_headers(char *bufp) phdr->p_paddr = fadump_relocate(paddr_vmcoreinfo_note()); phdr->p_offset = phdr->p_paddr; - phdr->p_memsz = vmcoreinfo_max_size; - phdr->p_filesz = vmcoreinfo_max_size; + phdr->p_memsz = phdr->p_filesz = VMCOREINFO_NOTE_SIZE; /* Increment number of program headers. */ (elf->e_phnum)++; @@ -956,7 +1061,7 @@ static unsigned long init_fadump_header(unsigned long addr) return addr; } -static void register_fadump(void) +static int register_fadump(void) { unsigned long addr; void *vaddr; @@ -966,7 +1071,7 @@ static void register_fadump(void) * assisted dump. */ if (!fw_dump.reserve_dump_area_size) - return; + return -ENODEV; fadump_setup_crash_memory_ranges(); @@ -979,7 +1084,7 @@ static void register_fadump(void) fadump_create_elfcore_headers(vaddr); /* register the future kernel dump with firmware. */ - register_fw_dump(&fdm); + return register_fw_dump(&fdm); } static int fadump_unregister_dump(struct fadump_mem_struct *fdm) @@ -1046,28 +1151,71 @@ void fadump_cleanup(void) } } +static void fadump_free_reserved_memory(unsigned long start_pfn, + unsigned long end_pfn) +{ + unsigned long pfn; + unsigned long time_limit = jiffies + HZ; + + pr_info("freeing reserved memory (0x%llx - 0x%llx)\n", + PFN_PHYS(start_pfn), PFN_PHYS(end_pfn)); + + for (pfn = start_pfn; pfn < end_pfn; pfn++) { + free_reserved_page(pfn_to_page(pfn)); + + if (time_after(jiffies, time_limit)) { + cond_resched(); + time_limit = jiffies + HZ; + } + } +} + +/* + * Skip memory holes and free memory that was actually reserved. + */ +static void fadump_release_reserved_area(unsigned long start, unsigned long end) +{ + struct memblock_region *reg; + unsigned long tstart, tend; + unsigned long start_pfn = PHYS_PFN(start); + unsigned long end_pfn = PHYS_PFN(end); + + for_each_memblock(memory, reg) { + tstart = max(start_pfn, memblock_region_memory_base_pfn(reg)); + tend = min(end_pfn, memblock_region_memory_end_pfn(reg)); + if (tstart < tend) { + fadump_free_reserved_memory(tstart, tend); + + if (tend == end_pfn) + break; + + start_pfn = tend + 1; + } + } +} + /* * Release the memory that was reserved in early boot to preserve the memory * contents. The released memory will be available for general use. */ static void fadump_release_memory(unsigned long begin, unsigned long end) { - unsigned long addr; unsigned long ra_start, ra_end; ra_start = fw_dump.reserve_dump_area_start; ra_end = ra_start + fw_dump.reserve_dump_area_size; - for (addr = begin; addr < end; addr += PAGE_SIZE) { - /* - * exclude the dump reserve area. Will reuse it for next - * fadump registration. - */ - if (addr <= ra_end && ((addr + PAGE_SIZE) > ra_start)) - continue; - - free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT)); - } + /* + * exclude the dump reserve area. Will reuse it for next + * fadump registration. + */ + if (begin < ra_end && end > ra_start) { + if (begin < ra_start) + fadump_release_reserved_area(begin, ra_start); + if (end > ra_end) + fadump_release_reserved_area(ra_end, end); + } else + fadump_release_reserved_area(begin, end); } static void fadump_invalidate_release_mem(void) @@ -1161,7 +1309,6 @@ static ssize_t fadump_register_store(struct kobject *kobj, switch (buf[0]) { case '0': if (fw_dump.dump_registered == 0) { - ret = -EINVAL; goto unlock_out; } /* Un-register Firmware-assisted dump */ @@ -1169,11 +1316,11 @@ static ssize_t fadump_register_store(struct kobject *kobj, break; case '1': if (fw_dump.dump_registered == 1) { - ret = -EINVAL; + ret = -EEXIST; goto unlock_out; } /* Register Firmware-assisted dump */ - register_fadump(); + ret = register_fadump(); break; default: ret = -EINVAL; diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S index 4898d67..516ebef 100644 --- a/arch/powerpc/kernel/idle_book3s.S +++ b/arch/powerpc/kernel/idle_book3s.S @@ -30,7 +30,9 @@ * Use unused space in the interrupt stack to save and restore * registers for winkle support. */ +#define _MMCR0 GPR0 #define _SDR1 GPR3 +#define _PTCR GPR3 #define _RPR GPR4 #define _SPURR GPR5 #define _PURR GPR6 @@ -39,7 +41,7 @@ #define _AMOR GPR9 #define _WORT GPR10 #define _WORC GPR11 -#define _PTCR GPR12 +#define _LPCR GPR12 #define PSSCR_EC_ESL_MASK_SHIFTED (PSSCR_EC | PSSCR_ESL) >> 16 @@ -55,12 +57,14 @@ save_sprs_to_stack: * here since any thread in the core might wake up first */ BEGIN_FTR_SECTION - mfspr r3,SPRN_PTCR - std r3,_PTCR(r1) /* * Note - SDR1 is dropped in Power ISA v3. Hence not restoring * SDR1 here */ + mfspr r3,SPRN_PTCR + std r3,_PTCR(r1) + mfspr r3,SPRN_LPCR + std r3,_LPCR(r1) FTR_SECTION_ELSE mfspr r3,SPRN_SDR1 std r3,_SDR1(r1) @@ -106,13 +110,9 @@ core_idle_lock_held: /* * Pass requested state in r3: * r3 - PNV_THREAD_NAP/SLEEP/WINKLE in POWER8 - * - Requested STOP state in POWER9 + * - Requested PSSCR value in POWER9 * - * To check IRQ_HAPPENED in r4 - * 0 - don't check - * 1 - check - * - * Address to 'rfid' to in r5 + * Address of idle handler to branch to in realmode in r4 */ pnv_powersave_common: /* Use r3 to pass state nap/sleep/winkle */ @@ -122,37 +122,14 @@ pnv_powersave_common: * need to save PC, some CR bits and the NV GPRs, * but for now an interrupt frame will do. */ + mtctr r4 + mflr r0 std r0,16(r1) stdu r1,-INT_FRAME_SIZE(r1) std r0,_LINK(r1) std r0,_NIP(r1) - /* Hard disable interrupts */ - mfmsr r9 - rldicl r9,r9,48,1 - rotldi r9,r9,16 - mtmsrd r9,1 /* hard-disable interrupts */ - - /* Check if something happened while soft-disabled */ - lbz r0,PACAIRQHAPPENED(r13) - andi. r0,r0,~PACA_IRQ_HARD_DIS@l - beq 1f - cmpwi cr0,r4,0 - beq 1f - addi r1,r1,INT_FRAME_SIZE - ld r0,16(r1) - li r3,0 /* Return 0 (no nap) */ - mtlr r0 - blr - -1: /* We mark irqs hard disabled as this is the state we'll - * be in when returning and we need to tell arch_local_irq_restore() - * about it - */ - li r0,PACA_IRQ_HARD_DIS - stb r0,PACAIRQHAPPENED(r13) - /* We haven't lost state ... yet */ li r0,0 stb r0,PACA_NAPSTATELOST(r13) @@ -160,9 +137,8 @@ pnv_powersave_common: /* Continue saving state */ SAVE_GPR(2, r1) SAVE_NVGPRS(r1) - mfcr r4 - std r4,_CCR(r1) - std r9,_MSR(r1) + mfcr r5 + std r5,_CCR(r1) std r1,PACAR1(r13) /* @@ -172,12 +148,8 @@ pnv_powersave_common: * the MMU context to the guest. */ LOAD_REG_IMMEDIATE(r7, MSR_IDLE) - li r6, MSR_RI - andc r6, r9, r6 - mtmsrd r6, 1 /* clear RI before setting SRR0/1 */ - mtspr SPRN_SRR0, r5 - mtspr SPRN_SRR1, r7 - rfid + mtmsrd r7,0 + bctr .globl pnv_enter_arch207_idle_mode pnv_enter_arch207_idle_mode: @@ -285,9 +257,30 @@ power_enter_stop: bne .Lhandle_esl_ec_set IDLE_STATE_ENTER_SEQ(PPC_STOP) li r3,0 /* Since we didn't lose state, return 0 */ + + /* + * pnv_wakeup_noloss() expects r12 to contain the SRR1 value so + * it can determine if the wakeup reason is an HMI in + * CHECK_HMI_INTERRUPT. + * + * However, when we wakeup with ESL=0, SRR1 will not contain the wakeup + * reason, so there is no point setting r12 to SRR1. + * + * Further, we clear r12 here, so that we don't accidentally enter the + * HMI in pnv_wakeup_noloss() if the value of r12[42:45] == WAKE_HMI. + */ + li r12, 0 b pnv_wakeup_noloss .Lhandle_esl_ec_set: + /* + * POWER9 DD2 can incorrectly set PMAO when waking up after a + * state-loss idle. Saving and restoring MMCR0 over idle is a + * workaround. + */ + mfspr r4,SPRN_MMCR0 + std r4,_MMCR0(r1) + /* * Check if the requested state is a deep idle state. */ @@ -319,45 +312,23 @@ lwarx_loop_stop: IDLE_STATE_ENTER_SEQ_NORET(PPC_STOP) -_GLOBAL(power7_idle) +/* + * Entered with MSR[EE]=0 and no soft-masked interrupts pending. + * r3 contains desired idle state (PNV_THREAD_NAP/SLEEP/WINKLE). + */ +_GLOBAL(power7_idle_insn) /* Now check if user or arch enabled NAP mode */ - LOAD_REG_ADDRBASE(r3,powersave_nap) - lwz r4,ADDROFF(powersave_nap)(r3) - cmpwi 0,r4,0 - beqlr - li r3, 1 - /* fall through */ - -_GLOBAL(power7_nap) - mr r4,r3 - li r3,PNV_THREAD_NAP - LOAD_REG_ADDR(r5, pnv_enter_arch207_idle_mode) + LOAD_REG_ADDR(r4, pnv_enter_arch207_idle_mode) b pnv_powersave_common - /* No return */ - -_GLOBAL(power7_sleep) - li r3,PNV_THREAD_SLEEP - li r4,1 - LOAD_REG_ADDR(r5, pnv_enter_arch207_idle_mode) - b pnv_powersave_common - /* No return */ - -_GLOBAL(power7_winkle) - li r3,PNV_THREAD_WINKLE - li r4,1 - LOAD_REG_ADDR(r5, pnv_enter_arch207_idle_mode) - b pnv_powersave_common - /* No return */ #define CHECK_HMI_INTERRUPT \ - mfspr r0,SPRN_SRR1; \ BEGIN_FTR_SECTION_NESTED(66); \ - rlwinm r0,r0,45-31,0xf; /* extract wake reason field (P8) */ \ + rlwinm r0,r12,45-31,0xf; /* extract wake reason field (P8) */ \ FTR_SECTION_ELSE_NESTED(66); \ - rlwinm r0,r0,45-31,0xe; /* P7 wake reason field is 3 bits */ \ + rlwinm r0,r12,45-31,0xe; /* P7 wake reason field is 3 bits */ \ ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_207S, 66); \ cmpwi r0,0xa; /* Hypervisor maintenance ? */ \ - bne 20f; \ + bne+ 20f; \ /* Invoke opal call to handle hmi */ \ ld r2,PACATOC(r13); \ ld r1,PACAR1(r13); \ @@ -369,16 +340,13 @@ ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_207S, 66); \ 20: nop; /* - * r3 - The PSSCR value corresponding to the stop state. - * r4 - The PSSCR mask corrresonding to the stop state. + * Entered with MSR[EE]=0 and no soft-masked interrupts pending. + * r3 contains desired PSSCR register value. */ _GLOBAL(power9_idle_stop) - mfspr r5,SPRN_PSSCR - andc r5,r5,r4 - or r3,r3,r5 + std r3, PACA_REQ_PSSCR(r13) mtspr SPRN_PSSCR,r3 - LOAD_REG_ADDR(r5,power_enter_stop) - li r4,1 + LOAD_REG_ADDR(r4,power_enter_stop) b pnv_powersave_common /* No return */ @@ -436,17 +404,17 @@ pnv_powersave_wakeup_mce: /* * Now put the original SRR1 with SRR1_WAKEMCE_RESVD as the wake - * reason into SRR1, which allows reuse of the system reset wakeup + * reason into r12, which allows reuse of the system reset wakeup * code without being mistaken for another type of wakeup. */ - oris r3,r3,SRR1_WAKEMCE_RESVD@h - mtspr SPRN_SRR1,r3 + oris r12,r3,SRR1_WAKEMCE_RESVD@h b pnv_powersave_wakeup /* * Called from reset vector for powersave wakeups. * cr3 - set to gt if waking up with partial/complete hypervisor state loss + * r12 - SRR1 */ .global pnv_powersave_wakeup pnv_powersave_wakeup: @@ -464,6 +432,8 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300) li r0,PNV_THREAD_RUNNING stb r0,PACA_THREAD_IDLE_STATE(r13) /* Clear thread state */ + mr r3,r12 + #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE li r0,KVM_HWTHREAD_IN_KERNEL stb r0,HSTATE_HWTHREAD_STATE(r13) @@ -477,7 +447,6 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300) #endif /* Return SRR1 from power7_nap() */ - mfspr r3,SPRN_SRR1 blt cr3,pnv_wakeup_noloss b pnv_wakeup_loss @@ -489,18 +458,39 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300) */ pnv_restore_hyp_resource_arch300: /* + * Workaround for POWER9, if we lost resources, the ERAT + * might have been mixed up and needs flushing. We also need + * to reload MMCR0 (see comment above). + */ + blt cr3,1f + PPC_INVALIDATE_ERAT + ld r1,PACAR1(r13) + ld r4,_MMCR0(r1) + mtspr SPRN_MMCR0,r4 +1: + /* * POWER ISA 3. Use PSSCR to determine if we * are waking up from deep idle state */ LOAD_REG_ADDRBASE(r5,pnv_first_deep_stop_state) ld r4,ADDROFF(pnv_first_deep_stop_state)(r5) - mfspr r5,SPRN_PSSCR +BEGIN_FTR_SECTION_NESTED(71) + /* + * Assume that we are waking up from the state + * same as the Requested Level (RL) in the PSSCR + * which are Bits 60-63 + */ + ld r5,PACA_REQ_PSSCR(r13) + rldicl r5,r5,0,60 +FTR_SECTION_ELSE_NESTED(71) /* * 0-3 bits correspond to Power-Saving Level Status * which indicates the idle state we are waking up from */ + mfspr r5, SPRN_PSSCR rldicl r5,r5,4,60 +ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_POWER9_DD1, 71) cmpd cr4,r5,r4 bge cr4,pnv_wakeup_tb_loss /* returns to caller */ @@ -567,9 +557,9 @@ pnv_wakeup_tb_loss: * is required to return back to reset vector after hypervisor state * restore is complete. */ + mr r19,r12 mr r18,r4 mflr r17 - mfspr r16,SPRN_SRR1 BEGIN_FTR_SECTION CHECK_HMI_INTERRUPT END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) @@ -731,13 +721,14 @@ timebase_resync: * Use cr3 which indicates that we are waking up with atleast partial * hypervisor state loss to determine if TIMEBASE RESYNC is needed. */ - ble cr3,clear_lock + ble cr3,.Ltb_resynced /* Time base re-sync */ bl opal_resync_timebase; /* - * If waking up from sleep, per core state is not lost, skip to - * clear_lock. + * If waking up from sleep (POWER8), per core state + * is not lost, skip to clear_lock. */ +.Ltb_resynced: blt cr4,clear_lock /* @@ -812,9 +803,13 @@ no_segments: mtctr r12 bctrl +BEGIN_FTR_SECTION + ld r4,_LPCR(r1) + mtspr SPRN_LPCR,r4 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) hypervisor_state_restored: - mtspr SPRN_SRR1,r16 + mr r12,r19 mtlr r17 blr /* return to pnv_powersave_wakeup */ @@ -827,6 +822,7 @@ fastsleep_workaround_at_exit: /* * R3 here contains the value that will be returned to the caller * of power7_nap. + * R12 contains SRR1 for CHECK_HMI_INTERRUPT. */ .global pnv_wakeup_loss pnv_wakeup_loss: @@ -836,32 +832,33 @@ BEGIN_FTR_SECTION END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) REST_NVGPRS(r1) REST_GPR(2, r1) + ld r4,PACAKMSR(r13) + ld r5,_LINK(r1) ld r6,_CCR(r1) - ld r4,_MSR(r1) - ld r5,_NIP(r1) addi r1,r1,INT_FRAME_SIZE + mtlr r5 mtcr r6 - mtspr SPRN_SRR1,r4 - mtspr SPRN_SRR0,r5 - rfid + mtmsrd r4 + blr /* * R3 here contains the value that will be returned to the caller * of power7_nap. + * R12 contains SRR1 for CHECK_HMI_INTERRUPT. */ pnv_wakeup_noloss: lbz r0,PACA_NAPSTATELOST(r13) cmpwi r0,0 bne pnv_wakeup_loss + ld r1,PACAR1(r13) BEGIN_FTR_SECTION CHECK_HMI_INTERRUPT END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) - ld r1,PACAR1(r13) - ld r6,_CCR(r1) - ld r4,_MSR(r1) + ld r4,PACAKMSR(r13) ld r5,_NIP(r1) + ld r6,_CCR(r1) addi r1,r1,INT_FRAME_SIZE + mtlr r5 mtcr r6 - mtspr SPRN_SRR1,r4 - mtspr SPRN_SRR0,r5 - rfid + mtmsrd r4 + blr diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index f2b724c..233ca3f 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -198,11 +198,11 @@ static unsigned long iommu_range_alloc(struct device *dev, if (unlikely(npages == 0)) { if (printk_ratelimit()) WARN_ON(1); - return DMA_ERROR_CODE; + return IOMMU_MAPPING_ERROR; } if (should_fail_iommu(dev)) - return DMA_ERROR_CODE; + return IOMMU_MAPPING_ERROR; /* * We don't need to disable preemption here because any CPU can @@ -278,7 +278,7 @@ again: } else { /* Give up */ spin_unlock_irqrestore(&(pool->lock), flags); - return DMA_ERROR_CODE; + return IOMMU_MAPPING_ERROR; } } @@ -310,13 +310,13 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl, unsigned long attrs) { unsigned long entry; - dma_addr_t ret = DMA_ERROR_CODE; + dma_addr_t ret = IOMMU_MAPPING_ERROR; int build_fail; entry = iommu_range_alloc(dev, tbl, npages, NULL, mask, align_order); - if (unlikely(entry == DMA_ERROR_CODE)) - return DMA_ERROR_CODE; + if (unlikely(entry == IOMMU_MAPPING_ERROR)) + return IOMMU_MAPPING_ERROR; entry += tbl->it_offset; /* Offset into real TCE table */ ret = entry << tbl->it_page_shift; /* Set the return dma address */ @@ -328,12 +328,12 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl, /* tbl->it_ops->set() only returns non-zero for transient errors. * Clean up the table bitmap in this case and return - * DMA_ERROR_CODE. For all other errors the functionality is + * IOMMU_MAPPING_ERROR. For all other errors the functionality is * not altered. */ if (unlikely(build_fail)) { __iommu_free(tbl, ret, npages); - return DMA_ERROR_CODE; + return IOMMU_MAPPING_ERROR; } /* Flush/invalidate TLB caches if necessary */ @@ -478,7 +478,7 @@ int ppc_iommu_map_sg(struct device *dev, struct iommu_table *tbl, DBG(" - vaddr: %lx, size: %lx\n", vaddr, slen); /* Handle failure */ - if (unlikely(entry == DMA_ERROR_CODE)) { + if (unlikely(entry == IOMMU_MAPPING_ERROR)) { if (!(attrs & DMA_ATTR_NO_WARN) && printk_ratelimit()) dev_info(dev, "iommu_alloc failed, tbl %p " @@ -545,7 +545,7 @@ int ppc_iommu_map_sg(struct device *dev, struct iommu_table *tbl, */ if (outcount < incount) { outs = sg_next(outs); - outs->dma_address = DMA_ERROR_CODE; + outs->dma_address = IOMMU_MAPPING_ERROR; outs->dma_length = 0; } @@ -563,7 +563,7 @@ int ppc_iommu_map_sg(struct device *dev, struct iommu_table *tbl, npages = iommu_num_pages(s->dma_address, s->dma_length, IOMMU_PAGE_SIZE(tbl)); __iommu_free(tbl, vaddr, npages); - s->dma_address = DMA_ERROR_CODE; + s->dma_address = IOMMU_MAPPING_ERROR; s->dma_length = 0; } if (s == outs) @@ -777,7 +777,7 @@ dma_addr_t iommu_map_page(struct device *dev, struct iommu_table *tbl, unsigned long mask, enum dma_data_direction direction, unsigned long attrs) { - dma_addr_t dma_handle = DMA_ERROR_CODE; + dma_addr_t dma_handle = IOMMU_MAPPING_ERROR; void *vaddr; unsigned long uaddr; unsigned int npages, align; @@ -797,7 +797,7 @@ dma_addr_t iommu_map_page(struct device *dev, struct iommu_table *tbl, dma_handle = iommu_alloc(dev, tbl, vaddr, npages, direction, mask >> tbl->it_page_shift, align, attrs); - if (dma_handle == DMA_ERROR_CODE) { + if (dma_handle == IOMMU_MAPPING_ERROR) { if (!(attrs & DMA_ATTR_NO_WARN) && printk_ratelimit()) { dev_info(dev, "iommu_alloc failed, tbl %p " @@ -869,7 +869,7 @@ void *iommu_alloc_coherent(struct device *dev, struct iommu_table *tbl, io_order = get_iommu_order(size, tbl); mapping = iommu_alloc(dev, tbl, ret, nio_pages, DMA_BIDIRECTIONAL, mask >> tbl->it_page_shift, io_order, 0); - if (mapping == DMA_ERROR_CODE) { + if (mapping == IOMMU_MAPPING_ERROR) { free_pages((unsigned long)ret, order); return NULL; } diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 5c291df..0bcec74 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -322,7 +322,8 @@ bool prep_irq_for_idle(void) * First we need to hard disable to ensure no interrupt * occurs before we effectively enter the low power state */ - hard_irq_disable(); + __hard_irq_disable(); + local_paca->irq_happened |= PACA_IRQ_HARD_DIS; /* * If anything happened while we were soft-disabled, @@ -347,6 +348,65 @@ bool prep_irq_for_idle(void) return true; } +#ifdef CONFIG_PPC_BOOK3S +/* + * This is for idle sequences that return with IRQs off, but the + * idle state itself wakes on interrupt. Tell the irq tracer that + * IRQs are enabled for the duration of idle so it does not get long + * off times. Must be paired with fini_irq_for_idle_irqsoff. + */ +bool prep_irq_for_idle_irqsoff(void) +{ + WARN_ON(!irqs_disabled()); + + /* + * First we need to hard disable to ensure no interrupt + * occurs before we effectively enter the low power state + */ + __hard_irq_disable(); + local_paca->irq_happened |= PACA_IRQ_HARD_DIS; + + /* + * If anything happened while we were soft-disabled, + * we return now and do not enter the low power state. + */ + if (lazy_irq_pending()) + return false; + + /* Tell lockdep we are about to re-enable */ + trace_hardirqs_on(); + + return true; +} + +/* + * Take the SRR1 wakeup reason, index into this table to find the + * appropriate irq_happened bit. + */ +static const u8 srr1_to_lazyirq[0x10] = { + 0, 0, 0, + PACA_IRQ_DBELL, + 0, + PACA_IRQ_DBELL, + PACA_IRQ_DEC, + 0, + PACA_IRQ_EE, + PACA_IRQ_EE, + PACA_IRQ_HMI, + 0, 0, 0, 0, 0 }; + +void irq_set_pending_from_srr1(unsigned long srr1) +{ + unsigned int idx = (srr1 & SRR1_WAKEMASK_P8) >> 18; + + /* + * The 0 index (SRR1[42:45]=b0000) must always evaluate to 0, + * so this can be called unconditionally with srr1 wake reason. + */ + local_paca->irq_happened |= srr1_to_lazyirq[idx]; +} +#endif /* CONFIG_PPC_BOOK3S */ + /* * Force a replay of the external interrupt handler on this CPU. */ diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c index 01addfb..367494d 100644 --- a/arch/powerpc/kernel/kprobes.c +++ b/arch/powerpc/kernel/kprobes.c @@ -164,17 +164,13 @@ NOKPROBE_SYMBOL(arch_prepare_kprobe); void arch_arm_kprobe(struct kprobe *p) { - *p->addr = BREAKPOINT_INSTRUCTION; - flush_icache_range((unsigned long) p->addr, - (unsigned long) p->addr + sizeof(kprobe_opcode_t)); + patch_instruction(p->addr, BREAKPOINT_INSTRUCTION); } NOKPROBE_SYMBOL(arch_arm_kprobe); void arch_disarm_kprobe(struct kprobe *p) { - *p->addr = p->opcode; - flush_icache_range((unsigned long) p->addr, - (unsigned long) p->addr + sizeof(kprobe_opcode_t)); + patch_instruction(p->addr, p->opcode); } NOKPROBE_SYMBOL(arch_disarm_kprobe); @@ -221,7 +217,7 @@ static nokprobe_inline void set_current_kprobe(struct kprobe *p, struct pt_regs kcb->kprobe_saved_msr = regs->msr; } -bool arch_function_offset_within_entry(unsigned long offset) +bool arch_kprobe_on_func_entry(unsigned long offset) { #ifdef PPC64_ELF_ABI_v2 #ifdef CONFIG_KPROBES_ON_FTRACE diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c index 9ad37f8..1086ea3 100644 --- a/arch/powerpc/kernel/kvm.c +++ b/arch/powerpc/kernel/kvm.c @@ -25,6 +25,7 @@ #include <linux/kvm_para.h> #include <linux/slab.h> #include <linux/of.h> +#include <linux/nmi.h> /* hardlockup_detector_disable() */ #include <asm/reg.h> #include <asm/sections.h> @@ -718,6 +719,12 @@ static __init void kvm_free_tmp(void) static int __init kvm_guest_init(void) { + /* + * The hardlockup detector is likely to get false positives in + * KVM guests, so disable it by default. + */ + hardlockup_detector_disable(); + if (!kvm_para_available()) goto free_tmp; diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index 5f9eada..e0e131e 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -268,6 +268,7 @@ void machine_check_print_event_info(struct machine_check_event *evt, static const char *mc_ra_types[] = { "Indeterminate", "Instruction fetch (bad)", + "Instruction fetch (foreign)", "Page table walk ifetch (bad)", "Page table walk ifetch (foreign)", "Load (bad)", @@ -405,6 +406,7 @@ void machine_check_print_event_info(struct machine_check_event *evt, break; } } +EXPORT_SYMBOL_GPL(machine_check_print_event_info); uint64_t get_mce_fault_addr(struct machine_check_event *evt) { diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c index f913139..b76ca19 100644 --- a/arch/powerpc/kernel/mce_power.c +++ b/arch/powerpc/kernel/mce_power.c @@ -53,6 +53,60 @@ static void flush_tlb_206(unsigned int num_sets, unsigned int action) asm volatile("ptesync" : : : "memory"); } +static void flush_tlb_300(unsigned int num_sets, unsigned int action) +{ + unsigned long rb; + unsigned int i; + unsigned int r; + + switch (action) { + case TLB_INVAL_SCOPE_GLOBAL: + rb = TLBIEL_INVAL_SET; + break; + case TLB_INVAL_SCOPE_LPID: + rb = TLBIEL_INVAL_SET_LPID; + break; + default: + BUG(); + break; + } + + asm volatile("ptesync" : : : "memory"); + + if (early_radix_enabled()) + r = 1; + else + r = 0; + + /* + * First flush table/PWC caches with set 0, then flush the + * rest of the sets, partition scope. Radix must then do it + * all again with process scope. Hash just has to flush + * process table. + */ + asm volatile(PPC_TLBIEL(%0, %1, %2, %3, %4) : : + "r"(rb), "r"(0), "i"(2), "i"(0), "r"(r)); + for (i = 1; i < num_sets; i++) { + unsigned long set = i * (1<<TLBIEL_INVAL_SET_SHIFT); + + asm volatile(PPC_TLBIEL(%0, %1, %2, %3, %4) : : + "r"(rb+set), "r"(0), "i"(2), "i"(0), "r"(r)); + } + + asm volatile(PPC_TLBIEL(%0, %1, %2, %3, %4) : : + "r"(rb), "r"(0), "i"(2), "i"(1), "r"(r)); + if (early_radix_enabled()) { + for (i = 1; i < num_sets; i++) { + unsigned long set = i * (1<<TLBIEL_INVAL_SET_SHIFT); + + asm volatile(PPC_TLBIEL(%0, %1, %2, %3, %4) : : + "r"(rb+set), "r"(0), "i"(2), "i"(1), "r"(r)); + } + } + + asm volatile("ptesync" : : : "memory"); +} + /* * Generic routines to flush TLB on POWER processors. These routines * are used as flush_tlb hook in the cpu_spec. @@ -79,7 +133,7 @@ void __flush_tlb_power9(unsigned int action) else num_sets = POWER9_TLB_SETS_HASH; - flush_tlb_206(num_sets, action); + flush_tlb_300(num_sets, action); } @@ -236,6 +290,9 @@ static const struct mce_ierror_table mce_p9_ierror_table[] = { { 0x00000000081c0000, 0x0000000000180000, true, MCE_ERROR_TYPE_UE, MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH, MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000000081c0000, 0x00000000001c0000, true, + MCE_ERROR_TYPE_RA, MCE_RA_ERROR_IFETCH_FOREIGN, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, { 0x00000000081c0000, 0x0000000008000000, true, MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_IFETCH_TIMEOUT, MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S index 84db14e..3f7a9a2 100644 --- a/arch/powerpc/kernel/misc_32.S +++ b/arch/powerpc/kernel/misc_32.S @@ -244,8 +244,7 @@ _GLOBAL(_nmask_and_or_msr) */ _GLOBAL(real_readb) mfmsr r7 - ori r0,r7,MSR_DR - xori r0,r0,MSR_DR + rlwinm r0,r7,0,~MSR_DR sync mtmsr r0 sync @@ -262,8 +261,7 @@ _GLOBAL(real_readb) */ _GLOBAL(real_writeb) mfmsr r7 - ori r0,r7,MSR_DR - xori r0,r0,MSR_DR + rlwinm r0,r7,0,~MSR_DR sync mtmsr r0 sync diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S index c119044..8ac0bd2 100644 --- a/arch/powerpc/kernel/misc_64.S +++ b/arch/powerpc/kernel/misc_64.S @@ -614,6 +614,18 @@ _GLOBAL(kexec_sequence) li r0,0 std r0,16(r1) +BEGIN_FTR_SECTION + /* + * This is the best time to turn AMR/IAMR off. + * key 0 is used in radix for supervisor<->user + * protection, but on hash key 0 is reserved + * ideally we want to enter with a clean state. + * NOTE, we rely on r0 being 0 from above. + */ + mtspr SPRN_IAMR,r0 + mtspr SPRN_AMOR,r0 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) + /* save regs for local vars on new stack. * yes, we won't go back, but ... */ diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c index eae61b0..496d639 100644 --- a/arch/powerpc/kernel/nvram_64.c +++ b/arch/powerpc/kernel/nvram_64.c @@ -792,21 +792,17 @@ static ssize_t dev_nvram_write(struct file *file, const char __user *buf, count = min_t(size_t, count, size - *ppos); count = min(count, PAGE_SIZE); - ret = -ENOMEM; - tmp = kmalloc(count, GFP_KERNEL); - if (!tmp) - goto out; - - ret = -EFAULT; - if (copy_from_user(tmp, buf, count)) + tmp = memdup_user(buf, count); + if (IS_ERR(tmp)) { + ret = PTR_ERR(tmp); goto out; + } ret = ppc_md.nvram_write(tmp, count, ppos); -out: kfree(tmp); +out: return ret; - } static long dev_nvram_ioctl(struct file *file, unsigned int cmd, diff --git a/arch/powerpc/kernel/optprobes.c b/arch/powerpc/kernel/optprobes.c index ec60ed0..6f8273f 100644 --- a/arch/powerpc/kernel/optprobes.c +++ b/arch/powerpc/kernel/optprobes.c @@ -158,12 +158,13 @@ void arch_remove_optimized_kprobe(struct optimized_kprobe *op) void patch_imm32_load_insns(unsigned int val, kprobe_opcode_t *addr) { /* addis r4,0,(insn)@h */ - *addr++ = PPC_INST_ADDIS | ___PPC_RT(4) | - ((val >> 16) & 0xffff); + patch_instruction(addr, PPC_INST_ADDIS | ___PPC_RT(4) | + ((val >> 16) & 0xffff)); + addr++; /* ori r4,r4,(insn)@l */ - *addr = PPC_INST_ORI | ___PPC_RA(4) | ___PPC_RS(4) | - (val & 0xffff); + patch_instruction(addr, PPC_INST_ORI | ___PPC_RA(4) | + ___PPC_RS(4) | (val & 0xffff)); } /* @@ -173,24 +174,28 @@ void patch_imm32_load_insns(unsigned int val, kprobe_opcode_t *addr) void patch_imm64_load_insns(unsigned long val, kprobe_opcode_t *addr) { /* lis r3,(op)@highest */ - *addr++ = PPC_INST_ADDIS | ___PPC_RT(3) | - ((val >> 48) & 0xffff); + patch_instruction(addr, PPC_INST_ADDIS | ___PPC_RT(3) | + ((val >> 48) & 0xffff)); + addr++; /* ori r3,r3,(op)@higher */ - *addr++ = PPC_INST_ORI | ___PPC_RA(3) | ___PPC_RS(3) | - ((val >> 32) & 0xffff); + patch_instruction(addr, PPC_INST_ORI | ___PPC_RA(3) | + ___PPC_RS(3) | ((val >> 32) & 0xffff)); + addr++; /* rldicr r3,r3,32,31 */ - *addr++ = PPC_INST_RLDICR | ___PPC_RA(3) | ___PPC_RS(3) | - __PPC_SH64(32) | __PPC_ME64(31); + patch_instruction(addr, PPC_INST_RLDICR | ___PPC_RA(3) | + ___PPC_RS(3) | __PPC_SH64(32) | __PPC_ME64(31)); + addr++; /* oris r3,r3,(op)@h */ - *addr++ = PPC_INST_ORIS | ___PPC_RA(3) | ___PPC_RS(3) | - ((val >> 16) & 0xffff); + patch_instruction(addr, PPC_INST_ORIS | ___PPC_RA(3) | + ___PPC_RS(3) | ((val >> 16) & 0xffff)); + addr++; /* ori r3,r3,(op)@l */ - *addr = PPC_INST_ORI | ___PPC_RA(3) | ___PPC_RS(3) | - (val & 0xffff); + patch_instruction(addr, PPC_INST_ORI | ___PPC_RA(3) | + ___PPC_RS(3) | (val & 0xffff)); } int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p) @@ -198,7 +203,8 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p) kprobe_opcode_t *buff, branch_op_callback, branch_emulate_step; kprobe_opcode_t *op_callback_addr, *emulate_step_addr; long b_offset; - unsigned long nip; + unsigned long nip, size; + int rc, i; kprobe_ppc_optinsn_slots.insn_size = MAX_OPTINSN_SIZE; @@ -231,8 +237,14 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p) goto error; /* Setup template */ - memcpy(buff, optprobe_template_entry, - TMPL_END_IDX * sizeof(kprobe_opcode_t)); + /* We can optimize this via patch_instruction_window later */ + size = (TMPL_END_IDX * sizeof(kprobe_opcode_t)) / sizeof(int); + pr_devel("Copying template to %p, size %lu\n", buff, size); + for (i = 0; i < size; i++) { + rc = patch_instruction(buff + i, *(optprobe_template_entry + i)); + if (rc < 0) + goto error; + } /* * Fixup the template with instructions to: @@ -261,8 +273,8 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p) if (!branch_op_callback || !branch_emulate_step) goto error; - buff[TMPL_CALL_HDLR_IDX] = branch_op_callback; - buff[TMPL_EMULATE_IDX] = branch_emulate_step; + patch_instruction(buff + TMPL_CALL_HDLR_IDX, branch_op_callback); + patch_instruction(buff + TMPL_EMULATE_IDX, branch_emulate_step); /* * 3. load instruction to be emulated into relevant register, and @@ -272,8 +284,7 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p) /* * 4. branch back from trampoline */ - buff[TMPL_RET_IDX] = create_branch((unsigned int *)buff + TMPL_RET_IDX, - (unsigned long)nip, 0); + patch_branch(buff + TMPL_RET_IDX, (unsigned long)nip, 0); flush_icache_range((unsigned long)buff, (unsigned long)(&buff[TMPL_END_IDX])); diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 2ad725e..9f3e2c9 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -511,6 +511,10 @@ void restore_math(struct pt_regs *regs) { unsigned long msr; + /* + * Syscall exit makes a similar initial check before branching + * to restore_math. Keep them in synch. + */ if (!msr_tm_active(regs->msr) && !current->thread.load_fp && !loadvec(current->thread)) return; @@ -1133,6 +1137,11 @@ static inline void restore_sprs(struct thread_struct *old_thread, #endif } +#ifdef CONFIG_PPC_BOOK3S_64 +#define CP_SIZE 128 +static const u8 dummy_copy_buffer[CP_SIZE] __attribute__((aligned(CP_SIZE))); +#endif + struct task_struct *__switch_to(struct task_struct *prev, struct task_struct *new) { @@ -1195,12 +1204,14 @@ struct task_struct *__switch_to(struct task_struct *prev, __switch_to_tm(prev, new); - /* - * We can't take a PMU exception inside _switch() since there is a - * window where the kernel stack SLB and the kernel stack are out - * of sync. Hard disable here. - */ - hard_irq_disable(); + if (!radix_enabled()) { + /* + * We can't take a PMU exception inside _switch() since there + * is a window where the kernel stack SLB and the kernel stack + * are out of sync. Hard disable here. + */ + hard_irq_disable(); + } /* * Call restore_sprs() before calling _switch(). If we move it after @@ -1220,8 +1231,28 @@ struct task_struct *__switch_to(struct task_struct *prev, batch->active = 1; } - if (current_thread_info()->task->thread.regs) + if (current_thread_info()->task->thread.regs) { restore_math(current_thread_info()->task->thread.regs); + + /* + * The copy-paste buffer can only store into foreign real + * addresses, so unprivileged processes can not see the + * data or use it in any way unless they have foreign real + * mappings. We don't have a VAS driver that allocates those + * yet, so no cpabort is required. + */ + if (cpu_has_feature(CPU_FTR_POWER9_DD1)) { + /* + * DD1 allows paste into normal system memory, so we + * do an unpaired copy here to clear the buffer and + * prevent a covert channel being set up. + * + * cpabort is not used because it is quite expensive. + */ + asm volatile(PPC_COPY(%0, %1) + : : "r"(dummy_copy_buffer), "r"(0)); + } + } #endif /* CONFIG_PPC_STD_MMU_64 */ return last; diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index dd8a04f..613f79f 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -15,6 +15,9 @@ #undef DEBUG_PROM +/* we cannot use FORTIFY as it brings in new symbols */ +#define __NO_FORTIFY + #include <stdarg.h> #include <linux/kernel.h> #include <linux/string.h> diff --git a/arch/powerpc/kernel/rtasd.c b/arch/powerpc/kernel/rtasd.c index 3650732..0f0b1b2 100644 --- a/arch/powerpc/kernel/rtasd.c +++ b/arch/powerpc/kernel/rtasd.c @@ -283,7 +283,7 @@ static void prrn_work_fn(struct work_struct *work) * the RTAS event. */ pseries_devicetree_update(-prrn_update_scope); - arch_update_cpu_topology(); + numa_update_cpu_topology(false); } static DECLARE_WORK(prrn_work, prrn_work_fn); diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 857129a..94a9482 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -335,6 +335,10 @@ static int show_cpuinfo(struct seq_file *m, void *v) maj = ((pvr >> 8) & 0xFF) - 1; min = pvr & 0xFF; break; + case 0x004e: /* POWER9 bits 12-15 give chip type */ + maj = (pvr >> 8) & 0x0F; + min = pvr & 0xFF; + break; default: maj = (pvr >> 8) & 0xFF; min = pvr & 0xFF; diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 4640f6d..af23d4b 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -751,22 +751,3 @@ unsigned long memory_block_size_bytes(void) struct ppc_pci_io ppc_pci_io; EXPORT_SYMBOL(ppc_pci_io); #endif - -#ifdef CONFIG_HARDLOCKUP_DETECTOR -u64 hw_nmi_get_sample_period(int watchdog_thresh) -{ - return ppc_proc_freq * watchdog_thresh; -} - -/* - * The hardlockup detector breaks PMU event based branches and is likely - * to get false positives in KVM guests, so disable it by default. - */ -static int __init disable_hardlockup_detector(void) -{ - hardlockup_detector_disable(); - - return 0; -} -early_initcall(disable_hardlockup_detector); -#endif diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index df2a416..997c88d 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -33,6 +33,7 @@ #include <linux/notifier.h> #include <linux/topology.h> #include <linux/profile.h> +#include <linux/processor.h> #include <asm/ptrace.h> #include <linux/atomic.h> @@ -97,7 +98,7 @@ int smp_generic_cpu_bootable(unsigned int nr) /* Special case - we inhibit secondary thread startup * during boot if the user requests it. */ - if (system_state == SYSTEM_BOOTING && cpu_has_feature(CPU_FTR_SMT)) { + if (system_state < SYSTEM_RUNNING && cpu_has_feature(CPU_FTR_SMT)) { if (!smt_enabled_at_boot && cpu_thread_in_core(nr) != 0) return 0; if (smt_enabled_at_boot @@ -112,7 +113,8 @@ int smp_generic_cpu_bootable(unsigned int nr) #ifdef CONFIG_PPC64 int smp_generic_kick_cpu(int nr) { - BUG_ON(nr < 0 || nr >= NR_CPUS); + if (nr < 0 || nr >= nr_cpu_ids) + return -EINVAL; /* * The processor is currently spinning, waiting for the @@ -433,13 +435,31 @@ static void do_smp_send_nmi_ipi(int cpu) } } +void smp_flush_nmi_ipi(u64 delay_us) +{ + unsigned long flags; + + nmi_ipi_lock_start(&flags); + while (nmi_ipi_busy_count) { + nmi_ipi_unlock_end(&flags); + udelay(1); + if (delay_us) { + delay_us--; + if (!delay_us) + return; + } + nmi_ipi_lock_start(&flags); + } + nmi_ipi_unlock_end(&flags); +} + /* * - cpu is the target CPU (must not be this CPU), or NMI_IPI_ALL_OTHERS. * - fn is the target callback function. * - delay_us > 0 is the delay before giving up waiting for targets to * enter the handler, == 0 specifies indefinite delay. */ -static int smp_send_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), u64 delay_us) +int smp_send_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), u64 delay_us) { unsigned long flags; int me = raw_smp_processor_id(); @@ -766,8 +786,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *tidle) smp_ops->give_timebase(); /* Wait until cpu puts itself in the online & active maps */ - while (!cpu_online(cpu)) - cpu_relax(); + spin_until_cond(cpu_online(cpu)); return 0; } diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 2b33cfa..fe6f3a2 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -59,10 +59,10 @@ #include <linux/suspend.h> #include <linux/rtc.h> #include <linux/sched/cputime.h> +#include <linux/processor.h> #include <asm/trace.h> #include <asm/io.h> -#include <asm/processor.h> #include <asm/nvram.h> #include <asm/cache.h> #include <asm/machdep.h> @@ -442,6 +442,7 @@ void __delay(unsigned long loops) unsigned long start; int diff; + spin_begin(); if (__USE_RTC()) { start = get_rtcl(); do { @@ -449,13 +450,14 @@ void __delay(unsigned long loops) diff = get_rtcl() - start; if (diff < 0) diff += 1000000000; + spin_cpu_relax(); } while (diff < loops); } else { start = get_tbl(); while (get_tbl() - start < loops) - HMT_low(); - HMT_medium(); + spin_cpu_relax(); } + spin_end(); } EXPORT_SYMBOL(__delay); @@ -675,7 +677,7 @@ EXPORT_SYMBOL_GPL(tb_to_ns); * the high 64 bits of a * b, i.e. (a * b) >> 64, where a and b * are 64-bit unsigned numbers. */ -unsigned long long sched_clock(void) +notrace unsigned long long sched_clock(void) { if (__USE_RTC()) return get_rtc(); @@ -739,12 +741,20 @@ static int __init get_freq(char *name, int cells, unsigned long *val) static void start_cpu_decrementer(void) { #if defined(CONFIG_BOOKE) || defined(CONFIG_40x) + unsigned int tcr; + /* Clear any pending timer interrupts */ mtspr(SPRN_TSR, TSR_ENW | TSR_WIS | TSR_DIS | TSR_FIS); - /* Enable decrementer interrupt */ - mtspr(SPRN_TCR, TCR_DIE); -#endif /* defined(CONFIG_BOOKE) || defined(CONFIG_40x) */ + tcr = mfspr(SPRN_TCR); + /* + * The watchdog may have already been enabled by u-boot. So leave + * TRC[WP] (Watchdog Period) alone. + */ + tcr &= TCR_WP_MASK; /* Clear all bits except for TCR[WP] */ + tcr |= TCR_DIE; /* Enable decrementer */ + mtspr(SPRN_TCR, tcr); +#endif } void __init generic_calibrate_decr(void) @@ -823,38 +833,76 @@ void read_persistent_clock(struct timespec *ts) } /* clocksource code */ -static u64 rtc_read(struct clocksource *cs) +static notrace u64 rtc_read(struct clocksource *cs) { return (u64)get_rtc(); } -static u64 timebase_read(struct clocksource *cs) +static notrace u64 timebase_read(struct clocksource *cs) { return (u64)get_tb(); } -void update_vsyscall_old(struct timespec *wall_time, struct timespec *wtm, - struct clocksource *clock, u32 mult, u64 cycle_last) + +void update_vsyscall(struct timekeeper *tk) { + struct timespec xt; + struct clocksource *clock = tk->tkr_mono.clock; + u32 mult = tk->tkr_mono.mult; + u32 shift = tk->tkr_mono.shift; + u64 cycle_last = tk->tkr_mono.cycle_last; u64 new_tb_to_xs, new_stamp_xsec; - u32 frac_sec; + u64 frac_sec; if (clock != &clocksource_timebase) return; + xt.tv_sec = tk->xtime_sec; + xt.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift); + /* Make userspace gettimeofday spin until we're done. */ ++vdso_data->tb_update_count; smp_mb(); - /* 19342813113834067 ~= 2^(20+64) / 1e9 */ - new_tb_to_xs = (u64) mult * (19342813113834067ULL >> clock->shift); - new_stamp_xsec = (u64) wall_time->tv_nsec * XSEC_PER_SEC; - do_div(new_stamp_xsec, 1000000000); - new_stamp_xsec += (u64) wall_time->tv_sec * XSEC_PER_SEC; + /* + * This computes ((2^20 / 1e9) * mult) >> shift as a + * 0.64 fixed-point fraction. + * The computation in the else clause below won't overflow + * (as long as the timebase frequency is >= 1.049 MHz) + * but loses precision because we lose the low bits of the constant + * in the shift. Note that 19342813113834067 ~= 2^(20+64) / 1e9. + * For a shift of 24 the error is about 0.5e-9, or about 0.5ns + * over a second. (Shift values are usually 22, 23 or 24.) + * For high frequency clocks such as the 512MHz timebase clock + * on POWER[6789], the mult value is small (e.g. 32768000) + * and so we can shift the constant by 16 initially + * (295147905179 ~= 2^(20+64-16) / 1e9) and then do the + * remaining shifts after the multiplication, which gives a + * more accurate result (e.g. with mult = 32768000, shift = 24, + * the error is only about 1.2e-12, or 0.7ns over 10 minutes). + */ + if (mult <= 62500000 && clock->shift >= 16) + new_tb_to_xs = ((u64) mult * 295147905179ULL) >> (clock->shift - 16); + else + new_tb_to_xs = (u64) mult * (19342813113834067ULL >> clock->shift); + + /* + * Compute the fractional second in units of 2^-32 seconds. + * The fractional second is tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift + * in nanoseconds, so multiplying that by 2^32 / 1e9 gives + * it in units of 2^-32 seconds. + * We assume shift <= 32 because clocks_calc_mult_shift() + * generates shift values in the range 0 - 32. + */ + frac_sec = tk->tkr_mono.xtime_nsec << (32 - shift); + do_div(frac_sec, NSEC_PER_SEC); - BUG_ON(wall_time->tv_nsec >= NSEC_PER_SEC); - /* this is tv_nsec / 1e9 as a 0.32 fraction */ - frac_sec = ((u64) wall_time->tv_nsec * 18446744073ULL) >> 32; + /* + * Work out new stamp_xsec value for any legacy users of systemcfg. + * stamp_xsec is in units of 2^-20 seconds. + */ + new_stamp_xsec = frac_sec >> 12; + new_stamp_xsec += tk->xtime_sec * XSEC_PER_SEC; /* * tb_update_count is used to allow the userspace gettimeofday code @@ -864,15 +912,13 @@ void update_vsyscall_old(struct timespec *wall_time, struct timespec *wtm, * the two values of tb_update_count match and are even then the * tb_to_xs and stamp_xsec values are consistent. If not, then it * loops back and reads them again until this criteria is met. - * We expect the caller to have done the first increment of - * vdso_data->tb_update_count already. */ vdso_data->tb_orig_stamp = cycle_last; vdso_data->stamp_xsec = new_stamp_xsec; vdso_data->tb_to_xs = new_tb_to_xs; - vdso_data->wtom_clock_sec = wtm->tv_sec; - vdso_data->wtom_clock_nsec = wtm->tv_nsec; - vdso_data->stamp_xtime = *wall_time; + vdso_data->wtom_clock_sec = tk->wall_to_monotonic.tv_sec; + vdso_data->wtom_clock_nsec = tk->wall_to_monotonic.tv_nsec; + vdso_data->stamp_xtime = xt; vdso_data->stamp_sec_fraction = frac_sec; smp_wmb(); ++(vdso_data->tb_update_count); diff --git a/arch/powerpc/kernel/tm.S b/arch/powerpc/kernel/tm.S index 3a2d041..c4ba378 100644 --- a/arch/powerpc/kernel/tm.S +++ b/arch/powerpc/kernel/tm.S @@ -313,8 +313,8 @@ dont_backup_fp: blr - /* void tm_recheckpoint(struct thread_struct *thread, - * unsigned long orig_msr) + /* void __tm_recheckpoint(struct thread_struct *thread, + * unsigned long orig_msr) * - Restore the checkpointed register state saved by tm_reclaim * when we switch_to a process. * diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index d4e545d..bfcfd9e 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -237,6 +237,7 @@ void die(const char *str, struct pt_regs *regs, long err) err = 0; oops_end(flags, regs, err); } +NOKPROBE_SYMBOL(die); void user_single_step_siginfo(struct task_struct *tsk, struct pt_regs *regs, siginfo_t *info) @@ -1968,6 +1969,7 @@ void unrecoverable_exception(struct pt_regs *regs) regs->trap, regs->nip); die("Unrecoverable exception", regs, SIGABRT); } +NOKPROBE_SYMBOL(unrecoverable_exception); #if defined(CONFIG_BOOKE_WDT) || defined(CONFIG_40x) /* @@ -1998,6 +2000,7 @@ void kernel_bad_stack(struct pt_regs *regs) regs->gpr[1], regs->nip); die("Bad kernel stack pointer", regs, SIGABRT); } +NOKPROBE_SYMBOL(kernel_bad_stack); void __init trap_init(void) { diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index 2f793be..b1a2505 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -8,6 +8,12 @@ #include <asm/cache.h> #include <asm/thread_info.h> +#ifdef CONFIG_STRICT_KERNEL_RWX +#define STRICT_ALIGN_SIZE (1 << 24) +#else +#define STRICT_ALIGN_SIZE PAGE_SIZE +#endif + ENTRY(_stext) PHDRS { @@ -58,7 +64,6 @@ SECTIONS #ifdef CONFIG_PPC64 KEEP(*(.head.text.first_256B)); #ifdef CONFIG_PPC_BOOK3E -# define END_FIXED 0x100 #else KEEP(*(.head.text.real_vectors)); *(.head.text.real_trampolines); @@ -66,12 +71,8 @@ SECTIONS *(.head.text.virt_trampolines); # if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) KEEP(*(.head.data.fwnmi_page)); -# define END_FIXED 0x8000 -# else -# define END_FIXED 0x7000 # endif #endif - ASSERT((. == END_FIXED), "vmlinux.lds.S: fixed section overflow error"); #else /* !CONFIG_PPC64 */ HEAD_TEXT #endif @@ -79,23 +80,6 @@ SECTIONS __head_end = .; - /* - * If the build dies here, it's likely code in head_64.S is referencing - * labels it can't reach, and the linker inserting stubs without the - * assembler's knowledge. To debug, remove the above assert and - * rebuild. Look for branch stubs in the fixed section region. - * - * Linker stub generation could be allowed in "trampoline" - * sections if absolutely necessary, but this would require - * some rework of the fixed sections. Before resorting to this, - * consider references that have sufficient addressing range, - * (e.g., hand coded trampolines) so the linker does not have - * to add stubs. - * - * Linker stubs at the top of the main text section are currently not - * detected, and will result in a crash at boot due to offsets being - * wrong. - */ #ifdef CONFIG_PPC64 /* * BLOCK(0) overrides the default output section alignment because @@ -103,18 +87,31 @@ SECTIONS * section placement to work. */ .text BLOCK(0) : AT(ADDR(.text) - LOAD_OFFSET) { +#ifdef CONFIG_LD_HEAD_STUB_CATCH + *(.linker_stub_catch); + . = . ; +#endif + #else .text : AT(ADDR(.text) - LOAD_OFFSET) { ALIGN_FUNCTION(); #endif /* careful! __ftr_alt_* sections need to be close to .text */ - *(.text .fixup __ftr_alt_* .ref.text) + *(.text.hot .text .text.fixup .text.unlikely .fixup __ftr_alt_* .ref.text); SCHED_TEXT CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT IRQENTRY_TEXT SOFTIRQENTRY_TEXT + /* + * -Os builds call FP save/restore functions. The powerpc64 + * linker generates those on demand in the .sfpr section. + * .sfpr gets placed at the beginning of a group of input + * sections, which can break start-of-text offset if it is + * included with the main text sections, so put it by itself. + */ + *(.sfpr); MEM_KEEP(init.text) MEM_KEEP(exit.text) @@ -132,7 +129,7 @@ SECTIONS PROVIDE32 (etext = .); /* Read-only data */ - RODATA + RO_DATA(PAGE_SIZE) EXCEPTION_TABLE(0) @@ -149,7 +146,7 @@ SECTIONS /* * Init sections discarded at runtime */ - . = ALIGN(PAGE_SIZE); + . = ALIGN(STRICT_ALIGN_SIZE); __init_begin = .; INIT_TEXT_SECTION(PAGE_SIZE) :kernel @@ -267,7 +264,9 @@ SECTIONS .data : AT(ADDR(.data) - LOAD_OFFSET) { DATA_DATA *(.sdata) + *(.sdata2) *(.got.plt) *(.got) + *(.plt) } #else .data : AT(ADDR(.data) - LOAD_OFFSET) { @@ -330,6 +329,16 @@ SECTIONS _end = . ; PROVIDE32 (end = .); - /* Sections to be discarded. */ + STABS_DEBUG + + DWARF_DEBUG + DISCARDS + /DISCARD/ : { + *(*.EMB.apuinfo) + *(.glink .iplt .plt .rela* .comment) + *(.gnu.version*) + *(.gnu.attributes) + *(.eh_frame) + } } diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c new file mode 100644 index 0000000..b67f8b0 --- /dev/null +++ b/arch/powerpc/kernel/watchdog.c @@ -0,0 +1,386 @@ +/* + * Watchdog support on powerpc systems. + * + * Copyright 2017, IBM Corporation. + * + * This uses code from arch/sparc/kernel/nmi.c and kernel/watchdog.c + */ +#include <linux/kernel.h> +#include <linux/param.h> +#include <linux/init.h> +#include <linux/percpu.h> +#include <linux/cpu.h> +#include <linux/nmi.h> +#include <linux/module.h> +#include <linux/export.h> +#include <linux/kprobes.h> +#include <linux/hardirq.h> +#include <linux/reboot.h> +#include <linux/slab.h> +#include <linux/kdebug.h> +#include <linux/sched/debug.h> +#include <linux/delay.h> +#include <linux/smp.h> + +#include <asm/paca.h> + +/* + * The watchdog has a simple timer that runs on each CPU, once per timer + * period. This is the heartbeat. + * + * Then there are checks to see if the heartbeat has not triggered on a CPU + * for the panic timeout period. Currently the watchdog only supports an + * SMP check, so the heartbeat only turns on when we have 2 or more CPUs. + * + * This is not an NMI watchdog, but Linux uses that name for a generic + * watchdog in some cases, so NMI gets used in some places. + */ + +static cpumask_t wd_cpus_enabled __read_mostly; + +static u64 wd_panic_timeout_tb __read_mostly; /* timebase ticks until panic */ +static u64 wd_smp_panic_timeout_tb __read_mostly; /* panic other CPUs */ + +static u64 wd_timer_period_ms __read_mostly; /* interval between heartbeat */ + +static DEFINE_PER_CPU(struct timer_list, wd_timer); +static DEFINE_PER_CPU(u64, wd_timer_tb); + +/* + * These are for the SMP checker. CPUs clear their pending bit in their + * heartbeat. If the bitmask becomes empty, the time is noted and the + * bitmask is refilled. + * + * All CPUs clear their bit in the pending mask every timer period. + * Once all have cleared, the time is noted and the bits are reset. + * If the time since all clear was greater than the panic timeout, + * we can panic with the list of stuck CPUs. + * + * This will work best with NMI IPIs for crash code so the stuck CPUs + * can be pulled out to get their backtraces. + */ +static unsigned long __wd_smp_lock; +static cpumask_t wd_smp_cpus_pending; +static cpumask_t wd_smp_cpus_stuck; +static u64 wd_smp_last_reset_tb; + +static inline void wd_smp_lock(unsigned long *flags) +{ + /* + * Avoid locking layers if possible. + * This may be called from low level interrupt handlers at some + * point in future. + */ + local_irq_save(*flags); + while (unlikely(test_and_set_bit_lock(0, &__wd_smp_lock))) + cpu_relax(); +} + +static inline void wd_smp_unlock(unsigned long *flags) +{ + clear_bit_unlock(0, &__wd_smp_lock); + local_irq_restore(*flags); +} + +static void wd_lockup_ipi(struct pt_regs *regs) +{ + pr_emerg("Watchdog CPU:%d Hard LOCKUP\n", raw_smp_processor_id()); + print_modules(); + print_irqtrace_events(current); + if (regs) + show_regs(regs); + else + dump_stack(); + + if (hardlockup_panic) + nmi_panic(regs, "Hard LOCKUP"); +} + +static void set_cpu_stuck(int cpu, u64 tb) +{ + cpumask_set_cpu(cpu, &wd_smp_cpus_stuck); + cpumask_clear_cpu(cpu, &wd_smp_cpus_pending); + if (cpumask_empty(&wd_smp_cpus_pending)) { + wd_smp_last_reset_tb = tb; + cpumask_andnot(&wd_smp_cpus_pending, + &wd_cpus_enabled, + &wd_smp_cpus_stuck); + } +} + +static void watchdog_smp_panic(int cpu, u64 tb) +{ + unsigned long flags; + int c; + + wd_smp_lock(&flags); + /* Double check some things under lock */ + if ((s64)(tb - wd_smp_last_reset_tb) < (s64)wd_smp_panic_timeout_tb) + goto out; + if (cpumask_test_cpu(cpu, &wd_smp_cpus_pending)) + goto out; + if (cpumask_weight(&wd_smp_cpus_pending) == 0) + goto out; + + pr_emerg("Watchdog CPU:%d detected Hard LOCKUP other CPUS:%*pbl\n", + cpu, cpumask_pr_args(&wd_smp_cpus_pending)); + + /* + * Try to trigger the stuck CPUs. + */ + for_each_cpu(c, &wd_smp_cpus_pending) { + if (c == cpu) + continue; + smp_send_nmi_ipi(c, wd_lockup_ipi, 1000000); + } + smp_flush_nmi_ipi(1000000); + + /* Take the stuck CPU out of the watch group */ + for_each_cpu(c, &wd_smp_cpus_pending) + set_cpu_stuck(c, tb); + +out: + wd_smp_unlock(&flags); + + printk_safe_flush(); + /* + * printk_safe_flush() seems to require another print + * before anything actually goes out to console. + */ + if (sysctl_hardlockup_all_cpu_backtrace) + trigger_allbutself_cpu_backtrace(); + + if (hardlockup_panic) + nmi_panic(NULL, "Hard LOCKUP"); +} + +static void wd_smp_clear_cpu_pending(int cpu, u64 tb) +{ + if (!cpumask_test_cpu(cpu, &wd_smp_cpus_pending)) { + if (unlikely(cpumask_test_cpu(cpu, &wd_smp_cpus_stuck))) { + unsigned long flags; + + pr_emerg("Watchdog CPU:%d became unstuck\n", cpu); + wd_smp_lock(&flags); + cpumask_clear_cpu(cpu, &wd_smp_cpus_stuck); + wd_smp_unlock(&flags); + } + return; + } + cpumask_clear_cpu(cpu, &wd_smp_cpus_pending); + if (cpumask_empty(&wd_smp_cpus_pending)) { + unsigned long flags; + + wd_smp_lock(&flags); + if (cpumask_empty(&wd_smp_cpus_pending)) { + wd_smp_last_reset_tb = tb; + cpumask_andnot(&wd_smp_cpus_pending, + &wd_cpus_enabled, + &wd_smp_cpus_stuck); + } + wd_smp_unlock(&flags); + } +} + +static void watchdog_timer_interrupt(int cpu) +{ + u64 tb = get_tb(); + + per_cpu(wd_timer_tb, cpu) = tb; + + wd_smp_clear_cpu_pending(cpu, tb); + + if ((s64)(tb - wd_smp_last_reset_tb) >= (s64)wd_smp_panic_timeout_tb) + watchdog_smp_panic(cpu, tb); +} + +void soft_nmi_interrupt(struct pt_regs *regs) +{ + unsigned long flags; + int cpu = raw_smp_processor_id(); + u64 tb; + + if (!cpumask_test_cpu(cpu, &wd_cpus_enabled)) + return; + + nmi_enter(); + tb = get_tb(); + if (tb - per_cpu(wd_timer_tb, cpu) >= wd_panic_timeout_tb) { + per_cpu(wd_timer_tb, cpu) = tb; + + wd_smp_lock(&flags); + if (cpumask_test_cpu(cpu, &wd_smp_cpus_stuck)) { + wd_smp_unlock(&flags); + goto out; + } + set_cpu_stuck(cpu, tb); + + pr_emerg("Watchdog CPU:%d Hard LOCKUP\n", cpu); + print_modules(); + print_irqtrace_events(current); + if (regs) + show_regs(regs); + else + dump_stack(); + + wd_smp_unlock(&flags); + + if (sysctl_hardlockup_all_cpu_backtrace) + trigger_allbutself_cpu_backtrace(); + + if (hardlockup_panic) + nmi_panic(regs, "Hard LOCKUP"); + } + if (wd_panic_timeout_tb < 0x7fffffff) + mtspr(SPRN_DEC, wd_panic_timeout_tb); + +out: + nmi_exit(); +} + +static void wd_timer_reset(unsigned int cpu, struct timer_list *t) +{ + t->expires = jiffies + msecs_to_jiffies(wd_timer_period_ms); + if (wd_timer_period_ms > 1000) + t->expires = __round_jiffies_up(t->expires, cpu); + add_timer_on(t, cpu); +} + +static void wd_timer_fn(unsigned long data) +{ + struct timer_list *t = this_cpu_ptr(&wd_timer); + int cpu = smp_processor_id(); + + watchdog_timer_interrupt(cpu); + + wd_timer_reset(cpu, t); +} + +void arch_touch_nmi_watchdog(void) +{ + int cpu = smp_processor_id(); + + watchdog_timer_interrupt(cpu); +} +EXPORT_SYMBOL(arch_touch_nmi_watchdog); + +static void start_watchdog_timer_on(unsigned int cpu) +{ + struct timer_list *t = per_cpu_ptr(&wd_timer, cpu); + + per_cpu(wd_timer_tb, cpu) = get_tb(); + + setup_pinned_timer(t, wd_timer_fn, 0); + wd_timer_reset(cpu, t); +} + +static void stop_watchdog_timer_on(unsigned int cpu) +{ + struct timer_list *t = per_cpu_ptr(&wd_timer, cpu); + + del_timer_sync(t); +} + +static int start_wd_on_cpu(unsigned int cpu) +{ + if (cpumask_test_cpu(cpu, &wd_cpus_enabled)) { + WARN_ON(1); + return 0; + } + + if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) + return 0; + + if (watchdog_suspended) + return 0; + + if (!cpumask_test_cpu(cpu, &watchdog_cpumask)) + return 0; + + cpumask_set_cpu(cpu, &wd_cpus_enabled); + if (cpumask_weight(&wd_cpus_enabled) == 1) { + cpumask_set_cpu(cpu, &wd_smp_cpus_pending); + wd_smp_last_reset_tb = get_tb(); + } + smp_wmb(); + start_watchdog_timer_on(cpu); + + return 0; +} + +static int stop_wd_on_cpu(unsigned int cpu) +{ + if (!cpumask_test_cpu(cpu, &wd_cpus_enabled)) + return 0; /* Can happen in CPU unplug case */ + + stop_watchdog_timer_on(cpu); + + cpumask_clear_cpu(cpu, &wd_cpus_enabled); + wd_smp_clear_cpu_pending(cpu, get_tb()); + + return 0; +} + +static void watchdog_calc_timeouts(void) +{ + wd_panic_timeout_tb = watchdog_thresh * ppc_tb_freq; + + /* Have the SMP detector trigger a bit later */ + wd_smp_panic_timeout_tb = wd_panic_timeout_tb * 3 / 2; + + /* 2/5 is the factor that the perf based detector uses */ + wd_timer_period_ms = watchdog_thresh * 1000 * 2 / 5; +} + +void watchdog_nmi_reconfigure(void) +{ + int cpu; + + watchdog_calc_timeouts(); + + for_each_cpu(cpu, &wd_cpus_enabled) + stop_wd_on_cpu(cpu); + + for_each_cpu_and(cpu, cpu_online_mask, &watchdog_cpumask) + start_wd_on_cpu(cpu); +} + +/* + * This runs after lockup_detector_init() which sets up watchdog_cpumask. + */ +static int __init powerpc_watchdog_init(void) +{ + int err; + + watchdog_calc_timeouts(); + + err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "powerpc/watchdog:online", + start_wd_on_cpu, stop_wd_on_cpu); + if (err < 0) + pr_warn("Watchdog could not be initialized"); + + return 0; +} +arch_initcall(powerpc_watchdog_init); + +static void handle_backtrace_ipi(struct pt_regs *regs) +{ + nmi_cpu_backtrace(regs); +} + +static void raise_backtrace_ipi(cpumask_t *mask) +{ + unsigned int cpu; + + for_each_cpu(cpu, mask) { + if (cpu == smp_processor_id()) + handle_backtrace_ipi(NULL); + else + smp_send_nmi_ipi(cpu, handle_backtrace_ipi, 1000000); + } +} + +void arch_trigger_cpumask_backtrace(const cpumask_t *mask, bool exclude_self) +{ + nmi_trigger_cpumask_backtrace(mask, exclude_self, raise_backtrace_ipi); +} |