diff options
45 files changed, 5901 insertions, 193 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index 81e9c98..312cd77 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6491,6 +6491,7 @@ F: arch/*/include/asm/kvm* F: include/linux/kvm* F: include/uapi/linux/kvm* F: virt/kvm/ +F: tools/kvm/ KERNEL VIRTUAL MACHINE (KVM) FOR AMD-V M: Joerg Roedel <joro@8bytes.org> diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 0df6b1f..96387d4 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -41,6 +41,8 @@ #define KVM_MAX_VCPUS VGIC_V2_MAX_CPUS +#define KVM_REQ_VCPU_EXIT 8 + u32 *kvm_vcpu_reg(struct kvm_vcpu *vcpu, u8 reg_num, u32 mode); int __attribute_const__ kvm_target_cpu(void); int kvm_reset_vcpu(struct kvm_vcpu *vcpu); @@ -226,6 +228,10 @@ static inline void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm, struct kvm_vcpu *kvm_arm_get_running_vcpu(void); struct kvm_vcpu __percpu **kvm_get_running_vcpus(void); +void kvm_arm_halt_guest(struct kvm *kvm); +void kvm_arm_resume_guest(struct kvm *kvm); +void kvm_arm_halt_vcpu(struct kvm_vcpu *vcpu); +void kvm_arm_resume_vcpu(struct kvm_vcpu *vcpu); int kvm_arm_copy_coproc_indices(struct kvm_vcpu *vcpu, u64 __user *uindices); unsigned long kvm_arm_num_coproc_regs(struct kvm_vcpu *vcpu); diff --git a/arch/arm/include/asm/kvm_mmio.h b/arch/arm/include/asm/kvm_mmio.h index d8e90c8..f3a7de7 100644 --- a/arch/arm/include/asm/kvm_mmio.h +++ b/arch/arm/include/asm/kvm_mmio.h @@ -28,6 +28,9 @@ struct kvm_decode { bool sign_extend; }; +void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data); +unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len); + int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run); int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run, phys_addr_t fault_ipa); diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig index 95a0005..02abfff 100644 --- a/arch/arm/kvm/Kconfig +++ b/arch/arm/kvm/Kconfig @@ -46,6 +46,13 @@ config KVM_ARM_HOST ---help--- Provides host support for ARM processors. +config KVM_NEW_VGIC + bool "New VGIC implementation" + depends on KVM + default y + ---help--- + uses the new VGIC implementation + source drivers/vhost/Kconfig endif # VIRTUALIZATION diff --git a/arch/arm/kvm/Makefile b/arch/arm/kvm/Makefile index eb1bf43..a596b58 100644 --- a/arch/arm/kvm/Makefile +++ b/arch/arm/kvm/Makefile @@ -21,7 +21,18 @@ obj-$(CONFIG_KVM_ARM_HOST) += hyp/ obj-y += kvm-arm.o init.o interrupts.o obj-y += arm.o handle_exit.o guest.o mmu.o emulate.o reset.o obj-y += coproc.o coproc_a15.o coproc_a7.o mmio.o psci.o perf.o + +ifeq ($(CONFIG_KVM_NEW_VGIC),y) +obj-y += $(KVM)/arm/vgic/vgic.o +obj-y += $(KVM)/arm/vgic/vgic-init.o +obj-y += $(KVM)/arm/vgic/vgic-irqfd.o +obj-y += $(KVM)/arm/vgic/vgic-v2.o +obj-y += $(KVM)/arm/vgic/vgic-mmio.o +obj-y += $(KVM)/arm/vgic/vgic-mmio-v2.o +obj-y += $(KVM)/arm/vgic/vgic-kvm-device.o +else obj-y += $(KVM)/arm/vgic.o obj-y += $(KVM)/arm/vgic-v2.o obj-y += $(KVM)/arm/vgic-v2-emul.o +endif obj-y += $(KVM)/arm/arch_timer.o diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 237d5d8..893941e 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -455,7 +455,7 @@ static void update_vttbr(struct kvm *kvm) static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu) { struct kvm *kvm = vcpu->kvm; - int ret; + int ret = 0; if (likely(vcpu->arch.has_run_once)) return 0; @@ -478,9 +478,9 @@ static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu) * interrupts from the virtual timer with a userspace gic. */ if (irqchip_in_kernel(kvm) && vgic_initialized(kvm)) - kvm_timer_enable(kvm); + ret = kvm_timer_enable(vcpu); - return 0; + return ret; } bool kvm_arch_intc_initialized(struct kvm *kvm) @@ -488,30 +488,37 @@ bool kvm_arch_intc_initialized(struct kvm *kvm) return vgic_initialized(kvm); } -static void kvm_arm_halt_guest(struct kvm *kvm) __maybe_unused; -static void kvm_arm_resume_guest(struct kvm *kvm) __maybe_unused; - -static void kvm_arm_halt_guest(struct kvm *kvm) +void kvm_arm_halt_guest(struct kvm *kvm) { int i; struct kvm_vcpu *vcpu; kvm_for_each_vcpu(i, vcpu, kvm) vcpu->arch.pause = true; - force_vm_exit(cpu_all_mask); + kvm_make_all_cpus_request(kvm, KVM_REQ_VCPU_EXIT); +} + +void kvm_arm_halt_vcpu(struct kvm_vcpu *vcpu) +{ + vcpu->arch.pause = true; + kvm_vcpu_kick(vcpu); } -static void kvm_arm_resume_guest(struct kvm *kvm) +void kvm_arm_resume_vcpu(struct kvm_vcpu *vcpu) +{ + struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu); + + vcpu->arch.pause = false; + swake_up(wq); +} + +void kvm_arm_resume_guest(struct kvm *kvm) { int i; struct kvm_vcpu *vcpu; - kvm_for_each_vcpu(i, vcpu, kvm) { - struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu); - - vcpu->arch.pause = false; - swake_up(wq); - } + kvm_for_each_vcpu(i, vcpu, kvm) + kvm_arm_resume_vcpu(vcpu); } static void vcpu_sleep(struct kvm_vcpu *vcpu) diff --git a/arch/arm/kvm/mmio.c b/arch/arm/kvm/mmio.c index 0f6600f..10f80a6 100644 --- a/arch/arm/kvm/mmio.c +++ b/arch/arm/kvm/mmio.c @@ -23,7 +23,7 @@ #include "trace.h" -static void mmio_write_buf(char *buf, unsigned int len, unsigned long data) +void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data) { void *datap = NULL; union { @@ -55,7 +55,7 @@ static void mmio_write_buf(char *buf, unsigned int len, unsigned long data) memcpy(buf, datap, len); } -static unsigned long mmio_read_buf(char *buf, unsigned int len) +unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len) { unsigned long data = 0; union { @@ -66,7 +66,7 @@ static unsigned long mmio_read_buf(char *buf, unsigned int len) switch (len) { case 1: - data = buf[0]; + data = *(u8 *)buf; break; case 2: memcpy(&tmp.hword, buf, len); @@ -87,11 +87,10 @@ static unsigned long mmio_read_buf(char *buf, unsigned int len) /** * kvm_handle_mmio_return -- Handle MMIO loads after user space emulation + * or in-kernel IO emulation + * * @vcpu: The VCPU pointer * @run: The VCPU run struct containing the mmio data - * - * This should only be called after returning from userspace for MMIO load - * emulation. */ int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run) { @@ -104,7 +103,7 @@ int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run) if (len > sizeof(unsigned long)) return -EINVAL; - data = mmio_read_buf(run->mmio.data, len); + data = kvm_mmio_read_buf(run->mmio.data, len); if (vcpu->arch.mmio_decode.sign_extend && len < sizeof(unsigned long)) { @@ -190,7 +189,7 @@ int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run, len); trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, len, fault_ipa, data); - mmio_write_buf(data_buf, len, data); + kvm_mmio_write_buf(data_buf, len, data); ret = kvm_io_bus_write(vcpu, KVM_MMIO_BUS, fault_ipa, len, data_buf); @@ -206,18 +205,19 @@ int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run, run->mmio.is_write = is_write; run->mmio.phys_addr = fault_ipa; run->mmio.len = len; - if (is_write) - memcpy(run->mmio.data, data_buf, len); if (!ret) { /* We handled the access successfully in the kernel. */ + if (!is_write) + memcpy(run->mmio.data, data_buf, len); vcpu->stat.mmio_exit_kernel++; kvm_handle_mmio_return(vcpu, run); return 1; - } else { - vcpu->stat.mmio_exit_user++; } + if (is_write) + memcpy(run->mmio.data, data_buf, len); + vcpu->stat.mmio_exit_user++; run->exit_reason = KVM_EXIT_MMIO; return 0; } diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index e63d23b..49095fc 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -43,6 +43,8 @@ #define KVM_VCPU_MAX_FEATURES 4 +#define KVM_REQ_VCPU_EXIT 8 + int __attribute_const__ kvm_target_cpu(void); int kvm_reset_vcpu(struct kvm_vcpu *vcpu); int kvm_arch_dev_ioctl_check_extension(long ext); @@ -327,6 +329,10 @@ static inline void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm, struct kvm_vcpu *kvm_arm_get_running_vcpu(void); struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void); +void kvm_arm_halt_guest(struct kvm *kvm); +void kvm_arm_resume_guest(struct kvm *kvm); +void kvm_arm_halt_vcpu(struct kvm_vcpu *vcpu); +void kvm_arm_resume_vcpu(struct kvm_vcpu *vcpu); u64 __kvm_call_hyp(void *hypfn, ...); #define kvm_call_hyp(f, ...) __kvm_call_hyp(kvm_ksym_ref(f), ##__VA_ARGS__) diff --git a/arch/arm64/include/asm/kvm_mmio.h b/arch/arm64/include/asm/kvm_mmio.h index fe612a9..75ea420 100644 --- a/arch/arm64/include/asm/kvm_mmio.h +++ b/arch/arm64/include/asm/kvm_mmio.h @@ -30,6 +30,9 @@ struct kvm_decode { bool sign_extend; }; +void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data); +unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len); + int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run); int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run, phys_addr_t fault_ipa); diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index aa2e34e..c4f26ef 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -54,6 +54,13 @@ config KVM_ARM_PMU Adds support for a virtual Performance Monitoring Unit (PMU) in virtual machines. +config KVM_NEW_VGIC + bool "New VGIC implementation" + depends on KVM + default y + ---help--- + uses the new VGIC implementation + source drivers/vhost/Kconfig endif # VIRTUALIZATION diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile index 122cff4..a7a958c 100644 --- a/arch/arm64/kvm/Makefile +++ b/arch/arm64/kvm/Makefile @@ -20,10 +20,22 @@ kvm-$(CONFIG_KVM_ARM_HOST) += emulate.o inject_fault.o regmap.o kvm-$(CONFIG_KVM_ARM_HOST) += hyp.o hyp-init.o handle_exit.o kvm-$(CONFIG_KVM_ARM_HOST) += guest.o debug.o reset.o sys_regs.o sys_regs_generic_v8.o +ifeq ($(CONFIG_KVM_NEW_VGIC),y) +kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic.o +kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-init.o +kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-irqfd.o +kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-v2.o +kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-v3.o +kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-mmio.o +kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-mmio-v2.o +kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-mmio-v3.o +kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-kvm-device.o +else kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic.o kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v2.o kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v2-emul.o kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v3.o kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v3-emul.o +endif kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/arch_timer.o kvm-$(CONFIG_KVM_ARM_PMU) += $(KVM)/arm/pmu.o diff --git a/arch/arm64/kvm/inject_fault.c b/arch/arm64/kvm/inject_fault.c index 4d1ac81..e9e0e6d 100644 --- a/arch/arm64/kvm/inject_fault.c +++ b/arch/arm64/kvm/inject_fault.c @@ -162,7 +162,7 @@ static void inject_abt64(struct kvm_vcpu *vcpu, bool is_iabt, unsigned long addr esr |= (ESR_ELx_EC_IABT_CUR << ESR_ELx_EC_SHIFT); if (!is_iabt) - esr |= ESR_ELx_EC_DABT_LOW; + esr |= ESR_ELx_EC_DABT_LOW << ESR_ELx_EC_SHIFT; vcpu_sys_reg(vcpu, ESR_EL1) = esr | ESR_ELx_FSC_EXTABT; } diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h index b9e9bb2..3725e14 100644 --- a/arch/x86/include/uapi/asm/svm.h +++ b/arch/x86/include/uapi/asm/svm.h @@ -2,10 +2,12 @@ #define _UAPI__SVM_H #define SVM_EXIT_READ_CR0 0x000 +#define SVM_EXIT_READ_CR2 0x002 #define SVM_EXIT_READ_CR3 0x003 #define SVM_EXIT_READ_CR4 0x004 #define SVM_EXIT_READ_CR8 0x008 #define SVM_EXIT_WRITE_CR0 0x010 +#define SVM_EXIT_WRITE_CR2 0x012 #define SVM_EXIT_WRITE_CR3 0x013 #define SVM_EXIT_WRITE_CR4 0x014 #define SVM_EXIT_WRITE_CR8 0x018 @@ -80,10 +82,12 @@ #define SVM_EXIT_REASONS \ { SVM_EXIT_READ_CR0, "read_cr0" }, \ + { SVM_EXIT_READ_CR2, "read_cr2" }, \ { SVM_EXIT_READ_CR3, "read_cr3" }, \ { SVM_EXIT_READ_CR4, "read_cr4" }, \ { SVM_EXIT_READ_CR8, "read_cr8" }, \ { SVM_EXIT_WRITE_CR0, "write_cr0" }, \ + { SVM_EXIT_WRITE_CR2, "write_cr2" }, \ { SVM_EXIT_WRITE_CR3, "write_cr3" }, \ { SVM_EXIT_WRITE_CR4, "write_cr4" }, \ { SVM_EXIT_WRITE_CR8, "write_cr8" }, \ @@ -91,26 +95,57 @@ { SVM_EXIT_READ_DR1, "read_dr1" }, \ { SVM_EXIT_READ_DR2, "read_dr2" }, \ { SVM_EXIT_READ_DR3, "read_dr3" }, \ + { SVM_EXIT_READ_DR4, "read_dr4" }, \ + { SVM_EXIT_READ_DR5, "read_dr5" }, \ + { SVM_EXIT_READ_DR6, "read_dr6" }, \ + { SVM_EXIT_READ_DR7, "read_dr7" }, \ { SVM_EXIT_WRITE_DR0, "write_dr0" }, \ { SVM_EXIT_WRITE_DR1, "write_dr1" }, \ { SVM_EXIT_WRITE_DR2, "write_dr2" }, \ { SVM_EXIT_WRITE_DR3, "write_dr3" }, \ + { SVM_EXIT_WRITE_DR4, "write_dr4" }, \ { SVM_EXIT_WRITE_DR5, "write_dr5" }, \ + { SVM_EXIT_WRITE_DR6, "write_dr6" }, \ { SVM_EXIT_WRITE_DR7, "write_dr7" }, \ + { SVM_EXIT_EXCP_BASE + DE_VECTOR, "DE excp" }, \ { SVM_EXIT_EXCP_BASE + DB_VECTOR, "DB excp" }, \ { SVM_EXIT_EXCP_BASE + BP_VECTOR, "BP excp" }, \ + { SVM_EXIT_EXCP_BASE + OF_VECTOR, "OF excp" }, \ + { SVM_EXIT_EXCP_BASE + BR_VECTOR, "BR excp" }, \ { SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" }, \ - { SVM_EXIT_EXCP_BASE + PF_VECTOR, "PF excp" }, \ { SVM_EXIT_EXCP_BASE + NM_VECTOR, "NM excp" }, \ + { SVM_EXIT_EXCP_BASE + DF_VECTOR, "DF excp" }, \ + { SVM_EXIT_EXCP_BASE + TS_VECTOR, "TS excp" }, \ + { SVM_EXIT_EXCP_BASE + NP_VECTOR, "NP excp" }, \ + { SVM_EXIT_EXCP_BASE + SS_VECTOR, "SS excp" }, \ + { SVM_EXIT_EXCP_BASE + GP_VECTOR, "GP excp" }, \ + { SVM_EXIT_EXCP_BASE + PF_VECTOR, "PF excp" }, \ + { SVM_EXIT_EXCP_BASE + MF_VECTOR, "MF excp" }, \ { SVM_EXIT_EXCP_BASE + AC_VECTOR, "AC excp" }, \ { SVM_EXIT_EXCP_BASE + MC_VECTOR, "MC excp" }, \ + { SVM_EXIT_EXCP_BASE + XM_VECTOR, "XF excp" }, \ { SVM_EXIT_INTR, "interrupt" }, \ { SVM_EXIT_NMI, "nmi" }, \ { SVM_EXIT_SMI, "smi" }, \ { SVM_EXIT_INIT, "init" }, \ { SVM_EXIT_VINTR, "vintr" }, \ { SVM_EXIT_CR0_SEL_WRITE, "cr0_sel_write" }, \ + { SVM_EXIT_IDTR_READ, "read_idtr" }, \ + { SVM_EXIT_GDTR_READ, "read_gdtr" }, \ + { SVM_EXIT_LDTR_READ, "read_ldtr" }, \ + { SVM_EXIT_TR_READ, "read_rt" }, \ + { SVM_EXIT_IDTR_WRITE, "write_idtr" }, \ + { SVM_EXIT_GDTR_WRITE, "write_gdtr" }, \ + { SVM_EXIT_LDTR_WRITE, "write_ldtr" }, \ + { SVM_EXIT_TR_WRITE, "write_rt" }, \ + { SVM_EXIT_RDTSC, "rdtsc" }, \ + { SVM_EXIT_RDPMC, "rdpmc" }, \ + { SVM_EXIT_PUSHF, "pushf" }, \ + { SVM_EXIT_POPF, "popf" }, \ { SVM_EXIT_CPUID, "cpuid" }, \ + { SVM_EXIT_RSM, "rsm" }, \ + { SVM_EXIT_IRET, "iret" }, \ + { SVM_EXIT_SWINT, "swint" }, \ { SVM_EXIT_INVD, "invd" }, \ { SVM_EXIT_PAUSE, "pause" }, \ { SVM_EXIT_HLT, "hlt" }, \ @@ -119,6 +154,7 @@ { SVM_EXIT_IOIO, "io" }, \ { SVM_EXIT_MSR, "msr" }, \ { SVM_EXIT_TASK_SWITCH, "task_switch" }, \ + { SVM_EXIT_FERR_FREEZE, "ferr_freeze" }, \ { SVM_EXIT_SHUTDOWN, "shutdown" }, \ { SVM_EXIT_VMRUN, "vmrun" }, \ { SVM_EXIT_VMMCALL, "hypercall" }, \ @@ -127,14 +163,16 @@ { SVM_EXIT_STGI, "stgi" }, \ { SVM_EXIT_CLGI, "clgi" }, \ { SVM_EXIT_SKINIT, "skinit" }, \ + { SVM_EXIT_RDTSCP, "rdtscp" }, \ + { SVM_EXIT_ICEBP, "icebp" }, \ { SVM_EXIT_WBINVD, "wbinvd" }, \ { SVM_EXIT_MONITOR, "monitor" }, \ { SVM_EXIT_MWAIT, "mwait" }, \ { SVM_EXIT_XSETBV, "xsetbv" }, \ { SVM_EXIT_NPF, "npf" }, \ - { SVM_EXIT_RSM, "rsm" }, \ { SVM_EXIT_AVIC_INCOMPLETE_IPI, "avic_incomplete_ipi" }, \ - { SVM_EXIT_AVIC_UNACCELERATED_ACCESS, "avic_unaccelerated_access" } + { SVM_EXIT_AVIC_UNACCELERATED_ACCESS, "avic_unaccelerated_access" }, \ + { SVM_EXIT_ERR, "invalid_guest_state" } #endif /* _UAPI__SVM_H */ diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 2214214..1163e81 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -84,7 +84,7 @@ MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id); #define TSC_RATIO_MIN 0x0000000000000001ULL #define TSC_RATIO_MAX 0x000000ffffffffffULL -#define AVIC_HPA_MASK ~((0xFFFULL << 52) || 0xFFF) +#define AVIC_HPA_MASK ~((0xFFFULL << 52) | 0xFFF) /* * 0xff is broadcast, so the max index allowed for physical APIC ID @@ -3597,7 +3597,7 @@ static int avic_incomplete_ipi_interception(struct vcpu_svm *svm) u32 icrh = svm->vmcb->control.exit_info_1 >> 32; u32 icrl = svm->vmcb->control.exit_info_1; u32 id = svm->vmcb->control.exit_info_2 >> 32; - u32 index = svm->vmcb->control.exit_info_2 && 0xFF; + u32 index = svm->vmcb->control.exit_info_2 & 0xFF; struct kvm_lapic *apic = svm->vcpu.arch.apic; trace_kvm_avic_incomplete_ipi(svm->vcpu.vcpu_id, icrh, icrl, id, index); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index e605d1e..fb93010 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2418,7 +2418,9 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu) if (is_guest_mode(vcpu)) msr_bitmap = vmx_msr_bitmap_nested; - else if (vcpu->arch.apic_base & X2APIC_ENABLE) { + else if (cpu_has_secondary_exec_ctrls() && + (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) & + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { if (is_long_mode(vcpu)) msr_bitmap = vmx_msr_bitmap_longmode_x2apic; else @@ -4787,6 +4789,19 @@ static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx)); + if (cpu_has_secondary_exec_ctrls()) { + if (kvm_vcpu_apicv_active(vcpu)) + vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, + SECONDARY_EXEC_APIC_REGISTER_VIRT | + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); + else + vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, + SECONDARY_EXEC_APIC_REGISTER_VIRT | + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); + } + + if (cpu_has_vmx_msr_bitmap()) + vmx_set_msr_bitmap(vcpu); } static u32 vmx_exec_control(struct vcpu_vmx *vmx) @@ -6333,23 +6348,20 @@ static __init int hardware_setup(void) set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ - if (enable_apicv) { - for (msr = 0x800; msr <= 0x8ff; msr++) - vmx_disable_intercept_msr_read_x2apic(msr); - - /* According SDM, in x2apic mode, the whole id reg is used. - * But in KVM, it only use the highest eight bits. Need to - * intercept it */ - vmx_enable_intercept_msr_read_x2apic(0x802); - /* TMCCT */ - vmx_enable_intercept_msr_read_x2apic(0x839); - /* TPR */ - vmx_disable_intercept_msr_write_x2apic(0x808); - /* EOI */ - vmx_disable_intercept_msr_write_x2apic(0x80b); - /* SELF-IPI */ - vmx_disable_intercept_msr_write_x2apic(0x83f); - } + for (msr = 0x800; msr <= 0x8ff; msr++) + vmx_disable_intercept_msr_read_x2apic(msr); + + /* According SDM, in x2apic mode, the whole id reg is used. But in + * KVM, it only use the highest eight bits. Need to intercept it */ + vmx_enable_intercept_msr_read_x2apic(0x802); + /* TMCCT */ + vmx_enable_intercept_msr_read_x2apic(0x839); + /* TPR */ + vmx_disable_intercept_msr_write_x2apic(0x808); + /* EOI */ + vmx_disable_intercept_msr_write_x2apic(0x80b); + /* SELF-IPI */ + vmx_disable_intercept_msr_write_x2apic(0x83f); if (enable_ept) { kvm_mmu_set_mask_ptes(0ull, diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h index b651aed..dda39d8 100644 --- a/include/kvm/arm_arch_timer.h +++ b/include/kvm/arm_arch_timer.h @@ -24,9 +24,6 @@ #include <linux/workqueue.h> struct arch_timer_kvm { - /* Is the timer enabled */ - bool enabled; - /* Virtual offset */ cycle_t cntvoff; }; @@ -53,15 +50,15 @@ struct arch_timer_cpu { /* Timer IRQ */ struct kvm_irq_level irq; - /* VGIC mapping */ - struct irq_phys_map *map; - /* Active IRQ state caching */ bool active_cleared_last; + + /* Is the timer enabled */ + bool enabled; }; int kvm_timer_hyp_init(void); -void kvm_timer_enable(struct kvm *kvm); +int kvm_timer_enable(struct kvm_vcpu *vcpu); void kvm_timer_init(struct kvm *kvm); int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu, const struct kvm_irq_level *irq); diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index be6037a..da0a5248 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -19,6 +19,10 @@ #ifndef __ASM_ARM_KVM_VGIC_H #define __ASM_ARM_KVM_VGIC_H +#ifdef CONFIG_KVM_NEW_VGIC +#include <kvm/vgic/vgic.h> +#else + #include <linux/kernel.h> #include <linux/kvm.h> #include <linux/irqreturn.h> @@ -158,7 +162,6 @@ struct vgic_io_device { struct irq_phys_map { u32 virt_irq; u32 phys_irq; - u32 irq; }; struct irq_phys_map_entry { @@ -305,9 +308,6 @@ struct vgic_cpu { unsigned long *active_shared; unsigned long *pend_act_shared; - /* Number of list registers on this CPU */ - int nr_lr; - /* CPU vif control registers for world switch */ union { struct vgic_v2_cpu_if vgic_v2; @@ -342,17 +342,18 @@ void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu); int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int irq_num, bool level); int kvm_vgic_inject_mapped_irq(struct kvm *kvm, int cpuid, - struct irq_phys_map *map, bool level); + unsigned int virt_irq, bool level); void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg); int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu); -struct irq_phys_map *kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, - int virt_irq, int irq); -int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, struct irq_phys_map *map); -bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, struct irq_phys_map *map); +int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, int virt_irq, int phys_irq); +int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int virt_irq); +bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int virt_irq); #define irqchip_in_kernel(k) (!!((k)->arch.vgic.in_kernel)) #define vgic_initialized(k) (!!((k)->arch.vgic.nr_cpus)) #define vgic_ready(k) ((k)->arch.vgic.ready) +#define vgic_valid_spi(k, i) (((i) >= VGIC_NR_PRIVATE_IRQS) && \ + ((i) < (k)->arch.vgic.nr_irqs)) int vgic_v2_probe(const struct gic_kvm_info *gic_kvm_info, const struct vgic_ops **ops, @@ -370,4 +371,5 @@ static inline int vgic_v3_probe(const struct gic_kvm_info *gic_kvm_info, } #endif +#endif /* old VGIC include */ #endif diff --git a/include/kvm/vgic/vgic.h b/include/kvm/vgic/vgic.h new file mode 100644 index 0000000..3fbd175 --- /dev/null +++ b/include/kvm/vgic/vgic.h @@ -0,0 +1,246 @@ +/* + * Copyright (C) 2015, 2016 ARM Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +#ifndef __ASM_ARM_KVM_VGIC_VGIC_H +#define __ASM_ARM_KVM_VGIC_VGIC_H + +#include <linux/kernel.h> +#include <linux/kvm.h> +#include <linux/irqreturn.h> +#include <linux/spinlock.h> +#include <linux/types.h> +#include <kvm/iodev.h> + +#define VGIC_V3_MAX_CPUS 255 +#define VGIC_V2_MAX_CPUS 8 +#define VGIC_NR_IRQS_LEGACY 256 +#define VGIC_NR_SGIS 16 +#define VGIC_NR_PPIS 16 +#define VGIC_NR_PRIVATE_IRQS (VGIC_NR_SGIS + VGIC_NR_PPIS) +#define VGIC_MAX_PRIVATE (VGIC_NR_PRIVATE_IRQS - 1) +#define VGIC_MAX_SPI 1019 +#define VGIC_MAX_RESERVED 1023 +#define VGIC_MIN_LPI 8192 + +enum vgic_type { + VGIC_V2, /* Good ol' GICv2 */ + VGIC_V3, /* New fancy GICv3 */ +}; + +/* same for all guests, as depending only on the _host's_ GIC model */ +struct vgic_global { + /* type of the host GIC */ + enum vgic_type type; + + /* Physical address of vgic virtual cpu interface */ + phys_addr_t vcpu_base; + + /* virtual control interface mapping */ + void __iomem *vctrl_base; + + /* Number of implemented list registers */ + int nr_lr; + + /* Maintenance IRQ number */ + unsigned int maint_irq; + + /* maximum number of VCPUs allowed (GICv2 limits us to 8) */ + int max_gic_vcpus; + + /* Only needed for the legacy KVM_CREATE_IRQCHIP */ + bool can_emulate_gicv2; +}; + +extern struct vgic_global kvm_vgic_global_state; + +#define VGIC_V2_MAX_LRS (1 << 6) +#define VGIC_V3_MAX_LRS 16 +#define VGIC_V3_LR_INDEX(lr) (VGIC_V3_MAX_LRS - 1 - lr) + +enum vgic_irq_config { + VGIC_CONFIG_EDGE = 0, + VGIC_CONFIG_LEVEL +}; + +struct vgic_irq { + spinlock_t irq_lock; /* Protects the content of the struct */ + struct list_head ap_list; + + struct kvm_vcpu *vcpu; /* SGIs and PPIs: The VCPU + * SPIs and LPIs: The VCPU whose ap_list + * this is queued on. + */ + + struct kvm_vcpu *target_vcpu; /* The VCPU that this interrupt should + * be sent to, as a result of the + * targets reg (v2) or the + * affinity reg (v3). + */ + + u32 intid; /* Guest visible INTID */ + bool pending; + bool line_level; /* Level only */ + bool soft_pending; /* Level only */ + bool active; /* not used for LPIs */ + bool enabled; + bool hw; /* Tied to HW IRQ */ + u32 hwintid; /* HW INTID number */ + union { + u8 targets; /* GICv2 target VCPUs mask */ + u32 mpidr; /* GICv3 target VCPU */ + }; + u8 source; /* GICv2 SGIs only */ + u8 priority; + enum vgic_irq_config config; /* Level or edge */ +}; + +struct vgic_register_region; + +struct vgic_io_device { + gpa_t base_addr; + struct kvm_vcpu *redist_vcpu; + const struct vgic_register_region *regions; + int nr_regions; + struct kvm_io_device dev; +}; + +struct vgic_dist { + bool in_kernel; + bool ready; + bool initialized; + + /* vGIC model the kernel emulates for the guest (GICv2 or GICv3) */ + u32 vgic_model; + + int nr_spis; + + /* TODO: Consider moving to global state */ + /* Virtual control interface mapping */ + void __iomem *vctrl_base; + + /* base addresses in guest physical address space: */ + gpa_t vgic_dist_base; /* distributor */ + union { + /* either a GICv2 CPU interface */ + gpa_t vgic_cpu_base; + /* or a number of GICv3 redistributor regions */ + gpa_t vgic_redist_base; + }; + + /* distributor enabled */ + bool enabled; + + struct vgic_irq *spis; + + struct vgic_io_device dist_iodev; + struct vgic_io_device *redist_iodevs; +}; + +struct vgic_v2_cpu_if { + u32 vgic_hcr; + u32 vgic_vmcr; + u32 vgic_misr; /* Saved only */ + u64 vgic_eisr; /* Saved only */ + u64 vgic_elrsr; /* Saved only */ + u32 vgic_apr; + u32 vgic_lr[VGIC_V2_MAX_LRS]; +}; + +struct vgic_v3_cpu_if { +#ifdef CONFIG_KVM_ARM_VGIC_V3 + u32 vgic_hcr; + u32 vgic_vmcr; + u32 vgic_sre; /* Restored only, change ignored */ + u32 vgic_misr; /* Saved only */ + u32 vgic_eisr; /* Saved only */ + u32 vgic_elrsr; /* Saved only */ + u32 vgic_ap0r[4]; + u32 vgic_ap1r[4]; + u64 vgic_lr[VGIC_V3_MAX_LRS]; +#endif +}; + +struct vgic_cpu { + /* CPU vif control registers for world switch */ + union { + struct vgic_v2_cpu_if vgic_v2; + struct vgic_v3_cpu_if vgic_v3; + }; + + unsigned int used_lrs; + struct vgic_irq private_irqs[VGIC_NR_PRIVATE_IRQS]; + + spinlock_t ap_list_lock; /* Protects the ap_list */ + + /* + * List of IRQs that this VCPU should consider because they are either + * Active or Pending (hence the name; AP list), or because they recently + * were one of the two and need to be migrated off this list to another + * VCPU. + */ + struct list_head ap_list_head; + + u64 live_lrs; +}; + +int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write); +void kvm_vgic_early_init(struct kvm *kvm); +int kvm_vgic_create(struct kvm *kvm, u32 type); +void kvm_vgic_destroy(struct kvm *kvm); +void kvm_vgic_vcpu_early_init(struct kvm_vcpu *vcpu); +void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu); +int kvm_vgic_map_resources(struct kvm *kvm); +int kvm_vgic_hyp_init(void); + +int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid, + bool level); +int kvm_vgic_inject_mapped_irq(struct kvm *kvm, int cpuid, unsigned int intid, + bool level); +int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, u32 virt_irq, u32 phys_irq); +int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int virt_irq); +bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int virt_irq); + +int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu); + +#define irqchip_in_kernel(k) (!!((k)->arch.vgic.in_kernel)) +#define vgic_initialized(k) ((k)->arch.vgic.initialized) +#define vgic_ready(k) ((k)->arch.vgic.ready) +#define vgic_valid_spi(k, i) (((i) >= VGIC_NR_PRIVATE_IRQS) && \ + ((i) < (k)->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS)) + +bool kvm_vcpu_has_pending_irqs(struct kvm_vcpu *vcpu); +void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu); +void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu); + +#ifdef CONFIG_KVM_ARM_VGIC_V3 +void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg); +#else +static inline void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg) +{ +} +#endif + +/** + * kvm_vgic_get_max_vcpus - Get the maximum number of VCPUs allowed by HW + * + * The host's GIC naturally limits the maximum amount of VCPUs a guest + * can use. + */ +static inline int kvm_vgic_get_max_vcpus(void) +{ + return kvm_vgic_global_state.max_gic_vcpus; +} + +#endif /* __ASM_ARM_KVM_VGIC_VGIC_H */ diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h index 9e6fdd3..bfbd707 100644 --- a/include/linux/irqchip/arm-gic-v3.h +++ b/include/linux/irqchip/arm-gic-v3.h @@ -273,6 +273,12 @@ #define ICH_LR_ACTIVE_BIT (1ULL << 63) #define ICH_LR_PHYS_ID_SHIFT 32 #define ICH_LR_PHYS_ID_MASK (0x3ffULL << ICH_LR_PHYS_ID_SHIFT) +#define ICH_LR_PRIORITY_SHIFT 48 + +/* These are for GICv2 emulation only */ +#define GICH_LR_VIRTUALID (0x3ffUL << 0) +#define GICH_LR_PHYSID_CPUID_SHIFT (10) +#define GICH_LR_PHYSID_CPUID (7UL << GICH_LR_PHYSID_CPUID_SHIFT) #define ICH_MISR_EOI (1 << 0) #define ICH_MISR_U (1 << 1) diff --git a/include/linux/irqchip/arm-gic.h b/include/linux/irqchip/arm-gic.h index 9c94026..fd05185 100644 --- a/include/linux/irqchip/arm-gic.h +++ b/include/linux/irqchip/arm-gic.h @@ -33,6 +33,7 @@ #define GIC_DIST_CTRL 0x000 #define GIC_DIST_CTR 0x004 +#define GIC_DIST_IIDR 0x008 #define GIC_DIST_IGROUP 0x080 #define GIC_DIST_ENABLE_SET 0x100 #define GIC_DIST_ENABLE_CLEAR 0x180 @@ -76,6 +77,7 @@ #define GICH_LR_VIRTUALID (0x3ff << 0) #define GICH_LR_PHYSID_CPUID_SHIFT (10) #define GICH_LR_PHYSID_CPUID (0x3ff << GICH_LR_PHYSID_CPUID_SHIFT) +#define GICH_LR_PRIORITY_SHIFT 23 #define GICH_LR_STATE (3 << 28) #define GICH_LR_PENDING_BIT (1 << 28) #define GICH_LR_ACTIVE_BIT (1 << 29) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index b1fa8f1..1c9c973 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -412,6 +412,8 @@ struct kvm { #endif long tlbs_dirty; struct list_head devices; + struct dentry *debugfs_dentry; + struct kvm_stat_data **debugfs_stat_data; }; #define kvm_err(fmt, ...) \ @@ -991,6 +993,11 @@ enum kvm_stat_kind { KVM_STAT_VCPU, }; +struct kvm_stat_data { + int offset; + struct kvm *kvm; +}; + struct kvm_stats_debugfs_item { const char *name; int offset; diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h index 526fb3d..f28292d 100644 --- a/include/trace/events/kvm.h +++ b/include/trace/events/kvm.h @@ -108,7 +108,7 @@ TRACE_EVENT(kvm_ioapic_set_irq, __entry->coalesced = coalesced; ), - TP_printk("pin %u dst %x vec=%u (%s|%s|%s%s)%s", + TP_printk("pin %u dst %x vec %u (%s|%s|%s%s)%s", __entry->pin, (u8)(__entry->e >> 56), (u8)__entry->e, __print_symbolic((__entry->e >> 8 & 0x7), kvm_deliver_mode), (__entry->e & (1<<11)) ? "logical" : "physical", @@ -129,7 +129,7 @@ TRACE_EVENT(kvm_ioapic_delayed_eoi_inj, __entry->e = e; ), - TP_printk("dst %x vec=%u (%s|%s|%s%s)", + TP_printk("dst %x vec %u (%s|%s|%s%s)", (u8)(__entry->e >> 56), (u8)__entry->e, __print_symbolic((__entry->e >> 8 & 0x7), kvm_deliver_mode), (__entry->e & (1<<11)) ? "logical" : "physical", @@ -151,7 +151,7 @@ TRACE_EVENT(kvm_msi_set_irq, __entry->data = data; ), - TP_printk("dst %u vec %x (%s|%s|%s%s)", + TP_printk("dst %u vec %u (%s|%s|%s%s)", (u8)(__entry->address >> 12), (u8)__entry->data, __print_symbolic((__entry->data >> 8 & 0x7), kvm_deliver_mode), (__entry->address & (1<<2)) ? "logical" : "physical", diff --git a/tools/Makefile b/tools/Makefile index 6bf68fe..f10b64d8 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -16,6 +16,7 @@ help: @echo ' gpio - GPIO tools' @echo ' hv - tools used when in Hyper-V clients' @echo ' iio - IIO tools' + @echo ' kvm_stat - top-like utility for displaying kvm statistics' @echo ' lguest - a minimal 32-bit x86 hypervisor' @echo ' net - misc networking tools' @echo ' perf - Linux performance measurement and analysis tool' @@ -110,10 +111,13 @@ tmon_install: freefall_install: $(call descend,laptop/$(@:_install=),install) +kvm_stat_install: + $(call descend,kvm/$(@:_install=),install) + install: acpi_install cgroup_install cpupower_install hv_install firewire_install lguest_install \ perf_install selftests_install turbostat_install usb_install \ virtio_install vm_install net_install x86_energy_perf_policy_install \ - tmon_install freefall_install objtool_install + tmon_install freefall_install objtool_install kvm_stat_install acpi_clean: $(call descend,power/acpi,clean) diff --git a/tools/kvm/kvm_stat/Makefile b/tools/kvm/kvm_stat/Makefile new file mode 100644 index 0000000..5b1cba5 --- /dev/null +++ b/tools/kvm/kvm_stat/Makefile @@ -0,0 +1,41 @@ +include ../../scripts/Makefile.include +include ../../scripts/utilities.mak +BINDIR=usr/bin +MANDIR=usr/share/man +MAN1DIR=$(MANDIR)/man1 + +MAN1=kvm_stat.1 + +A2X=a2x +a2x_path := $(call get-executable,$(A2X)) + +all: man + +ifneq ($(findstring $(MAKEFLAGS),s),s) + ifneq ($(V),1) + QUIET_A2X = @echo ' A2X '$@; + endif +endif + +%.1: %.txt +ifeq ($(a2x_path),) + $(error "You need to install asciidoc for man pages") +else + $(QUIET_A2X)$(A2X) --doctype manpage --format manpage $< +endif + +clean: + rm -f $(MAN1) + +man: $(MAN1) + +install-man: man + install -d -m 755 $(INSTALL_ROOT)/$(MAN1DIR) + install -m 644 kvm_stat.1 $(INSTALL_ROOT)/$(MAN1DIR) + +install-tools: + install -d -m 755 $(INSTALL_ROOT)/$(BINDIR) + install -m 755 -p "kvm_stat" "$(INSTALL_ROOT)/$(BINDIR)/$(TARGET)" + +install: install-tools install-man +.PHONY: all clean man install-tools install-man install diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat new file mode 100755 index 0000000..581278c --- /dev/null +++ b/tools/kvm/kvm_stat/kvm_stat @@ -0,0 +1,1127 @@ +#!/usr/bin/python +# +# top-like utility for displaying kvm statistics +# +# Copyright 2006-2008 Qumranet Technologies +# Copyright 2008-2011 Red Hat, Inc. +# +# Authors: +# Avi Kivity <avi@redhat.com> +# +# This work is licensed under the terms of the GNU GPL, version 2. See +# the COPYING file in the top-level directory. +"""The kvm_stat module outputs statistics about running KVM VMs + +Three different ways of output formatting are available: +- as a top-like text ui +- in a key -> value format +- in an all keys, all values format + +The data is sampled from the KVM's debugfs entries and its perf events. +""" + +import curses +import sys +import os +import time +import optparse +import ctypes +import fcntl +import resource +import struct +import re +from collections import defaultdict +from time import sleep + +VMX_EXIT_REASONS = { + 'EXCEPTION_NMI': 0, + 'EXTERNAL_INTERRUPT': 1, + 'TRIPLE_FAULT': 2, + 'PENDING_INTERRUPT': 7, + 'NMI_WINDOW': 8, + 'TASK_SWITCH': 9, + 'CPUID': 10, + 'HLT': 12, + 'INVLPG': 14, + 'RDPMC': 15, + 'RDTSC': 16, + 'VMCALL': 18, + 'VMCLEAR': 19, + 'VMLAUNCH': 20, + 'VMPTRLD': 21, + 'VMPTRST': 22, + 'VMREAD': 23, + 'VMRESUME': 24, + 'VMWRITE': 25, + 'VMOFF': 26, + 'VMON': 27, + 'CR_ACCESS': 28, + 'DR_ACCESS': 29, + 'IO_INSTRUCTION': 30, + 'MSR_READ': 31, + 'MSR_WRITE': 32, + 'INVALID_STATE': 33, + 'MWAIT_INSTRUCTION': 36, + 'MONITOR_INSTRUCTION': 39, + 'PAUSE_INSTRUCTION': 40, + 'MCE_DURING_VMENTRY': 41, + 'TPR_BELOW_THRESHOLD': 43, + 'APIC_ACCESS': 44, + 'EPT_VIOLATION': 48, + 'EPT_MISCONFIG': 49, + 'WBINVD': 54, + 'XSETBV': 55, + 'APIC_WRITE': 56, + 'INVPCID': 58, +} + +SVM_EXIT_REASONS = { + 'READ_CR0': 0x000, + 'READ_CR3': 0x003, + 'READ_CR4': 0x004, + 'READ_CR8': 0x008, + 'WRITE_CR0': 0x010, + 'WRITE_CR3': 0x013, + 'WRITE_CR4': 0x014, + 'WRITE_CR8': 0x018, + 'READ_DR0': 0x020, + 'READ_DR1': 0x021, + 'READ_DR2': 0x022, + 'READ_DR3': 0x023, + 'READ_DR4': 0x024, + 'READ_DR5': 0x025, + 'READ_DR6': 0x026, + 'READ_DR7': 0x027, + 'WRITE_DR0': 0x030, + 'WRITE_DR1': 0x031, + 'WRITE_DR2': 0x032, + 'WRITE_DR3': 0x033, + 'WRITE_DR4': 0x034, + 'WRITE_DR5': 0x035, + 'WRITE_DR6': 0x036, + 'WRITE_DR7': 0x037, + 'EXCP_BASE': 0x040, + 'INTR': 0x060, + 'NMI': 0x061, + 'SMI': 0x062, + 'INIT': 0x063, + 'VINTR': 0x064, + 'CR0_SEL_WRITE': 0x065, + 'IDTR_READ': 0x066, + 'GDTR_READ': 0x067, + 'LDTR_READ': 0x068, + 'TR_READ': 0x069, + 'IDTR_WRITE': 0x06a, + 'GDTR_WRITE': 0x06b, + 'LDTR_WRITE': 0x06c, + 'TR_WRITE': 0x06d, + 'RDTSC': 0x06e, + 'RDPMC': 0x06f, + 'PUSHF': 0x070, + 'POPF': 0x071, + 'CPUID': 0x072, + 'RSM': 0x073, + 'IRET': 0x074, + 'SWINT': 0x075, + 'INVD': 0x076, + 'PAUSE': 0x077, + 'HLT': 0x078, + 'INVLPG': 0x079, + 'INVLPGA': 0x07a, + 'IOIO': 0x07b, + 'MSR': 0x07c, + 'TASK_SWITCH': 0x07d, + 'FERR_FREEZE': 0x07e, + 'SHUTDOWN': 0x07f, + 'VMRUN': 0x080, + 'VMMCALL': 0x081, + 'VMLOAD': 0x082, + 'VMSAVE': 0x083, + 'STGI': 0x084, + 'CLGI': 0x085, + 'SKINIT': 0x086, + 'RDTSCP': 0x087, + 'ICEBP': 0x088, + 'WBINVD': 0x089, + 'MONITOR': 0x08a, + 'MWAIT': 0x08b, + 'MWAIT_COND': 0x08c, + 'XSETBV': 0x08d, + 'NPF': 0x400, +} + +# EC definition of HSR (from arch/arm64/include/asm/kvm_arm.h) +AARCH64_EXIT_REASONS = { + 'UNKNOWN': 0x00, + 'WFI': 0x01, + 'CP15_32': 0x03, + 'CP15_64': 0x04, + 'CP14_MR': 0x05, + 'CP14_LS': 0x06, + 'FP_ASIMD': 0x07, + 'CP10_ID': 0x08, + 'CP14_64': 0x0C, + 'ILL_ISS': 0x0E, + 'SVC32': 0x11, + 'HVC32': 0x12, + 'SMC32': 0x13, + 'SVC64': 0x15, + 'HVC64': 0x16, + 'SMC64': 0x17, + 'SYS64': 0x18, + 'IABT': 0x20, + 'IABT_HYP': 0x21, + 'PC_ALIGN': 0x22, + 'DABT': 0x24, + 'DABT_HYP': 0x25, + 'SP_ALIGN': 0x26, + 'FP_EXC32': 0x28, + 'FP_EXC64': 0x2C, + 'SERROR': 0x2F, + 'BREAKPT': 0x30, + 'BREAKPT_HYP': 0x31, + 'SOFTSTP': 0x32, + 'SOFTSTP_HYP': 0x33, + 'WATCHPT': 0x34, + 'WATCHPT_HYP': 0x35, + 'BKPT32': 0x38, + 'VECTOR32': 0x3A, + 'BRK64': 0x3C, +} + +# From include/uapi/linux/kvm.h, KVM_EXIT_xxx +USERSPACE_EXIT_REASONS = { + 'UNKNOWN': 0, + 'EXCEPTION': 1, + 'IO': 2, + 'HYPERCALL': 3, + 'DEBUG': 4, + 'HLT': 5, + 'MMIO': 6, + 'IRQ_WINDOW_OPEN': 7, + 'SHUTDOWN': 8, + 'FAIL_ENTRY': 9, + 'INTR': 10, + 'SET_TPR': 11, + 'TPR_ACCESS': 12, + 'S390_SIEIC': 13, + 'S390_RESET': 14, + 'DCR': 15, + 'NMI': 16, + 'INTERNAL_ERROR': 17, + 'OSI': 18, + 'PAPR_HCALL': 19, + 'S390_UCONTROL': 20, + 'WATCHDOG': 21, + 'S390_TSCH': 22, + 'EPR': 23, + 'SYSTEM_EVENT': 24, +} + +IOCTL_NUMBERS = { + 'SET_FILTER': 0x40082406, + 'ENABLE': 0x00002400, + 'DISABLE': 0x00002401, + 'RESET': 0x00002403, +} + +class Arch(object): + """Encapsulates global architecture specific data. + + Contains the performance event open syscall and ioctl numbers, as + well as the VM exit reasons for the architecture it runs on. + + """ + @staticmethod + def get_arch(): + machine = os.uname()[4] + + if machine.startswith('ppc'): + return ArchPPC() + elif machine.startswith('aarch64'): + return ArchA64() + elif machine.startswith('s390'): + return ArchS390() + else: + # X86_64 + for line in open('/proc/cpuinfo'): + if not line.startswith('flags'): + continue + + flags = line.split() + if 'vmx' in flags: + return ArchX86(VMX_EXIT_REASONS) + if 'svm' in flags: + return ArchX86(SVM_EXIT_REASONS) + return + +class ArchX86(Arch): + def __init__(self, exit_reasons): + self.sc_perf_evt_open = 298 + self.ioctl_numbers = IOCTL_NUMBERS + self.exit_reasons = exit_reasons + +class ArchPPC(Arch): + def __init__(self): + self.sc_perf_evt_open = 319 + self.ioctl_numbers = IOCTL_NUMBERS + self.ioctl_numbers['ENABLE'] = 0x20002400 + self.ioctl_numbers['DISABLE'] = 0x20002401 + self.ioctl_numbers['RESET'] = 0x20002403 + + # PPC comes in 32 and 64 bit and some generated ioctl + # numbers depend on the wordsize. + char_ptr_size = ctypes.sizeof(ctypes.c_char_p) + self.ioctl_numbers['SET_FILTER'] = 0x80002406 | char_ptr_size << 16 + self.exit_reasons = {} + +class ArchA64(Arch): + def __init__(self): + self.sc_perf_evt_open = 241 + self.ioctl_numbers = IOCTL_NUMBERS + self.exit_reasons = AARCH64_EXIT_REASONS + +class ArchS390(Arch): + def __init__(self): + self.sc_perf_evt_open = 331 + self.ioctl_numbers = IOCTL_NUMBERS + self.exit_reasons = None + +ARCH = Arch.get_arch() + + +def walkdir(path): + """Returns os.walk() data for specified directory. + + As it is only a wrapper it returns the same 3-tuple of (dirpath, + dirnames, filenames). + """ + return next(os.walk(path)) + + +def parse_int_list(list_string): + """Returns an int list from a string of comma separated integers and + integer ranges.""" + integers = [] + members = list_string.split(',') + + for member in members: + if '-' not in member: + integers.append(int(member)) + else: + int_range = member.split('-') + integers.extend(range(int(int_range[0]), + int(int_range[1]) + 1)) + + return integers + + +def get_online_cpus(): + """Returns a list of cpu id integers.""" + with open('/sys/devices/system/cpu/online') as cpu_list: + cpu_string = cpu_list.readline() + return parse_int_list(cpu_string) + + +def get_filters(): + """Returns a dict of trace events, their filter ids and + the values that can be filtered. + + Trace events can be filtered for special values by setting a + filter string via an ioctl. The string normally has the format + identifier==value. For each filter a new event will be created, to + be able to distinguish the events. + + """ + filters = {} + filters['kvm_userspace_exit'] = ('reason', USERSPACE_EXIT_REASONS) + if ARCH.exit_reasons: + filters['kvm_exit'] = ('exit_reason', ARCH.exit_reasons) + return filters + +libc = ctypes.CDLL('libc.so.6', use_errno=True) +syscall = libc.syscall + +class perf_event_attr(ctypes.Structure): + """Struct that holds the necessary data to set up a trace event. + + For an extensive explanation see perf_event_open(2) and + include/uapi/linux/perf_event.h, struct perf_event_attr + + All fields that are not initialized in the constructor are 0. + + """ + _fields_ = [('type', ctypes.c_uint32), + ('size', ctypes.c_uint32), + ('config', ctypes.c_uint64), + ('sample_freq', ctypes.c_uint64), + ('sample_type', ctypes.c_uint64), + ('read_format', ctypes.c_uint64), + ('flags', ctypes.c_uint64), + ('wakeup_events', ctypes.c_uint32), + ('bp_type', ctypes.c_uint32), + ('bp_addr', ctypes.c_uint64), + ('bp_len', ctypes.c_uint64), + ] + + def __init__(self): + super(self.__class__, self).__init__() + self.type = PERF_TYPE_TRACEPOINT + self.size = ctypes.sizeof(self) + self.read_format = PERF_FORMAT_GROUP + +def perf_event_open(attr, pid, cpu, group_fd, flags): + """Wrapper for the sys_perf_evt_open() syscall. + + Used to set up performance events, returns a file descriptor or -1 + on error. + + Attributes are: + - syscall number + - struct perf_event_attr * + - pid or -1 to monitor all pids + - cpu number or -1 to monitor all cpus + - The file descriptor of the group leader or -1 to create a group. + - flags + + """ + return syscall(ARCH.sc_perf_evt_open, ctypes.pointer(attr), + ctypes.c_int(pid), ctypes.c_int(cpu), + ctypes.c_int(group_fd), ctypes.c_long(flags)) + +PERF_TYPE_TRACEPOINT = 2 +PERF_FORMAT_GROUP = 1 << 3 + +PATH_DEBUGFS_TRACING = '/sys/kernel/debug/tracing' +PATH_DEBUGFS_KVM = '/sys/kernel/debug/kvm' + +class Group(object): + """Represents a perf event group.""" + + def __init__(self): + self.events = [] + + def add_event(self, event): + self.events.append(event) + + def read(self): + """Returns a dict with 'event name: value' for all events in the + group. + + Values are read by reading from the file descriptor of the + event that is the group leader. See perf_event_open(2) for + details. + + Read format for the used event configuration is: + struct read_format { + u64 nr; /* The number of events */ + struct { + u64 value; /* The value of the event */ + } values[nr]; + }; + + """ + length = 8 * (1 + len(self.events)) + read_format = 'xxxxxxxx' + 'Q' * len(self.events) + return dict(zip([event.name for event in self.events], + struct.unpack(read_format, + os.read(self.events[0].fd, length)))) + +class Event(object): + """Represents a performance event and manages its life cycle.""" + def __init__(self, name, group, trace_cpu, trace_pid, trace_point, + trace_filter, trace_set='kvm'): + self.name = name + self.fd = None + self.setup_event(group, trace_cpu, trace_pid, trace_point, + trace_filter, trace_set) + + def __del__(self): + """Closes the event's file descriptor. + + As no python file object was created for the file descriptor, + python will not reference count the descriptor and will not + close it itself automatically, so we do it. + + """ + if self.fd: + os.close(self.fd) + + def setup_event_attribute(self, trace_set, trace_point): + """Returns an initialized ctype perf_event_attr struct.""" + + id_path = os.path.join(PATH_DEBUGFS_TRACING, 'events', trace_set, + trace_point, 'id') + + event_attr = perf_event_attr() + event_attr.config = int(open(id_path).read()) + return event_attr + + def setup_event(self, group, trace_cpu, trace_pid, trace_point, + trace_filter, trace_set): + """Sets up the perf event in Linux. + + Issues the syscall to register the event in the kernel and + then sets the optional filter. + + """ + + event_attr = self.setup_event_attribute(trace_set, trace_point) + + # First event will be group leader. + group_leader = -1 + + # All others have to pass the leader's descriptor instead. + if group.events: + group_leader = group.events[0].fd + + fd = perf_event_open(event_attr, trace_pid, + trace_cpu, group_leader, 0) + if fd == -1: + err = ctypes.get_errno() + raise OSError(err, os.strerror(err), + 'while calling sys_perf_event_open().') + + if trace_filter: + fcntl.ioctl(fd, ARCH.ioctl_numbers['SET_FILTER'], + trace_filter) + + self.fd = fd + + def enable(self): + """Enables the trace event in the kernel. + + Enabling the group leader makes reading counters from it and the + events under it possible. + + """ + fcntl.ioctl(self.fd, ARCH.ioctl_numbers['ENABLE'], 0) + + def disable(self): + """Disables the trace event in the kernel. + + Disabling the group leader makes reading all counters under it + impossible. + + """ + fcntl.ioctl(self.fd, ARCH.ioctl_numbers['DISABLE'], 0) + + def reset(self): + """Resets the count of the trace event in the kernel.""" + fcntl.ioctl(self.fd, ARCH.ioctl_numbers['RESET'], 0) + +class TracepointProvider(object): + """Data provider for the stats class. + + Manages the events/groups from which it acquires its data. + + """ + def __init__(self): + self.group_leaders = [] + self.filters = get_filters() + self._fields = self.get_available_fields() + self._pid = 0 + + def get_available_fields(self): + """Returns a list of available event's of format 'event name(filter + name)'. + + All available events have directories under + /sys/kernel/debug/tracing/events/ which export information + about the specific event. Therefore, listing the dirs gives us + a list of all available events. + + Some events like the vm exit reasons can be filtered for + specific values. To take account for that, the routine below + creates special fields with the following format: + event name(filter name) + + """ + path = os.path.join(PATH_DEBUGFS_TRACING, 'events', 'kvm') + fields = walkdir(path)[1] + extra = [] + for field in fields: + if field in self.filters: + filter_name_, filter_dicts = self.filters[field] + for name in filter_dicts: + extra.append(field + '(' + name + ')') + fields += extra + return fields + + def setup_traces(self): + """Creates all event and group objects needed to be able to retrieve + data.""" + if self._pid > 0: + # Fetch list of all threads of the monitored pid, as qemu + # starts a thread for each vcpu. + path = os.path.join('/proc', str(self._pid), 'task') + groupids = walkdir(path)[1] + else: + groupids = get_online_cpus() + + # The constant is needed as a buffer for python libs, std + # streams and other files that the script opens. + newlim = len(groupids) * len(self._fields) + 50 + try: + softlim_, hardlim = resource.getrlimit(resource.RLIMIT_NOFILE) + + if hardlim < newlim: + # Now we need CAP_SYS_RESOURCE, to increase the hard limit. + resource.setrlimit(resource.RLIMIT_NOFILE, (newlim, newlim)) + else: + # Raising the soft limit is sufficient. + resource.setrlimit(resource.RLIMIT_NOFILE, (newlim, hardlim)) + + except ValueError: + sys.exit("NOFILE rlimit could not be raised to {0}".format(newlim)) + + for groupid in groupids: + group = Group() + for name in self._fields: + tracepoint = name + tracefilter = None + match = re.match(r'(.*)\((.*)\)', name) + if match: + tracepoint, sub = match.groups() + tracefilter = ('%s==%d\0' % + (self.filters[tracepoint][0], + self.filters[tracepoint][1][sub])) + + # From perf_event_open(2): + # pid > 0 and cpu == -1 + # This measures the specified process/thread on any CPU. + # + # pid == -1 and cpu >= 0 + # This measures all processes/threads on the specified CPU. + trace_cpu = groupid if self._pid == 0 else -1 + trace_pid = int(groupid) if self._pid != 0 else -1 + + group.add_event(Event(name=name, + group=group, + trace_cpu=trace_cpu, + trace_pid=trace_pid, + trace_point=tracepoint, + trace_filter=tracefilter)) + + self.group_leaders.append(group) + + def available_fields(self): + return self.get_available_fields() + + @property + def fields(self): + return self._fields + + @fields.setter + def fields(self, fields): + """Enables/disables the (un)wanted events""" + self._fields = fields + for group in self.group_leaders: + for index, event in enumerate(group.events): + if event.name in fields: + event.reset() + event.enable() + else: + # Do not disable the group leader. + # It would disable all of its events. + if index != 0: + event.disable() + + @property + def pid(self): + return self._pid + + @pid.setter + def pid(self, pid): + """Changes the monitored pid by setting new traces.""" + self._pid = pid + # The garbage collector will get rid of all Event/Group + # objects and open files after removing the references. + self.group_leaders = [] + self.setup_traces() + self.fields = self._fields + + def read(self): + """Returns 'event name: current value' for all enabled events.""" + ret = defaultdict(int) + for group in self.group_leaders: + for name, val in group.read().iteritems(): + if name in self._fields: + ret[name] += val + return ret + +class DebugfsProvider(object): + """Provides data from the files that KVM creates in the kvm debugfs + folder.""" + def __init__(self): + self._fields = self.get_available_fields() + self._pid = 0 + self.do_read = True + + def get_available_fields(self): + """"Returns a list of available fields. + + The fields are all available KVM debugfs files + + """ + return walkdir(PATH_DEBUGFS_KVM)[2] + + @property + def fields(self): + return self._fields + + @fields.setter + def fields(self, fields): + self._fields = fields + + @property + def pid(self): + return self._pid + + @pid.setter + def pid(self, pid): + if pid != 0: + self._pid = pid + + vms = walkdir(PATH_DEBUGFS_KVM)[1] + if len(vms) == 0: + self.do_read = False + + self.paths = filter(lambda x: "{}-".format(pid) in x, vms) + + else: + self.paths = [''] + self.do_read = True + + def read(self): + """Returns a dict with format:'file name / field -> current value'.""" + results = {} + + # If no debugfs filtering support is available, then don't read. + if not self.do_read: + return results + + for path in self.paths: + for field in self._fields: + results[field] = results.get(field, 0) \ + + self.read_field(field, path) + + return results + + def read_field(self, field, path): + """Returns the value of a single field from a specific VM.""" + try: + return int(open(os.path.join(PATH_DEBUGFS_KVM, + path, + field)) + .read()) + except IOError: + return 0 + +class Stats(object): + """Manages the data providers and the data they provide. + + It is used to set filters on the provider's data and collect all + provider data. + + """ + def __init__(self, providers, pid, fields=None): + self.providers = providers + self._pid_filter = pid + self._fields_filter = fields + self.values = {} + self.update_provider_pid() + self.update_provider_filters() + + def update_provider_filters(self): + """Propagates fields filters to providers.""" + def wanted(key): + if not self._fields_filter: + return True + return re.match(self._fields_filter, key) is not None + + # As we reset the counters when updating the fields we can + # also clear the cache of old values. + self.values = {} + for provider in self.providers: + provider_fields = [key for key in provider.get_available_fields() + if wanted(key)] + provider.fields = provider_fields + + def update_provider_pid(self): + """Propagates pid filters to providers.""" + for provider in self.providers: + provider.pid = self._pid_filter + + @property + def fields_filter(self): + return self._fields_filter + + @fields_filter.setter + def fields_filter(self, fields_filter): + self._fields_filter = fields_filter + self.update_provider_filters() + + @property + def pid_filter(self): + return self._pid_filter + + @pid_filter.setter + def pid_filter(self, pid): + self._pid_filter = pid + self.values = {} + self.update_provider_pid() + + def get(self): + """Returns a dict with field -> (value, delta to last value) of all + provider data.""" + for provider in self.providers: + new = provider.read() + for key in provider.fields: + oldval = self.values.get(key, (0, 0)) + newval = new.get(key, 0) + newdelta = None + if oldval is not None: + newdelta = newval - oldval[0] + self.values[key] = (newval, newdelta) + return self.values + +LABEL_WIDTH = 40 +NUMBER_WIDTH = 10 + +class Tui(object): + """Instruments curses to draw a nice text ui.""" + def __init__(self, stats): + self.stats = stats + self.screen = None + self.drilldown = False + self.update_drilldown() + + def __enter__(self): + """Initialises curses for later use. Based on curses.wrapper + implementation from the Python standard library.""" + self.screen = curses.initscr() + curses.noecho() + curses.cbreak() + + # The try/catch works around a minor bit of + # over-conscientiousness in the curses module, the error + # return from C start_color() is ignorable. + try: + curses.start_color() + except: + pass + + curses.use_default_colors() + return self + + def __exit__(self, *exception): + """Resets the terminal to its normal state. Based on curses.wrappre + implementation from the Python standard library.""" + if self.screen: + self.screen.keypad(0) + curses.echo() + curses.nocbreak() + curses.endwin() + + def update_drilldown(self): + """Sets or removes a filter that only allows fields without braces.""" + if not self.stats.fields_filter: + self.stats.fields_filter = r'^[^\(]*$' + + elif self.stats.fields_filter == r'^[^\(]*$': + self.stats.fields_filter = None + + def update_pid(self, pid): + """Propagates pid selection to stats object.""" + self.stats.pid_filter = pid + + def refresh(self, sleeptime): + """Refreshes on-screen data.""" + self.screen.erase() + if self.stats.pid_filter > 0: + self.screen.addstr(0, 0, 'kvm statistics - pid {0}' + .format(self.stats.pid_filter), + curses.A_BOLD) + else: + self.screen.addstr(0, 0, 'kvm statistics - summary', curses.A_BOLD) + self.screen.addstr(2, 1, 'Event') + self.screen.addstr(2, 1 + LABEL_WIDTH + NUMBER_WIDTH - + len('Total'), 'Total') + self.screen.addstr(2, 1 + LABEL_WIDTH + NUMBER_WIDTH + 8 - + len('Current'), 'Current') + row = 3 + stats = self.stats.get() + def sortkey(x): + if stats[x][1]: + return (-stats[x][1], -stats[x][0]) + else: + return (0, -stats[x][0]) + for key in sorted(stats.keys(), key=sortkey): + + if row >= self.screen.getmaxyx()[0]: + break + values = stats[key] + if not values[0] and not values[1]: + break + col = 1 + self.screen.addstr(row, col, key) + col += LABEL_WIDTH + self.screen.addstr(row, col, '%10d' % (values[0],)) + col += NUMBER_WIDTH + if values[1] is not None: + self.screen.addstr(row, col, '%8d' % (values[1] / sleeptime,)) + row += 1 + self.screen.refresh() + + def show_filter_selection(self): + """Draws filter selection mask. + + Asks for a valid regex and sets the fields filter accordingly. + + """ + while True: + self.screen.erase() + self.screen.addstr(0, 0, + "Show statistics for events matching a regex.", + curses.A_BOLD) + self.screen.addstr(2, 0, + "Current regex: {0}" + .format(self.stats.fields_filter)) + self.screen.addstr(3, 0, "New regex: ") + curses.echo() + regex = self.screen.getstr() + curses.noecho() + if len(regex) == 0: + return + try: + re.compile(regex) + self.stats.fields_filter = regex + return + except re.error: + continue + + def show_vm_selection(self): + """Draws PID selection mask. + + Asks for a pid until a valid pid or 0 has been entered. + + """ + while True: + self.screen.erase() + self.screen.addstr(0, 0, + 'Show statistics for specific pid.', + curses.A_BOLD) + self.screen.addstr(1, 0, + 'This might limit the shown data to the trace ' + 'statistics.') + + curses.echo() + self.screen.addstr(3, 0, "Pid [0 or pid]: ") + pid = self.screen.getstr() + curses.noecho() + + try: + pid = int(pid) + + if pid == 0: + self.update_pid(pid) + break + else: + if not os.path.isdir(os.path.join('/proc/', str(pid))): + continue + else: + self.update_pid(pid) + break + + except ValueError: + continue + + def show_stats(self): + """Refreshes the screen and processes user input.""" + sleeptime = 0.25 + while True: + self.refresh(sleeptime) + curses.halfdelay(int(sleeptime * 10)) + sleeptime = 3 + try: + char = self.screen.getkey() + if char == 'x': + self.drilldown = not self.drilldown + self.update_drilldown() + if char == 'q': + break + if char == 'f': + self.show_filter_selection() + if char == 'p': + self.show_vm_selection() + except KeyboardInterrupt: + break + except curses.error: + continue + +def batch(stats): + """Prints statistics in a key, value format.""" + s = stats.get() + time.sleep(1) + s = stats.get() + for key in sorted(s.keys()): + values = s[key] + print '%-42s%10d%10d' % (key, values[0], values[1]) + +def log(stats): + """Prints statistics as reiterating key block, multiple value blocks.""" + keys = sorted(stats.get().iterkeys()) + def banner(): + for k in keys: + print '%s' % k, + print + def statline(): + s = stats.get() + for k in keys: + print ' %9d' % s[k][1], + print + line = 0 + banner_repeat = 20 + while True: + time.sleep(1) + if line % banner_repeat == 0: + banner() + statline() + line += 1 + +def get_options(): + """Returns processed program arguments.""" + description_text = """ +This script displays various statistics about VMs running under KVM. +The statistics are gathered from the KVM debugfs entries and / or the +currently available perf traces. + +The monitoring takes additional cpu cycles and might affect the VM's +performance. + +Requirements: +- Access to: + /sys/kernel/debug/kvm + /sys/kernel/debug/trace/events/* + /proc/pid/task +- /proc/sys/kernel/perf_event_paranoid < 1 if user has no + CAP_SYS_ADMIN and perf events are used. +- CAP_SYS_RESOURCE if the hard limit is not high enough to allow + the large number of files that are possibly opened. +""" + + class PlainHelpFormatter(optparse.IndentedHelpFormatter): + def format_description(self, description): + if description: + return description + "\n" + else: + return "" + + optparser = optparse.OptionParser(description=description_text, + formatter=PlainHelpFormatter()) + optparser.add_option('-1', '--once', '--batch', + action='store_true', + default=False, + dest='once', + help='run in batch mode for one second', + ) + optparser.add_option('-l', '--log', + action='store_true', + default=False, + dest='log', + help='run in logging mode (like vmstat)', + ) + optparser.add_option('-t', '--tracepoints', + action='store_true', + default=False, + dest='tracepoints', + help='retrieve statistics from tracepoints', + ) + optparser.add_option('-d', '--debugfs', + action='store_true', + default=False, + dest='debugfs', + help='retrieve statistics from debugfs', + ) + optparser.add_option('-f', '--fields', + action='store', + default=None, + dest='fields', + help='fields to display (regex)', + ) + optparser.add_option('-p', '--pid', + action='store', + default=0, + type=int, + dest='pid', + help='restrict statistics to pid', + ) + (options, _) = optparser.parse_args(sys.argv) + return options + +def get_providers(options): + """Returns a list of data providers depending on the passed options.""" + providers = [] + + if options.tracepoints: + providers.append(TracepointProvider()) + if options.debugfs: + providers.append(DebugfsProvider()) + if len(providers) == 0: + providers.append(TracepointProvider()) + + return providers + +def check_access(options): + """Exits if the current user can't access all needed directories.""" + if not os.path.exists('/sys/kernel/debug'): + sys.stderr.write('Please enable CONFIG_DEBUG_FS in your kernel.') + sys.exit(1) + + if not os.path.exists(PATH_DEBUGFS_KVM): + sys.stderr.write("Please make sure, that debugfs is mounted and " + "readable by the current user:\n" + "('mount -t debugfs debugfs /sys/kernel/debug')\n" + "Also ensure, that the kvm modules are loaded.\n") + sys.exit(1) + + if not os.path.exists(PATH_DEBUGFS_TRACING) and (options.tracepoints + or not options.debugfs): + sys.stderr.write("Please enable CONFIG_TRACING in your kernel " + "when using the option -t (default).\n" + "If it is enabled, make {0} readable by the " + "current user.\n" + .format(PATH_DEBUGFS_TRACING)) + if options.tracepoints: + sys.exit(1) + + sys.stderr.write("Falling back to debugfs statistics!\n") + options.debugfs = True + sleep(5) + + return options + +def main(): + options = get_options() + options = check_access(options) + + if (options.pid > 0 and + not os.path.isdir(os.path.join('/proc/', + str(options.pid)))): + sys.stderr.write('Did you use a (unsupported) tid instead of a pid?\n') + sys.exit('Specified pid does not exist.') + + providers = get_providers(options) + stats = Stats(providers, options.pid, fields=options.fields) + + if options.log: + log(stats) + elif not options.once: + with Tui(stats) as tui: + tui.show_stats() + else: + batch(stats) + +if __name__ == "__main__": + main() diff --git a/tools/kvm/kvm_stat/kvm_stat.txt b/tools/kvm/kvm_stat/kvm_stat.txt new file mode 100644 index 0000000..b92a153 --- /dev/null +++ b/tools/kvm/kvm_stat/kvm_stat.txt @@ -0,0 +1,63 @@ +kvm_stat(1) +=========== + +NAME +---- +kvm_stat - Report KVM kernel module event counters + +SYNOPSIS +-------- +[verse] +'kvm_stat' [OPTION]... + +DESCRIPTION +----------- +kvm_stat prints counts of KVM kernel module trace events. These events signify +state transitions such as guest mode entry and exit. + +This tool is useful for observing guest behavior from the host perspective. +Often conclusions about performance or buggy behavior can be drawn from the +output. + +The set of KVM kernel module trace events may be specific to the kernel version +or architecture. It is best to check the KVM kernel module source code for the +meaning of events. + +OPTIONS +------- +-1:: +--once:: +--batch:: + run in batch mode for one second + +-l:: +--log:: + run in logging mode (like vmstat) + +-t:: +--tracepoints:: + retrieve statistics from tracepoints + +-d:: +--debugfs:: + retrieve statistics from debugfs + +-p<pid>:: +--pid=<pid>:: + limit statistics to one virtual machine (pid) + +-f<fields>:: +--fields=<fields>:: + fields to display (regex) + +-h:: +--help:: + show help message + +SEE ALSO +-------- +'perf'(1), 'trace-cmd'(1) + +AUTHOR +------ +Stefan Hajnoczi <stefanha@redhat.com> diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index 409db33..e2d5b6f 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -20,6 +20,7 @@ #include <linux/kvm.h> #include <linux/kvm_host.h> #include <linux/interrupt.h> +#include <linux/irq.h> #include <clocksource/arm_arch_timer.h> #include <asm/arch_timer.h> @@ -174,10 +175,10 @@ static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level) timer->active_cleared_last = false; timer->irq.level = new_level; - trace_kvm_timer_update_irq(vcpu->vcpu_id, timer->map->virt_irq, + trace_kvm_timer_update_irq(vcpu->vcpu_id, timer->irq.irq, timer->irq.level); ret = kvm_vgic_inject_mapped_irq(vcpu->kvm, vcpu->vcpu_id, - timer->map, + timer->irq.irq, timer->irq.level); WARN_ON(ret); } @@ -196,7 +197,7 @@ static int kvm_timer_update_state(struct kvm_vcpu *vcpu) * because the guest would never see the interrupt. Instead wait * until we call this function from kvm_timer_flush_hwstate. */ - if (!vgic_initialized(vcpu->kvm)) + if (!vgic_initialized(vcpu->kvm) || !timer->enabled) return -ENODEV; if (kvm_timer_should_fire(vcpu) != timer->irq.level) @@ -274,10 +275,8 @@ void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu) * to ensure that hardware interrupts from the timer triggers a guest * exit. */ - if (timer->irq.level || kvm_vgic_map_is_active(vcpu, timer->map)) - phys_active = true; - else - phys_active = false; + phys_active = timer->irq.level || + kvm_vgic_map_is_active(vcpu, timer->irq.irq); /* * We want to avoid hitting the (re)distributor as much as @@ -302,7 +301,7 @@ void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu) if (timer->active_cleared_last && !phys_active) return; - ret = irq_set_irqchip_state(timer->map->irq, + ret = irq_set_irqchip_state(host_vtimer_irq, IRQCHIP_STATE_ACTIVE, phys_active); WARN_ON(ret); @@ -334,7 +333,6 @@ int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu, const struct kvm_irq_level *irq) { struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; - struct irq_phys_map *map; /* * The vcpu timer irq number cannot be determined in @@ -353,15 +351,6 @@ int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu, timer->cntv_ctl = 0; kvm_timer_update_state(vcpu); - /* - * Tell the VGIC that the virtual interrupt is tied to a - * physical interrupt. We do that once per VCPU. - */ - map = kvm_vgic_map_phys_irq(vcpu, irq->irq, host_vtimer_irq); - if (WARN_ON(IS_ERR(map))) - return PTR_ERR(map); - - timer->map = map; return 0; } @@ -487,14 +476,43 @@ void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu) struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; timer_disarm(timer); - if (timer->map) - kvm_vgic_unmap_phys_irq(vcpu, timer->map); + kvm_vgic_unmap_phys_irq(vcpu, timer->irq.irq); } -void kvm_timer_enable(struct kvm *kvm) +int kvm_timer_enable(struct kvm_vcpu *vcpu) { - if (kvm->arch.timer.enabled) - return; + struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + struct irq_desc *desc; + struct irq_data *data; + int phys_irq; + int ret; + + if (timer->enabled) + return 0; + + /* + * Find the physical IRQ number corresponding to the host_vtimer_irq + */ + desc = irq_to_desc(host_vtimer_irq); + if (!desc) { + kvm_err("%s: no interrupt descriptor\n", __func__); + return -EINVAL; + } + + data = irq_desc_get_irq_data(desc); + while (data->parent_data) + data = data->parent_data; + + phys_irq = data->hwirq; + + /* + * Tell the VGIC that the virtual interrupt is tied to a + * physical interrupt. We do that once per VCPU. + */ + ret = kvm_vgic_map_phys_irq(vcpu, timer->irq.irq, phys_irq); + if (ret) + return ret; + /* * There is a potential race here between VCPUs starting for the first @@ -505,7 +523,9 @@ void kvm_timer_enable(struct kvm *kvm) * the arch timers are enabled. */ if (timecounter && wqueue) - kvm->arch.timer.enabled = 1; + timer->enabled = 1; + + return 0; } void kvm_timer_init(struct kvm *kvm) diff --git a/virt/kvm/arm/hyp/timer-sr.c b/virt/kvm/arm/hyp/timer-sr.c index ea00d69..798866a 100644 --- a/virt/kvm/arm/hyp/timer-sr.c +++ b/virt/kvm/arm/hyp/timer-sr.c @@ -24,11 +24,10 @@ /* vcpu is already in the HYP VA space */ void __hyp_text __timer_save_state(struct kvm_vcpu *vcpu) { - struct kvm *kvm = kern_hyp_va(vcpu->kvm); struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; u64 val; - if (kvm->arch.timer.enabled) { + if (timer->enabled) { timer->cntv_ctl = read_sysreg_el0(cntv_ctl); timer->cntv_cval = read_sysreg_el0(cntv_cval); } @@ -60,7 +59,7 @@ void __hyp_text __timer_restore_state(struct kvm_vcpu *vcpu) val |= CNTHCTL_EL1PCTEN; write_sysreg(val, cnthctl_el2); - if (kvm->arch.timer.enabled) { + if (timer->enabled) { write_sysreg(kvm->arch.timer.cntvoff, cntvoff_el2); write_sysreg_el0(timer->cntv_cval, cntv_cval); isb(); diff --git a/virt/kvm/arm/hyp/vgic-v2-sr.c b/virt/kvm/arm/hyp/vgic-v2-sr.c index 674bdf8..a3f12b3 100644 --- a/virt/kvm/arm/hyp/vgic-v2-sr.c +++ b/virt/kvm/arm/hyp/vgic-v2-sr.c @@ -21,11 +21,18 @@ #include <asm/kvm_hyp.h> +#ifdef CONFIG_KVM_NEW_VGIC +extern struct vgic_global kvm_vgic_global_state; +#define vgic_v2_params kvm_vgic_global_state +#else +extern struct vgic_params vgic_v2_params; +#endif + static void __hyp_text save_maint_int_state(struct kvm_vcpu *vcpu, void __iomem *base) { struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; - int nr_lr = vcpu->arch.vgic_cpu.nr_lr; + int nr_lr = (kern_hyp_va(&vgic_v2_params))->nr_lr; u32 eisr0, eisr1; int i; bool expect_mi; @@ -67,7 +74,7 @@ static void __hyp_text save_maint_int_state(struct kvm_vcpu *vcpu, static void __hyp_text save_elrsr(struct kvm_vcpu *vcpu, void __iomem *base) { struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; - int nr_lr = vcpu->arch.vgic_cpu.nr_lr; + int nr_lr = (kern_hyp_va(&vgic_v2_params))->nr_lr; u32 elrsr0, elrsr1; elrsr0 = readl_relaxed(base + GICH_ELRSR0); @@ -86,7 +93,7 @@ static void __hyp_text save_elrsr(struct kvm_vcpu *vcpu, void __iomem *base) static void __hyp_text save_lrs(struct kvm_vcpu *vcpu, void __iomem *base) { struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; - int nr_lr = vcpu->arch.vgic_cpu.nr_lr; + int nr_lr = (kern_hyp_va(&vgic_v2_params))->nr_lr; int i; for (i = 0; i < nr_lr; i++) { @@ -141,13 +148,13 @@ void __hyp_text __vgic_v2_restore_state(struct kvm_vcpu *vcpu) struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; struct vgic_dist *vgic = &kvm->arch.vgic; void __iomem *base = kern_hyp_va(vgic->vctrl_base); - int i, nr_lr; + int nr_lr = (kern_hyp_va(&vgic_v2_params))->nr_lr; + int i; u64 live_lrs = 0; if (!base) return; - nr_lr = vcpu->arch.vgic_cpu.nr_lr; for (i = 0; i < nr_lr; i++) if (cpu_if->vgic_lr[i] & GICH_LR_STATE) diff --git a/virt/kvm/arm/pmu.c b/virt/kvm/arm/pmu.c index 575c7aa..a027569 100644 --- a/virt/kvm/arm/pmu.c +++ b/virt/kvm/arm/pmu.c @@ -436,7 +436,14 @@ static int kvm_arm_pmu_v3_init(struct kvm_vcpu *vcpu) return 0; } -static bool irq_is_valid(struct kvm *kvm, int irq, bool is_ppi) +#define irq_is_ppi(irq) ((irq) >= VGIC_NR_SGIS && (irq) < VGIC_NR_PRIVATE_IRQS) + +/* + * For one VM the interrupt type must be same for each vcpu. + * As a PPI, the interrupt number is the same for all vcpus, + * while as an SPI it must be a separate number per vcpu. + */ +static bool pmu_irq_is_valid(struct kvm *kvm, int irq) { int i; struct kvm_vcpu *vcpu; @@ -445,7 +452,7 @@ static bool irq_is_valid(struct kvm *kvm, int irq, bool is_ppi) if (!kvm_arm_pmu_irq_initialized(vcpu)) continue; - if (is_ppi) { + if (irq_is_ppi(irq)) { if (vcpu->arch.pmu.irq_num != irq) return false; } else { @@ -457,7 +464,6 @@ static bool irq_is_valid(struct kvm *kvm, int irq, bool is_ppi) return true; } - int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) { switch (attr->attr) { @@ -471,14 +477,11 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) if (get_user(irq, uaddr)) return -EFAULT; - /* - * The PMU overflow interrupt could be a PPI or SPI, but for one - * VM the interrupt type must be same for each vcpu. As a PPI, - * the interrupt number is the same for all vcpus, while as an - * SPI it must be a separate number per vcpu. - */ - if (irq < VGIC_NR_SGIS || irq >= vcpu->kvm->arch.vgic.nr_irqs || - !irq_is_valid(vcpu->kvm, irq, irq < VGIC_NR_PRIVATE_IRQS)) + /* The PMU overflow interrupt can be a PPI or a valid SPI. */ + if (!(irq_is_ppi(irq) || vgic_valid_spi(vcpu->kvm, irq))) + return -EINVAL; + + if (!pmu_irq_is_valid(vcpu->kvm, irq)) return -EINVAL; if (kvm_arm_pmu_irq_initialized(vcpu)) diff --git a/virt/kvm/arm/vgic-v2.c b/virt/kvm/arm/vgic-v2.c index 7e826c9..334cd7a 100644 --- a/virt/kvm/arm/vgic-v2.c +++ b/virt/kvm/arm/vgic-v2.c @@ -171,7 +171,7 @@ static const struct vgic_ops vgic_v2_ops = { .enable = vgic_v2_enable, }; -static struct vgic_params vgic_v2_params; +struct vgic_params __section(.hyp.text) vgic_v2_params; static void vgic_cpu_init_lrs(void *params) { @@ -201,6 +201,8 @@ int vgic_v2_probe(const struct gic_kvm_info *gic_kvm_info, const struct resource *vctrl_res = &gic_kvm_info->vctrl; const struct resource *vcpu_res = &gic_kvm_info->vcpu; + memset(vgic, 0, sizeof(*vgic)); + if (!gic_kvm_info->maint_irq) { kvm_err("error getting vgic maintenance irq\n"); ret = -ENXIO; diff --git a/virt/kvm/arm/vgic-v3.c b/virt/kvm/arm/vgic-v3.c index c02a1b1..75b02fa 100644 --- a/virt/kvm/arm/vgic-v3.c +++ b/virt/kvm/arm/vgic-v3.c @@ -29,12 +29,6 @@ #include <asm/kvm_asm.h> #include <asm/kvm_mmu.h> -/* These are for GICv2 emulation only */ -#define GICH_LR_VIRTUALID (0x3ffUL << 0) -#define GICH_LR_PHYSID_CPUID_SHIFT (10) -#define GICH_LR_PHYSID_CPUID (7UL << GICH_LR_PHYSID_CPUID_SHIFT) -#define ICH_LR_VIRTUALID_MASK (BIT_ULL(32) - 1) - static u32 ich_vtr_el2; static struct vgic_lr vgic_v3_get_lr(const struct kvm_vcpu *vcpu, int lr) @@ -43,7 +37,7 @@ static struct vgic_lr vgic_v3_get_lr(const struct kvm_vcpu *vcpu, int lr) u64 val = vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr]; if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) - lr_desc.irq = val & ICH_LR_VIRTUALID_MASK; + lr_desc.irq = val & ICH_LR_VIRTUAL_ID_MASK; else lr_desc.irq = val & GICH_LR_VIRTUALID; diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c index 60668a7..c3bfbb9 100644 --- a/virt/kvm/arm/vgic.c +++ b/virt/kvm/arm/vgic.c @@ -690,12 +690,11 @@ bool vgic_handle_cfg_reg(u32 *reg, struct kvm_exit_mmio *mmio, */ void vgic_unqueue_irqs(struct kvm_vcpu *vcpu) { - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; u64 elrsr = vgic_get_elrsr(vcpu); unsigned long *elrsr_ptr = u64_to_bitmask(&elrsr); int i; - for_each_clear_bit(i, elrsr_ptr, vgic_cpu->nr_lr) { + for_each_clear_bit(i, elrsr_ptr, vgic->nr_lr) { struct vgic_lr lr = vgic_get_lr(vcpu, i); /* @@ -820,7 +819,6 @@ static int vgic_handle_mmio_access(struct kvm_vcpu *vcpu, struct vgic_dist *dist = &vcpu->kvm->arch.vgic; struct vgic_io_device *iodev = container_of(this, struct vgic_io_device, dev); - struct kvm_run *run = vcpu->run; const struct vgic_io_range *range; struct kvm_exit_mmio mmio; bool updated_state; @@ -849,12 +847,6 @@ static int vgic_handle_mmio_access(struct kvm_vcpu *vcpu, updated_state = false; } spin_unlock(&dist->lock); - run->mmio.is_write = is_write; - run->mmio.len = len; - run->mmio.phys_addr = addr; - memcpy(run->mmio.data, val, len); - - kvm_handle_mmio_return(vcpu, run); if (updated_state) vgic_kick_vcpus(vcpu->kvm); @@ -1102,18 +1094,18 @@ static bool dist_active_irq(struct kvm_vcpu *vcpu) return test_bit(vcpu->vcpu_id, dist->irq_active_on_cpu); } -bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, struct irq_phys_map *map) +bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int virt_irq) { int i; - for (i = 0; i < vcpu->arch.vgic_cpu.nr_lr; i++) { + for (i = 0; i < vgic->nr_lr; i++) { struct vgic_lr vlr = vgic_get_lr(vcpu, i); - if (vlr.irq == map->virt_irq && vlr.state & LR_STATE_ACTIVE) + if (vlr.irq == virt_irq && vlr.state & LR_STATE_ACTIVE) return true; } - return vgic_irq_is_active(vcpu, map->virt_irq); + return vgic_irq_is_active(vcpu, virt_irq); } /* @@ -1521,7 +1513,6 @@ static int vgic_validate_injection(struct kvm_vcpu *vcpu, int irq, int level) } static int vgic_update_irq_pending(struct kvm *kvm, int cpuid, - struct irq_phys_map *map, unsigned int irq_num, bool level) { struct vgic_dist *dist = &kvm->arch.vgic; @@ -1660,14 +1651,14 @@ int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int irq_num, if (map) return -EINVAL; - return vgic_update_irq_pending(kvm, cpuid, NULL, irq_num, level); + return vgic_update_irq_pending(kvm, cpuid, irq_num, level); } /** * kvm_vgic_inject_mapped_irq - Inject a physically mapped IRQ to the vgic * @kvm: The VM structure pointer * @cpuid: The CPU for PPIs - * @map: Pointer to a irq_phys_map structure describing the mapping + * @virt_irq: The virtual IRQ to be injected * @level: Edge-triggered: true: to trigger the interrupt * false: to ignore the call * Level-sensitive true: raise the input signal @@ -1678,7 +1669,7 @@ int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int irq_num, * being HIGH and 0 being LOW and all devices being active-HIGH. */ int kvm_vgic_inject_mapped_irq(struct kvm *kvm, int cpuid, - struct irq_phys_map *map, bool level) + unsigned int virt_irq, bool level) { int ret; @@ -1686,7 +1677,7 @@ int kvm_vgic_inject_mapped_irq(struct kvm *kvm, int cpuid, if (ret) return ret; - return vgic_update_irq_pending(kvm, cpuid, map, map->virt_irq, level); + return vgic_update_irq_pending(kvm, cpuid, virt_irq, level); } static irqreturn_t vgic_maintenance_handler(int irq, void *data) @@ -1712,43 +1703,28 @@ static struct list_head *vgic_get_irq_phys_map_list(struct kvm_vcpu *vcpu, /** * kvm_vgic_map_phys_irq - map a virtual IRQ to a physical IRQ * @vcpu: The VCPU pointer - * @virt_irq: The virtual irq number - * @irq: The Linux IRQ number + * @virt_irq: The virtual IRQ number for the guest + * @phys_irq: The hardware IRQ number of the host * * Establish a mapping between a guest visible irq (@virt_irq) and a - * Linux irq (@irq). On injection, @virt_irq will be associated with - * the physical interrupt represented by @irq. This mapping can be + * hardware irq (@phys_irq). On injection, @virt_irq will be associated with + * the physical interrupt represented by @phys_irq. This mapping can be * established multiple times as long as the parameters are the same. * - * Returns a valid pointer on success, and an error pointer otherwise + * Returns 0 on success or an error value otherwise. */ -struct irq_phys_map *kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, - int virt_irq, int irq) +int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, int virt_irq, int phys_irq) { struct vgic_dist *dist = &vcpu->kvm->arch.vgic; struct list_head *root = vgic_get_irq_phys_map_list(vcpu, virt_irq); struct irq_phys_map *map; struct irq_phys_map_entry *entry; - struct irq_desc *desc; - struct irq_data *data; - int phys_irq; - - desc = irq_to_desc(irq); - if (!desc) { - kvm_err("%s: no interrupt descriptor\n", __func__); - return ERR_PTR(-EINVAL); - } - - data = irq_desc_get_irq_data(desc); - while (data->parent_data) - data = data->parent_data; - - phys_irq = data->hwirq; + int ret = 0; /* Create a new mapping */ entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (!entry) - return ERR_PTR(-ENOMEM); + return -ENOMEM; spin_lock(&dist->irq_phys_map_lock); @@ -1756,9 +1732,8 @@ struct irq_phys_map *kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, map = vgic_irq_map_search(vcpu, virt_irq); if (map) { /* Make sure this mapping matches */ - if (map->phys_irq != phys_irq || - map->irq != irq) - map = ERR_PTR(-EINVAL); + if (map->phys_irq != phys_irq) + ret = -EINVAL; /* Found an existing, valid mapping */ goto out; @@ -1767,7 +1742,6 @@ struct irq_phys_map *kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, map = &entry->map; map->virt_irq = virt_irq; map->phys_irq = phys_irq; - map->irq = irq; list_add_tail_rcu(&entry->entry, root); @@ -1775,9 +1749,9 @@ out: spin_unlock(&dist->irq_phys_map_lock); /* If we've found a hit in the existing list, free the useless * entry */ - if (IS_ERR(map) || map != &entry->map) + if (ret || map != &entry->map) kfree(entry); - return map; + return ret; } static struct irq_phys_map *vgic_irq_map_search(struct kvm_vcpu *vcpu, @@ -1813,25 +1787,22 @@ static void vgic_free_phys_irq_map_rcu(struct rcu_head *rcu) /** * kvm_vgic_unmap_phys_irq - Remove a virtual to physical IRQ mapping * @vcpu: The VCPU pointer - * @map: The pointer to a mapping obtained through kvm_vgic_map_phys_irq + * @virt_irq: The virtual IRQ number to be unmapped * * Remove an existing mapping between virtual and physical interrupts. */ -int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, struct irq_phys_map *map) +int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int virt_irq) { struct vgic_dist *dist = &vcpu->kvm->arch.vgic; struct irq_phys_map_entry *entry; struct list_head *root; - if (!map) - return -EINVAL; - - root = vgic_get_irq_phys_map_list(vcpu, map->virt_irq); + root = vgic_get_irq_phys_map_list(vcpu, virt_irq); spin_lock(&dist->irq_phys_map_lock); list_for_each_entry(entry, root, entry) { - if (&entry->map == map) { + if (entry->map.virt_irq == virt_irq) { list_del_rcu(&entry->entry); call_rcu(&entry->rcu, vgic_free_phys_irq_map_rcu); break; @@ -1887,13 +1858,6 @@ static int vgic_vcpu_init_maps(struct kvm_vcpu *vcpu, int nr_irqs) return -ENOMEM; } - /* - * Store the number of LRs per vcpu, so we don't have to go - * all the way to the distributor structure to find out. Only - * assembly code should use this one. - */ - vgic_cpu->nr_lr = vgic->nr_lr; - return 0; } diff --git a/virt/kvm/arm/vgic/vgic-init.c b/virt/kvm/arm/vgic/vgic-init.c new file mode 100644 index 0000000..a1442f7 --- /dev/null +++ b/virt/kvm/arm/vgic/vgic-init.c @@ -0,0 +1,452 @@ +/* + * Copyright (C) 2015, 2016 ARM Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/uaccess.h> +#include <linux/interrupt.h> +#include <linux/cpu.h> +#include <linux/kvm_host.h> +#include <kvm/arm_vgic.h> +#include <asm/kvm_mmu.h> +#include "vgic.h" + +/* + * Initialization rules: there are multiple stages to the vgic + * initialization, both for the distributor and the CPU interfaces. + * + * Distributor: + * + * - kvm_vgic_early_init(): initialization of static data that doesn't + * depend on any sizing information or emulation type. No allocation + * is allowed there. + * + * - vgic_init(): allocation and initialization of the generic data + * structures that depend on sizing information (number of CPUs, + * number of interrupts). Also initializes the vcpu specific data + * structures. Can be executed lazily for GICv2. + * + * CPU Interface: + * + * - kvm_vgic_cpu_early_init(): initialization of static data that + * doesn't depend on any sizing information or emulation type. No + * allocation is allowed there. + */ + +/* EARLY INIT */ + +/* + * Those 2 functions should not be needed anymore but they + * still are called from arm.c + */ +void kvm_vgic_early_init(struct kvm *kvm) +{ +} + +void kvm_vgic_vcpu_early_init(struct kvm_vcpu *vcpu) +{ +} + +/* CREATION */ + +/** + * kvm_vgic_create: triggered by the instantiation of the VGIC device by + * user space, either through the legacy KVM_CREATE_IRQCHIP ioctl (v2 only) + * or through the generic KVM_CREATE_DEVICE API ioctl. + * irqchip_in_kernel() tells you if this function succeeded or not. + * @kvm: kvm struct pointer + * @type: KVM_DEV_TYPE_ARM_VGIC_V[23] + */ +int kvm_vgic_create(struct kvm *kvm, u32 type) +{ + int i, vcpu_lock_idx = -1, ret; + struct kvm_vcpu *vcpu; + + mutex_lock(&kvm->lock); + + if (irqchip_in_kernel(kvm)) { + ret = -EEXIST; + goto out; + } + + /* + * This function is also called by the KVM_CREATE_IRQCHIP handler, + * which had no chance yet to check the availability of the GICv2 + * emulation. So check this here again. KVM_CREATE_DEVICE does + * the proper checks already. + */ + if (type == KVM_DEV_TYPE_ARM_VGIC_V2 && + !kvm_vgic_global_state.can_emulate_gicv2) { + ret = -ENODEV; + goto out; + } + + /* + * Any time a vcpu is run, vcpu_load is called which tries to grab the + * vcpu->mutex. By grabbing the vcpu->mutex of all VCPUs we ensure + * that no other VCPUs are run while we create the vgic. + */ + ret = -EBUSY; + kvm_for_each_vcpu(i, vcpu, kvm) { + if (!mutex_trylock(&vcpu->mutex)) + goto out_unlock; + vcpu_lock_idx = i; + } + + kvm_for_each_vcpu(i, vcpu, kvm) { + if (vcpu->arch.has_run_once) + goto out_unlock; + } + ret = 0; + + if (type == KVM_DEV_TYPE_ARM_VGIC_V2) + kvm->arch.max_vcpus = VGIC_V2_MAX_CPUS; + else + kvm->arch.max_vcpus = VGIC_V3_MAX_CPUS; + + if (atomic_read(&kvm->online_vcpus) > kvm->arch.max_vcpus) { + ret = -E2BIG; + goto out_unlock; + } + + kvm->arch.vgic.in_kernel = true; + kvm->arch.vgic.vgic_model = type; + + /* + * kvm_vgic_global_state.vctrl_base is set on vgic probe (kvm_arch_init) + * it is stored in distributor struct for asm save/restore purpose + */ + kvm->arch.vgic.vctrl_base = kvm_vgic_global_state.vctrl_base; + + kvm->arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF; + kvm->arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF; + kvm->arch.vgic.vgic_redist_base = VGIC_ADDR_UNDEF; + +out_unlock: + for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) { + vcpu = kvm_get_vcpu(kvm, vcpu_lock_idx); + mutex_unlock(&vcpu->mutex); + } + +out: + mutex_unlock(&kvm->lock); + return ret; +} + +/* INIT/DESTROY */ + +/** + * kvm_vgic_dist_init: initialize the dist data structures + * @kvm: kvm struct pointer + * @nr_spis: number of spis, frozen by caller + */ +static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + struct kvm_vcpu *vcpu0 = kvm_get_vcpu(kvm, 0); + int i; + + dist->spis = kcalloc(nr_spis, sizeof(struct vgic_irq), GFP_KERNEL); + if (!dist->spis) + return -ENOMEM; + + /* + * In the following code we do not take the irq struct lock since + * no other action on irq structs can happen while the VGIC is + * not initialized yet: + * If someone wants to inject an interrupt or does a MMIO access, we + * require prior initialization in case of a virtual GICv3 or trigger + * initialization when using a virtual GICv2. + */ + for (i = 0; i < nr_spis; i++) { + struct vgic_irq *irq = &dist->spis[i]; + + irq->intid = i + VGIC_NR_PRIVATE_IRQS; + INIT_LIST_HEAD(&irq->ap_list); + spin_lock_init(&irq->irq_lock); + irq->vcpu = NULL; + irq->target_vcpu = vcpu0; + if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2) + irq->targets = 0; + else + irq->mpidr = 0; + } + return 0; +} + +/** + * kvm_vgic_vcpu_init: initialize the vcpu data structures and + * enable the VCPU interface + * @vcpu: the VCPU which's VGIC should be initialized + */ +static void kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + int i; + + INIT_LIST_HEAD(&vgic_cpu->ap_list_head); + spin_lock_init(&vgic_cpu->ap_list_lock); + + /* + * Enable and configure all SGIs to be edge-triggered and + * configure all PPIs as level-triggered. + */ + for (i = 0; i < VGIC_NR_PRIVATE_IRQS; i++) { + struct vgic_irq *irq = &vgic_cpu->private_irqs[i]; + + INIT_LIST_HEAD(&irq->ap_list); + spin_lock_init(&irq->irq_lock); + irq->intid = i; + irq->vcpu = NULL; + irq->target_vcpu = vcpu; + irq->targets = 1U << vcpu->vcpu_id; + if (vgic_irq_is_sgi(i)) { + /* SGIs */ + irq->enabled = 1; + irq->config = VGIC_CONFIG_EDGE; + } else { + /* PPIs */ + irq->config = VGIC_CONFIG_LEVEL; + } + } + if (kvm_vgic_global_state.type == VGIC_V2) + vgic_v2_enable(vcpu); + else + vgic_v3_enable(vcpu); +} + +/* + * vgic_init: allocates and initializes dist and vcpu data structures + * depending on two dimensioning parameters: + * - the number of spis + * - the number of vcpus + * The function is generally called when nr_spis has been explicitly set + * by the guest through the KVM DEVICE API. If not nr_spis is set to 256. + * vgic_initialized() returns true when this function has succeeded. + * Must be called with kvm->lock held! + */ +int vgic_init(struct kvm *kvm) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + struct kvm_vcpu *vcpu; + int ret = 0, i; + + if (vgic_initialized(kvm)) + return 0; + + /* freeze the number of spis */ + if (!dist->nr_spis) + dist->nr_spis = VGIC_NR_IRQS_LEGACY - VGIC_NR_PRIVATE_IRQS; + + ret = kvm_vgic_dist_init(kvm, dist->nr_spis); + if (ret) + goto out; + + kvm_for_each_vcpu(i, vcpu, kvm) + kvm_vgic_vcpu_init(vcpu); + + dist->initialized = true; +out: + return ret; +} + +static void kvm_vgic_dist_destroy(struct kvm *kvm) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + + mutex_lock(&kvm->lock); + + dist->ready = false; + dist->initialized = false; + + kfree(dist->spis); + kfree(dist->redist_iodevs); + dist->nr_spis = 0; + + mutex_unlock(&kvm->lock); +} + +void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + + INIT_LIST_HEAD(&vgic_cpu->ap_list_head); +} + +void kvm_vgic_destroy(struct kvm *kvm) +{ + struct kvm_vcpu *vcpu; + int i; + + kvm_vgic_dist_destroy(kvm); + + kvm_for_each_vcpu(i, vcpu, kvm) + kvm_vgic_vcpu_destroy(vcpu); +} + +/** + * vgic_lazy_init: Lazy init is only allowed if the GIC exposed to the guest + * is a GICv2. A GICv3 must be explicitly initialized by the guest using the + * KVM_DEV_ARM_VGIC_GRP_CTRL KVM_DEVICE group. + * @kvm: kvm struct pointer + */ +int vgic_lazy_init(struct kvm *kvm) +{ + int ret = 0; + + if (unlikely(!vgic_initialized(kvm))) { + /* + * We only provide the automatic initialization of the VGIC + * for the legacy case of a GICv2. Any other type must + * be explicitly initialized once setup with the respective + * KVM device call. + */ + if (kvm->arch.vgic.vgic_model != KVM_DEV_TYPE_ARM_VGIC_V2) + return -EBUSY; + + mutex_lock(&kvm->lock); + ret = vgic_init(kvm); + mutex_unlock(&kvm->lock); + } + + return ret; +} + +/* RESOURCE MAPPING */ + +/** + * Map the MMIO regions depending on the VGIC model exposed to the guest + * called on the first VCPU run. + * Also map the virtual CPU interface into the VM. + * v2/v3 derivatives call vgic_init if not already done. + * vgic_ready() returns true if this function has succeeded. + * @kvm: kvm struct pointer + */ +int kvm_vgic_map_resources(struct kvm *kvm) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + int ret = 0; + + mutex_lock(&kvm->lock); + if (!irqchip_in_kernel(kvm)) + goto out; + + if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2) + ret = vgic_v2_map_resources(kvm); + else + ret = vgic_v3_map_resources(kvm); +out: + mutex_unlock(&kvm->lock); + return ret; +} + +/* GENERIC PROBE */ + +static void vgic_init_maintenance_interrupt(void *info) +{ + enable_percpu_irq(kvm_vgic_global_state.maint_irq, 0); +} + +static int vgic_cpu_notify(struct notifier_block *self, + unsigned long action, void *cpu) +{ + switch (action) { + case CPU_STARTING: + case CPU_STARTING_FROZEN: + vgic_init_maintenance_interrupt(NULL); + break; + case CPU_DYING: + case CPU_DYING_FROZEN: + disable_percpu_irq(kvm_vgic_global_state.maint_irq); + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block vgic_cpu_nb = { + .notifier_call = vgic_cpu_notify, +}; + +static irqreturn_t vgic_maintenance_handler(int irq, void *data) +{ + /* + * We cannot rely on the vgic maintenance interrupt to be + * delivered synchronously. This means we can only use it to + * exit the VM, and we perform the handling of EOIed + * interrupts on the exit path (see vgic_process_maintenance). + */ + return IRQ_HANDLED; +} + +/** + * kvm_vgic_hyp_init: populates the kvm_vgic_global_state variable + * according to the host GIC model. Accordingly calls either + * vgic_v2/v3_probe which registers the KVM_DEVICE that can be + * instantiated by a guest later on . + */ +int kvm_vgic_hyp_init(void) +{ + const struct gic_kvm_info *gic_kvm_info; + int ret; + + gic_kvm_info = gic_get_kvm_info(); + if (!gic_kvm_info) + return -ENODEV; + + if (!gic_kvm_info->maint_irq) { + kvm_err("No vgic maintenance irq\n"); + return -ENXIO; + } + + switch (gic_kvm_info->type) { + case GIC_V2: + ret = vgic_v2_probe(gic_kvm_info); + break; + case GIC_V3: + ret = vgic_v3_probe(gic_kvm_info); + break; + default: + ret = -ENODEV; + }; + + if (ret) + return ret; + + kvm_vgic_global_state.maint_irq = gic_kvm_info->maint_irq; + ret = request_percpu_irq(kvm_vgic_global_state.maint_irq, + vgic_maintenance_handler, + "vgic", kvm_get_running_vcpus()); + if (ret) { + kvm_err("Cannot register interrupt %d\n", + kvm_vgic_global_state.maint_irq); + return ret; + } + + ret = __register_cpu_notifier(&vgic_cpu_nb); + if (ret) { + kvm_err("Cannot register vgic CPU notifier\n"); + goto out_free_irq; + } + + on_each_cpu(vgic_init_maintenance_interrupt, NULL, 1); + + kvm_info("vgic interrupt IRQ%d\n", kvm_vgic_global_state.maint_irq); + return 0; + +out_free_irq: + free_percpu_irq(kvm_vgic_global_state.maint_irq, + kvm_get_running_vcpus()); + return ret; +} diff --git a/virt/kvm/arm/vgic/vgic-irqfd.c b/virt/kvm/arm/vgic/vgic-irqfd.c new file mode 100644 index 0000000..c675513 --- /dev/null +++ b/virt/kvm/arm/vgic/vgic-irqfd.c @@ -0,0 +1,52 @@ +/* + * Copyright (C) 2015, 2016 ARM Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <trace/events/kvm.h> + +int kvm_irq_map_gsi(struct kvm *kvm, + struct kvm_kernel_irq_routing_entry *entries, + int gsi) +{ + return 0; +} + +int kvm_irq_map_chip_pin(struct kvm *kvm, unsigned int irqchip, + unsigned int pin) +{ + return pin; +} + +int kvm_set_irq(struct kvm *kvm, int irq_source_id, + u32 irq, int level, bool line_status) +{ + unsigned int spi = irq + VGIC_NR_PRIVATE_IRQS; + + trace_kvm_set_irq(irq, level, irq_source_id); + + BUG_ON(!vgic_initialized(kvm)); + + return kvm_vgic_inject_irq(kvm, 0, spi, level); +} + +/* MSI not implemented yet */ +int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int irq_source_id, + int level, bool line_status) +{ + return 0; +} diff --git a/virt/kvm/arm/vgic/vgic-kvm-device.c b/virt/kvm/arm/vgic/vgic-kvm-device.c new file mode 100644 index 0000000..0130c4b --- /dev/null +++ b/virt/kvm/arm/vgic/vgic-kvm-device.c @@ -0,0 +1,431 @@ +/* + * VGIC: KVM DEVICE API + * + * Copyright (C) 2015 ARM Ltd. + * Author: Marc Zyngier <marc.zyngier@arm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ +#include <linux/kvm_host.h> +#include <kvm/arm_vgic.h> +#include <linux/uaccess.h> +#include <asm/kvm_mmu.h> +#include "vgic.h" + +/* common helpers */ + +static int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr, + phys_addr_t addr, phys_addr_t alignment) +{ + if (addr & ~KVM_PHYS_MASK) + return -E2BIG; + + if (!IS_ALIGNED(addr, alignment)) + return -EINVAL; + + if (!IS_VGIC_ADDR_UNDEF(*ioaddr)) + return -EEXIST; + + return 0; +} + +/** + * kvm_vgic_addr - set or get vgic VM base addresses + * @kvm: pointer to the vm struct + * @type: the VGIC addr type, one of KVM_VGIC_V[23]_ADDR_TYPE_XXX + * @addr: pointer to address value + * @write: if true set the address in the VM address space, if false read the + * address + * + * Set or get the vgic base addresses for the distributor and the virtual CPU + * interface in the VM physical address space. These addresses are properties + * of the emulated core/SoC and therefore user space initially knows this + * information. + * Check them for sanity (alignment, double assignment). We can't check for + * overlapping regions in case of a virtual GICv3 here, since we don't know + * the number of VCPUs yet, so we defer this check to map_resources(). + */ +int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write) +{ + int r = 0; + struct vgic_dist *vgic = &kvm->arch.vgic; + int type_needed; + phys_addr_t *addr_ptr, alignment; + + mutex_lock(&kvm->lock); + switch (type) { + case KVM_VGIC_V2_ADDR_TYPE_DIST: + type_needed = KVM_DEV_TYPE_ARM_VGIC_V2; + addr_ptr = &vgic->vgic_dist_base; + alignment = SZ_4K; + break; + case KVM_VGIC_V2_ADDR_TYPE_CPU: + type_needed = KVM_DEV_TYPE_ARM_VGIC_V2; + addr_ptr = &vgic->vgic_cpu_base; + alignment = SZ_4K; + break; +#ifdef CONFIG_KVM_ARM_VGIC_V3 + case KVM_VGIC_V3_ADDR_TYPE_DIST: + type_needed = KVM_DEV_TYPE_ARM_VGIC_V3; + addr_ptr = &vgic->vgic_dist_base; + alignment = SZ_64K; + break; + case KVM_VGIC_V3_ADDR_TYPE_REDIST: + type_needed = KVM_DEV_TYPE_ARM_VGIC_V3; + addr_ptr = &vgic->vgic_redist_base; + alignment = SZ_64K; + break; +#endif + default: + r = -ENODEV; + goto out; + } + + if (vgic->vgic_model != type_needed) { + r = -ENODEV; + goto out; + } + + if (write) { + r = vgic_check_ioaddr(kvm, addr_ptr, *addr, alignment); + if (!r) + *addr_ptr = *addr; + } else { + *addr = *addr_ptr; + } + +out: + mutex_unlock(&kvm->lock); + return r; +} + +static int vgic_set_common_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + int r; + + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_ADDR: { + u64 __user *uaddr = (u64 __user *)(long)attr->addr; + u64 addr; + unsigned long type = (unsigned long)attr->attr; + + if (copy_from_user(&addr, uaddr, sizeof(addr))) + return -EFAULT; + + r = kvm_vgic_addr(dev->kvm, type, &addr, true); + return (r == -ENODEV) ? -ENXIO : r; + } + case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: { + u32 __user *uaddr = (u32 __user *)(long)attr->addr; + u32 val; + int ret = 0; + + if (get_user(val, uaddr)) + return -EFAULT; + + /* + * We require: + * - at least 32 SPIs on top of the 16 SGIs and 16 PPIs + * - at most 1024 interrupts + * - a multiple of 32 interrupts + */ + if (val < (VGIC_NR_PRIVATE_IRQS + 32) || + val > VGIC_MAX_RESERVED || + (val & 31)) + return -EINVAL; + + mutex_lock(&dev->kvm->lock); + + if (vgic_ready(dev->kvm) || dev->kvm->arch.vgic.nr_spis) + ret = -EBUSY; + else + dev->kvm->arch.vgic.nr_spis = + val - VGIC_NR_PRIVATE_IRQS; + + mutex_unlock(&dev->kvm->lock); + + return ret; + } + case KVM_DEV_ARM_VGIC_GRP_CTRL: { + switch (attr->attr) { + case KVM_DEV_ARM_VGIC_CTRL_INIT: + mutex_lock(&dev->kvm->lock); + r = vgic_init(dev->kvm); + mutex_unlock(&dev->kvm->lock); + return r; + } + break; + } + } + + return -ENXIO; +} + +static int vgic_get_common_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + int r = -ENXIO; + + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_ADDR: { + u64 __user *uaddr = (u64 __user *)(long)attr->addr; + u64 addr; + unsigned long type = (unsigned long)attr->attr; + + r = kvm_vgic_addr(dev->kvm, type, &addr, false); + if (r) + return (r == -ENODEV) ? -ENXIO : r; + + if (copy_to_user(uaddr, &addr, sizeof(addr))) + return -EFAULT; + break; + } + case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: { + u32 __user *uaddr = (u32 __user *)(long)attr->addr; + + r = put_user(dev->kvm->arch.vgic.nr_spis + + VGIC_NR_PRIVATE_IRQS, uaddr); + break; + } + } + + return r; +} + +static int vgic_create(struct kvm_device *dev, u32 type) +{ + return kvm_vgic_create(dev->kvm, type); +} + +static void vgic_destroy(struct kvm_device *dev) +{ + kfree(dev); +} + +void kvm_register_vgic_device(unsigned long type) +{ + switch (type) { + case KVM_DEV_TYPE_ARM_VGIC_V2: + kvm_register_device_ops(&kvm_arm_vgic_v2_ops, + KVM_DEV_TYPE_ARM_VGIC_V2); + break; +#ifdef CONFIG_KVM_ARM_VGIC_V3 + case KVM_DEV_TYPE_ARM_VGIC_V3: + kvm_register_device_ops(&kvm_arm_vgic_v3_ops, + KVM_DEV_TYPE_ARM_VGIC_V3); + break; +#endif + } +} + +/** vgic_attr_regs_access: allows user space to read/write VGIC registers + * + * @dev: kvm device handle + * @attr: kvm device attribute + * @reg: address the value is read or written + * @is_write: write flag + * + */ +static int vgic_attr_regs_access(struct kvm_device *dev, + struct kvm_device_attr *attr, + u32 *reg, bool is_write) +{ + gpa_t addr; + int cpuid, ret, c; + struct kvm_vcpu *vcpu, *tmp_vcpu; + int vcpu_lock_idx = -1; + + cpuid = (attr->attr & KVM_DEV_ARM_VGIC_CPUID_MASK) >> + KVM_DEV_ARM_VGIC_CPUID_SHIFT; + vcpu = kvm_get_vcpu(dev->kvm, cpuid); + addr = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK; + + mutex_lock(&dev->kvm->lock); + + ret = vgic_init(dev->kvm); + if (ret) + goto out; + + if (cpuid >= atomic_read(&dev->kvm->online_vcpus)) { + ret = -EINVAL; + goto out; + } + + /* + * Any time a vcpu is run, vcpu_load is called which tries to grab the + * vcpu->mutex. By grabbing the vcpu->mutex of all VCPUs we ensure + * that no other VCPUs are run and fiddle with the vgic state while we + * access it. + */ + ret = -EBUSY; + kvm_for_each_vcpu(c, tmp_vcpu, dev->kvm) { + if (!mutex_trylock(&tmp_vcpu->mutex)) + goto out; + vcpu_lock_idx = c; + } + + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: + ret = vgic_v2_cpuif_uaccess(vcpu, is_write, addr, reg); + break; + case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: + ret = vgic_v2_dist_uaccess(vcpu, is_write, addr, reg); + break; + default: + ret = -EINVAL; + break; + } + +out: + for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) { + tmp_vcpu = kvm_get_vcpu(dev->kvm, vcpu_lock_idx); + mutex_unlock(&tmp_vcpu->mutex); + } + + mutex_unlock(&dev->kvm->lock); + return ret; +} + +/* V2 ops */ + +static int vgic_v2_set_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + int ret; + + ret = vgic_set_common_attr(dev, attr); + if (ret != -ENXIO) + return ret; + + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: + case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: { + u32 __user *uaddr = (u32 __user *)(long)attr->addr; + u32 reg; + + if (get_user(reg, uaddr)) + return -EFAULT; + + return vgic_attr_regs_access(dev, attr, ®, true); + } + } + + return -ENXIO; +} + +static int vgic_v2_get_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + int ret; + + ret = vgic_get_common_attr(dev, attr); + if (ret != -ENXIO) + return ret; + + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: + case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: { + u32 __user *uaddr = (u32 __user *)(long)attr->addr; + u32 reg = 0; + + ret = vgic_attr_regs_access(dev, attr, ®, false); + if (ret) + return ret; + return put_user(reg, uaddr); + } + } + + return -ENXIO; +} + +static int vgic_v2_has_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_ADDR: + switch (attr->attr) { + case KVM_VGIC_V2_ADDR_TYPE_DIST: + case KVM_VGIC_V2_ADDR_TYPE_CPU: + return 0; + } + break; + case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: + case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: + return vgic_v2_has_attr_regs(dev, attr); + case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: + return 0; + case KVM_DEV_ARM_VGIC_GRP_CTRL: + switch (attr->attr) { + case KVM_DEV_ARM_VGIC_CTRL_INIT: + return 0; + } + } + return -ENXIO; +} + +struct kvm_device_ops kvm_arm_vgic_v2_ops = { + .name = "kvm-arm-vgic-v2", + .create = vgic_create, + .destroy = vgic_destroy, + .set_attr = vgic_v2_set_attr, + .get_attr = vgic_v2_get_attr, + .has_attr = vgic_v2_has_attr, +}; + +/* V3 ops */ + +#ifdef CONFIG_KVM_ARM_VGIC_V3 + +static int vgic_v3_set_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + return vgic_set_common_attr(dev, attr); +} + +static int vgic_v3_get_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + return vgic_get_common_attr(dev, attr); +} + +static int vgic_v3_has_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_ADDR: + switch (attr->attr) { + case KVM_VGIC_V3_ADDR_TYPE_DIST: + case KVM_VGIC_V3_ADDR_TYPE_REDIST: + return 0; + } + break; + case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: + return 0; + case KVM_DEV_ARM_VGIC_GRP_CTRL: + switch (attr->attr) { + case KVM_DEV_ARM_VGIC_CTRL_INIT: + return 0; + } + } + return -ENXIO; +} + +struct kvm_device_ops kvm_arm_vgic_v3_ops = { + .name = "kvm-arm-vgic-v3", + .create = vgic_create, + .destroy = vgic_destroy, + .set_attr = vgic_v3_set_attr, + .get_attr = vgic_v3_get_attr, + .has_attr = vgic_v3_has_attr, +}; + +#endif /* CONFIG_KVM_ARM_VGIC_V3 */ + diff --git a/virt/kvm/arm/vgic/vgic-mmio-v2.c b/virt/kvm/arm/vgic/vgic-mmio-v2.c new file mode 100644 index 0000000..a213936 --- /dev/null +++ b/virt/kvm/arm/vgic/vgic-mmio-v2.c @@ -0,0 +1,446 @@ +/* + * VGICv2 MMIO handling functions + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/irqchip/arm-gic.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <kvm/iodev.h> +#include <kvm/arm_vgic.h> + +#include "vgic.h" +#include "vgic-mmio.h" + +static unsigned long vgic_mmio_read_v2_misc(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + u32 value; + + switch (addr & 0x0c) { + case GIC_DIST_CTRL: + value = vcpu->kvm->arch.vgic.enabled ? GICD_ENABLE : 0; + break; + case GIC_DIST_CTR: + value = vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS; + value = (value >> 5) - 1; + value |= (atomic_read(&vcpu->kvm->online_vcpus) - 1) << 5; + break; + case GIC_DIST_IIDR: + value = (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0); + break; + default: + return 0; + } + + return value; +} + +static void vgic_mmio_write_v2_misc(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + bool was_enabled = dist->enabled; + + switch (addr & 0x0c) { + case GIC_DIST_CTRL: + dist->enabled = val & GICD_ENABLE; + if (!was_enabled && dist->enabled) + vgic_kick_vcpus(vcpu->kvm); + break; + case GIC_DIST_CTR: + case GIC_DIST_IIDR: + /* Nothing to do */ + return; + } +} + +static void vgic_mmio_write_sgir(struct kvm_vcpu *source_vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + int nr_vcpus = atomic_read(&source_vcpu->kvm->online_vcpus); + int intid = val & 0xf; + int targets = (val >> 16) & 0xff; + int mode = (val >> 24) & 0x03; + int c; + struct kvm_vcpu *vcpu; + + switch (mode) { + case 0x0: /* as specified by targets */ + break; + case 0x1: + targets = (1U << nr_vcpus) - 1; /* all, ... */ + targets &= ~(1U << source_vcpu->vcpu_id); /* but self */ + break; + case 0x2: /* this very vCPU only */ + targets = (1U << source_vcpu->vcpu_id); + break; + case 0x3: /* reserved */ + return; + } + + kvm_for_each_vcpu(c, vcpu, source_vcpu->kvm) { + struct vgic_irq *irq; + + if (!(targets & (1U << c))) + continue; + + irq = vgic_get_irq(source_vcpu->kvm, vcpu, intid); + + spin_lock(&irq->irq_lock); + irq->pending = true; + irq->source |= 1U << source_vcpu->vcpu_id; + + vgic_queue_irq_unlock(source_vcpu->kvm, irq); + } +} + +static unsigned long vgic_mmio_read_target(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 8); + int i; + u64 val = 0; + + for (i = 0; i < len; i++) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + + val |= (u64)irq->targets << (i * 8); + } + + return val; +} + +static void vgic_mmio_write_target(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 8); + int i; + + /* GICD_ITARGETSR[0-7] are read-only */ + if (intid < VGIC_NR_PRIVATE_IRQS) + return; + + for (i = 0; i < len; i++) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, NULL, intid + i); + int target; + + spin_lock(&irq->irq_lock); + + irq->targets = (val >> (i * 8)) & 0xff; + target = irq->targets ? __ffs(irq->targets) : 0; + irq->target_vcpu = kvm_get_vcpu(vcpu->kvm, target); + + spin_unlock(&irq->irq_lock); + } +} + +static unsigned long vgic_mmio_read_sgipend(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + u32 intid = addr & 0x0f; + int i; + u64 val = 0; + + for (i = 0; i < len; i++) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + + val |= (u64)irq->source << (i * 8); + } + return val; +} + +static void vgic_mmio_write_sgipendc(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 intid = addr & 0x0f; + int i; + + for (i = 0; i < len; i++) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + + spin_lock(&irq->irq_lock); + + irq->source &= ~((val >> (i * 8)) & 0xff); + if (!irq->source) + irq->pending = false; + + spin_unlock(&irq->irq_lock); + } +} + +static void vgic_mmio_write_sgipends(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 intid = addr & 0x0f; + int i; + + for (i = 0; i < len; i++) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + + spin_lock(&irq->irq_lock); + + irq->source |= (val >> (i * 8)) & 0xff; + + if (irq->source) { + irq->pending = true; + vgic_queue_irq_unlock(vcpu->kvm, irq); + } else { + spin_unlock(&irq->irq_lock); + } + } +} + +static void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr) +{ + if (kvm_vgic_global_state.type == VGIC_V2) + vgic_v2_set_vmcr(vcpu, vmcr); + else + vgic_v3_set_vmcr(vcpu, vmcr); +} + +static void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr) +{ + if (kvm_vgic_global_state.type == VGIC_V2) + vgic_v2_get_vmcr(vcpu, vmcr); + else + vgic_v3_get_vmcr(vcpu, vmcr); +} + +#define GICC_ARCH_VERSION_V2 0x2 + +/* These are for userland accesses only, there is no guest-facing emulation. */ +static unsigned long vgic_mmio_read_vcpuif(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + struct vgic_vmcr vmcr; + u32 val; + + vgic_get_vmcr(vcpu, &vmcr); + + switch (addr & 0xff) { + case GIC_CPU_CTRL: + val = vmcr.ctlr; + break; + case GIC_CPU_PRIMASK: + val = vmcr.pmr; + break; + case GIC_CPU_BINPOINT: + val = vmcr.bpr; + break; + case GIC_CPU_ALIAS_BINPOINT: + val = vmcr.abpr; + break; + case GIC_CPU_IDENT: + val = ((PRODUCT_ID_KVM << 20) | + (GICC_ARCH_VERSION_V2 << 16) | + IMPLEMENTER_ARM); + break; + default: + return 0; + } + + return val; +} + +static void vgic_mmio_write_vcpuif(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + struct vgic_vmcr vmcr; + + vgic_get_vmcr(vcpu, &vmcr); + + switch (addr & 0xff) { + case GIC_CPU_CTRL: + vmcr.ctlr = val; + break; + case GIC_CPU_PRIMASK: + vmcr.pmr = val; + break; + case GIC_CPU_BINPOINT: + vmcr.bpr = val; + break; + case GIC_CPU_ALIAS_BINPOINT: + vmcr.abpr = val; + break; + } + + vgic_set_vmcr(vcpu, &vmcr); +} + +static const struct vgic_register_region vgic_v2_dist_registers[] = { + REGISTER_DESC_WITH_LENGTH(GIC_DIST_CTRL, + vgic_mmio_read_v2_misc, vgic_mmio_write_v2_misc, 12, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_IGROUP, + vgic_mmio_read_rao, vgic_mmio_write_wi, 1, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ENABLE_SET, + vgic_mmio_read_enable, vgic_mmio_write_senable, 1, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ENABLE_CLEAR, + vgic_mmio_read_enable, vgic_mmio_write_cenable, 1, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PENDING_SET, + vgic_mmio_read_pending, vgic_mmio_write_spending, 1, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PENDING_CLEAR, + vgic_mmio_read_pending, vgic_mmio_write_cpending, 1, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ACTIVE_SET, + vgic_mmio_read_active, vgic_mmio_write_sactive, 1, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ACTIVE_CLEAR, + vgic_mmio_read_active, vgic_mmio_write_cactive, 1, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PRI, + vgic_mmio_read_priority, vgic_mmio_write_priority, 8, + VGIC_ACCESS_32bit | VGIC_ACCESS_8bit), + REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_TARGET, + vgic_mmio_read_target, vgic_mmio_write_target, 8, + VGIC_ACCESS_32bit | VGIC_ACCESS_8bit), + REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_CONFIG, + vgic_mmio_read_config, vgic_mmio_write_config, 2, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GIC_DIST_SOFTINT, + vgic_mmio_read_raz, vgic_mmio_write_sgir, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GIC_DIST_SGI_PENDING_CLEAR, + vgic_mmio_read_sgipend, vgic_mmio_write_sgipendc, 16, + VGIC_ACCESS_32bit | VGIC_ACCESS_8bit), + REGISTER_DESC_WITH_LENGTH(GIC_DIST_SGI_PENDING_SET, + vgic_mmio_read_sgipend, vgic_mmio_write_sgipends, 16, + VGIC_ACCESS_32bit | VGIC_ACCESS_8bit), +}; + +static const struct vgic_register_region vgic_v2_cpu_registers[] = { + REGISTER_DESC_WITH_LENGTH(GIC_CPU_CTRL, + vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GIC_CPU_PRIMASK, + vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GIC_CPU_BINPOINT, + vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GIC_CPU_ALIAS_BINPOINT, + vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GIC_CPU_ACTIVEPRIO, + vgic_mmio_read_raz, vgic_mmio_write_wi, 16, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GIC_CPU_IDENT, + vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4, + VGIC_ACCESS_32bit), +}; + +unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev) +{ + dev->regions = vgic_v2_dist_registers; + dev->nr_regions = ARRAY_SIZE(vgic_v2_dist_registers); + + kvm_iodevice_init(&dev->dev, &kvm_io_gic_ops); + + return SZ_4K; +} + +int vgic_v2_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr) +{ + int nr_irqs = dev->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS; + const struct vgic_register_region *regions; + gpa_t addr; + int nr_regions, i, len; + + addr = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK; + + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: + regions = vgic_v2_dist_registers; + nr_regions = ARRAY_SIZE(vgic_v2_dist_registers); + break; + case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: + regions = vgic_v2_cpu_registers; + nr_regions = ARRAY_SIZE(vgic_v2_cpu_registers); + break; + default: + return -ENXIO; + } + + /* We only support aligned 32-bit accesses. */ + if (addr & 3) + return -ENXIO; + + for (i = 0; i < nr_regions; i++) { + if (regions[i].bits_per_irq) + len = (regions[i].bits_per_irq * nr_irqs) / 8; + else + len = regions[i].len; + + if (regions[i].reg_offset <= addr && + regions[i].reg_offset + len > addr) + return 0; + } + + return -ENXIO; +} + +/* + * When userland tries to access the VGIC register handlers, we need to + * create a usable struct vgic_io_device to be passed to the handlers and we + * have to set up a buffer similar to what would have happened if a guest MMIO + * access occurred, including doing endian conversions on BE systems. + */ +static int vgic_uaccess(struct kvm_vcpu *vcpu, struct vgic_io_device *dev, + bool is_write, int offset, u32 *val) +{ + unsigned int len = 4; + u8 buf[4]; + int ret; + + if (is_write) { + vgic_data_host_to_mmio_bus(buf, len, *val); + ret = kvm_io_gic_ops.write(vcpu, &dev->dev, offset, len, buf); + } else { + ret = kvm_io_gic_ops.read(vcpu, &dev->dev, offset, len, buf); + if (!ret) + *val = vgic_data_mmio_bus_to_host(buf, len); + } + + return ret; +} + +int vgic_v2_cpuif_uaccess(struct kvm_vcpu *vcpu, bool is_write, + int offset, u32 *val) +{ + struct vgic_io_device dev = { + .regions = vgic_v2_cpu_registers, + .nr_regions = ARRAY_SIZE(vgic_v2_cpu_registers), + }; + + return vgic_uaccess(vcpu, &dev, is_write, offset, val); +} + +int vgic_v2_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write, + int offset, u32 *val) +{ + struct vgic_io_device dev = { + .regions = vgic_v2_dist_registers, + .nr_regions = ARRAY_SIZE(vgic_v2_dist_registers), + }; + + return vgic_uaccess(vcpu, &dev, is_write, offset, val); +} diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c new file mode 100644 index 0000000..a0c515a --- /dev/null +++ b/virt/kvm/arm/vgic/vgic-mmio-v3.c @@ -0,0 +1,455 @@ +/* + * VGICv3 MMIO handling functions + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/irqchip/arm-gic-v3.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <kvm/iodev.h> +#include <kvm/arm_vgic.h> + +#include <asm/kvm_emulate.h> + +#include "vgic.h" +#include "vgic-mmio.h" + +/* extract @num bytes at @offset bytes offset in data */ +static unsigned long extract_bytes(unsigned long data, unsigned int offset, + unsigned int num) +{ + return (data >> (offset * 8)) & GENMASK_ULL(num * 8 - 1, 0); +} + +static unsigned long vgic_mmio_read_v3_misc(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + u32 value = 0; + + switch (addr & 0x0c) { + case GICD_CTLR: + if (vcpu->kvm->arch.vgic.enabled) + value |= GICD_CTLR_ENABLE_SS_G1; + value |= GICD_CTLR_ARE_NS | GICD_CTLR_DS; + break; + case GICD_TYPER: + value = vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS; + value = (value >> 5) - 1; + value |= (INTERRUPT_ID_BITS_SPIS - 1) << 19; + break; + case GICD_IIDR: + value = (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0); + break; + default: + return 0; + } + + return value; +} + +static void vgic_mmio_write_v3_misc(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + bool was_enabled = dist->enabled; + + switch (addr & 0x0c) { + case GICD_CTLR: + dist->enabled = val & GICD_CTLR_ENABLE_SS_G1; + + if (!was_enabled && dist->enabled) + vgic_kick_vcpus(vcpu->kvm); + break; + case GICD_TYPER: + case GICD_IIDR: + return; + } +} + +static unsigned long vgic_mmio_read_irouter(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + int intid = VGIC_ADDR_TO_INTID(addr, 64); + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, NULL, intid); + + if (!irq) + return 0; + + /* The upper word is RAZ for us. */ + if (addr & 4) + return 0; + + return extract_bytes(READ_ONCE(irq->mpidr), addr & 7, len); +} + +static void vgic_mmio_write_irouter(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + int intid = VGIC_ADDR_TO_INTID(addr, 64); + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, NULL, intid); + + if (!irq) + return; + + /* The upper word is WI for us since we don't implement Aff3. */ + if (addr & 4) + return; + + spin_lock(&irq->irq_lock); + + /* We only care about and preserve Aff0, Aff1 and Aff2. */ + irq->mpidr = val & GENMASK(23, 0); + irq->target_vcpu = kvm_mpidr_to_vcpu(vcpu->kvm, irq->mpidr); + + spin_unlock(&irq->irq_lock); +} + +static unsigned long vgic_mmio_read_v3r_typer(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + unsigned long mpidr = kvm_vcpu_get_mpidr_aff(vcpu); + int target_vcpu_id = vcpu->vcpu_id; + u64 value; + + value = (mpidr & GENMASK(23, 0)) << 32; + value |= ((target_vcpu_id & 0xffff) << 8); + if (target_vcpu_id == atomic_read(&vcpu->kvm->online_vcpus) - 1) + value |= GICR_TYPER_LAST; + + return extract_bytes(value, addr & 7, len); +} + +static unsigned long vgic_mmio_read_v3r_iidr(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + return (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0); +} + +static unsigned long vgic_mmio_read_v3_idregs(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + switch (addr & 0xffff) { + case GICD_PIDR2: + /* report a GICv3 compliant implementation */ + return 0x3b; + } + + return 0; +} + +/* + * The GICv3 per-IRQ registers are split to control PPIs and SGIs in the + * redistributors, while SPIs are covered by registers in the distributor + * block. Trying to set private IRQs in this block gets ignored. + * We take some special care here to fix the calculation of the register + * offset. + */ +#define REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(off, rd, wr, bpi, acc) \ + { \ + .reg_offset = off, \ + .bits_per_irq = bpi, \ + .len = (bpi * VGIC_NR_PRIVATE_IRQS) / 8, \ + .access_flags = acc, \ + .read = vgic_mmio_read_raz, \ + .write = vgic_mmio_write_wi, \ + }, { \ + .reg_offset = off + (bpi * VGIC_NR_PRIVATE_IRQS) / 8, \ + .bits_per_irq = bpi, \ + .len = (bpi * (1024 - VGIC_NR_PRIVATE_IRQS)) / 8, \ + .access_flags = acc, \ + .read = rd, \ + .write = wr, \ + } + +static const struct vgic_register_region vgic_v3_dist_registers[] = { + REGISTER_DESC_WITH_LENGTH(GICD_CTLR, + vgic_mmio_read_v3_misc, vgic_mmio_write_v3_misc, 16, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IGROUPR, + vgic_mmio_read_rao, vgic_mmio_write_wi, 1, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISENABLER, + vgic_mmio_read_enable, vgic_mmio_write_senable, 1, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICENABLER, + vgic_mmio_read_enable, vgic_mmio_write_cenable, 1, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISPENDR, + vgic_mmio_read_pending, vgic_mmio_write_spending, 1, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICPENDR, + vgic_mmio_read_pending, vgic_mmio_write_cpending, 1, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISACTIVER, + vgic_mmio_read_active, vgic_mmio_write_sactive, 1, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICACTIVER, + vgic_mmio_read_active, vgic_mmio_write_cactive, 1, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IPRIORITYR, + vgic_mmio_read_priority, vgic_mmio_write_priority, 8, + VGIC_ACCESS_32bit | VGIC_ACCESS_8bit), + REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ITARGETSR, + vgic_mmio_read_raz, vgic_mmio_write_wi, 8, + VGIC_ACCESS_32bit | VGIC_ACCESS_8bit), + REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICFGR, + vgic_mmio_read_config, vgic_mmio_write_config, 2, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IGRPMODR, + vgic_mmio_read_raz, vgic_mmio_write_wi, 1, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IROUTER, + vgic_mmio_read_irouter, vgic_mmio_write_irouter, 64, + VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GICD_IDREGS, + vgic_mmio_read_v3_idregs, vgic_mmio_write_wi, 48, + VGIC_ACCESS_32bit), +}; + +static const struct vgic_register_region vgic_v3_rdbase_registers[] = { + REGISTER_DESC_WITH_LENGTH(GICR_CTLR, + vgic_mmio_read_raz, vgic_mmio_write_wi, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GICR_IIDR, + vgic_mmio_read_v3r_iidr, vgic_mmio_write_wi, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GICR_TYPER, + vgic_mmio_read_v3r_typer, vgic_mmio_write_wi, 8, + VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GICR_PROPBASER, + vgic_mmio_read_raz, vgic_mmio_write_wi, 8, + VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GICR_PENDBASER, + vgic_mmio_read_raz, vgic_mmio_write_wi, 8, + VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GICR_IDREGS, + vgic_mmio_read_v3_idregs, vgic_mmio_write_wi, 48, + VGIC_ACCESS_32bit), +}; + +static const struct vgic_register_region vgic_v3_sgibase_registers[] = { + REGISTER_DESC_WITH_LENGTH(GICR_IGROUPR0, + vgic_mmio_read_rao, vgic_mmio_write_wi, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GICR_ISENABLER0, + vgic_mmio_read_enable, vgic_mmio_write_senable, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GICR_ICENABLER0, + vgic_mmio_read_enable, vgic_mmio_write_cenable, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GICR_ISPENDR0, + vgic_mmio_read_pending, vgic_mmio_write_spending, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GICR_ICPENDR0, + vgic_mmio_read_pending, vgic_mmio_write_cpending, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GICR_ISACTIVER0, + vgic_mmio_read_active, vgic_mmio_write_sactive, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GICR_ICACTIVER0, + vgic_mmio_read_active, vgic_mmio_write_cactive, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GICR_IPRIORITYR0, + vgic_mmio_read_priority, vgic_mmio_write_priority, 32, + VGIC_ACCESS_32bit | VGIC_ACCESS_8bit), + REGISTER_DESC_WITH_LENGTH(GICR_ICFGR0, + vgic_mmio_read_config, vgic_mmio_write_config, 8, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GICR_IGRPMODR0, + vgic_mmio_read_raz, vgic_mmio_write_wi, 4, + VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH(GICR_NSACR, + vgic_mmio_read_raz, vgic_mmio_write_wi, 4, + VGIC_ACCESS_32bit), +}; + +unsigned int vgic_v3_init_dist_iodev(struct vgic_io_device *dev) +{ + dev->regions = vgic_v3_dist_registers; + dev->nr_regions = ARRAY_SIZE(vgic_v3_dist_registers); + + kvm_iodevice_init(&dev->dev, &kvm_io_gic_ops); + + return SZ_64K; +} + +int vgic_register_redist_iodevs(struct kvm *kvm, gpa_t redist_base_address) +{ + int nr_vcpus = atomic_read(&kvm->online_vcpus); + struct kvm_vcpu *vcpu; + struct vgic_io_device *devices; + int c, ret = 0; + + devices = kmalloc(sizeof(struct vgic_io_device) * nr_vcpus * 2, + GFP_KERNEL); + if (!devices) + return -ENOMEM; + + kvm_for_each_vcpu(c, vcpu, kvm) { + gpa_t rd_base = redist_base_address + c * SZ_64K * 2; + gpa_t sgi_base = rd_base + SZ_64K; + struct vgic_io_device *rd_dev = &devices[c * 2]; + struct vgic_io_device *sgi_dev = &devices[c * 2 + 1]; + + kvm_iodevice_init(&rd_dev->dev, &kvm_io_gic_ops); + rd_dev->base_addr = rd_base; + rd_dev->regions = vgic_v3_rdbase_registers; + rd_dev->nr_regions = ARRAY_SIZE(vgic_v3_rdbase_registers); + rd_dev->redist_vcpu = vcpu; + + mutex_lock(&kvm->slots_lock); + ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, rd_base, + SZ_64K, &rd_dev->dev); + mutex_unlock(&kvm->slots_lock); + + if (ret) + break; + + kvm_iodevice_init(&sgi_dev->dev, &kvm_io_gic_ops); + sgi_dev->base_addr = sgi_base; + sgi_dev->regions = vgic_v3_sgibase_registers; + sgi_dev->nr_regions = ARRAY_SIZE(vgic_v3_sgibase_registers); + sgi_dev->redist_vcpu = vcpu; + + mutex_lock(&kvm->slots_lock); + ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, sgi_base, + SZ_64K, &sgi_dev->dev); + mutex_unlock(&kvm->slots_lock); + if (ret) { + kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, + &rd_dev->dev); + break; + } + } + + if (ret) { + /* The current c failed, so we start with the previous one. */ + for (c--; c >= 0; c--) { + kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, + &devices[c * 2].dev); + kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, + &devices[c * 2 + 1].dev); + } + kfree(devices); + } else { + kvm->arch.vgic.redist_iodevs = devices; + } + + return ret; +} + +/* + * Compare a given affinity (level 1-3 and a level 0 mask, from the SGI + * generation register ICC_SGI1R_EL1) with a given VCPU. + * If the VCPU's MPIDR matches, return the level0 affinity, otherwise + * return -1. + */ +static int match_mpidr(u64 sgi_aff, u16 sgi_cpu_mask, struct kvm_vcpu *vcpu) +{ + unsigned long affinity; + int level0; + + /* + * Split the current VCPU's MPIDR into affinity level 0 and the + * rest as this is what we have to compare against. + */ + affinity = kvm_vcpu_get_mpidr_aff(vcpu); + level0 = MPIDR_AFFINITY_LEVEL(affinity, 0); + affinity &= ~MPIDR_LEVEL_MASK; + + /* bail out if the upper three levels don't match */ + if (sgi_aff != affinity) + return -1; + + /* Is this VCPU's bit set in the mask ? */ + if (!(sgi_cpu_mask & BIT(level0))) + return -1; + + return level0; +} + +/* + * The ICC_SGI* registers encode the affinity differently from the MPIDR, + * so provide a wrapper to use the existing defines to isolate a certain + * affinity level. + */ +#define SGI_AFFINITY_LEVEL(reg, level) \ + ((((reg) & ICC_SGI1R_AFFINITY_## level ##_MASK) \ + >> ICC_SGI1R_AFFINITY_## level ##_SHIFT) << MPIDR_LEVEL_SHIFT(level)) + +/** + * vgic_v3_dispatch_sgi - handle SGI requests from VCPUs + * @vcpu: The VCPU requesting a SGI + * @reg: The value written into the ICC_SGI1R_EL1 register by that VCPU + * + * With GICv3 (and ARE=1) CPUs trigger SGIs by writing to a system register. + * This will trap in sys_regs.c and call this function. + * This ICC_SGI1R_EL1 register contains the upper three affinity levels of the + * target processors as well as a bitmask of 16 Aff0 CPUs. + * If the interrupt routing mode bit is not set, we iterate over all VCPUs to + * check for matching ones. If this bit is set, we signal all, but not the + * calling VCPU. + */ +void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_vcpu *c_vcpu; + u16 target_cpus; + u64 mpidr; + int sgi, c; + int vcpu_id = vcpu->vcpu_id; + bool broadcast; + + sgi = (reg & ICC_SGI1R_SGI_ID_MASK) >> ICC_SGI1R_SGI_ID_SHIFT; + broadcast = reg & BIT(ICC_SGI1R_IRQ_ROUTING_MODE_BIT); + target_cpus = (reg & ICC_SGI1R_TARGET_LIST_MASK) >> ICC_SGI1R_TARGET_LIST_SHIFT; + mpidr = SGI_AFFINITY_LEVEL(reg, 3); + mpidr |= SGI_AFFINITY_LEVEL(reg, 2); + mpidr |= SGI_AFFINITY_LEVEL(reg, 1); + + /* + * We iterate over all VCPUs to find the MPIDRs matching the request. + * If we have handled one CPU, we clear its bit to detect early + * if we are already finished. This avoids iterating through all + * VCPUs when most of the times we just signal a single VCPU. + */ + kvm_for_each_vcpu(c, c_vcpu, kvm) { + struct vgic_irq *irq; + + /* Exit early if we have dealt with all requested CPUs */ + if (!broadcast && target_cpus == 0) + break; + + /* Don't signal the calling VCPU */ + if (broadcast && c == vcpu_id) + continue; + + if (!broadcast) { + int level0; + + level0 = match_mpidr(mpidr, target_cpus, c_vcpu); + if (level0 == -1) + continue; + + /* remove this matching VCPU from the mask */ + target_cpus &= ~BIT(level0); + } + + irq = vgic_get_irq(vcpu->kvm, c_vcpu, sgi); + + spin_lock(&irq->irq_lock); + irq->pending = true; + + vgic_queue_irq_unlock(vcpu->kvm, irq); + } +} diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c new file mode 100644 index 0000000..059595e --- /dev/null +++ b/virt/kvm/arm/vgic/vgic-mmio.c @@ -0,0 +1,526 @@ +/* + * VGIC MMIO handling functions + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/bitops.h> +#include <linux/bsearch.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <kvm/iodev.h> +#include <kvm/arm_vgic.h> + +#include "vgic.h" +#include "vgic-mmio.h" + +unsigned long vgic_mmio_read_raz(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + return 0; +} + +unsigned long vgic_mmio_read_rao(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + return -1UL; +} + +void vgic_mmio_write_wi(struct kvm_vcpu *vcpu, gpa_t addr, + unsigned int len, unsigned long val) +{ + /* Ignore */ +} + +/* + * Read accesses to both GICD_ICENABLER and GICD_ISENABLER return the value + * of the enabled bit, so there is only one function for both here. + */ +unsigned long vgic_mmio_read_enable(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + u32 value = 0; + int i; + + /* Loop over all IRQs affected by this read */ + for (i = 0; i < len * 8; i++) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + + if (irq->enabled) + value |= (1U << i); + } + + return value; +} + +void vgic_mmio_write_senable(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + int i; + + for_each_set_bit(i, &val, len * 8) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + + spin_lock(&irq->irq_lock); + irq->enabled = true; + vgic_queue_irq_unlock(vcpu->kvm, irq); + } +} + +void vgic_mmio_write_cenable(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + int i; + + for_each_set_bit(i, &val, len * 8) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + + spin_lock(&irq->irq_lock); + + irq->enabled = false; + + spin_unlock(&irq->irq_lock); + } +} + +unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + u32 value = 0; + int i; + + /* Loop over all IRQs affected by this read */ + for (i = 0; i < len * 8; i++) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + + if (irq->pending) + value |= (1U << i); + } + + return value; +} + +void vgic_mmio_write_spending(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + int i; + + for_each_set_bit(i, &val, len * 8) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + + spin_lock(&irq->irq_lock); + irq->pending = true; + if (irq->config == VGIC_CONFIG_LEVEL) + irq->soft_pending = true; + + vgic_queue_irq_unlock(vcpu->kvm, irq); + } +} + +void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + int i; + + for_each_set_bit(i, &val, len * 8) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + + spin_lock(&irq->irq_lock); + + if (irq->config == VGIC_CONFIG_LEVEL) { + irq->soft_pending = false; + irq->pending = irq->line_level; + } else { + irq->pending = false; + } + + spin_unlock(&irq->irq_lock); + } +} + +unsigned long vgic_mmio_read_active(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + u32 value = 0; + int i; + + /* Loop over all IRQs affected by this read */ + for (i = 0; i < len * 8; i++) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + + if (irq->active) + value |= (1U << i); + } + + return value; +} + +static void vgic_mmio_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq, + bool new_active_state) +{ + spin_lock(&irq->irq_lock); + /* + * If this virtual IRQ was written into a list register, we + * have to make sure the CPU that runs the VCPU thread has + * synced back LR state to the struct vgic_irq. We can only + * know this for sure, when either this irq is not assigned to + * anyone's AP list anymore, or the VCPU thread is not + * running on any CPUs. + * + * In the opposite case, we know the VCPU thread may be on its + * way back from the guest and still has to sync back this + * IRQ, so we release and re-acquire the spin_lock to let the + * other thread sync back the IRQ. + */ + while (irq->vcpu && /* IRQ may have state in an LR somewhere */ + irq->vcpu->cpu != -1) { /* VCPU thread is running */ + BUG_ON(irq->intid < VGIC_NR_PRIVATE_IRQS); + cond_resched_lock(&irq->irq_lock); + } + + irq->active = new_active_state; + if (new_active_state) + vgic_queue_irq_unlock(vcpu->kvm, irq); + else + spin_unlock(&irq->irq_lock); +} + +/* + * If we are fiddling with an IRQ's active state, we have to make sure the IRQ + * is not queued on some running VCPU's LRs, because then the change to the + * active state can be overwritten when the VCPU's state is synced coming back + * from the guest. + * + * For shared interrupts, we have to stop all the VCPUs because interrupts can + * be migrated while we don't hold the IRQ locks and we don't want to be + * chasing moving targets. + * + * For private interrupts, we only have to make sure the single and only VCPU + * that can potentially queue the IRQ is stopped. + */ +static void vgic_change_active_prepare(struct kvm_vcpu *vcpu, u32 intid) +{ + if (intid < VGIC_NR_PRIVATE_IRQS) + kvm_arm_halt_vcpu(vcpu); + else + kvm_arm_halt_guest(vcpu->kvm); +} + +/* See vgic_change_active_prepare */ +static void vgic_change_active_finish(struct kvm_vcpu *vcpu, u32 intid) +{ + if (intid < VGIC_NR_PRIVATE_IRQS) + kvm_arm_resume_vcpu(vcpu); + else + kvm_arm_resume_guest(vcpu->kvm); +} + +void vgic_mmio_write_cactive(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + int i; + + vgic_change_active_prepare(vcpu, intid); + for_each_set_bit(i, &val, len * 8) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + vgic_mmio_change_active(vcpu, irq, false); + } + vgic_change_active_finish(vcpu, intid); +} + +void vgic_mmio_write_sactive(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + int i; + + vgic_change_active_prepare(vcpu, intid); + for_each_set_bit(i, &val, len * 8) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + vgic_mmio_change_active(vcpu, irq, true); + } + vgic_change_active_finish(vcpu, intid); +} + +unsigned long vgic_mmio_read_priority(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 8); + int i; + u64 val = 0; + + for (i = 0; i < len; i++) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + + val |= (u64)irq->priority << (i * 8); + } + + return val; +} + +/* + * We currently don't handle changing the priority of an interrupt that + * is already pending on a VCPU. If there is a need for this, we would + * need to make this VCPU exit and re-evaluate the priorities, potentially + * leading to this interrupt getting presented now to the guest (if it has + * been masked by the priority mask before). + */ +void vgic_mmio_write_priority(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 8); + int i; + + for (i = 0; i < len; i++) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + + spin_lock(&irq->irq_lock); + /* Narrow the priority range to what we actually support */ + irq->priority = (val >> (i * 8)) & GENMASK(7, 8 - VGIC_PRI_BITS); + spin_unlock(&irq->irq_lock); + } +} + +unsigned long vgic_mmio_read_config(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 2); + u32 value = 0; + int i; + + for (i = 0; i < len * 4; i++) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + + if (irq->config == VGIC_CONFIG_EDGE) + value |= (2U << (i * 2)); + } + + return value; +} + +void vgic_mmio_write_config(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 2); + int i; + + for (i = 0; i < len * 4; i++) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + + /* + * The configuration cannot be changed for SGIs in general, + * for PPIs this is IMPLEMENTATION DEFINED. The arch timer + * code relies on PPIs being level triggered, so we also + * make them read-only here. + */ + if (intid + i < VGIC_NR_PRIVATE_IRQS) + continue; + + spin_lock(&irq->irq_lock); + if (test_bit(i * 2 + 1, &val)) { + irq->config = VGIC_CONFIG_EDGE; + } else { + irq->config = VGIC_CONFIG_LEVEL; + irq->pending = irq->line_level | irq->soft_pending; + } + spin_unlock(&irq->irq_lock); + } +} + +static int match_region(const void *key, const void *elt) +{ + const unsigned int offset = (unsigned long)key; + const struct vgic_register_region *region = elt; + + if (offset < region->reg_offset) + return -1; + + if (offset >= region->reg_offset + region->len) + return 1; + + return 0; +} + +/* Find the proper register handler entry given a certain address offset. */ +static const struct vgic_register_region * +vgic_find_mmio_region(const struct vgic_register_region *region, int nr_regions, + unsigned int offset) +{ + return bsearch((void *)(uintptr_t)offset, region, nr_regions, + sizeof(region[0]), match_region); +} + +/* + * kvm_mmio_read_buf() returns a value in a format where it can be converted + * to a byte array and be directly observed as the guest wanted it to appear + * in memory if it had done the store itself, which is LE for the GIC, as the + * guest knows the GIC is always LE. + * + * We convert this value to the CPUs native format to deal with it as a data + * value. + */ +unsigned long vgic_data_mmio_bus_to_host(const void *val, unsigned int len) +{ + unsigned long data = kvm_mmio_read_buf(val, len); + + switch (len) { + case 1: + return data; + case 2: + return le16_to_cpu(data); + case 4: + return le32_to_cpu(data); + default: + return le64_to_cpu(data); + } +} + +/* + * kvm_mmio_write_buf() expects a value in a format such that if converted to + * a byte array it is observed as the guest would see it if it could perform + * the load directly. Since the GIC is LE, and the guest knows this, the + * guest expects a value in little endian format. + * + * We convert the data value from the CPUs native format to LE so that the + * value is returned in the proper format. + */ +void vgic_data_host_to_mmio_bus(void *buf, unsigned int len, + unsigned long data) +{ + switch (len) { + case 1: + break; + case 2: + data = cpu_to_le16(data); + break; + case 4: + data = cpu_to_le32(data); + break; + default: + data = cpu_to_le64(data); + } + + kvm_mmio_write_buf(buf, len, data); +} + +static +struct vgic_io_device *kvm_to_vgic_iodev(const struct kvm_io_device *dev) +{ + return container_of(dev, struct vgic_io_device, dev); +} + +static bool check_region(const struct vgic_register_region *region, + gpa_t addr, int len) +{ + if ((region->access_flags & VGIC_ACCESS_8bit) && len == 1) + return true; + if ((region->access_flags & VGIC_ACCESS_32bit) && + len == sizeof(u32) && !(addr & 3)) + return true; + if ((region->access_flags & VGIC_ACCESS_64bit) && + len == sizeof(u64) && !(addr & 7)) + return true; + + return false; +} + +static int dispatch_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, + gpa_t addr, int len, void *val) +{ + struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev); + const struct vgic_register_region *region; + struct kvm_vcpu *r_vcpu; + unsigned long data; + + region = vgic_find_mmio_region(iodev->regions, iodev->nr_regions, + addr - iodev->base_addr); + if (!region || !check_region(region, addr, len)) { + memset(val, 0, len); + return 0; + } + + r_vcpu = iodev->redist_vcpu ? iodev->redist_vcpu : vcpu; + data = region->read(r_vcpu, addr, len); + vgic_data_host_to_mmio_bus(val, len, data); + return 0; +} + +static int dispatch_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, + gpa_t addr, int len, const void *val) +{ + struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev); + const struct vgic_register_region *region; + struct kvm_vcpu *r_vcpu; + unsigned long data = vgic_data_mmio_bus_to_host(val, len); + + region = vgic_find_mmio_region(iodev->regions, iodev->nr_regions, + addr - iodev->base_addr); + if (!region) + return 0; + + if (!check_region(region, addr, len)) + return 0; + + r_vcpu = iodev->redist_vcpu ? iodev->redist_vcpu : vcpu; + region->write(r_vcpu, addr, len, data); + return 0; +} + +struct kvm_io_device_ops kvm_io_gic_ops = { + .read = dispatch_mmio_read, + .write = dispatch_mmio_write, +}; + +int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address, + enum vgic_type type) +{ + struct vgic_io_device *io_device = &kvm->arch.vgic.dist_iodev; + int ret = 0; + unsigned int len; + + switch (type) { + case VGIC_V2: + len = vgic_v2_init_dist_iodev(io_device); + break; +#ifdef CONFIG_KVM_ARM_VGIC_V3 + case VGIC_V3: + len = vgic_v3_init_dist_iodev(io_device); + break; +#endif + default: + BUG_ON(1); + } + + io_device->base_addr = dist_base_address; + io_device->redist_vcpu = NULL; + + mutex_lock(&kvm->slots_lock); + ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, dist_base_address, + len, &io_device->dev); + mutex_unlock(&kvm->slots_lock); + + return ret; +} diff --git a/virt/kvm/arm/vgic/vgic-mmio.h b/virt/kvm/arm/vgic/vgic-mmio.h new file mode 100644 index 0000000..8509014 --- /dev/null +++ b/virt/kvm/arm/vgic/vgic-mmio.h @@ -0,0 +1,150 @@ +/* + * Copyright (C) 2015, 2016 ARM Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +#ifndef __KVM_ARM_VGIC_MMIO_H__ +#define __KVM_ARM_VGIC_MMIO_H__ + +struct vgic_register_region { + unsigned int reg_offset; + unsigned int len; + unsigned int bits_per_irq; + unsigned int access_flags; + unsigned long (*read)(struct kvm_vcpu *vcpu, gpa_t addr, + unsigned int len); + void (*write)(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, + unsigned long val); +}; + +extern struct kvm_io_device_ops kvm_io_gic_ops; + +#define VGIC_ACCESS_8bit 1 +#define VGIC_ACCESS_32bit 2 +#define VGIC_ACCESS_64bit 4 + +/* + * Generate a mask that covers the number of bytes required to address + * up to 1024 interrupts, each represented by <bits> bits. This assumes + * that <bits> is a power of two. + */ +#define VGIC_ADDR_IRQ_MASK(bits) (((bits) * 1024 / 8) - 1) + +/* + * (addr & mask) gives us the byte offset for the INT ID, so we want to + * divide this with 'bytes per irq' to get the INT ID, which is given + * by '(bits) / 8'. But we do this with fixed-point-arithmetic and + * take advantage of the fact that division by a fraction equals + * multiplication with the inverted fraction, and scale up both the + * numerator and denominator with 8 to support at most 64 bits per IRQ: + */ +#define VGIC_ADDR_TO_INTID(addr, bits) (((addr) & VGIC_ADDR_IRQ_MASK(bits)) * \ + 64 / (bits) / 8) + +/* + * Some VGIC registers store per-IRQ information, with a different number + * of bits per IRQ. For those registers this macro is used. + * The _WITH_LENGTH version instantiates registers with a fixed length + * and is mutually exclusive with the _PER_IRQ version. + */ +#define REGISTER_DESC_WITH_BITS_PER_IRQ(off, rd, wr, bpi, acc) \ + { \ + .reg_offset = off, \ + .bits_per_irq = bpi, \ + .len = bpi * 1024 / 8, \ + .access_flags = acc, \ + .read = rd, \ + .write = wr, \ + } + +#define REGISTER_DESC_WITH_LENGTH(off, rd, wr, length, acc) \ + { \ + .reg_offset = off, \ + .bits_per_irq = 0, \ + .len = length, \ + .access_flags = acc, \ + .read = rd, \ + .write = wr, \ + } + +int kvm_vgic_register_mmio_region(struct kvm *kvm, struct kvm_vcpu *vcpu, + struct vgic_register_region *reg_desc, + struct vgic_io_device *region, + int nr_irqs, bool offset_private); + +unsigned long vgic_data_mmio_bus_to_host(const void *val, unsigned int len); + +void vgic_data_host_to_mmio_bus(void *buf, unsigned int len, + unsigned long data); + +unsigned long vgic_mmio_read_raz(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len); + +unsigned long vgic_mmio_read_rao(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len); + +void vgic_mmio_write_wi(struct kvm_vcpu *vcpu, gpa_t addr, + unsigned int len, unsigned long val); + +unsigned long vgic_mmio_read_enable(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len); + +void vgic_mmio_write_senable(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val); + +void vgic_mmio_write_cenable(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val); + +unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len); + +void vgic_mmio_write_spending(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val); + +void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val); + +unsigned long vgic_mmio_read_active(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len); + +void vgic_mmio_write_cactive(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val); + +void vgic_mmio_write_sactive(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val); + +unsigned long vgic_mmio_read_priority(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len); + +void vgic_mmio_write_priority(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val); + +unsigned long vgic_mmio_read_config(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len); + +void vgic_mmio_write_config(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val); + +unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev); + +unsigned int vgic_v3_init_dist_iodev(struct vgic_io_device *dev); + +#endif diff --git a/virt/kvm/arm/vgic/vgic-v2.c b/virt/kvm/arm/vgic/vgic-v2.c new file mode 100644 index 0000000..8ad42c2 --- /dev/null +++ b/virt/kvm/arm/vgic/vgic-v2.c @@ -0,0 +1,352 @@ +/* + * Copyright (C) 2015, 2016 ARM Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/irqchip/arm-gic.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <kvm/arm_vgic.h> +#include <asm/kvm_mmu.h> + +#include "vgic.h" + +/* + * Call this function to convert a u64 value to an unsigned long * bitmask + * in a way that works on both 32-bit and 64-bit LE and BE platforms. + * + * Warning: Calling this function may modify *val. + */ +static unsigned long *u64_to_bitmask(u64 *val) +{ +#if defined(CONFIG_CPU_BIG_ENDIAN) && BITS_PER_LONG == 32 + *val = (*val >> 32) | (*val << 32); +#endif + return (unsigned long *)val; +} + +void vgic_v2_process_maintenance(struct kvm_vcpu *vcpu) +{ + struct vgic_v2_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v2; + + if (cpuif->vgic_misr & GICH_MISR_EOI) { + u64 eisr = cpuif->vgic_eisr; + unsigned long *eisr_bmap = u64_to_bitmask(&eisr); + int lr; + + for_each_set_bit(lr, eisr_bmap, kvm_vgic_global_state.nr_lr) { + u32 intid = cpuif->vgic_lr[lr] & GICH_LR_VIRTUALID; + + WARN_ON(cpuif->vgic_lr[lr] & GICH_LR_STATE); + + kvm_notify_acked_irq(vcpu->kvm, 0, + intid - VGIC_NR_PRIVATE_IRQS); + } + } + + /* check and disable underflow maintenance IRQ */ + cpuif->vgic_hcr &= ~GICH_HCR_UIE; + + /* + * In the next iterations of the vcpu loop, if we sync the + * vgic state after flushing it, but before entering the guest + * (this happens for pending signals and vmid rollovers), then + * make sure we don't pick up any old maintenance interrupts + * here. + */ + cpuif->vgic_eisr = 0; +} + +void vgic_v2_set_underflow(struct kvm_vcpu *vcpu) +{ + struct vgic_v2_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v2; + + cpuif->vgic_hcr |= GICH_HCR_UIE; +} + +/* + * transfer the content of the LRs back into the corresponding ap_list: + * - active bit is transferred as is + * - pending bit is + * - transferred as is in case of edge sensitive IRQs + * - set to the line-level (resample time) for level sensitive IRQs + */ +void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu) +{ + struct vgic_v2_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v2; + int lr; + + for (lr = 0; lr < vcpu->arch.vgic_cpu.used_lrs; lr++) { + u32 val = cpuif->vgic_lr[lr]; + u32 intid = val & GICH_LR_VIRTUALID; + struct vgic_irq *irq; + + irq = vgic_get_irq(vcpu->kvm, vcpu, intid); + + spin_lock(&irq->irq_lock); + + /* Always preserve the active bit */ + irq->active = !!(val & GICH_LR_ACTIVE_BIT); + + /* Edge is the only case where we preserve the pending bit */ + if (irq->config == VGIC_CONFIG_EDGE && + (val & GICH_LR_PENDING_BIT)) { + irq->pending = true; + + if (vgic_irq_is_sgi(intid)) { + u32 cpuid = val & GICH_LR_PHYSID_CPUID; + + cpuid >>= GICH_LR_PHYSID_CPUID_SHIFT; + irq->source |= (1 << cpuid); + } + } + + /* Clear soft pending state when level IRQs have been acked */ + if (irq->config == VGIC_CONFIG_LEVEL && + !(val & GICH_LR_PENDING_BIT)) { + irq->soft_pending = false; + irq->pending = irq->line_level; + } + + spin_unlock(&irq->irq_lock); + } +} + +/* + * Populates the particular LR with the state of a given IRQ: + * - for an edge sensitive IRQ the pending state is cleared in struct vgic_irq + * - for a level sensitive IRQ the pending state value is unchanged; + * it is dictated directly by the input level + * + * If @irq describes an SGI with multiple sources, we choose the + * lowest-numbered source VCPU and clear that bit in the source bitmap. + * + * The irq_lock must be held by the caller. + */ +void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) +{ + u32 val = irq->intid; + + if (irq->pending) { + val |= GICH_LR_PENDING_BIT; + + if (irq->config == VGIC_CONFIG_EDGE) + irq->pending = false; + + if (vgic_irq_is_sgi(irq->intid)) { + u32 src = ffs(irq->source); + + BUG_ON(!src); + val |= (src - 1) << GICH_LR_PHYSID_CPUID_SHIFT; + irq->source &= ~(1 << (src - 1)); + if (irq->source) + irq->pending = true; + } + } + + if (irq->active) + val |= GICH_LR_ACTIVE_BIT; + + if (irq->hw) { + val |= GICH_LR_HW; + val |= irq->hwintid << GICH_LR_PHYSID_CPUID_SHIFT; + } else { + if (irq->config == VGIC_CONFIG_LEVEL) + val |= GICH_LR_EOI; + } + + /* The GICv2 LR only holds five bits of priority. */ + val |= (irq->priority >> 3) << GICH_LR_PRIORITY_SHIFT; + + vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = val; +} + +void vgic_v2_clear_lr(struct kvm_vcpu *vcpu, int lr) +{ + vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = 0; +} + +void vgic_v2_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) +{ + u32 vmcr; + + vmcr = (vmcrp->ctlr << GICH_VMCR_CTRL_SHIFT) & GICH_VMCR_CTRL_MASK; + vmcr |= (vmcrp->abpr << GICH_VMCR_ALIAS_BINPOINT_SHIFT) & + GICH_VMCR_ALIAS_BINPOINT_MASK; + vmcr |= (vmcrp->bpr << GICH_VMCR_BINPOINT_SHIFT) & + GICH_VMCR_BINPOINT_MASK; + vmcr |= (vmcrp->pmr << GICH_VMCR_PRIMASK_SHIFT) & + GICH_VMCR_PRIMASK_MASK; + + vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr = vmcr; +} + +void vgic_v2_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) +{ + u32 vmcr = vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr; + + vmcrp->ctlr = (vmcr & GICH_VMCR_CTRL_MASK) >> + GICH_VMCR_CTRL_SHIFT; + vmcrp->abpr = (vmcr & GICH_VMCR_ALIAS_BINPOINT_MASK) >> + GICH_VMCR_ALIAS_BINPOINT_SHIFT; + vmcrp->bpr = (vmcr & GICH_VMCR_BINPOINT_MASK) >> + GICH_VMCR_BINPOINT_SHIFT; + vmcrp->pmr = (vmcr & GICH_VMCR_PRIMASK_MASK) >> + GICH_VMCR_PRIMASK_SHIFT; +} + +void vgic_v2_enable(struct kvm_vcpu *vcpu) +{ + /* + * By forcing VMCR to zero, the GIC will restore the binary + * points to their reset values. Anything else resets to zero + * anyway. + */ + vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr = 0; + vcpu->arch.vgic_cpu.vgic_v2.vgic_elrsr = ~0; + + /* Get the show on the road... */ + vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr = GICH_HCR_EN; +} + +/* check for overlapping regions and for regions crossing the end of memory */ +static bool vgic_v2_check_base(gpa_t dist_base, gpa_t cpu_base) +{ + if (dist_base + KVM_VGIC_V2_DIST_SIZE < dist_base) + return false; + if (cpu_base + KVM_VGIC_V2_CPU_SIZE < cpu_base) + return false; + + if (dist_base + KVM_VGIC_V2_DIST_SIZE <= cpu_base) + return true; + if (cpu_base + KVM_VGIC_V2_CPU_SIZE <= dist_base) + return true; + + return false; +} + +int vgic_v2_map_resources(struct kvm *kvm) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + int ret = 0; + + if (vgic_ready(kvm)) + goto out; + + if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base) || + IS_VGIC_ADDR_UNDEF(dist->vgic_cpu_base)) { + kvm_err("Need to set vgic cpu and dist addresses first\n"); + ret = -ENXIO; + goto out; + } + + if (!vgic_v2_check_base(dist->vgic_dist_base, dist->vgic_cpu_base)) { + kvm_err("VGIC CPU and dist frames overlap\n"); + ret = -EINVAL; + goto out; + } + + /* + * Initialize the vgic if this hasn't already been done on demand by + * accessing the vgic state from userspace. + */ + ret = vgic_init(kvm); + if (ret) { + kvm_err("Unable to initialize VGIC dynamic data structures\n"); + goto out; + } + + ret = vgic_register_dist_iodev(kvm, dist->vgic_dist_base, VGIC_V2); + if (ret) { + kvm_err("Unable to register VGIC MMIO regions\n"); + goto out; + } + + ret = kvm_phys_addr_ioremap(kvm, dist->vgic_cpu_base, + kvm_vgic_global_state.vcpu_base, + KVM_VGIC_V2_CPU_SIZE, true); + if (ret) { + kvm_err("Unable to remap VGIC CPU to VCPU\n"); + goto out; + } + + dist->ready = true; + +out: + if (ret) + kvm_vgic_destroy(kvm); + return ret; +} + +/** + * vgic_v2_probe - probe for a GICv2 compatible interrupt controller in DT + * @node: pointer to the DT node + * + * Returns 0 if a GICv2 has been found, returns an error code otherwise + */ +int vgic_v2_probe(const struct gic_kvm_info *info) +{ + int ret; + u32 vtr; + + if (!info->vctrl.start) { + kvm_err("GICH not present in the firmware table\n"); + return -ENXIO; + } + + if (!PAGE_ALIGNED(info->vcpu.start)) { + kvm_err("GICV physical address 0x%llx not page aligned\n", + (unsigned long long)info->vcpu.start); + return -ENXIO; + } + + if (!PAGE_ALIGNED(resource_size(&info->vcpu))) { + kvm_err("GICV size 0x%llx not a multiple of page size 0x%lx\n", + (unsigned long long)resource_size(&info->vcpu), + PAGE_SIZE); + return -ENXIO; + } + + kvm_vgic_global_state.vctrl_base = ioremap(info->vctrl.start, + resource_size(&info->vctrl)); + if (!kvm_vgic_global_state.vctrl_base) { + kvm_err("Cannot ioremap GICH\n"); + return -ENOMEM; + } + + vtr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VTR); + kvm_vgic_global_state.nr_lr = (vtr & 0x3f) + 1; + + ret = create_hyp_io_mappings(kvm_vgic_global_state.vctrl_base, + kvm_vgic_global_state.vctrl_base + + resource_size(&info->vctrl), + info->vctrl.start); + + if (ret) { + kvm_err("Cannot map VCTRL into hyp\n"); + iounmap(kvm_vgic_global_state.vctrl_base); + return ret; + } + + kvm_vgic_global_state.can_emulate_gicv2 = true; + kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2); + + kvm_vgic_global_state.vcpu_base = info->vcpu.start; + kvm_vgic_global_state.type = VGIC_V2; + kvm_vgic_global_state.max_gic_vcpus = VGIC_V2_MAX_CPUS; + + kvm_info("vgic-v2@%llx\n", info->vctrl.start); + + return 0; +} diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c new file mode 100644 index 0000000..336a461 --- /dev/null +++ b/virt/kvm/arm/vgic/vgic-v3.c @@ -0,0 +1,330 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/irqchip/arm-gic-v3.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <kvm/arm_vgic.h> +#include <asm/kvm_mmu.h> +#include <asm/kvm_asm.h> + +#include "vgic.h" + +void vgic_v3_process_maintenance(struct kvm_vcpu *vcpu) +{ + struct vgic_v3_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v3; + u32 model = vcpu->kvm->arch.vgic.vgic_model; + + if (cpuif->vgic_misr & ICH_MISR_EOI) { + unsigned long eisr_bmap = cpuif->vgic_eisr; + int lr; + + for_each_set_bit(lr, &eisr_bmap, kvm_vgic_global_state.nr_lr) { + u32 intid; + u64 val = cpuif->vgic_lr[lr]; + + if (model == KVM_DEV_TYPE_ARM_VGIC_V3) + intid = val & ICH_LR_VIRTUAL_ID_MASK; + else + intid = val & GICH_LR_VIRTUALID; + + WARN_ON(cpuif->vgic_lr[lr] & ICH_LR_STATE); + + kvm_notify_acked_irq(vcpu->kvm, 0, + intid - VGIC_NR_PRIVATE_IRQS); + } + + /* + * In the next iterations of the vcpu loop, if we sync + * the vgic state after flushing it, but before + * entering the guest (this happens for pending + * signals and vmid rollovers), then make sure we + * don't pick up any old maintenance interrupts here. + */ + cpuif->vgic_eisr = 0; + } + + cpuif->vgic_hcr &= ~ICH_HCR_UIE; +} + +void vgic_v3_set_underflow(struct kvm_vcpu *vcpu) +{ + struct vgic_v3_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v3; + + cpuif->vgic_hcr |= ICH_HCR_UIE; +} + +void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) +{ + struct vgic_v3_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v3; + u32 model = vcpu->kvm->arch.vgic.vgic_model; + int lr; + + for (lr = 0; lr < vcpu->arch.vgic_cpu.used_lrs; lr++) { + u64 val = cpuif->vgic_lr[lr]; + u32 intid; + struct vgic_irq *irq; + + if (model == KVM_DEV_TYPE_ARM_VGIC_V3) + intid = val & ICH_LR_VIRTUAL_ID_MASK; + else + intid = val & GICH_LR_VIRTUALID; + irq = vgic_get_irq(vcpu->kvm, vcpu, intid); + + spin_lock(&irq->irq_lock); + + /* Always preserve the active bit */ + irq->active = !!(val & ICH_LR_ACTIVE_BIT); + + /* Edge is the only case where we preserve the pending bit */ + if (irq->config == VGIC_CONFIG_EDGE && + (val & ICH_LR_PENDING_BIT)) { + irq->pending = true; + + if (vgic_irq_is_sgi(intid) && + model == KVM_DEV_TYPE_ARM_VGIC_V2) { + u32 cpuid = val & GICH_LR_PHYSID_CPUID; + + cpuid >>= GICH_LR_PHYSID_CPUID_SHIFT; + irq->source |= (1 << cpuid); + } + } + + /* Clear soft pending state when level irqs have been acked */ + if (irq->config == VGIC_CONFIG_LEVEL && + !(val & ICH_LR_PENDING_BIT)) { + irq->soft_pending = false; + irq->pending = irq->line_level; + } + + spin_unlock(&irq->irq_lock); + } +} + +/* Requires the irq to be locked already */ +void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) +{ + u32 model = vcpu->kvm->arch.vgic.vgic_model; + u64 val = irq->intid; + + if (irq->pending) { + val |= ICH_LR_PENDING_BIT; + + if (irq->config == VGIC_CONFIG_EDGE) + irq->pending = false; + + if (vgic_irq_is_sgi(irq->intid) && + model == KVM_DEV_TYPE_ARM_VGIC_V2) { + u32 src = ffs(irq->source); + + BUG_ON(!src); + val |= (src - 1) << GICH_LR_PHYSID_CPUID_SHIFT; + irq->source &= ~(1 << (src - 1)); + if (irq->source) + irq->pending = true; + } + } + + if (irq->active) + val |= ICH_LR_ACTIVE_BIT; + + if (irq->hw) { + val |= ICH_LR_HW; + val |= ((u64)irq->hwintid) << ICH_LR_PHYS_ID_SHIFT; + } else { + if (irq->config == VGIC_CONFIG_LEVEL) + val |= ICH_LR_EOI; + } + + /* + * We currently only support Group1 interrupts, which is a + * known defect. This needs to be addressed at some point. + */ + if (model == KVM_DEV_TYPE_ARM_VGIC_V3) + val |= ICH_LR_GROUP; + + val |= (u64)irq->priority << ICH_LR_PRIORITY_SHIFT; + + vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = val; +} + +void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr) +{ + vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = 0; +} + +void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) +{ + u32 vmcr; + + vmcr = (vmcrp->ctlr << ICH_VMCR_CTLR_SHIFT) & ICH_VMCR_CTLR_MASK; + vmcr |= (vmcrp->abpr << ICH_VMCR_BPR1_SHIFT) & ICH_VMCR_BPR1_MASK; + vmcr |= (vmcrp->bpr << ICH_VMCR_BPR0_SHIFT) & ICH_VMCR_BPR0_MASK; + vmcr |= (vmcrp->pmr << ICH_VMCR_PMR_SHIFT) & ICH_VMCR_PMR_MASK; + + vcpu->arch.vgic_cpu.vgic_v3.vgic_vmcr = vmcr; +} + +void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) +{ + u32 vmcr = vcpu->arch.vgic_cpu.vgic_v3.vgic_vmcr; + + vmcrp->ctlr = (vmcr & ICH_VMCR_CTLR_MASK) >> ICH_VMCR_CTLR_SHIFT; + vmcrp->abpr = (vmcr & ICH_VMCR_BPR1_MASK) >> ICH_VMCR_BPR1_SHIFT; + vmcrp->bpr = (vmcr & ICH_VMCR_BPR0_MASK) >> ICH_VMCR_BPR0_SHIFT; + vmcrp->pmr = (vmcr & ICH_VMCR_PMR_MASK) >> ICH_VMCR_PMR_SHIFT; +} + +void vgic_v3_enable(struct kvm_vcpu *vcpu) +{ + struct vgic_v3_cpu_if *vgic_v3 = &vcpu->arch.vgic_cpu.vgic_v3; + + /* + * By forcing VMCR to zero, the GIC will restore the binary + * points to their reset values. Anything else resets to zero + * anyway. + */ + vgic_v3->vgic_vmcr = 0; + vgic_v3->vgic_elrsr = ~0; + + /* + * If we are emulating a GICv3, we do it in an non-GICv2-compatible + * way, so we force SRE to 1 to demonstrate this to the guest. + * This goes with the spec allowing the value to be RAO/WI. + */ + if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) + vgic_v3->vgic_sre = ICC_SRE_EL1_SRE; + else + vgic_v3->vgic_sre = 0; + + /* Get the show on the road... */ + vgic_v3->vgic_hcr = ICH_HCR_EN; +} + +/* check for overlapping regions and for regions crossing the end of memory */ +static bool vgic_v3_check_base(struct kvm *kvm) +{ + struct vgic_dist *d = &kvm->arch.vgic; + gpa_t redist_size = KVM_VGIC_V3_REDIST_SIZE; + + redist_size *= atomic_read(&kvm->online_vcpus); + + if (d->vgic_dist_base + KVM_VGIC_V3_DIST_SIZE < d->vgic_dist_base) + return false; + if (d->vgic_redist_base + redist_size < d->vgic_redist_base) + return false; + + if (d->vgic_dist_base + KVM_VGIC_V3_DIST_SIZE <= d->vgic_redist_base) + return true; + if (d->vgic_redist_base + redist_size <= d->vgic_dist_base) + return true; + + return false; +} + +int vgic_v3_map_resources(struct kvm *kvm) +{ + int ret = 0; + struct vgic_dist *dist = &kvm->arch.vgic; + + if (vgic_ready(kvm)) + goto out; + + if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base) || + IS_VGIC_ADDR_UNDEF(dist->vgic_redist_base)) { + kvm_err("Need to set vgic distributor addresses first\n"); + ret = -ENXIO; + goto out; + } + + if (!vgic_v3_check_base(kvm)) { + kvm_err("VGIC redist and dist frames overlap\n"); + ret = -EINVAL; + goto out; + } + + /* + * For a VGICv3 we require the userland to explicitly initialize + * the VGIC before we need to use it. + */ + if (!vgic_initialized(kvm)) { + ret = -EBUSY; + goto out; + } + + ret = vgic_register_dist_iodev(kvm, dist->vgic_dist_base, VGIC_V3); + if (ret) { + kvm_err("Unable to register VGICv3 dist MMIO regions\n"); + goto out; + } + + ret = vgic_register_redist_iodevs(kvm, dist->vgic_redist_base); + if (ret) { + kvm_err("Unable to register VGICv3 redist MMIO regions\n"); + goto out; + } + + dist->ready = true; + +out: + if (ret) + kvm_vgic_destroy(kvm); + return ret; +} + +/** + * vgic_v3_probe - probe for a GICv3 compatible interrupt controller in DT + * @node: pointer to the DT node + * + * Returns 0 if a GICv3 has been found, returns an error code otherwise + */ +int vgic_v3_probe(const struct gic_kvm_info *info) +{ + u32 ich_vtr_el2 = kvm_call_hyp(__vgic_v3_get_ich_vtr_el2); + + /* + * The ListRegs field is 5 bits, but there is a architectural + * maximum of 16 list registers. Just ignore bit 4... + */ + kvm_vgic_global_state.nr_lr = (ich_vtr_el2 & 0xf) + 1; + kvm_vgic_global_state.can_emulate_gicv2 = false; + + if (!info->vcpu.start) { + kvm_info("GICv3: no GICV resource entry\n"); + kvm_vgic_global_state.vcpu_base = 0; + } else if (!PAGE_ALIGNED(info->vcpu.start)) { + pr_warn("GICV physical address 0x%llx not page aligned\n", + (unsigned long long)info->vcpu.start); + kvm_vgic_global_state.vcpu_base = 0; + } else if (!PAGE_ALIGNED(resource_size(&info->vcpu))) { + pr_warn("GICV size 0x%llx not a multiple of page size 0x%lx\n", + (unsigned long long)resource_size(&info->vcpu), + PAGE_SIZE); + kvm_vgic_global_state.vcpu_base = 0; + } else { + kvm_vgic_global_state.vcpu_base = info->vcpu.start; + kvm_vgic_global_state.can_emulate_gicv2 = true; + kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2); + kvm_info("vgic-v2@%llx\n", info->vcpu.start); + } + if (kvm_vgic_global_state.vcpu_base == 0) + kvm_info("disabling GICv2 emulation\n"); + kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V3); + + kvm_vgic_global_state.vctrl_base = NULL; + kvm_vgic_global_state.type = VGIC_V3; + kvm_vgic_global_state.max_gic_vcpus = VGIC_V3_MAX_CPUS; + + return 0; +} diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c new file mode 100644 index 0000000..69b61ab --- /dev/null +++ b/virt/kvm/arm/vgic/vgic.c @@ -0,0 +1,619 @@ +/* + * Copyright (C) 2015, 2016 ARM Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <linux/list_sort.h> + +#include "vgic.h" + +#define CREATE_TRACE_POINTS +#include "../trace.h" + +#ifdef CONFIG_DEBUG_SPINLOCK +#define DEBUG_SPINLOCK_BUG_ON(p) BUG_ON(p) +#else +#define DEBUG_SPINLOCK_BUG_ON(p) +#endif + +struct vgic_global __section(.hyp.text) kvm_vgic_global_state; + +/* + * Locking order is always: + * vgic_cpu->ap_list_lock + * vgic_irq->irq_lock + * + * (that is, always take the ap_list_lock before the struct vgic_irq lock). + * + * When taking more than one ap_list_lock at the same time, always take the + * lowest numbered VCPU's ap_list_lock first, so: + * vcpuX->vcpu_id < vcpuY->vcpu_id: + * spin_lock(vcpuX->arch.vgic_cpu.ap_list_lock); + * spin_lock(vcpuY->arch.vgic_cpu.ap_list_lock); + */ + +struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu, + u32 intid) +{ + /* SGIs and PPIs */ + if (intid <= VGIC_MAX_PRIVATE) + return &vcpu->arch.vgic_cpu.private_irqs[intid]; + + /* SPIs */ + if (intid <= VGIC_MAX_SPI) + return &kvm->arch.vgic.spis[intid - VGIC_NR_PRIVATE_IRQS]; + + /* LPIs are not yet covered */ + if (intid >= VGIC_MIN_LPI) + return NULL; + + WARN(1, "Looking up struct vgic_irq for reserved INTID"); + return NULL; +} + +/** + * kvm_vgic_target_oracle - compute the target vcpu for an irq + * + * @irq: The irq to route. Must be already locked. + * + * Based on the current state of the interrupt (enabled, pending, + * active, vcpu and target_vcpu), compute the next vcpu this should be + * given to. Return NULL if this shouldn't be injected at all. + * + * Requires the IRQ lock to be held. + */ +static struct kvm_vcpu *vgic_target_oracle(struct vgic_irq *irq) +{ + DEBUG_SPINLOCK_BUG_ON(!spin_is_locked(&irq->irq_lock)); + + /* If the interrupt is active, it must stay on the current vcpu */ + if (irq->active) + return irq->vcpu ? : irq->target_vcpu; + + /* + * If the IRQ is not active but enabled and pending, we should direct + * it to its configured target VCPU. + * If the distributor is disabled, pending interrupts shouldn't be + * forwarded. + */ + if (irq->enabled && irq->pending) { + if (unlikely(irq->target_vcpu && + !irq->target_vcpu->kvm->arch.vgic.enabled)) + return NULL; + + return irq->target_vcpu; + } + + /* If neither active nor pending and enabled, then this IRQ should not + * be queued to any VCPU. + */ + return NULL; +} + +/* + * The order of items in the ap_lists defines how we'll pack things in LRs as + * well, the first items in the list being the first things populated in the + * LRs. + * + * A hard rule is that active interrupts can never be pushed out of the LRs + * (and therefore take priority) since we cannot reliably trap on deactivation + * of IRQs and therefore they have to be present in the LRs. + * + * Otherwise things should be sorted by the priority field and the GIC + * hardware support will take care of preemption of priority groups etc. + * + * Return negative if "a" sorts before "b", 0 to preserve order, and positive + * to sort "b" before "a". + */ +static int vgic_irq_cmp(void *priv, struct list_head *a, struct list_head *b) +{ + struct vgic_irq *irqa = container_of(a, struct vgic_irq, ap_list); + struct vgic_irq *irqb = container_of(b, struct vgic_irq, ap_list); + bool penda, pendb; + int ret; + + spin_lock(&irqa->irq_lock); + spin_lock_nested(&irqb->irq_lock, SINGLE_DEPTH_NESTING); + + if (irqa->active || irqb->active) { + ret = (int)irqb->active - (int)irqa->active; + goto out; + } + + penda = irqa->enabled && irqa->pending; + pendb = irqb->enabled && irqb->pending; + + if (!penda || !pendb) { + ret = (int)pendb - (int)penda; + goto out; + } + + /* Both pending and enabled, sort by priority */ + ret = irqa->priority - irqb->priority; +out: + spin_unlock(&irqb->irq_lock); + spin_unlock(&irqa->irq_lock); + return ret; +} + +/* Must be called with the ap_list_lock held */ +static void vgic_sort_ap_list(struct kvm_vcpu *vcpu) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + + DEBUG_SPINLOCK_BUG_ON(!spin_is_locked(&vgic_cpu->ap_list_lock)); + + list_sort(NULL, &vgic_cpu->ap_list_head, vgic_irq_cmp); +} + +/* + * Only valid injection if changing level for level-triggered IRQs or for a + * rising edge. + */ +static bool vgic_validate_injection(struct vgic_irq *irq, bool level) +{ + switch (irq->config) { + case VGIC_CONFIG_LEVEL: + return irq->line_level != level; + case VGIC_CONFIG_EDGE: + return level; + } + + return false; +} + +/* + * Check whether an IRQ needs to (and can) be queued to a VCPU's ap list. + * Do the queuing if necessary, taking the right locks in the right order. + * Returns true when the IRQ was queued, false otherwise. + * + * Needs to be entered with the IRQ lock already held, but will return + * with all locks dropped. + */ +bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq) +{ + struct kvm_vcpu *vcpu; + + DEBUG_SPINLOCK_BUG_ON(!spin_is_locked(&irq->irq_lock)); + +retry: + vcpu = vgic_target_oracle(irq); + if (irq->vcpu || !vcpu) { + /* + * If this IRQ is already on a VCPU's ap_list, then it + * cannot be moved or modified and there is no more work for + * us to do. + * + * Otherwise, if the irq is not pending and enabled, it does + * not need to be inserted into an ap_list and there is also + * no more work for us to do. + */ + spin_unlock(&irq->irq_lock); + return false; + } + + /* + * We must unlock the irq lock to take the ap_list_lock where + * we are going to insert this new pending interrupt. + */ + spin_unlock(&irq->irq_lock); + + /* someone can do stuff here, which we re-check below */ + + spin_lock(&vcpu->arch.vgic_cpu.ap_list_lock); + spin_lock(&irq->irq_lock); + + /* + * Did something change behind our backs? + * + * There are two cases: + * 1) The irq lost its pending state or was disabled behind our + * backs and/or it was queued to another VCPU's ap_list. + * 2) Someone changed the affinity on this irq behind our + * backs and we are now holding the wrong ap_list_lock. + * + * In both cases, drop the locks and retry. + */ + + if (unlikely(irq->vcpu || vcpu != vgic_target_oracle(irq))) { + spin_unlock(&irq->irq_lock); + spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock); + + spin_lock(&irq->irq_lock); + goto retry; + } + + list_add_tail(&irq->ap_list, &vcpu->arch.vgic_cpu.ap_list_head); + irq->vcpu = vcpu; + + spin_unlock(&irq->irq_lock); + spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock); + + kvm_vcpu_kick(vcpu); + + return true; +} + +static int vgic_update_irq_pending(struct kvm *kvm, int cpuid, + unsigned int intid, bool level, + bool mapped_irq) +{ + struct kvm_vcpu *vcpu; + struct vgic_irq *irq; + int ret; + + trace_vgic_update_irq_pending(cpuid, intid, level); + + ret = vgic_lazy_init(kvm); + if (ret) + return ret; + + vcpu = kvm_get_vcpu(kvm, cpuid); + if (!vcpu && intid < VGIC_NR_PRIVATE_IRQS) + return -EINVAL; + + irq = vgic_get_irq(kvm, vcpu, intid); + if (!irq) + return -EINVAL; + + if (irq->hw != mapped_irq) + return -EINVAL; + + spin_lock(&irq->irq_lock); + + if (!vgic_validate_injection(irq, level)) { + /* Nothing to see here, move along... */ + spin_unlock(&irq->irq_lock); + return 0; + } + + if (irq->config == VGIC_CONFIG_LEVEL) { + irq->line_level = level; + irq->pending = level || irq->soft_pending; + } else { + irq->pending = true; + } + + vgic_queue_irq_unlock(kvm, irq); + + return 0; +} + +/** + * kvm_vgic_inject_irq - Inject an IRQ from a device to the vgic + * @kvm: The VM structure pointer + * @cpuid: The CPU for PPIs + * @intid: The INTID to inject a new state to. + * @level: Edge-triggered: true: to trigger the interrupt + * false: to ignore the call + * Level-sensitive true: raise the input signal + * false: lower the input signal + * + * The VGIC is not concerned with devices being active-LOW or active-HIGH for + * level-sensitive interrupts. You can think of the level parameter as 1 + * being HIGH and 0 being LOW and all devices being active-HIGH. + */ +int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid, + bool level) +{ + return vgic_update_irq_pending(kvm, cpuid, intid, level, false); +} + +int kvm_vgic_inject_mapped_irq(struct kvm *kvm, int cpuid, unsigned int intid, + bool level) +{ + return vgic_update_irq_pending(kvm, cpuid, intid, level, true); +} + +int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, u32 virt_irq, u32 phys_irq) +{ + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, virt_irq); + + BUG_ON(!irq); + + spin_lock(&irq->irq_lock); + + irq->hw = true; + irq->hwintid = phys_irq; + + spin_unlock(&irq->irq_lock); + + return 0; +} + +int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int virt_irq) +{ + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, virt_irq); + + BUG_ON(!irq); + + if (!vgic_initialized(vcpu->kvm)) + return -EAGAIN; + + spin_lock(&irq->irq_lock); + + irq->hw = false; + irq->hwintid = 0; + + spin_unlock(&irq->irq_lock); + + return 0; +} + +/** + * vgic_prune_ap_list - Remove non-relevant interrupts from the list + * + * @vcpu: The VCPU pointer + * + * Go over the list of "interesting" interrupts, and prune those that we + * won't have to consider in the near future. + */ +static void vgic_prune_ap_list(struct kvm_vcpu *vcpu) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct vgic_irq *irq, *tmp; + +retry: + spin_lock(&vgic_cpu->ap_list_lock); + + list_for_each_entry_safe(irq, tmp, &vgic_cpu->ap_list_head, ap_list) { + struct kvm_vcpu *target_vcpu, *vcpuA, *vcpuB; + + spin_lock(&irq->irq_lock); + + BUG_ON(vcpu != irq->vcpu); + + target_vcpu = vgic_target_oracle(irq); + + if (!target_vcpu) { + /* + * We don't need to process this interrupt any + * further, move it off the list. + */ + list_del(&irq->ap_list); + irq->vcpu = NULL; + spin_unlock(&irq->irq_lock); + continue; + } + + if (target_vcpu == vcpu) { + /* We're on the right CPU */ + spin_unlock(&irq->irq_lock); + continue; + } + + /* This interrupt looks like it has to be migrated. */ + + spin_unlock(&irq->irq_lock); + spin_unlock(&vgic_cpu->ap_list_lock); + + /* + * Ensure locking order by always locking the smallest + * ID first. + */ + if (vcpu->vcpu_id < target_vcpu->vcpu_id) { + vcpuA = vcpu; + vcpuB = target_vcpu; + } else { + vcpuA = target_vcpu; + vcpuB = vcpu; + } + + spin_lock(&vcpuA->arch.vgic_cpu.ap_list_lock); + spin_lock_nested(&vcpuB->arch.vgic_cpu.ap_list_lock, + SINGLE_DEPTH_NESTING); + spin_lock(&irq->irq_lock); + + /* + * If the affinity has been preserved, move the + * interrupt around. Otherwise, it means things have + * changed while the interrupt was unlocked, and we + * need to replay this. + * + * In all cases, we cannot trust the list not to have + * changed, so we restart from the beginning. + */ + if (target_vcpu == vgic_target_oracle(irq)) { + struct vgic_cpu *new_cpu = &target_vcpu->arch.vgic_cpu; + + list_del(&irq->ap_list); + irq->vcpu = target_vcpu; + list_add_tail(&irq->ap_list, &new_cpu->ap_list_head); + } + + spin_unlock(&irq->irq_lock); + spin_unlock(&vcpuB->arch.vgic_cpu.ap_list_lock); + spin_unlock(&vcpuA->arch.vgic_cpu.ap_list_lock); + goto retry; + } + + spin_unlock(&vgic_cpu->ap_list_lock); +} + +static inline void vgic_process_maintenance_interrupt(struct kvm_vcpu *vcpu) +{ + if (kvm_vgic_global_state.type == VGIC_V2) + vgic_v2_process_maintenance(vcpu); + else + vgic_v3_process_maintenance(vcpu); +} + +static inline void vgic_fold_lr_state(struct kvm_vcpu *vcpu) +{ + if (kvm_vgic_global_state.type == VGIC_V2) + vgic_v2_fold_lr_state(vcpu); + else + vgic_v3_fold_lr_state(vcpu); +} + +/* Requires the irq_lock to be held. */ +static inline void vgic_populate_lr(struct kvm_vcpu *vcpu, + struct vgic_irq *irq, int lr) +{ + DEBUG_SPINLOCK_BUG_ON(!spin_is_locked(&irq->irq_lock)); + + if (kvm_vgic_global_state.type == VGIC_V2) + vgic_v2_populate_lr(vcpu, irq, lr); + else + vgic_v3_populate_lr(vcpu, irq, lr); +} + +static inline void vgic_clear_lr(struct kvm_vcpu *vcpu, int lr) +{ + if (kvm_vgic_global_state.type == VGIC_V2) + vgic_v2_clear_lr(vcpu, lr); + else + vgic_v3_clear_lr(vcpu, lr); +} + +static inline void vgic_set_underflow(struct kvm_vcpu *vcpu) +{ + if (kvm_vgic_global_state.type == VGIC_V2) + vgic_v2_set_underflow(vcpu); + else + vgic_v3_set_underflow(vcpu); +} + +/* Requires the ap_list_lock to be held. */ +static int compute_ap_list_depth(struct kvm_vcpu *vcpu) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct vgic_irq *irq; + int count = 0; + + DEBUG_SPINLOCK_BUG_ON(!spin_is_locked(&vgic_cpu->ap_list_lock)); + + list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) { + spin_lock(&irq->irq_lock); + /* GICv2 SGIs can count for more than one... */ + if (vgic_irq_is_sgi(irq->intid) && irq->source) + count += hweight8(irq->source); + else + count++; + spin_unlock(&irq->irq_lock); + } + return count; +} + +/* Requires the VCPU's ap_list_lock to be held. */ +static void vgic_flush_lr_state(struct kvm_vcpu *vcpu) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct vgic_irq *irq; + int count = 0; + + DEBUG_SPINLOCK_BUG_ON(!spin_is_locked(&vgic_cpu->ap_list_lock)); + + if (compute_ap_list_depth(vcpu) > kvm_vgic_global_state.nr_lr) { + vgic_set_underflow(vcpu); + vgic_sort_ap_list(vcpu); + } + + list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) { + spin_lock(&irq->irq_lock); + + if (unlikely(vgic_target_oracle(irq) != vcpu)) + goto next; + + /* + * If we get an SGI with multiple sources, try to get + * them in all at once. + */ + do { + vgic_populate_lr(vcpu, irq, count++); + } while (irq->source && count < kvm_vgic_global_state.nr_lr); + +next: + spin_unlock(&irq->irq_lock); + + if (count == kvm_vgic_global_state.nr_lr) + break; + } + + vcpu->arch.vgic_cpu.used_lrs = count; + + /* Nuke remaining LRs */ + for ( ; count < kvm_vgic_global_state.nr_lr; count++) + vgic_clear_lr(vcpu, count); +} + +/* Sync back the hardware VGIC state into our emulation after a guest's run. */ +void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu) +{ + vgic_process_maintenance_interrupt(vcpu); + vgic_fold_lr_state(vcpu); + vgic_prune_ap_list(vcpu); +} + +/* Flush our emulation state into the GIC hardware before entering the guest. */ +void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu) +{ + spin_lock(&vcpu->arch.vgic_cpu.ap_list_lock); + vgic_flush_lr_state(vcpu); + spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock); +} + +int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct vgic_irq *irq; + bool pending = false; + + if (!vcpu->kvm->arch.vgic.enabled) + return false; + + spin_lock(&vgic_cpu->ap_list_lock); + + list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) { + spin_lock(&irq->irq_lock); + pending = irq->pending && irq->enabled; + spin_unlock(&irq->irq_lock); + + if (pending) + break; + } + + spin_unlock(&vgic_cpu->ap_list_lock); + + return pending; +} + +void vgic_kick_vcpus(struct kvm *kvm) +{ + struct kvm_vcpu *vcpu; + int c; + + /* + * We've injected an interrupt, time to find out who deserves + * a good kick... + */ + kvm_for_each_vcpu(c, vcpu, kvm) { + if (kvm_vgic_vcpu_pending_irq(vcpu)) + kvm_vcpu_kick(vcpu); + } +} + +bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int virt_irq) +{ + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, virt_irq); + bool map_is_active; + + spin_lock(&irq->irq_lock); + map_is_active = irq->hw && irq->active; + spin_unlock(&irq->irq_lock); + + return map_is_active; +} diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h new file mode 100644 index 0000000..7b300ca --- /dev/null +++ b/virt/kvm/arm/vgic/vgic.h @@ -0,0 +1,131 @@ +/* + * Copyright (C) 2015, 2016 ARM Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +#ifndef __KVM_ARM_VGIC_NEW_H__ +#define __KVM_ARM_VGIC_NEW_H__ + +#include <linux/irqchip/arm-gic-common.h> + +#define PRODUCT_ID_KVM 0x4b /* ASCII code K */ +#define IMPLEMENTER_ARM 0x43b + +#define VGIC_ADDR_UNDEF (-1) +#define IS_VGIC_ADDR_UNDEF(_x) ((_x) == VGIC_ADDR_UNDEF) + +#define INTERRUPT_ID_BITS_SPIS 10 +#define VGIC_PRI_BITS 5 + +#define vgic_irq_is_sgi(intid) ((intid) < VGIC_NR_SGIS) + +struct vgic_vmcr { + u32 ctlr; + u32 abpr; + u32 bpr; + u32 pmr; +}; + +struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu, + u32 intid); +bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq); +void vgic_kick_vcpus(struct kvm *kvm); + +void vgic_v2_process_maintenance(struct kvm_vcpu *vcpu); +void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu); +void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr); +void vgic_v2_clear_lr(struct kvm_vcpu *vcpu, int lr); +void vgic_v2_set_underflow(struct kvm_vcpu *vcpu); +int vgic_v2_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr); +int vgic_v2_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write, + int offset, u32 *val); +int vgic_v2_cpuif_uaccess(struct kvm_vcpu *vcpu, bool is_write, + int offset, u32 *val); +void vgic_v2_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); +void vgic_v2_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); +void vgic_v2_enable(struct kvm_vcpu *vcpu); +int vgic_v2_probe(const struct gic_kvm_info *info); +int vgic_v2_map_resources(struct kvm *kvm); +int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address, + enum vgic_type); + +#ifdef CONFIG_KVM_ARM_VGIC_V3 +void vgic_v3_process_maintenance(struct kvm_vcpu *vcpu); +void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu); +void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr); +void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr); +void vgic_v3_set_underflow(struct kvm_vcpu *vcpu); +void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); +void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); +void vgic_v3_enable(struct kvm_vcpu *vcpu); +int vgic_v3_probe(const struct gic_kvm_info *info); +int vgic_v3_map_resources(struct kvm *kvm); +int vgic_register_redist_iodevs(struct kvm *kvm, gpa_t dist_base_address); +#else +static inline void vgic_v3_process_maintenance(struct kvm_vcpu *vcpu) +{ +} + +static inline void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) +{ +} + +static inline void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, + struct vgic_irq *irq, int lr) +{ +} + +static inline void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr) +{ +} + +static inline void vgic_v3_set_underflow(struct kvm_vcpu *vcpu) +{ +} + +static inline +void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr) +{ +} + +static inline +void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr) +{ +} + +static inline void vgic_v3_enable(struct kvm_vcpu *vcpu) +{ +} + +static inline int vgic_v3_probe(const struct gic_kvm_info *info) +{ + return -ENODEV; +} + +static inline int vgic_v3_map_resources(struct kvm *kvm) +{ + return -ENODEV; +} + +static inline int vgic_register_redist_iodevs(struct kvm *kvm, + gpa_t dist_base_address) +{ + return -ENODEV; +} +#endif + +void kvm_register_vgic_device(unsigned long type); +int vgic_lazy_init(struct kvm *kvm); +int vgic_init(struct kvm *kvm); + +#endif diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index dd4ac9d..37af230 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -63,6 +63,9 @@ #define CREATE_TRACE_POINTS #include <trace/events/kvm.h> +/* Worst case buffer size needed for holding an integer. */ +#define ITOA_MAX_LEN 12 + MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); @@ -100,6 +103,9 @@ static __read_mostly struct preempt_ops kvm_preempt_ops; struct dentry *kvm_debugfs_dir; EXPORT_SYMBOL_GPL(kvm_debugfs_dir); +static int kvm_debugfs_num_entries; +static const struct file_operations *stat_fops_per_vm[]; + static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, unsigned long arg); #ifdef CONFIG_KVM_COMPAT @@ -542,6 +548,58 @@ static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots) kvfree(slots); } +static void kvm_destroy_vm_debugfs(struct kvm *kvm) +{ + int i; + + if (!kvm->debugfs_dentry) + return; + + debugfs_remove_recursive(kvm->debugfs_dentry); + + for (i = 0; i < kvm_debugfs_num_entries; i++) + kfree(kvm->debugfs_stat_data[i]); + kfree(kvm->debugfs_stat_data); +} + +static int kvm_create_vm_debugfs(struct kvm *kvm, int fd) +{ + char dir_name[ITOA_MAX_LEN * 2]; + struct kvm_stat_data *stat_data; + struct kvm_stats_debugfs_item *p; + + if (!debugfs_initialized()) + return 0; + + snprintf(dir_name, sizeof(dir_name), "%d-%d", task_pid_nr(current), fd); + kvm->debugfs_dentry = debugfs_create_dir(dir_name, + kvm_debugfs_dir); + if (!kvm->debugfs_dentry) + return -ENOMEM; + + kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries, + sizeof(*kvm->debugfs_stat_data), + GFP_KERNEL); + if (!kvm->debugfs_stat_data) + return -ENOMEM; + + for (p = debugfs_entries; p->name; p++) { + stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL); + if (!stat_data) + return -ENOMEM; + + stat_data->kvm = kvm; + stat_data->offset = p->offset; + kvm->debugfs_stat_data[p - debugfs_entries] = stat_data; + if (!debugfs_create_file(p->name, 0444, + kvm->debugfs_dentry, + stat_data, + stat_fops_per_vm[p->kind])) + return -ENOMEM; + } + return 0; +} + static struct kvm *kvm_create_vm(unsigned long type) { int r, i; @@ -647,6 +705,7 @@ static void kvm_destroy_vm(struct kvm *kvm) int i; struct mm_struct *mm = kvm->mm; + kvm_destroy_vm_debugfs(kvm); kvm_arch_sync_events(kvm); spin_lock(&kvm_lock); list_del(&kvm->vm_list); @@ -2999,8 +3058,15 @@ static int kvm_dev_ioctl_create_vm(unsigned long type) } #endif r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR | O_CLOEXEC); - if (r < 0) + if (r < 0) { kvm_put_kvm(kvm); + return r; + } + + if (kvm_create_vm_debugfs(kvm, r) < 0) { + kvm_put_kvm(kvm); + return -ENOMEM; + } return r; } @@ -3425,15 +3491,114 @@ static struct notifier_block kvm_cpu_notifier = { .notifier_call = kvm_cpu_hotplug, }; +static int kvm_debugfs_open(struct inode *inode, struct file *file, + int (*get)(void *, u64 *), int (*set)(void *, u64), + const char *fmt) +{ + struct kvm_stat_data *stat_data = (struct kvm_stat_data *) + inode->i_private; + + /* The debugfs files are a reference to the kvm struct which + * is still valid when kvm_destroy_vm is called. + * To avoid the race between open and the removal of the debugfs + * directory we test against the users count. + */ + if (!atomic_add_unless(&stat_data->kvm->users_count, 1, 0)) + return -ENOENT; + + if (simple_attr_open(inode, file, get, set, fmt)) { + kvm_put_kvm(stat_data->kvm); + return -ENOMEM; + } + + return 0; +} + +static int kvm_debugfs_release(struct inode *inode, struct file *file) +{ + struct kvm_stat_data *stat_data = (struct kvm_stat_data *) + inode->i_private; + + simple_attr_release(inode, file); + kvm_put_kvm(stat_data->kvm); + + return 0; +} + +static int vm_stat_get_per_vm(void *data, u64 *val) +{ + struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data; + + *val = *(u32 *)((void *)stat_data->kvm + stat_data->offset); + + return 0; +} + +static int vm_stat_get_per_vm_open(struct inode *inode, struct file *file) +{ + __simple_attr_check_format("%llu\n", 0ull); + return kvm_debugfs_open(inode, file, vm_stat_get_per_vm, + NULL, "%llu\n"); +} + +static const struct file_operations vm_stat_get_per_vm_fops = { + .owner = THIS_MODULE, + .open = vm_stat_get_per_vm_open, + .release = kvm_debugfs_release, + .read = simple_attr_read, + .write = simple_attr_write, + .llseek = generic_file_llseek, +}; + +static int vcpu_stat_get_per_vm(void *data, u64 *val) +{ + int i; + struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data; + struct kvm_vcpu *vcpu; + + *val = 0; + + kvm_for_each_vcpu(i, vcpu, stat_data->kvm) + *val += *(u32 *)((void *)vcpu + stat_data->offset); + + return 0; +} + +static int vcpu_stat_get_per_vm_open(struct inode *inode, struct file *file) +{ + __simple_attr_check_format("%llu\n", 0ull); + return kvm_debugfs_open(inode, file, vcpu_stat_get_per_vm, + NULL, "%llu\n"); +} + +static const struct file_operations vcpu_stat_get_per_vm_fops = { + .owner = THIS_MODULE, + .open = vcpu_stat_get_per_vm_open, + .release = kvm_debugfs_release, + .read = simple_attr_read, + .write = simple_attr_write, + .llseek = generic_file_llseek, +}; + +static const struct file_operations *stat_fops_per_vm[] = { + [KVM_STAT_VCPU] = &vcpu_stat_get_per_vm_fops, + [KVM_STAT_VM] = &vm_stat_get_per_vm_fops, +}; + static int vm_stat_get(void *_offset, u64 *val) { unsigned offset = (long)_offset; struct kvm *kvm; + struct kvm_stat_data stat_tmp = {.offset = offset}; + u64 tmp_val; *val = 0; spin_lock(&kvm_lock); - list_for_each_entry(kvm, &vm_list, vm_list) - *val += *(u32 *)((void *)kvm + offset); + list_for_each_entry(kvm, &vm_list, vm_list) { + stat_tmp.kvm = kvm; + vm_stat_get_per_vm((void *)&stat_tmp, &tmp_val); + *val += tmp_val; + } spin_unlock(&kvm_lock); return 0; } @@ -3444,15 +3609,16 @@ static int vcpu_stat_get(void *_offset, u64 *val) { unsigned offset = (long)_offset; struct kvm *kvm; - struct kvm_vcpu *vcpu; - int i; + struct kvm_stat_data stat_tmp = {.offset = offset}; + u64 tmp_val; *val = 0; spin_lock(&kvm_lock); - list_for_each_entry(kvm, &vm_list, vm_list) - kvm_for_each_vcpu(i, vcpu, kvm) - *val += *(u32 *)((void *)vcpu + offset); - + list_for_each_entry(kvm, &vm_list, vm_list) { + stat_tmp.kvm = kvm; + vcpu_stat_get_per_vm((void *)&stat_tmp, &tmp_val); + *val += tmp_val; + } spin_unlock(&kvm_lock); return 0; } @@ -3473,7 +3639,8 @@ static int kvm_init_debug(void) if (kvm_debugfs_dir == NULL) goto out; - for (p = debugfs_entries; p->name; ++p) { + kvm_debugfs_num_entries = 0; + for (p = debugfs_entries; p->name; ++p, kvm_debugfs_num_entries++) { if (!debugfs_create_file(p->name, 0444, kvm_debugfs_dir, (void *)(long)p->offset, stat_fops[p->kind])) |