diff options
author | Timothy Pearson <tpearson@raptorengineering.com> | 2017-08-23 14:45:25 -0500 |
---|---|---|
committer | Timothy Pearson <tpearson@raptorengineering.com> | 2017-08-23 14:45:25 -0500 |
commit | fcbb27b0ec6dcbc5a5108cb8fb19eae64593d204 (patch) | |
tree | 22962a4387943edc841c72a4e636a068c66d58fd /virt/kvm | |
download | ast2050-linux-kernel-fcbb27b0ec6dcbc5a5108cb8fb19eae64593d204.zip ast2050-linux-kernel-fcbb27b0ec6dcbc5a5108cb8fb19eae64593d204.tar.gz |
Initial import of modified Linux 2.6.28 tree
Original upstream URL:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git | branch linux-2.6.28.y
Diffstat (limited to 'virt/kvm')
-rw-r--r-- | virt/kvm/coalesced_mmio.c | 156 | ||||
-rw-r--r-- | virt/kvm/coalesced_mmio.h | 23 | ||||
-rw-r--r-- | virt/kvm/ioapic.c | 426 | ||||
-rw-r--r-- | virt/kvm/ioapic.h | 89 | ||||
-rw-r--r-- | virt/kvm/iodev.h | 65 | ||||
-rw-r--r-- | virt/kvm/irq_comm.c | 96 | ||||
-rw-r--r-- | virt/kvm/kvm_main.c | 2106 | ||||
-rw-r--r-- | virt/kvm/kvm_trace.c | 284 | ||||
-rw-r--r-- | virt/kvm/vtd.c | 191 |
9 files changed, 3436 insertions, 0 deletions
diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c new file mode 100644 index 0000000..5ae620d --- /dev/null +++ b/virt/kvm/coalesced_mmio.c @@ -0,0 +1,156 @@ +/* + * KVM coalesced MMIO + * + * Copyright (c) 2008 Bull S.A.S. + * + * Author: Laurent Vivier <Laurent.Vivier@bull.net> + * + */ + +#include "iodev.h" + +#include <linux/kvm_host.h> +#include <linux/kvm.h> + +#include "coalesced_mmio.h" + +static int coalesced_mmio_in_range(struct kvm_io_device *this, + gpa_t addr, int len, int is_write) +{ + struct kvm_coalesced_mmio_dev *dev = + (struct kvm_coalesced_mmio_dev*)this->private; + struct kvm_coalesced_mmio_zone *zone; + int next; + int i; + + if (!is_write) + return 0; + + /* kvm->lock is taken by the caller and must be not released before + * dev.read/write + */ + + /* Are we able to batch it ? */ + + /* last is the first free entry + * check if we don't meet the first used entry + * there is always one unused entry in the buffer + */ + + next = (dev->kvm->coalesced_mmio_ring->last + 1) % + KVM_COALESCED_MMIO_MAX; + if (next == dev->kvm->coalesced_mmio_ring->first) { + /* full */ + return 0; + } + + /* is it in a batchable area ? */ + + for (i = 0; i < dev->nb_zones; i++) { + zone = &dev->zone[i]; + + /* (addr,len) is fully included in + * (zone->addr, zone->size) + */ + + if (zone->addr <= addr && + addr + len <= zone->addr + zone->size) + return 1; + } + return 0; +} + +static void coalesced_mmio_write(struct kvm_io_device *this, + gpa_t addr, int len, const void *val) +{ + struct kvm_coalesced_mmio_dev *dev = + (struct kvm_coalesced_mmio_dev*)this->private; + struct kvm_coalesced_mmio_ring *ring = dev->kvm->coalesced_mmio_ring; + + /* kvm->lock must be taken by caller before call to in_range()*/ + + /* copy data in first free entry of the ring */ + + ring->coalesced_mmio[ring->last].phys_addr = addr; + ring->coalesced_mmio[ring->last].len = len; + memcpy(ring->coalesced_mmio[ring->last].data, val, len); + smp_wmb(); + ring->last = (ring->last + 1) % KVM_COALESCED_MMIO_MAX; +} + +static void coalesced_mmio_destructor(struct kvm_io_device *this) +{ + kfree(this); +} + +int kvm_coalesced_mmio_init(struct kvm *kvm) +{ + struct kvm_coalesced_mmio_dev *dev; + + dev = kzalloc(sizeof(struct kvm_coalesced_mmio_dev), GFP_KERNEL); + if (!dev) + return -ENOMEM; + dev->dev.write = coalesced_mmio_write; + dev->dev.in_range = coalesced_mmio_in_range; + dev->dev.destructor = coalesced_mmio_destructor; + dev->dev.private = dev; + dev->kvm = kvm; + kvm->coalesced_mmio_dev = dev; + kvm_io_bus_register_dev(&kvm->mmio_bus, &dev->dev); + + return 0; +} + +int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm, + struct kvm_coalesced_mmio_zone *zone) +{ + struct kvm_coalesced_mmio_dev *dev = kvm->coalesced_mmio_dev; + + if (dev == NULL) + return -EINVAL; + + mutex_lock(&kvm->lock); + if (dev->nb_zones >= KVM_COALESCED_MMIO_ZONE_MAX) { + mutex_unlock(&kvm->lock); + return -ENOBUFS; + } + + dev->zone[dev->nb_zones] = *zone; + dev->nb_zones++; + + mutex_unlock(&kvm->lock); + return 0; +} + +int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm, + struct kvm_coalesced_mmio_zone *zone) +{ + int i; + struct kvm_coalesced_mmio_dev *dev = kvm->coalesced_mmio_dev; + struct kvm_coalesced_mmio_zone *z; + + if (dev == NULL) + return -EINVAL; + + mutex_lock(&kvm->lock); + + i = dev->nb_zones; + while(i) { + z = &dev->zone[i - 1]; + + /* unregister all zones + * included in (zone->addr, zone->size) + */ + + if (zone->addr <= z->addr && + z->addr + z->size <= zone->addr + zone->size) { + dev->nb_zones--; + *z = dev->zone[dev->nb_zones]; + } + i--; + } + + mutex_unlock(&kvm->lock); + + return 0; +} diff --git a/virt/kvm/coalesced_mmio.h b/virt/kvm/coalesced_mmio.h new file mode 100644 index 0000000..5ac0ec6 --- /dev/null +++ b/virt/kvm/coalesced_mmio.h @@ -0,0 +1,23 @@ +/* + * KVM coalesced MMIO + * + * Copyright (c) 2008 Bull S.A.S. + * + * Author: Laurent Vivier <Laurent.Vivier@bull.net> + * + */ + +#define KVM_COALESCED_MMIO_ZONE_MAX 100 + +struct kvm_coalesced_mmio_dev { + struct kvm_io_device dev; + struct kvm *kvm; + int nb_zones; + struct kvm_coalesced_mmio_zone zone[KVM_COALESCED_MMIO_ZONE_MAX]; +}; + +int kvm_coalesced_mmio_init(struct kvm *kvm); +int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm, + struct kvm_coalesced_mmio_zone *zone); +int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm, + struct kvm_coalesced_mmio_zone *zone); diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c new file mode 100644 index 0000000..53772bb --- /dev/null +++ b/virt/kvm/ioapic.c @@ -0,0 +1,426 @@ +/* + * Copyright (C) 2001 MandrakeSoft S.A. + * + * MandrakeSoft S.A. + * 43, rue d'Aboukir + * 75002 Paris - France + * http://www.linux-mandrake.com/ + * http://www.mandrakesoft.com/ + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Yunhong Jiang <yunhong.jiang@intel.com> + * Yaozu (Eddie) Dong <eddie.dong@intel.com> + * Based on Xen 3.1 code. + */ + +#include <linux/kvm_host.h> +#include <linux/kvm.h> +#include <linux/mm.h> +#include <linux/highmem.h> +#include <linux/smp.h> +#include <linux/hrtimer.h> +#include <linux/io.h> +#include <asm/processor.h> +#include <asm/page.h> +#include <asm/current.h> + +#include "ioapic.h" +#include "lapic.h" +#include "irq.h" + +#if 0 +#define ioapic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) +#else +#define ioapic_debug(fmt, arg...) +#endif +static int ioapic_deliver(struct kvm_ioapic *vioapic, int irq); + +static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic, + unsigned long addr, + unsigned long length) +{ + unsigned long result = 0; + + switch (ioapic->ioregsel) { + case IOAPIC_REG_VERSION: + result = ((((IOAPIC_NUM_PINS - 1) & 0xff) << 16) + | (IOAPIC_VERSION_ID & 0xff)); + break; + + case IOAPIC_REG_APIC_ID: + case IOAPIC_REG_ARB_ID: + result = ((ioapic->id & 0xf) << 24); + break; + + default: + { + u32 redir_index = (ioapic->ioregsel - 0x10) >> 1; + u64 redir_content; + + ASSERT(redir_index < IOAPIC_NUM_PINS); + + redir_content = ioapic->redirtbl[redir_index].bits; + result = (ioapic->ioregsel & 0x1) ? + (redir_content >> 32) & 0xffffffff : + redir_content & 0xffffffff; + break; + } + } + + return result; +} + +static void ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx) +{ + union ioapic_redir_entry *pent; + + pent = &ioapic->redirtbl[idx]; + + if (!pent->fields.mask) { + int injected = ioapic_deliver(ioapic, idx); + if (injected && pent->fields.trig_mode == IOAPIC_LEVEL_TRIG) + pent->fields.remote_irr = 1; + } + if (!pent->fields.trig_mode) + ioapic->irr &= ~(1 << idx); +} + +static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) +{ + unsigned index; + + switch (ioapic->ioregsel) { + case IOAPIC_REG_VERSION: + /* Writes are ignored. */ + break; + + case IOAPIC_REG_APIC_ID: + ioapic->id = (val >> 24) & 0xf; + break; + + case IOAPIC_REG_ARB_ID: + break; + + default: + index = (ioapic->ioregsel - 0x10) >> 1; + + ioapic_debug("change redir index %x val %x\n", index, val); + if (index >= IOAPIC_NUM_PINS) + return; + if (ioapic->ioregsel & 1) { + ioapic->redirtbl[index].bits &= 0xffffffff; + ioapic->redirtbl[index].bits |= (u64) val << 32; + } else { + ioapic->redirtbl[index].bits &= ~0xffffffffULL; + ioapic->redirtbl[index].bits |= (u32) val; + ioapic->redirtbl[index].fields.remote_irr = 0; + } + if (ioapic->irr & (1 << index)) + ioapic_service(ioapic, index); + break; + } +} + +static int ioapic_inj_irq(struct kvm_ioapic *ioapic, + struct kvm_vcpu *vcpu, + u8 vector, u8 trig_mode, u8 delivery_mode) +{ + ioapic_debug("irq %d trig %d deliv %d\n", vector, trig_mode, + delivery_mode); + + ASSERT((delivery_mode == IOAPIC_FIXED) || + (delivery_mode == IOAPIC_LOWEST_PRIORITY)); + + return kvm_apic_set_irq(vcpu, vector, trig_mode); +} + +static void ioapic_inj_nmi(struct kvm_vcpu *vcpu) +{ + kvm_inject_nmi(vcpu); +} + +static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest, + u8 dest_mode) +{ + u32 mask = 0; + int i; + struct kvm *kvm = ioapic->kvm; + struct kvm_vcpu *vcpu; + + ioapic_debug("dest %d dest_mode %d\n", dest, dest_mode); + + if (dest_mode == 0) { /* Physical mode. */ + if (dest == 0xFF) { /* Broadcast. */ + for (i = 0; i < KVM_MAX_VCPUS; ++i) + if (kvm->vcpus[i] && kvm->vcpus[i]->arch.apic) + mask |= 1 << i; + return mask; + } + for (i = 0; i < KVM_MAX_VCPUS; ++i) { + vcpu = kvm->vcpus[i]; + if (!vcpu) + continue; + if (kvm_apic_match_physical_addr(vcpu->arch.apic, dest)) { + if (vcpu->arch.apic) + mask = 1 << i; + break; + } + } + } else if (dest != 0) /* Logical mode, MDA non-zero. */ + for (i = 0; i < KVM_MAX_VCPUS; ++i) { + vcpu = kvm->vcpus[i]; + if (!vcpu) + continue; + if (vcpu->arch.apic && + kvm_apic_match_logical_addr(vcpu->arch.apic, dest)) + mask |= 1 << vcpu->vcpu_id; + } + ioapic_debug("mask %x\n", mask); + return mask; +} + +static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq) +{ + u8 dest = ioapic->redirtbl[irq].fields.dest_id; + u8 dest_mode = ioapic->redirtbl[irq].fields.dest_mode; + u8 delivery_mode = ioapic->redirtbl[irq].fields.delivery_mode; + u8 vector = ioapic->redirtbl[irq].fields.vector; + u8 trig_mode = ioapic->redirtbl[irq].fields.trig_mode; + u32 deliver_bitmask; + struct kvm_vcpu *vcpu; + int vcpu_id, r = 0; + + ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x " + "vector=%x trig_mode=%x\n", + dest, dest_mode, delivery_mode, vector, trig_mode); + + deliver_bitmask = ioapic_get_delivery_bitmask(ioapic, dest, dest_mode); + if (!deliver_bitmask) { + ioapic_debug("no target on destination\n"); + return 0; + } + + switch (delivery_mode) { + case IOAPIC_LOWEST_PRIORITY: + vcpu = kvm_get_lowest_prio_vcpu(ioapic->kvm, vector, + deliver_bitmask); +#ifdef CONFIG_X86 + if (irq == 0) + vcpu = ioapic->kvm->vcpus[0]; +#endif + if (vcpu != NULL) + r = ioapic_inj_irq(ioapic, vcpu, vector, + trig_mode, delivery_mode); + else + ioapic_debug("null lowest prio vcpu: " + "mask=%x vector=%x delivery_mode=%x\n", + deliver_bitmask, vector, IOAPIC_LOWEST_PRIORITY); + break; + case IOAPIC_FIXED: +#ifdef CONFIG_X86 + if (irq == 0) + deliver_bitmask = 1; +#endif + for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) { + if (!(deliver_bitmask & (1 << vcpu_id))) + continue; + deliver_bitmask &= ~(1 << vcpu_id); + vcpu = ioapic->kvm->vcpus[vcpu_id]; + if (vcpu) { + r = ioapic_inj_irq(ioapic, vcpu, vector, + trig_mode, delivery_mode); + } + } + break; + case IOAPIC_NMI: + for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) { + if (!(deliver_bitmask & (1 << vcpu_id))) + continue; + deliver_bitmask &= ~(1 << vcpu_id); + vcpu = ioapic->kvm->vcpus[vcpu_id]; + if (vcpu) + ioapic_inj_nmi(vcpu); + else + ioapic_debug("NMI to vcpu %d failed\n", + vcpu->vcpu_id); + } + break; + default: + printk(KERN_WARNING "Unsupported delivery mode %d\n", + delivery_mode); + break; + } + return r; +} + +void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level) +{ + u32 old_irr = ioapic->irr; + u32 mask = 1 << irq; + union ioapic_redir_entry entry; + + if (irq >= 0 && irq < IOAPIC_NUM_PINS) { + entry = ioapic->redirtbl[irq]; + level ^= entry.fields.polarity; + if (!level) + ioapic->irr &= ~mask; + else { + ioapic->irr |= mask; + if ((!entry.fields.trig_mode && old_irr != ioapic->irr) + || !entry.fields.remote_irr) + ioapic_service(ioapic, irq); + } + } +} + +static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int gsi, + int trigger_mode) +{ + union ioapic_redir_entry *ent; + + ent = &ioapic->redirtbl[gsi]; + + kvm_notify_acked_irq(ioapic->kvm, gsi); + + if (trigger_mode == IOAPIC_LEVEL_TRIG) { + ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG); + ent->fields.remote_irr = 0; + if (!ent->fields.mask && (ioapic->irr & (1 << gsi))) + ioapic_service(ioapic, gsi); + } +} + +void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode) +{ + struct kvm_ioapic *ioapic = kvm->arch.vioapic; + int i; + + for (i = 0; i < IOAPIC_NUM_PINS; i++) + if (ioapic->redirtbl[i].fields.vector == vector) + __kvm_ioapic_update_eoi(ioapic, i, trigger_mode); +} + +static int ioapic_in_range(struct kvm_io_device *this, gpa_t addr, + int len, int is_write) +{ + struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private; + + return ((addr >= ioapic->base_address && + (addr < ioapic->base_address + IOAPIC_MEM_LENGTH))); +} + +static void ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len, + void *val) +{ + struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private; + u32 result; + + ioapic_debug("addr %lx\n", (unsigned long)addr); + ASSERT(!(addr & 0xf)); /* check alignment */ + + addr &= 0xff; + switch (addr) { + case IOAPIC_REG_SELECT: + result = ioapic->ioregsel; + break; + + case IOAPIC_REG_WINDOW: + result = ioapic_read_indirect(ioapic, addr, len); + break; + + default: + result = 0; + break; + } + switch (len) { + case 8: + *(u64 *) val = result; + break; + case 1: + case 2: + case 4: + memcpy(val, (char *)&result, len); + break; + default: + printk(KERN_WARNING "ioapic: wrong length %d\n", len); + } +} + +static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len, + const void *val) +{ + struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private; + u32 data; + + ioapic_debug("ioapic_mmio_write addr=%p len=%d val=%p\n", + (void*)addr, len, val); + ASSERT(!(addr & 0xf)); /* check alignment */ + if (len == 4 || len == 8) + data = *(u32 *) val; + else { + printk(KERN_WARNING "ioapic: Unsupported size %d\n", len); + return; + } + + addr &= 0xff; + switch (addr) { + case IOAPIC_REG_SELECT: + ioapic->ioregsel = data; + break; + + case IOAPIC_REG_WINDOW: + ioapic_write_indirect(ioapic, data); + break; +#ifdef CONFIG_IA64 + case IOAPIC_REG_EOI: + kvm_ioapic_update_eoi(ioapic->kvm, data, IOAPIC_LEVEL_TRIG); + break; +#endif + + default: + break; + } +} + +void kvm_ioapic_reset(struct kvm_ioapic *ioapic) +{ + int i; + + for (i = 0; i < IOAPIC_NUM_PINS; i++) + ioapic->redirtbl[i].fields.mask = 1; + ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS; + ioapic->ioregsel = 0; + ioapic->irr = 0; + ioapic->id = 0; +} + +int kvm_ioapic_init(struct kvm *kvm) +{ + struct kvm_ioapic *ioapic; + + ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL); + if (!ioapic) + return -ENOMEM; + kvm->arch.vioapic = ioapic; + kvm_ioapic_reset(ioapic); + ioapic->dev.read = ioapic_mmio_read; + ioapic->dev.write = ioapic_mmio_write; + ioapic->dev.in_range = ioapic_in_range; + ioapic->dev.private = ioapic; + ioapic->kvm = kvm; + kvm_io_bus_register_dev(&kvm->mmio_bus, &ioapic->dev); + return 0; +} diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h new file mode 100644 index 0000000..cd7ae76 --- /dev/null +++ b/virt/kvm/ioapic.h @@ -0,0 +1,89 @@ +#ifndef __KVM_IO_APIC_H +#define __KVM_IO_APIC_H + +#include <linux/kvm_host.h> + +#include "iodev.h" + +struct kvm; +struct kvm_vcpu; + +#define IOAPIC_NUM_PINS KVM_IOAPIC_NUM_PINS +#define IOAPIC_VERSION_ID 0x11 /* IOAPIC version */ +#define IOAPIC_EDGE_TRIG 0 +#define IOAPIC_LEVEL_TRIG 1 + +#define IOAPIC_DEFAULT_BASE_ADDRESS 0xfec00000 +#define IOAPIC_MEM_LENGTH 0x100 + +/* Direct registers. */ +#define IOAPIC_REG_SELECT 0x00 +#define IOAPIC_REG_WINDOW 0x10 +#define IOAPIC_REG_EOI 0x40 /* IA64 IOSAPIC only */ + +/* Indirect registers. */ +#define IOAPIC_REG_APIC_ID 0x00 /* x86 IOAPIC only */ +#define IOAPIC_REG_VERSION 0x01 +#define IOAPIC_REG_ARB_ID 0x02 /* x86 IOAPIC only */ + +/*ioapic delivery mode*/ +#define IOAPIC_FIXED 0x0 +#define IOAPIC_LOWEST_PRIORITY 0x1 +#define IOAPIC_PMI 0x2 +#define IOAPIC_NMI 0x4 +#define IOAPIC_INIT 0x5 +#define IOAPIC_EXTINT 0x7 + +struct kvm_ioapic { + u64 base_address; + u32 ioregsel; + u32 id; + u32 irr; + u32 pad; + union ioapic_redir_entry { + u64 bits; + struct { + u8 vector; + u8 delivery_mode:3; + u8 dest_mode:1; + u8 delivery_status:1; + u8 polarity:1; + u8 remote_irr:1; + u8 trig_mode:1; + u8 mask:1; + u8 reserve:7; + u8 reserved[4]; + u8 dest_id; + } fields; + } redirtbl[IOAPIC_NUM_PINS]; + struct kvm_io_device dev; + struct kvm *kvm; + void (*ack_notifier)(void *opaque, int irq); +}; + +#ifdef DEBUG +#define ASSERT(x) \ +do { \ + if (!(x)) { \ + printk(KERN_EMERG "assertion failed %s: %d: %s\n", \ + __FILE__, __LINE__, #x); \ + BUG(); \ + } \ +} while (0) +#else +#define ASSERT(x) do { } while (0) +#endif + +static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm) +{ + return kvm->arch.vioapic; +} + +struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector, + unsigned long bitmap); +void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode); +int kvm_ioapic_init(struct kvm *kvm); +void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level); +void kvm_ioapic_reset(struct kvm_ioapic *ioapic); + +#endif diff --git a/virt/kvm/iodev.h b/virt/kvm/iodev.h new file mode 100644 index 0000000..55e8846 --- /dev/null +++ b/virt/kvm/iodev.h @@ -0,0 +1,65 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#ifndef __KVM_IODEV_H__ +#define __KVM_IODEV_H__ + +#include <linux/kvm_types.h> + +struct kvm_io_device { + void (*read)(struct kvm_io_device *this, + gpa_t addr, + int len, + void *val); + void (*write)(struct kvm_io_device *this, + gpa_t addr, + int len, + const void *val); + int (*in_range)(struct kvm_io_device *this, gpa_t addr, int len, + int is_write); + void (*destructor)(struct kvm_io_device *this); + + void *private; +}; + +static inline void kvm_iodevice_read(struct kvm_io_device *dev, + gpa_t addr, + int len, + void *val) +{ + dev->read(dev, addr, len, val); +} + +static inline void kvm_iodevice_write(struct kvm_io_device *dev, + gpa_t addr, + int len, + const void *val) +{ + dev->write(dev, addr, len, val); +} + +static inline int kvm_iodevice_inrange(struct kvm_io_device *dev, + gpa_t addr, int len, int is_write) +{ + return dev->in_range(dev, addr, len, is_write); +} + +static inline void kvm_iodevice_destructor(struct kvm_io_device *dev) +{ + if (dev->destructor) + dev->destructor(dev); +} + +#endif /* __KVM_IODEV_H__ */ diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c new file mode 100644 index 0000000..55ad76e --- /dev/null +++ b/virt/kvm/irq_comm.c @@ -0,0 +1,96 @@ +/* + * irq_comm.c: Common API for in kernel interrupt controller + * Copyright (c) 2007, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * Authors: + * Yaozu (Eddie) Dong <Eddie.dong@intel.com> + * + */ + +#include <linux/kvm_host.h> +#include "irq.h" + +#include "ioapic.h" + +/* This should be called with the kvm->lock mutex held */ +void kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level) +{ + unsigned long *irq_state = (unsigned long *)&kvm->arch.irq_states[irq]; + + /* Logical OR for level trig interrupt */ + if (level) + set_bit(irq_source_id, irq_state); + else + clear_bit(irq_source_id, irq_state); + + /* Not possible to detect if the guest uses the PIC or the + * IOAPIC. So set the bit in both. The guest will ignore + * writes to the unused one. + */ + kvm_ioapic_set_irq(kvm->arch.vioapic, irq, !!(*irq_state)); +#ifdef CONFIG_X86 + kvm_pic_set_irq(pic_irqchip(kvm), irq, !!(*irq_state)); +#endif +} + +void kvm_notify_acked_irq(struct kvm *kvm, unsigned gsi) +{ + struct kvm_irq_ack_notifier *kian; + struct hlist_node *n; + + hlist_for_each_entry(kian, n, &kvm->arch.irq_ack_notifier_list, link) + if (kian->gsi == gsi) + kian->irq_acked(kian); +} + +void kvm_register_irq_ack_notifier(struct kvm *kvm, + struct kvm_irq_ack_notifier *kian) +{ + hlist_add_head(&kian->link, &kvm->arch.irq_ack_notifier_list); +} + +void kvm_unregister_irq_ack_notifier(struct kvm *kvm, + struct kvm_irq_ack_notifier *kian) +{ + hlist_del(&kian->link); +} + +/* The caller must hold kvm->lock mutex */ +int kvm_request_irq_source_id(struct kvm *kvm) +{ + unsigned long *bitmap = &kvm->arch.irq_sources_bitmap; + int irq_source_id = find_first_zero_bit(bitmap, + sizeof(kvm->arch.irq_sources_bitmap)); + if (irq_source_id >= sizeof(kvm->arch.irq_sources_bitmap)) { + printk(KERN_WARNING "kvm: exhaust allocatable IRQ sources!\n"); + irq_source_id = -EFAULT; + } else + set_bit(irq_source_id, bitmap); + return irq_source_id; +} + +void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id) +{ + int i; + + if (irq_source_id <= 0 || + irq_source_id >= sizeof(kvm->arch.irq_sources_bitmap)) { + printk(KERN_ERR "kvm: IRQ source ID out of range!\n"); + return; + } + for (i = 0; i < KVM_IOAPIC_NUM_PINS; i++) + clear_bit(irq_source_id, &kvm->arch.irq_states[i]); + clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap); +} diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c new file mode 100644 index 0000000..a87f45e --- /dev/null +++ b/virt/kvm/kvm_main.c @@ -0,0 +1,2106 @@ +/* + * Kernel-based Virtual Machine driver for Linux + * + * This module enables machines with Intel VT-x extensions to run virtual + * machines without emulation or binary translation. + * + * Copyright (C) 2006 Qumranet, Inc. + * + * Authors: + * Avi Kivity <avi@qumranet.com> + * Yaniv Kamay <yaniv@qumranet.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include "iodev.h" + +#include <linux/kvm_host.h> +#include <linux/kvm.h> +#include <linux/module.h> +#include <linux/errno.h> +#include <linux/percpu.h> +#include <linux/gfp.h> +#include <linux/mm.h> +#include <linux/miscdevice.h> +#include <linux/vmalloc.h> +#include <linux/reboot.h> +#include <linux/debugfs.h> +#include <linux/highmem.h> +#include <linux/file.h> +#include <linux/sysdev.h> +#include <linux/cpu.h> +#include <linux/sched.h> +#include <linux/cpumask.h> +#include <linux/smp.h> +#include <linux/anon_inodes.h> +#include <linux/profile.h> +#include <linux/kvm_para.h> +#include <linux/pagemap.h> +#include <linux/mman.h> +#include <linux/swap.h> + +#include <asm/processor.h> +#include <asm/io.h> +#include <asm/uaccess.h> +#include <asm/pgtable.h> + +#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET +#include "coalesced_mmio.h" +#endif + +#ifdef KVM_CAP_DEVICE_ASSIGNMENT +#include <linux/pci.h> +#include <linux/interrupt.h> +#include "irq.h" +#endif + +MODULE_AUTHOR("Qumranet"); +MODULE_LICENSE("GPL"); + +DEFINE_SPINLOCK(kvm_lock); +LIST_HEAD(vm_list); + +static cpumask_t cpus_hardware_enabled; + +struct kmem_cache *kvm_vcpu_cache; +EXPORT_SYMBOL_GPL(kvm_vcpu_cache); + +static __read_mostly struct preempt_ops kvm_preempt_ops; + +struct dentry *kvm_debugfs_dir; + +static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, + unsigned long arg); + +bool kvm_rebooting; + +#ifdef KVM_CAP_DEVICE_ASSIGNMENT +static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head, + int assigned_dev_id) +{ + struct list_head *ptr; + struct kvm_assigned_dev_kernel *match; + + list_for_each(ptr, head) { + match = list_entry(ptr, struct kvm_assigned_dev_kernel, list); + if (match->assigned_dev_id == assigned_dev_id) + return match; + } + return NULL; +} + +static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work) +{ + struct kvm_assigned_dev_kernel *assigned_dev; + + assigned_dev = container_of(work, struct kvm_assigned_dev_kernel, + interrupt_work); + + /* This is taken to safely inject irq inside the guest. When + * the interrupt injection (or the ioapic code) uses a + * finer-grained lock, update this + */ + mutex_lock(&assigned_dev->kvm->lock); + kvm_set_irq(assigned_dev->kvm, + assigned_dev->irq_source_id, + assigned_dev->guest_irq, 1); + mutex_unlock(&assigned_dev->kvm->lock); + kvm_put_kvm(assigned_dev->kvm); +} + +static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id) +{ + struct kvm_assigned_dev_kernel *assigned_dev = + (struct kvm_assigned_dev_kernel *) dev_id; + + kvm_get_kvm(assigned_dev->kvm); + schedule_work(&assigned_dev->interrupt_work); + disable_irq_nosync(irq); + return IRQ_HANDLED; +} + +/* Ack the irq line for an assigned device */ +static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) +{ + struct kvm_assigned_dev_kernel *dev; + + if (kian->gsi == -1) + return; + + dev = container_of(kian, struct kvm_assigned_dev_kernel, + ack_notifier); + kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0); + enable_irq(dev->host_irq); +} + +static void kvm_free_assigned_device(struct kvm *kvm, + struct kvm_assigned_dev_kernel + *assigned_dev) +{ + if (irqchip_in_kernel(kvm) && assigned_dev->irq_requested) + free_irq(assigned_dev->host_irq, (void *)assigned_dev); + + kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier); + kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id); + + if (cancel_work_sync(&assigned_dev->interrupt_work)) + /* We had pending work. That means we will have to take + * care of kvm_put_kvm. + */ + kvm_put_kvm(kvm); + + pci_release_regions(assigned_dev->dev); + pci_disable_device(assigned_dev->dev); + pci_dev_put(assigned_dev->dev); + + list_del(&assigned_dev->list); + kfree(assigned_dev); +} + +void kvm_free_all_assigned_devices(struct kvm *kvm) +{ + struct list_head *ptr, *ptr2; + struct kvm_assigned_dev_kernel *assigned_dev; + + list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) { + assigned_dev = list_entry(ptr, + struct kvm_assigned_dev_kernel, + list); + + kvm_free_assigned_device(kvm, assigned_dev); + } +} + +static int kvm_vm_ioctl_assign_irq(struct kvm *kvm, + struct kvm_assigned_irq + *assigned_irq) +{ + int r = 0; + struct kvm_assigned_dev_kernel *match; + + mutex_lock(&kvm->lock); + + match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, + assigned_irq->assigned_dev_id); + if (!match) { + mutex_unlock(&kvm->lock); + return -EINVAL; + } + + if (match->irq_requested) { + match->guest_irq = assigned_irq->guest_irq; + match->ack_notifier.gsi = assigned_irq->guest_irq; + mutex_unlock(&kvm->lock); + return 0; + } + + INIT_WORK(&match->interrupt_work, + kvm_assigned_dev_interrupt_work_handler); + + if (irqchip_in_kernel(kvm)) { + if (!capable(CAP_SYS_RAWIO)) { + r = -EPERM; + goto out_release; + } + + if (assigned_irq->host_irq) + match->host_irq = assigned_irq->host_irq; + else + match->host_irq = match->dev->irq; + match->guest_irq = assigned_irq->guest_irq; + match->ack_notifier.gsi = assigned_irq->guest_irq; + match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq; + kvm_register_irq_ack_notifier(kvm, &match->ack_notifier); + r = kvm_request_irq_source_id(kvm); + if (r < 0) + goto out_release; + else + match->irq_source_id = r; + + /* Even though this is PCI, we don't want to use shared + * interrupts. Sharing host devices with guest-assigned devices + * on the same interrupt line is not a happy situation: there + * are going to be long delays in accepting, acking, etc. + */ + if (request_irq(match->host_irq, kvm_assigned_dev_intr, 0, + "kvm_assigned_device", (void *)match)) { + r = -EIO; + goto out_release; + } + } + + match->irq_requested = true; + mutex_unlock(&kvm->lock); + return r; +out_release: + mutex_unlock(&kvm->lock); + kvm_free_assigned_device(kvm, match); + return r; +} + +static int kvm_vm_ioctl_assign_device(struct kvm *kvm, + struct kvm_assigned_pci_dev *assigned_dev) +{ + int r = 0; + struct kvm_assigned_dev_kernel *match; + struct pci_dev *dev; + + mutex_lock(&kvm->lock); + + match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, + assigned_dev->assigned_dev_id); + if (match) { + /* device already assigned */ + r = -EINVAL; + goto out; + } + + match = kzalloc(sizeof(struct kvm_assigned_dev_kernel), GFP_KERNEL); + if (match == NULL) { + printk(KERN_INFO "%s: Couldn't allocate memory\n", + __func__); + r = -ENOMEM; + goto out; + } + dev = pci_get_bus_and_slot(assigned_dev->busnr, + assigned_dev->devfn); + if (!dev) { + printk(KERN_INFO "%s: host device not found\n", __func__); + r = -EINVAL; + goto out_free; + } + if (pci_enable_device(dev)) { + printk(KERN_INFO "%s: Could not enable PCI device\n", __func__); + r = -EBUSY; + goto out_put; + } + r = pci_request_regions(dev, "kvm_assigned_device"); + if (r) { + printk(KERN_INFO "%s: Could not get access to device regions\n", + __func__); + goto out_disable; + } + match->assigned_dev_id = assigned_dev->assigned_dev_id; + match->host_busnr = assigned_dev->busnr; + match->host_devfn = assigned_dev->devfn; + match->dev = dev; + + match->kvm = kvm; + + list_add(&match->list, &kvm->arch.assigned_dev_head); + + if (assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) { + r = kvm_iommu_map_guest(kvm, match); + if (r) + goto out_list_del; + } + +out: + mutex_unlock(&kvm->lock); + return r; +out_list_del: + list_del(&match->list); + pci_release_regions(dev); +out_disable: + pci_disable_device(dev); +out_put: + pci_dev_put(dev); +out_free: + kfree(match); + mutex_unlock(&kvm->lock); + return r; +} +#endif + +static inline int valid_vcpu(int n) +{ + return likely(n >= 0 && n < KVM_MAX_VCPUS); +} + +inline int kvm_is_mmio_pfn(pfn_t pfn) +{ + if (pfn_valid(pfn)) + return PageReserved(pfn_to_page(pfn)); + + return true; +} + +/* + * Switches to specified vcpu, until a matching vcpu_put() + */ +void vcpu_load(struct kvm_vcpu *vcpu) +{ + int cpu; + + mutex_lock(&vcpu->mutex); + cpu = get_cpu(); + preempt_notifier_register(&vcpu->preempt_notifier); + kvm_arch_vcpu_load(vcpu, cpu); + put_cpu(); +} + +void vcpu_put(struct kvm_vcpu *vcpu) +{ + preempt_disable(); + kvm_arch_vcpu_put(vcpu); + preempt_notifier_unregister(&vcpu->preempt_notifier); + preempt_enable(); + mutex_unlock(&vcpu->mutex); +} + +static void ack_flush(void *_completed) +{ +} + +void kvm_flush_remote_tlbs(struct kvm *kvm) +{ + int i, cpu, me; + cpumask_t cpus; + struct kvm_vcpu *vcpu; + + me = get_cpu(); + cpus_clear(cpus); + for (i = 0; i < KVM_MAX_VCPUS; ++i) { + vcpu = kvm->vcpus[i]; + if (!vcpu) + continue; + if (test_and_set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) + continue; + cpu = vcpu->cpu; + if (cpu != -1 && cpu != me) + cpu_set(cpu, cpus); + } + if (cpus_empty(cpus)) + goto out; + ++kvm->stat.remote_tlb_flush; + smp_call_function_mask(cpus, ack_flush, NULL, 1); +out: + put_cpu(); +} + +void kvm_reload_remote_mmus(struct kvm *kvm) +{ + int i, cpu, me; + cpumask_t cpus; + struct kvm_vcpu *vcpu; + + me = get_cpu(); + cpus_clear(cpus); + for (i = 0; i < KVM_MAX_VCPUS; ++i) { + vcpu = kvm->vcpus[i]; + if (!vcpu) + continue; + if (test_and_set_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) + continue; + cpu = vcpu->cpu; + if (cpu != -1 && cpu != me) + cpu_set(cpu, cpus); + } + if (cpus_empty(cpus)) + goto out; + smp_call_function_mask(cpus, ack_flush, NULL, 1); +out: + put_cpu(); +} + + +int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) +{ + struct page *page; + int r; + + mutex_init(&vcpu->mutex); + vcpu->cpu = -1; + vcpu->kvm = kvm; + vcpu->vcpu_id = id; + init_waitqueue_head(&vcpu->wq); + + page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!page) { + r = -ENOMEM; + goto fail; + } + vcpu->run = page_address(page); + + r = kvm_arch_vcpu_init(vcpu); + if (r < 0) + goto fail_free_run; + return 0; + +fail_free_run: + free_page((unsigned long)vcpu->run); +fail: + return r; +} +EXPORT_SYMBOL_GPL(kvm_vcpu_init); + +void kvm_vcpu_uninit(struct kvm_vcpu *vcpu) +{ + kvm_arch_vcpu_uninit(vcpu); + free_page((unsigned long)vcpu->run); +} +EXPORT_SYMBOL_GPL(kvm_vcpu_uninit); + +#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) +static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) +{ + return container_of(mn, struct kvm, mmu_notifier); +} + +static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long address) +{ + struct kvm *kvm = mmu_notifier_to_kvm(mn); + int need_tlb_flush; + + /* + * When ->invalidate_page runs, the linux pte has been zapped + * already but the page is still allocated until + * ->invalidate_page returns. So if we increase the sequence + * here the kvm page fault will notice if the spte can't be + * established because the page is going to be freed. If + * instead the kvm page fault establishes the spte before + * ->invalidate_page runs, kvm_unmap_hva will release it + * before returning. + * + * The sequence increase only need to be seen at spin_unlock + * time, and not at spin_lock time. + * + * Increasing the sequence after the spin_unlock would be + * unsafe because the kvm page fault could then establish the + * pte after kvm_unmap_hva returned, without noticing the page + * is going to be freed. + */ + spin_lock(&kvm->mmu_lock); + kvm->mmu_notifier_seq++; + need_tlb_flush = kvm_unmap_hva(kvm, address); + spin_unlock(&kvm->mmu_lock); + + /* we've to flush the tlb before the pages can be freed */ + if (need_tlb_flush) + kvm_flush_remote_tlbs(kvm); + +} + +static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, + unsigned long end) +{ + struct kvm *kvm = mmu_notifier_to_kvm(mn); + int need_tlb_flush = 0; + + spin_lock(&kvm->mmu_lock); + /* + * The count increase must become visible at unlock time as no + * spte can be established without taking the mmu_lock and + * count is also read inside the mmu_lock critical section. + */ + kvm->mmu_notifier_count++; + for (; start < end; start += PAGE_SIZE) + need_tlb_flush |= kvm_unmap_hva(kvm, start); + spin_unlock(&kvm->mmu_lock); + + /* we've to flush the tlb before the pages can be freed */ + if (need_tlb_flush) + kvm_flush_remote_tlbs(kvm); +} + +static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, + unsigned long end) +{ + struct kvm *kvm = mmu_notifier_to_kvm(mn); + + spin_lock(&kvm->mmu_lock); + /* + * This sequence increase will notify the kvm page fault that + * the page that is going to be mapped in the spte could have + * been freed. + */ + kvm->mmu_notifier_seq++; + /* + * The above sequence increase must be visible before the + * below count decrease but both values are read by the kvm + * page fault under mmu_lock spinlock so we don't need to add + * a smb_wmb() here in between the two. + */ + kvm->mmu_notifier_count--; + spin_unlock(&kvm->mmu_lock); + + BUG_ON(kvm->mmu_notifier_count < 0); +} + +static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long address) +{ + struct kvm *kvm = mmu_notifier_to_kvm(mn); + int young; + + spin_lock(&kvm->mmu_lock); + young = kvm_age_hva(kvm, address); + spin_unlock(&kvm->mmu_lock); + + if (young) + kvm_flush_remote_tlbs(kvm); + + return young; +} + +static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { + .invalidate_page = kvm_mmu_notifier_invalidate_page, + .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, + .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, + .clear_flush_young = kvm_mmu_notifier_clear_flush_young, +}; +#endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */ + +static struct kvm *kvm_create_vm(void) +{ + struct kvm *kvm = kvm_arch_create_vm(); +#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET + struct page *page; +#endif + + if (IS_ERR(kvm)) + goto out; + +#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET + page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!page) { + kfree(kvm); + return ERR_PTR(-ENOMEM); + } + kvm->coalesced_mmio_ring = + (struct kvm_coalesced_mmio_ring *)page_address(page); +#endif + +#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) + { + int err; + kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops; + err = mmu_notifier_register(&kvm->mmu_notifier, current->mm); + if (err) { +#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET + put_page(page); +#endif + kfree(kvm); + return ERR_PTR(err); + } + } +#endif + + kvm->mm = current->mm; + atomic_inc(&kvm->mm->mm_count); + spin_lock_init(&kvm->mmu_lock); + kvm_io_bus_init(&kvm->pio_bus); + mutex_init(&kvm->lock); + kvm_io_bus_init(&kvm->mmio_bus); + init_rwsem(&kvm->slots_lock); + atomic_set(&kvm->users_count, 1); + spin_lock(&kvm_lock); + list_add(&kvm->vm_list, &vm_list); + spin_unlock(&kvm_lock); +#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET + kvm_coalesced_mmio_init(kvm); +#endif +out: + return kvm; +} + +/* + * Free any memory in @free but not in @dont. + */ +static void kvm_free_physmem_slot(struct kvm_memory_slot *free, + struct kvm_memory_slot *dont) +{ + if (!dont || free->rmap != dont->rmap) + vfree(free->rmap); + + if (!dont || free->dirty_bitmap != dont->dirty_bitmap) + vfree(free->dirty_bitmap); + + if (!dont || free->lpage_info != dont->lpage_info) + vfree(free->lpage_info); + + free->npages = 0; + free->dirty_bitmap = NULL; + free->rmap = NULL; + free->lpage_info = NULL; +} + +void kvm_free_physmem(struct kvm *kvm) +{ + int i; + + for (i = 0; i < kvm->nmemslots; ++i) + kvm_free_physmem_slot(&kvm->memslots[i], NULL); +} + +static void kvm_destroy_vm(struct kvm *kvm) +{ + struct mm_struct *mm = kvm->mm; + + spin_lock(&kvm_lock); + list_del(&kvm->vm_list); + spin_unlock(&kvm_lock); + kvm_io_bus_destroy(&kvm->pio_bus); + kvm_io_bus_destroy(&kvm->mmio_bus); +#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET + if (kvm->coalesced_mmio_ring != NULL) + free_page((unsigned long)kvm->coalesced_mmio_ring); +#endif +#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) + mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); +#endif + kvm_arch_destroy_vm(kvm); + mmdrop(mm); +} + +void kvm_get_kvm(struct kvm *kvm) +{ + atomic_inc(&kvm->users_count); +} +EXPORT_SYMBOL_GPL(kvm_get_kvm); + +void kvm_put_kvm(struct kvm *kvm) +{ + if (atomic_dec_and_test(&kvm->users_count)) + kvm_destroy_vm(kvm); +} +EXPORT_SYMBOL_GPL(kvm_put_kvm); + + +static int kvm_vm_release(struct inode *inode, struct file *filp) +{ + struct kvm *kvm = filp->private_data; + + kvm_put_kvm(kvm); + return 0; +} + +/* + * Allocate some memory and give it an address in the guest physical address + * space. + * + * Discontiguous memory is allowed, mostly for framebuffers. + * + * Must be called holding mmap_sem for write. + */ +int __kvm_set_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, + int user_alloc) +{ + int r; + gfn_t base_gfn; + unsigned long npages; + unsigned long i; + struct kvm_memory_slot *memslot; + struct kvm_memory_slot old, new; + + r = -EINVAL; + /* General sanity checks */ + if (mem->memory_size & (PAGE_SIZE - 1)) + goto out; + if (mem->guest_phys_addr & (PAGE_SIZE - 1)) + goto out; + if (mem->slot >= KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS) + goto out; + if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) + goto out; + + memslot = &kvm->memslots[mem->slot]; + base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; + npages = mem->memory_size >> PAGE_SHIFT; + + if (!npages) + mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES; + + new = old = *memslot; + + new.base_gfn = base_gfn; + new.npages = npages; + new.flags = mem->flags; + + /* Disallow changing a memory slot's size. */ + r = -EINVAL; + if (npages && old.npages && npages != old.npages) + goto out_free; + + /* Check for overlaps */ + r = -EEXIST; + for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { + struct kvm_memory_slot *s = &kvm->memslots[i]; + + if (s == memslot) + continue; + if (!((base_gfn + npages <= s->base_gfn) || + (base_gfn >= s->base_gfn + s->npages))) + goto out_free; + } + + /* Free page dirty bitmap if unneeded */ + if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES)) + new.dirty_bitmap = NULL; + + r = -ENOMEM; + + /* Allocate if a slot is being created */ +#ifndef CONFIG_S390 + if (npages && !new.rmap) { + new.rmap = vmalloc(npages * sizeof(struct page *)); + + if (!new.rmap) + goto out_free; + + memset(new.rmap, 0, npages * sizeof(*new.rmap)); + + new.user_alloc = user_alloc; + /* + * hva_to_rmmap() serialzies with the mmu_lock and to be + * safe it has to ignore memslots with !user_alloc && + * !userspace_addr. + */ + if (user_alloc) + new.userspace_addr = mem->userspace_addr; + else + new.userspace_addr = 0; + } + if (npages && !new.lpage_info) { + int largepages = npages / KVM_PAGES_PER_HPAGE; + if (npages % KVM_PAGES_PER_HPAGE) + largepages++; + if (base_gfn % KVM_PAGES_PER_HPAGE) + largepages++; + + new.lpage_info = vmalloc(largepages * sizeof(*new.lpage_info)); + + if (!new.lpage_info) + goto out_free; + + memset(new.lpage_info, 0, largepages * sizeof(*new.lpage_info)); + + if (base_gfn % KVM_PAGES_PER_HPAGE) + new.lpage_info[0].write_count = 1; + if ((base_gfn+npages) % KVM_PAGES_PER_HPAGE) + new.lpage_info[largepages-1].write_count = 1; + } + + /* Allocate page dirty bitmap if needed */ + if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { + unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8; + + new.dirty_bitmap = vmalloc(dirty_bytes); + if (!new.dirty_bitmap) + goto out_free; + memset(new.dirty_bitmap, 0, dirty_bytes); + } +#endif /* not defined CONFIG_S390 */ + + if (!npages) + kvm_arch_flush_shadow(kvm); + + spin_lock(&kvm->mmu_lock); + if (mem->slot >= kvm->nmemslots) + kvm->nmemslots = mem->slot + 1; + + *memslot = new; + spin_unlock(&kvm->mmu_lock); + + r = kvm_arch_set_memory_region(kvm, mem, old, user_alloc); + if (r) { + spin_lock(&kvm->mmu_lock); + *memslot = old; + spin_unlock(&kvm->mmu_lock); + goto out_free; + } + + kvm_free_physmem_slot(&old, &new); +#ifdef CONFIG_DMAR + /* map the pages in iommu page table */ + r = kvm_iommu_map_pages(kvm, base_gfn, npages); + if (r) + goto out; +#endif + return 0; + +out_free: + kvm_free_physmem_slot(&new, &old); +out: + return r; + +} +EXPORT_SYMBOL_GPL(__kvm_set_memory_region); + +int kvm_set_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, + int user_alloc) +{ + int r; + + down_write(&kvm->slots_lock); + r = __kvm_set_memory_region(kvm, mem, user_alloc); + up_write(&kvm->slots_lock); + return r; +} +EXPORT_SYMBOL_GPL(kvm_set_memory_region); + +int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, + struct + kvm_userspace_memory_region *mem, + int user_alloc) +{ + if (mem->slot >= KVM_MEMORY_SLOTS) + return -EINVAL; + return kvm_set_memory_region(kvm, mem, user_alloc); +} + +int kvm_get_dirty_log(struct kvm *kvm, + struct kvm_dirty_log *log, int *is_dirty) +{ + struct kvm_memory_slot *memslot; + int r, i; + int n; + unsigned long any = 0; + + r = -EINVAL; + if (log->slot >= KVM_MEMORY_SLOTS) + goto out; + + memslot = &kvm->memslots[log->slot]; + r = -ENOENT; + if (!memslot->dirty_bitmap) + goto out; + + n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; + + for (i = 0; !any && i < n/sizeof(long); ++i) + any = memslot->dirty_bitmap[i]; + + r = -EFAULT; + if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) + goto out; + + if (any) + *is_dirty = 1; + + r = 0; +out: + return r; +} + +int is_error_page(struct page *page) +{ + return page == bad_page; +} +EXPORT_SYMBOL_GPL(is_error_page); + +int is_error_pfn(pfn_t pfn) +{ + return pfn == bad_pfn; +} +EXPORT_SYMBOL_GPL(is_error_pfn); + +static inline unsigned long bad_hva(void) +{ + return PAGE_OFFSET; +} + +int kvm_is_error_hva(unsigned long addr) +{ + return addr == bad_hva(); +} +EXPORT_SYMBOL_GPL(kvm_is_error_hva); + +static struct kvm_memory_slot *__gfn_to_memslot(struct kvm *kvm, gfn_t gfn) +{ + int i; + + for (i = 0; i < kvm->nmemslots; ++i) { + struct kvm_memory_slot *memslot = &kvm->memslots[i]; + + if (gfn >= memslot->base_gfn + && gfn < memslot->base_gfn + memslot->npages) + return memslot; + } + return NULL; +} + +struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) +{ + gfn = unalias_gfn(kvm, gfn); + return __gfn_to_memslot(kvm, gfn); +} + +int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) +{ + int i; + + gfn = unalias_gfn(kvm, gfn); + for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { + struct kvm_memory_slot *memslot = &kvm->memslots[i]; + + if (gfn >= memslot->base_gfn + && gfn < memslot->base_gfn + memslot->npages) + return 1; + } + return 0; +} +EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); + +unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) +{ + struct kvm_memory_slot *slot; + + gfn = unalias_gfn(kvm, gfn); + slot = __gfn_to_memslot(kvm, gfn); + if (!slot) + return bad_hva(); + return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE); +} +EXPORT_SYMBOL_GPL(gfn_to_hva); + +pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) +{ + struct page *page[1]; + unsigned long addr; + int npages; + pfn_t pfn; + + might_sleep(); + + addr = gfn_to_hva(kvm, gfn); + if (kvm_is_error_hva(addr)) { + get_page(bad_page); + return page_to_pfn(bad_page); + } + + npages = get_user_pages_fast(addr, 1, 1, page); + + if (unlikely(npages != 1)) { + struct vm_area_struct *vma; + + down_read(¤t->mm->mmap_sem); + vma = find_vma(current->mm, addr); + + if (vma == NULL || addr < vma->vm_start || + !(vma->vm_flags & VM_PFNMAP)) { + up_read(¤t->mm->mmap_sem); + get_page(bad_page); + return page_to_pfn(bad_page); + } + + pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + up_read(¤t->mm->mmap_sem); + BUG_ON(!kvm_is_mmio_pfn(pfn)); + } else + pfn = page_to_pfn(page[0]); + + return pfn; +} + +EXPORT_SYMBOL_GPL(gfn_to_pfn); + +struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) +{ + pfn_t pfn; + + pfn = gfn_to_pfn(kvm, gfn); + if (!kvm_is_mmio_pfn(pfn)) + return pfn_to_page(pfn); + + WARN_ON(kvm_is_mmio_pfn(pfn)); + + get_page(bad_page); + return bad_page; +} + +EXPORT_SYMBOL_GPL(gfn_to_page); + +void kvm_release_page_clean(struct page *page) +{ + kvm_release_pfn_clean(page_to_pfn(page)); +} +EXPORT_SYMBOL_GPL(kvm_release_page_clean); + +void kvm_release_pfn_clean(pfn_t pfn) +{ + if (!kvm_is_mmio_pfn(pfn)) + put_page(pfn_to_page(pfn)); +} +EXPORT_SYMBOL_GPL(kvm_release_pfn_clean); + +void kvm_release_page_dirty(struct page *page) +{ + kvm_release_pfn_dirty(page_to_pfn(page)); +} +EXPORT_SYMBOL_GPL(kvm_release_page_dirty); + +void kvm_release_pfn_dirty(pfn_t pfn) +{ + kvm_set_pfn_dirty(pfn); + kvm_release_pfn_clean(pfn); +} +EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty); + +void kvm_set_page_dirty(struct page *page) +{ + kvm_set_pfn_dirty(page_to_pfn(page)); +} +EXPORT_SYMBOL_GPL(kvm_set_page_dirty); + +void kvm_set_pfn_dirty(pfn_t pfn) +{ + if (!kvm_is_mmio_pfn(pfn)) { + struct page *page = pfn_to_page(pfn); + if (!PageReserved(page)) + SetPageDirty(page); + } +} +EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty); + +void kvm_set_pfn_accessed(pfn_t pfn) +{ + if (!kvm_is_mmio_pfn(pfn)) + mark_page_accessed(pfn_to_page(pfn)); +} +EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed); + +void kvm_get_pfn(pfn_t pfn) +{ + if (!kvm_is_mmio_pfn(pfn)) + get_page(pfn_to_page(pfn)); +} +EXPORT_SYMBOL_GPL(kvm_get_pfn); + +static int next_segment(unsigned long len, int offset) +{ + if (len > PAGE_SIZE - offset) + return PAGE_SIZE - offset; + else + return len; +} + +int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, + int len) +{ + int r; + unsigned long addr; + + addr = gfn_to_hva(kvm, gfn); + if (kvm_is_error_hva(addr)) + return -EFAULT; + r = copy_from_user(data, (void __user *)addr + offset, len); + if (r) + return -EFAULT; + return 0; +} +EXPORT_SYMBOL_GPL(kvm_read_guest_page); + +int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len) +{ + gfn_t gfn = gpa >> PAGE_SHIFT; + int seg; + int offset = offset_in_page(gpa); + int ret; + + while ((seg = next_segment(len, offset)) != 0) { + ret = kvm_read_guest_page(kvm, gfn, data, offset, seg); + if (ret < 0) + return ret; + offset = 0; + len -= seg; + data += seg; + ++gfn; + } + return 0; +} +EXPORT_SYMBOL_GPL(kvm_read_guest); + +int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, + unsigned long len) +{ + int r; + unsigned long addr; + gfn_t gfn = gpa >> PAGE_SHIFT; + int offset = offset_in_page(gpa); + + addr = gfn_to_hva(kvm, gfn); + if (kvm_is_error_hva(addr)) + return -EFAULT; + pagefault_disable(); + r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len); + pagefault_enable(); + if (r) + return -EFAULT; + return 0; +} +EXPORT_SYMBOL(kvm_read_guest_atomic); + +int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data, + int offset, int len) +{ + int r; + unsigned long addr; + + addr = gfn_to_hva(kvm, gfn); + if (kvm_is_error_hva(addr)) + return -EFAULT; + r = copy_to_user((void __user *)addr + offset, data, len); + if (r) + return -EFAULT; + mark_page_dirty(kvm, gfn); + return 0; +} +EXPORT_SYMBOL_GPL(kvm_write_guest_page); + +int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, + unsigned long len) +{ + gfn_t gfn = gpa >> PAGE_SHIFT; + int seg; + int offset = offset_in_page(gpa); + int ret; + + while ((seg = next_segment(len, offset)) != 0) { + ret = kvm_write_guest_page(kvm, gfn, data, offset, seg); + if (ret < 0) + return ret; + offset = 0; + len -= seg; + data += seg; + ++gfn; + } + return 0; +} + +int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len) +{ + return kvm_write_guest_page(kvm, gfn, empty_zero_page, offset, len); +} +EXPORT_SYMBOL_GPL(kvm_clear_guest_page); + +int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len) +{ + gfn_t gfn = gpa >> PAGE_SHIFT; + int seg; + int offset = offset_in_page(gpa); + int ret; + + while ((seg = next_segment(len, offset)) != 0) { + ret = kvm_clear_guest_page(kvm, gfn, offset, seg); + if (ret < 0) + return ret; + offset = 0; + len -= seg; + ++gfn; + } + return 0; +} +EXPORT_SYMBOL_GPL(kvm_clear_guest); + +void mark_page_dirty(struct kvm *kvm, gfn_t gfn) +{ + struct kvm_memory_slot *memslot; + + gfn = unalias_gfn(kvm, gfn); + memslot = __gfn_to_memslot(kvm, gfn); + if (memslot && memslot->dirty_bitmap) { + unsigned long rel_gfn = gfn - memslot->base_gfn; + + /* avoid RMW */ + if (!test_bit(rel_gfn, memslot->dirty_bitmap)) + set_bit(rel_gfn, memslot->dirty_bitmap); + } +} + +/* + * The vCPU has executed a HLT instruction with in-kernel mode enabled. + */ +void kvm_vcpu_block(struct kvm_vcpu *vcpu) +{ + DEFINE_WAIT(wait); + + for (;;) { + prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); + + if (kvm_cpu_has_interrupt(vcpu) || + kvm_cpu_has_pending_timer(vcpu) || + kvm_arch_vcpu_runnable(vcpu)) { + set_bit(KVM_REQ_UNHALT, &vcpu->requests); + break; + } + if (signal_pending(current)) + break; + + vcpu_put(vcpu); + schedule(); + vcpu_load(vcpu); + } + + finish_wait(&vcpu->wq, &wait); +} + +void kvm_resched(struct kvm_vcpu *vcpu) +{ + if (!need_resched()) + return; + cond_resched(); +} +EXPORT_SYMBOL_GPL(kvm_resched); + +static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct kvm_vcpu *vcpu = vma->vm_file->private_data; + struct page *page; + + if (vmf->pgoff == 0) + page = virt_to_page(vcpu->run); +#ifdef CONFIG_X86 + else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET) + page = virt_to_page(vcpu->arch.pio_data); +#endif +#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET + else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET) + page = virt_to_page(vcpu->kvm->coalesced_mmio_ring); +#endif + else + return VM_FAULT_SIGBUS; + get_page(page); + vmf->page = page; + return 0; +} + +static struct vm_operations_struct kvm_vcpu_vm_ops = { + .fault = kvm_vcpu_fault, +}; + +static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma) +{ + vma->vm_ops = &kvm_vcpu_vm_ops; + return 0; +} + +static int kvm_vcpu_release(struct inode *inode, struct file *filp) +{ + struct kvm_vcpu *vcpu = filp->private_data; + + kvm_put_kvm(vcpu->kvm); + return 0; +} + +static const struct file_operations kvm_vcpu_fops = { + .release = kvm_vcpu_release, + .unlocked_ioctl = kvm_vcpu_ioctl, + .compat_ioctl = kvm_vcpu_ioctl, + .mmap = kvm_vcpu_mmap, +}; + +/* + * Allocates an inode for the vcpu. + */ +static int create_vcpu_fd(struct kvm_vcpu *vcpu) +{ + int fd = anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, 0); + if (fd < 0) + kvm_put_kvm(vcpu->kvm); + return fd; +} + +/* + * Creates some virtual cpus. Good luck creating more than one. + */ +static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n) +{ + int r; + struct kvm_vcpu *vcpu; + + if (!valid_vcpu(n)) + return -EINVAL; + + vcpu = kvm_arch_vcpu_create(kvm, n); + if (IS_ERR(vcpu)) + return PTR_ERR(vcpu); + + preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); + + r = kvm_arch_vcpu_setup(vcpu); + if (r) + return r; + + mutex_lock(&kvm->lock); + if (kvm->vcpus[n]) { + r = -EEXIST; + goto vcpu_destroy; + } + kvm->vcpus[n] = vcpu; + mutex_unlock(&kvm->lock); + + /* Now it's all set up, let userspace reach it */ + kvm_get_kvm(kvm); + r = create_vcpu_fd(vcpu); + if (r < 0) + goto unlink; + return r; + +unlink: + mutex_lock(&kvm->lock); + kvm->vcpus[n] = NULL; +vcpu_destroy: + mutex_unlock(&kvm->lock); + kvm_arch_vcpu_destroy(vcpu); + return r; +} + +static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset) +{ + if (sigset) { + sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP)); + vcpu->sigset_active = 1; + vcpu->sigset = *sigset; + } else + vcpu->sigset_active = 0; + return 0; +} + +static long kvm_vcpu_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + struct kvm_vcpu *vcpu = filp->private_data; + void __user *argp = (void __user *)arg; + int r; + struct kvm_fpu *fpu = NULL; + struct kvm_sregs *kvm_sregs = NULL; + + if (vcpu->kvm->mm != current->mm) + return -EIO; + switch (ioctl) { + case KVM_RUN: + r = -EINVAL; + if (arg) + goto out; + r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run); + break; + case KVM_GET_REGS: { + struct kvm_regs *kvm_regs; + + r = -ENOMEM; + kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL); + if (!kvm_regs) + goto out; + r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs); + if (r) + goto out_free1; + r = -EFAULT; + if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs))) + goto out_free1; + r = 0; +out_free1: + kfree(kvm_regs); + break; + } + case KVM_SET_REGS: { + struct kvm_regs *kvm_regs; + + r = -ENOMEM; + kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL); + if (!kvm_regs) + goto out; + r = -EFAULT; + if (copy_from_user(kvm_regs, argp, sizeof(struct kvm_regs))) + goto out_free2; + r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs); + if (r) + goto out_free2; + r = 0; +out_free2: + kfree(kvm_regs); + break; + } + case KVM_GET_SREGS: { + kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL); + r = -ENOMEM; + if (!kvm_sregs) + goto out; + r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs); + if (r) + goto out; + r = -EFAULT; + if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs))) + goto out; + r = 0; + break; + } + case KVM_SET_SREGS: { + kvm_sregs = kmalloc(sizeof(struct kvm_sregs), GFP_KERNEL); + r = -ENOMEM; + if (!kvm_sregs) + goto out; + r = -EFAULT; + if (copy_from_user(kvm_sregs, argp, sizeof(struct kvm_sregs))) + goto out; + r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs); + if (r) + goto out; + r = 0; + break; + } + case KVM_GET_MP_STATE: { + struct kvm_mp_state mp_state; + + r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state); + if (r) + goto out; + r = -EFAULT; + if (copy_to_user(argp, &mp_state, sizeof mp_state)) + goto out; + r = 0; + break; + } + case KVM_SET_MP_STATE: { + struct kvm_mp_state mp_state; + + r = -EFAULT; + if (copy_from_user(&mp_state, argp, sizeof mp_state)) + goto out; + r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state); + if (r) + goto out; + r = 0; + break; + } + case KVM_TRANSLATE: { + struct kvm_translation tr; + + r = -EFAULT; + if (copy_from_user(&tr, argp, sizeof tr)) + goto out; + r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr); + if (r) + goto out; + r = -EFAULT; + if (copy_to_user(argp, &tr, sizeof tr)) + goto out; + r = 0; + break; + } + case KVM_DEBUG_GUEST: { + struct kvm_debug_guest dbg; + + r = -EFAULT; + if (copy_from_user(&dbg, argp, sizeof dbg)) + goto out; + r = kvm_arch_vcpu_ioctl_debug_guest(vcpu, &dbg); + if (r) + goto out; + r = 0; + break; + } + case KVM_SET_SIGNAL_MASK: { + struct kvm_signal_mask __user *sigmask_arg = argp; + struct kvm_signal_mask kvm_sigmask; + sigset_t sigset, *p; + + p = NULL; + if (argp) { + r = -EFAULT; + if (copy_from_user(&kvm_sigmask, argp, + sizeof kvm_sigmask)) + goto out; + r = -EINVAL; + if (kvm_sigmask.len != sizeof sigset) + goto out; + r = -EFAULT; + if (copy_from_user(&sigset, sigmask_arg->sigset, + sizeof sigset)) + goto out; + p = &sigset; + } + r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset); + break; + } + case KVM_GET_FPU: { + fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL); + r = -ENOMEM; + if (!fpu) + goto out; + r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu); + if (r) + goto out; + r = -EFAULT; + if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu))) + goto out; + r = 0; + break; + } + case KVM_SET_FPU: { + fpu = kmalloc(sizeof(struct kvm_fpu), GFP_KERNEL); + r = -ENOMEM; + if (!fpu) + goto out; + r = -EFAULT; + if (copy_from_user(fpu, argp, sizeof(struct kvm_fpu))) + goto out; + r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu); + if (r) + goto out; + r = 0; + break; + } + default: + r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); + } +out: + kfree(fpu); + kfree(kvm_sregs); + return r; +} + +static long kvm_vm_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + struct kvm *kvm = filp->private_data; + void __user *argp = (void __user *)arg; + int r; + + if (kvm->mm != current->mm) + return -EIO; + switch (ioctl) { + case KVM_CREATE_VCPU: + r = kvm_vm_ioctl_create_vcpu(kvm, arg); + if (r < 0) + goto out; + break; + case KVM_SET_USER_MEMORY_REGION: { + struct kvm_userspace_memory_region kvm_userspace_mem; + + r = -EFAULT; + if (copy_from_user(&kvm_userspace_mem, argp, + sizeof kvm_userspace_mem)) + goto out; + + r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 1); + if (r) + goto out; + break; + } + case KVM_GET_DIRTY_LOG: { + struct kvm_dirty_log log; + + r = -EFAULT; + if (copy_from_user(&log, argp, sizeof log)) + goto out; + r = kvm_vm_ioctl_get_dirty_log(kvm, &log); + if (r) + goto out; + break; + } +#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET + case KVM_REGISTER_COALESCED_MMIO: { + struct kvm_coalesced_mmio_zone zone; + r = -EFAULT; + if (copy_from_user(&zone, argp, sizeof zone)) + goto out; + r = -ENXIO; + r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone); + if (r) + goto out; + r = 0; + break; + } + case KVM_UNREGISTER_COALESCED_MMIO: { + struct kvm_coalesced_mmio_zone zone; + r = -EFAULT; + if (copy_from_user(&zone, argp, sizeof zone)) + goto out; + r = -ENXIO; + r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone); + if (r) + goto out; + r = 0; + break; + } +#endif +#ifdef KVM_CAP_DEVICE_ASSIGNMENT + case KVM_ASSIGN_PCI_DEVICE: { + struct kvm_assigned_pci_dev assigned_dev; + + r = -EFAULT; + if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev)) + goto out; + r = kvm_vm_ioctl_assign_device(kvm, &assigned_dev); + if (r) + goto out; + break; + } + case KVM_ASSIGN_IRQ: { + struct kvm_assigned_irq assigned_irq; + + r = -EFAULT; + if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq)) + goto out; + r = kvm_vm_ioctl_assign_irq(kvm, &assigned_irq); + if (r) + goto out; + break; + } +#endif + default: + r = kvm_arch_vm_ioctl(filp, ioctl, arg); + } +out: + return r; +} + +static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct page *page[1]; + unsigned long addr; + int npages; + gfn_t gfn = vmf->pgoff; + struct kvm *kvm = vma->vm_file->private_data; + + addr = gfn_to_hva(kvm, gfn); + if (kvm_is_error_hva(addr)) + return VM_FAULT_SIGBUS; + + npages = get_user_pages(current, current->mm, addr, 1, 1, 0, page, + NULL); + if (unlikely(npages != 1)) + return VM_FAULT_SIGBUS; + + vmf->page = page[0]; + return 0; +} + +static struct vm_operations_struct kvm_vm_vm_ops = { + .fault = kvm_vm_fault, +}; + +static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma) +{ + vma->vm_ops = &kvm_vm_vm_ops; + return 0; +} + +static const struct file_operations kvm_vm_fops = { + .release = kvm_vm_release, + .unlocked_ioctl = kvm_vm_ioctl, + .compat_ioctl = kvm_vm_ioctl, + .mmap = kvm_vm_mmap, +}; + +static int kvm_dev_ioctl_create_vm(void) +{ + int fd; + struct kvm *kvm; + + kvm = kvm_create_vm(); + if (IS_ERR(kvm)) + return PTR_ERR(kvm); + fd = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, 0); + if (fd < 0) + kvm_put_kvm(kvm); + + return fd; +} + +static long kvm_dev_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + long r = -EINVAL; + + switch (ioctl) { + case KVM_GET_API_VERSION: + r = -EINVAL; + if (arg) + goto out; + r = KVM_API_VERSION; + break; + case KVM_CREATE_VM: + r = -EINVAL; + if (arg) + goto out; + r = kvm_dev_ioctl_create_vm(); + break; + case KVM_CHECK_EXTENSION: + r = kvm_dev_ioctl_check_extension(arg); + break; + case KVM_GET_VCPU_MMAP_SIZE: + r = -EINVAL; + if (arg) + goto out; + r = PAGE_SIZE; /* struct kvm_run */ +#ifdef CONFIG_X86 + r += PAGE_SIZE; /* pio data page */ +#endif +#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET + r += PAGE_SIZE; /* coalesced mmio ring page */ +#endif + break; + case KVM_TRACE_ENABLE: + case KVM_TRACE_PAUSE: + case KVM_TRACE_DISABLE: + r = kvm_trace_ioctl(ioctl, arg); + break; + default: + return kvm_arch_dev_ioctl(filp, ioctl, arg); + } +out: + return r; +} + +static struct file_operations kvm_chardev_ops = { + .unlocked_ioctl = kvm_dev_ioctl, + .compat_ioctl = kvm_dev_ioctl, +}; + +static struct miscdevice kvm_dev = { + KVM_MINOR, + "kvm", + &kvm_chardev_ops, +}; + +static void hardware_enable(void *junk) +{ + int cpu = raw_smp_processor_id(); + + if (cpu_isset(cpu, cpus_hardware_enabled)) + return; + cpu_set(cpu, cpus_hardware_enabled); + kvm_arch_hardware_enable(NULL); +} + +static void hardware_disable(void *junk) +{ + int cpu = raw_smp_processor_id(); + + if (!cpu_isset(cpu, cpus_hardware_enabled)) + return; + cpu_clear(cpu, cpus_hardware_enabled); + kvm_arch_hardware_disable(NULL); +} + +static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, + void *v) +{ + int cpu = (long)v; + + val &= ~CPU_TASKS_FROZEN; + switch (val) { + case CPU_DYING: + printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", + cpu); + hardware_disable(NULL); + break; + case CPU_UP_CANCELED: + printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", + cpu); + smp_call_function_single(cpu, hardware_disable, NULL, 1); + break; + case CPU_ONLINE: + printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n", + cpu); + smp_call_function_single(cpu, hardware_enable, NULL, 1); + break; + } + return NOTIFY_OK; +} + + +asmlinkage void kvm_handle_fault_on_reboot(void) +{ + if (kvm_rebooting) + /* spin while reset goes on */ + while (true) + ; + /* Fault while not rebooting. We want the trace. */ + BUG(); +} +EXPORT_SYMBOL_GPL(kvm_handle_fault_on_reboot); + +static int kvm_reboot(struct notifier_block *notifier, unsigned long val, + void *v) +{ + if (val == SYS_RESTART) { + /* + * Some (well, at least mine) BIOSes hang on reboot if + * in vmx root mode. + */ + printk(KERN_INFO "kvm: exiting hardware virtualization\n"); + kvm_rebooting = true; + on_each_cpu(hardware_disable, NULL, 1); + } + return NOTIFY_OK; +} + +static struct notifier_block kvm_reboot_notifier = { + .notifier_call = kvm_reboot, + .priority = 0, +}; + +void kvm_io_bus_init(struct kvm_io_bus *bus) +{ + memset(bus, 0, sizeof(*bus)); +} + +void kvm_io_bus_destroy(struct kvm_io_bus *bus) +{ + int i; + + for (i = 0; i < bus->dev_count; i++) { + struct kvm_io_device *pos = bus->devs[i]; + + kvm_iodevice_destructor(pos); + } +} + +struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, + gpa_t addr, int len, int is_write) +{ + int i; + + for (i = 0; i < bus->dev_count; i++) { + struct kvm_io_device *pos = bus->devs[i]; + + if (pos->in_range(pos, addr, len, is_write)) + return pos; + } + + return NULL; +} + +void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev) +{ + BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1)); + + bus->devs[bus->dev_count++] = dev; +} + +static struct notifier_block kvm_cpu_notifier = { + .notifier_call = kvm_cpu_hotplug, + .priority = 20, /* must be > scheduler priority */ +}; + +static int vm_stat_get(void *_offset, u64 *val) +{ + unsigned offset = (long)_offset; + struct kvm *kvm; + + *val = 0; + spin_lock(&kvm_lock); + list_for_each_entry(kvm, &vm_list, vm_list) + *val += *(u32 *)((void *)kvm + offset); + spin_unlock(&kvm_lock); + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, NULL, "%llu\n"); + +static int vcpu_stat_get(void *_offset, u64 *val) +{ + unsigned offset = (long)_offset; + struct kvm *kvm; + struct kvm_vcpu *vcpu; + int i; + + *val = 0; + spin_lock(&kvm_lock); + list_for_each_entry(kvm, &vm_list, vm_list) + for (i = 0; i < KVM_MAX_VCPUS; ++i) { + vcpu = kvm->vcpus[i]; + if (vcpu) + *val += *(u32 *)((void *)vcpu + offset); + } + spin_unlock(&kvm_lock); + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, NULL, "%llu\n"); + +static struct file_operations *stat_fops[] = { + [KVM_STAT_VCPU] = &vcpu_stat_fops, + [KVM_STAT_VM] = &vm_stat_fops, +}; + +static void kvm_init_debug(void) +{ + struct kvm_stats_debugfs_item *p; + + kvm_debugfs_dir = debugfs_create_dir("kvm", NULL); + for (p = debugfs_entries; p->name; ++p) + p->dentry = debugfs_create_file(p->name, 0444, kvm_debugfs_dir, + (void *)(long)p->offset, + stat_fops[p->kind]); +} + +static void kvm_exit_debug(void) +{ + struct kvm_stats_debugfs_item *p; + + for (p = debugfs_entries; p->name; ++p) + debugfs_remove(p->dentry); + debugfs_remove(kvm_debugfs_dir); +} + +static int kvm_suspend(struct sys_device *dev, pm_message_t state) +{ + hardware_disable(NULL); + return 0; +} + +static int kvm_resume(struct sys_device *dev) +{ + hardware_enable(NULL); + return 0; +} + +static struct sysdev_class kvm_sysdev_class = { + .name = "kvm", + .suspend = kvm_suspend, + .resume = kvm_resume, +}; + +static struct sys_device kvm_sysdev = { + .id = 0, + .cls = &kvm_sysdev_class, +}; + +struct page *bad_page; +pfn_t bad_pfn; + +static inline +struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn) +{ + return container_of(pn, struct kvm_vcpu, preempt_notifier); +} + +static void kvm_sched_in(struct preempt_notifier *pn, int cpu) +{ + struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); + + kvm_arch_vcpu_load(vcpu, cpu); +} + +static void kvm_sched_out(struct preempt_notifier *pn, + struct task_struct *next) +{ + struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); + + kvm_arch_vcpu_put(vcpu); +} + +int kvm_init(void *opaque, unsigned int vcpu_size, + struct module *module) +{ + int r; + int cpu; + + kvm_init_debug(); + + r = kvm_arch_init(opaque); + if (r) + goto out_fail; + + bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO); + + if (bad_page == NULL) { + r = -ENOMEM; + goto out; + } + + bad_pfn = page_to_pfn(bad_page); + + r = kvm_arch_hardware_setup(); + if (r < 0) + goto out_free_0; + + for_each_online_cpu(cpu) { + smp_call_function_single(cpu, + kvm_arch_check_processor_compat, + &r, 1); + if (r < 0) + goto out_free_1; + } + + on_each_cpu(hardware_enable, NULL, 1); + r = register_cpu_notifier(&kvm_cpu_notifier); + if (r) + goto out_free_2; + register_reboot_notifier(&kvm_reboot_notifier); + + r = sysdev_class_register(&kvm_sysdev_class); + if (r) + goto out_free_3; + + r = sysdev_register(&kvm_sysdev); + if (r) + goto out_free_4; + + /* A kmem cache lets us meet the alignment requirements of fx_save. */ + kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, + __alignof__(struct kvm_vcpu), + 0, NULL); + if (!kvm_vcpu_cache) { + r = -ENOMEM; + goto out_free_5; + } + + kvm_chardev_ops.owner = module; + + r = misc_register(&kvm_dev); + if (r) { + printk(KERN_ERR "kvm: misc device register failed\n"); + goto out_free; + } + + kvm_preempt_ops.sched_in = kvm_sched_in; + kvm_preempt_ops.sched_out = kvm_sched_out; + + return 0; + +out_free: + kmem_cache_destroy(kvm_vcpu_cache); +out_free_5: + sysdev_unregister(&kvm_sysdev); +out_free_4: + sysdev_class_unregister(&kvm_sysdev_class); +out_free_3: + unregister_reboot_notifier(&kvm_reboot_notifier); + unregister_cpu_notifier(&kvm_cpu_notifier); +out_free_2: + on_each_cpu(hardware_disable, NULL, 1); +out_free_1: + kvm_arch_hardware_unsetup(); +out_free_0: + __free_page(bad_page); +out: + kvm_arch_exit(); + kvm_exit_debug(); +out_fail: + return r; +} +EXPORT_SYMBOL_GPL(kvm_init); + +void kvm_exit(void) +{ + kvm_trace_cleanup(); + misc_deregister(&kvm_dev); + kmem_cache_destroy(kvm_vcpu_cache); + sysdev_unregister(&kvm_sysdev); + sysdev_class_unregister(&kvm_sysdev_class); + unregister_reboot_notifier(&kvm_reboot_notifier); + unregister_cpu_notifier(&kvm_cpu_notifier); + on_each_cpu(hardware_disable, NULL, 1); + kvm_arch_hardware_unsetup(); + kvm_arch_exit(); + kvm_exit_debug(); + __free_page(bad_page); +} +EXPORT_SYMBOL_GPL(kvm_exit); diff --git a/virt/kvm/kvm_trace.c b/virt/kvm/kvm_trace.c new file mode 100644 index 0000000..41dcc84 --- /dev/null +++ b/virt/kvm/kvm_trace.c @@ -0,0 +1,284 @@ +/* + * kvm trace + * + * It is designed to allow debugging traces of kvm to be generated + * on UP / SMP machines. Each trace entry can be timestamped so that + * it's possible to reconstruct a chronological record of trace events. + * The implementation refers to blktrace kernel support. + * + * Copyright (c) 2008 Intel Corporation + * Copyright (C) 2006 Jens Axboe <axboe@kernel.dk> + * + * Authors: Feng(Eric) Liu, eric.e.liu@intel.com + * + * Date: Feb 2008 + */ + +#include <linux/module.h> +#include <linux/relay.h> +#include <linux/debugfs.h> +#include <linux/ktime.h> + +#include <linux/kvm_host.h> + +#define KVM_TRACE_STATE_RUNNING (1 << 0) +#define KVM_TRACE_STATE_PAUSE (1 << 1) +#define KVM_TRACE_STATE_CLEARUP (1 << 2) + +struct kvm_trace { + int trace_state; + struct rchan *rchan; + struct dentry *lost_file; + atomic_t lost_records; +}; +static struct kvm_trace *kvm_trace; + +struct kvm_trace_probe { + const char *name; + const char *format; + u32 timestamp_in; + marker_probe_func *probe_func; +}; + +static inline int calc_rec_size(int timestamp, int extra) +{ + int rec_size = KVM_TRC_HEAD_SIZE; + + rec_size += extra; + return timestamp ? rec_size += KVM_TRC_CYCLE_SIZE : rec_size; +} + +static void kvm_add_trace(void *probe_private, void *call_data, + const char *format, va_list *args) +{ + struct kvm_trace_probe *p = probe_private; + struct kvm_trace *kt = kvm_trace; + struct kvm_trace_rec rec; + struct kvm_vcpu *vcpu; + int i, size; + u32 extra; + + if (unlikely(kt->trace_state != KVM_TRACE_STATE_RUNNING)) + return; + + rec.rec_val = TRACE_REC_EVENT_ID(va_arg(*args, u32)); + vcpu = va_arg(*args, struct kvm_vcpu *); + rec.pid = current->tgid; + rec.vcpu_id = vcpu->vcpu_id; + + extra = va_arg(*args, u32); + WARN_ON(!(extra <= KVM_TRC_EXTRA_MAX)); + extra = min_t(u32, extra, KVM_TRC_EXTRA_MAX); + + rec.rec_val |= TRACE_REC_TCS(p->timestamp_in) + | TRACE_REC_NUM_DATA_ARGS(extra); + + if (p->timestamp_in) { + rec.u.timestamp.timestamp = ktime_to_ns(ktime_get()); + + for (i = 0; i < extra; i++) + rec.u.timestamp.extra_u32[i] = va_arg(*args, u32); + } else { + for (i = 0; i < extra; i++) + rec.u.notimestamp.extra_u32[i] = va_arg(*args, u32); + } + + size = calc_rec_size(p->timestamp_in, extra * sizeof(u32)); + relay_write(kt->rchan, &rec, size); +} + +static struct kvm_trace_probe kvm_trace_probes[] = { + { "kvm_trace_entryexit", "%u %p %u %u %u %u %u %u", 1, kvm_add_trace }, + { "kvm_trace_handler", "%u %p %u %u %u %u %u %u", 0, kvm_add_trace }, +}; + +static int lost_records_get(void *data, u64 *val) +{ + struct kvm_trace *kt = data; + + *val = atomic_read(&kt->lost_records); + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(kvm_trace_lost_ops, lost_records_get, NULL, "%llu\n"); + +/* + * The relay channel is used in "no-overwrite" mode, it keeps trace of how + * many times we encountered a full subbuffer, to tell user space app the + * lost records there were. + */ +static int kvm_subbuf_start_callback(struct rchan_buf *buf, void *subbuf, + void *prev_subbuf, size_t prev_padding) +{ + struct kvm_trace *kt; + + if (!relay_buf_full(buf)) { + if (!prev_subbuf) { + /* + * executed only once when the channel is opened + * save metadata as first record + */ + subbuf_start_reserve(buf, sizeof(u32)); + *(u32 *)subbuf = 0x12345678; + } + + return 1; + } + + kt = buf->chan->private_data; + atomic_inc(&kt->lost_records); + + return 0; +} + +static struct dentry *kvm_create_buf_file_callack(const char *filename, + struct dentry *parent, + int mode, + struct rchan_buf *buf, + int *is_global) +{ + return debugfs_create_file(filename, mode, parent, buf, + &relay_file_operations); +} + +static int kvm_remove_buf_file_callback(struct dentry *dentry) +{ + debugfs_remove(dentry); + return 0; +} + +static struct rchan_callbacks kvm_relay_callbacks = { + .subbuf_start = kvm_subbuf_start_callback, + .create_buf_file = kvm_create_buf_file_callack, + .remove_buf_file = kvm_remove_buf_file_callback, +}; + +static int do_kvm_trace_enable(struct kvm_user_trace_setup *kuts) +{ + struct kvm_trace *kt; + int i, r = -ENOMEM; + + if (!kuts->buf_size || !kuts->buf_nr) + return -EINVAL; + + kt = kzalloc(sizeof(*kt), GFP_KERNEL); + if (!kt) + goto err; + + r = -EIO; + atomic_set(&kt->lost_records, 0); + kt->lost_file = debugfs_create_file("lost_records", 0444, kvm_debugfs_dir, + kt, &kvm_trace_lost_ops); + if (!kt->lost_file) + goto err; + + kt->rchan = relay_open("trace", kvm_debugfs_dir, kuts->buf_size, + kuts->buf_nr, &kvm_relay_callbacks, kt); + if (!kt->rchan) + goto err; + + kvm_trace = kt; + + for (i = 0; i < ARRAY_SIZE(kvm_trace_probes); i++) { + struct kvm_trace_probe *p = &kvm_trace_probes[i]; + + r = marker_probe_register(p->name, p->format, p->probe_func, p); + if (r) + printk(KERN_INFO "Unable to register probe %s\n", + p->name); + } + + kvm_trace->trace_state = KVM_TRACE_STATE_RUNNING; + + return 0; +err: + if (kt) { + if (kt->lost_file) + debugfs_remove(kt->lost_file); + if (kt->rchan) + relay_close(kt->rchan); + kfree(kt); + } + return r; +} + +static int kvm_trace_enable(char __user *arg) +{ + struct kvm_user_trace_setup kuts; + int ret; + + ret = copy_from_user(&kuts, arg, sizeof(kuts)); + if (ret) + return -EFAULT; + + ret = do_kvm_trace_enable(&kuts); + if (ret) + return ret; + + return 0; +} + +static int kvm_trace_pause(void) +{ + struct kvm_trace *kt = kvm_trace; + int r = -EINVAL; + + if (kt == NULL) + return r; + + if (kt->trace_state == KVM_TRACE_STATE_RUNNING) { + kt->trace_state = KVM_TRACE_STATE_PAUSE; + relay_flush(kt->rchan); + r = 0; + } + + return r; +} + +void kvm_trace_cleanup(void) +{ + struct kvm_trace *kt = kvm_trace; + int i; + + if (kt == NULL) + return; + + if (kt->trace_state == KVM_TRACE_STATE_RUNNING || + kt->trace_state == KVM_TRACE_STATE_PAUSE) { + + kt->trace_state = KVM_TRACE_STATE_CLEARUP; + + for (i = 0; i < ARRAY_SIZE(kvm_trace_probes); i++) { + struct kvm_trace_probe *p = &kvm_trace_probes[i]; + marker_probe_unregister(p->name, p->probe_func, p); + } + + relay_close(kt->rchan); + debugfs_remove(kt->lost_file); + kfree(kt); + } +} + +int kvm_trace_ioctl(unsigned int ioctl, unsigned long arg) +{ + void __user *argp = (void __user *)arg; + long r = -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + switch (ioctl) { + case KVM_TRACE_ENABLE: + r = kvm_trace_enable(argp); + break; + case KVM_TRACE_PAUSE: + r = kvm_trace_pause(); + break; + case KVM_TRACE_DISABLE: + r = 0; + kvm_trace_cleanup(); + break; + } + + return r; +} diff --git a/virt/kvm/vtd.c b/virt/kvm/vtd.c new file mode 100644 index 0000000..a770874 --- /dev/null +++ b/virt/kvm/vtd.c @@ -0,0 +1,191 @@ +/* + * Copyright (c) 2006, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * Copyright (C) 2006-2008 Intel Corporation + * Copyright IBM Corporation, 2008 + * Author: Allen M. Kay <allen.m.kay@intel.com> + * Author: Weidong Han <weidong.han@intel.com> + * Author: Ben-Ami Yassour <benami@il.ibm.com> + */ + +#include <linux/list.h> +#include <linux/kvm_host.h> +#include <linux/pci.h> +#include <linux/dmar.h> +#include <linux/intel-iommu.h> + +static int kvm_iommu_unmap_memslots(struct kvm *kvm); +static void kvm_iommu_put_pages(struct kvm *kvm, + gfn_t base_gfn, unsigned long npages); + +int kvm_iommu_map_pages(struct kvm *kvm, + gfn_t base_gfn, unsigned long npages) +{ + gfn_t gfn = base_gfn; + pfn_t pfn; + int i, r = 0; + struct dmar_domain *domain = kvm->arch.intel_iommu_domain; + + /* check if iommu exists and in use */ + if (!domain) + return 0; + + for (i = 0; i < npages; i++) { + /* check if already mapped */ + pfn = (pfn_t)intel_iommu_iova_to_pfn(domain, + gfn_to_gpa(gfn)); + if (pfn) + continue; + + pfn = gfn_to_pfn(kvm, gfn); + r = intel_iommu_page_mapping(domain, + gfn_to_gpa(gfn), + pfn_to_hpa(pfn), + PAGE_SIZE, + DMA_PTE_READ | + DMA_PTE_WRITE); + if (r) { + printk(KERN_ERR "kvm_iommu_map_pages:" + "iommu failed to map pfn=%lx\n", pfn); + goto unmap_pages; + } + gfn++; + } + return 0; + +unmap_pages: + kvm_iommu_put_pages(kvm, base_gfn, i); + return r; +} + +static int kvm_iommu_map_memslots(struct kvm *kvm) +{ + int i, r; + + down_read(&kvm->slots_lock); + for (i = 0; i < kvm->nmemslots; i++) { + r = kvm_iommu_map_pages(kvm, kvm->memslots[i].base_gfn, + kvm->memslots[i].npages); + if (r) + break; + } + up_read(&kvm->slots_lock); + return r; +} + +int kvm_iommu_map_guest(struct kvm *kvm, + struct kvm_assigned_dev_kernel *assigned_dev) +{ + struct pci_dev *pdev = NULL; + int r; + + if (!intel_iommu_found()) { + printk(KERN_ERR "%s: intel iommu not found\n", __func__); + return -ENODEV; + } + + printk(KERN_DEBUG "VT-d direct map: host bdf = %x:%x:%x\n", + assigned_dev->host_busnr, + PCI_SLOT(assigned_dev->host_devfn), + PCI_FUNC(assigned_dev->host_devfn)); + + pdev = assigned_dev->dev; + + if (pdev == NULL) { + if (kvm->arch.intel_iommu_domain) { + intel_iommu_domain_exit(kvm->arch.intel_iommu_domain); + kvm->arch.intel_iommu_domain = NULL; + } + return -ENODEV; + } + + kvm->arch.intel_iommu_domain = intel_iommu_domain_alloc(pdev); + if (!kvm->arch.intel_iommu_domain) + return -ENODEV; + + r = kvm_iommu_map_memslots(kvm); + if (r) + goto out_unmap; + + intel_iommu_detach_dev(kvm->arch.intel_iommu_domain, + pdev->bus->number, pdev->devfn); + + r = intel_iommu_context_mapping(kvm->arch.intel_iommu_domain, + pdev); + if (r) { + printk(KERN_ERR "Domain context map for %s failed", + pci_name(pdev)); + goto out_unmap; + } + return 0; + +out_unmap: + kvm_iommu_unmap_memslots(kvm); + return r; +} + +static void kvm_iommu_put_pages(struct kvm *kvm, + gfn_t base_gfn, unsigned long npages) +{ + gfn_t gfn = base_gfn; + pfn_t pfn; + struct dmar_domain *domain = kvm->arch.intel_iommu_domain; + int i; + + for (i = 0; i < npages; i++) { + pfn = (pfn_t)intel_iommu_iova_to_pfn(domain, + gfn_to_gpa(gfn)); + kvm_release_pfn_clean(pfn); + gfn++; + } +} + +static int kvm_iommu_unmap_memslots(struct kvm *kvm) +{ + int i; + down_read(&kvm->slots_lock); + for (i = 0; i < kvm->nmemslots; i++) { + kvm_iommu_put_pages(kvm, kvm->memslots[i].base_gfn, + kvm->memslots[i].npages); + } + up_read(&kvm->slots_lock); + + return 0; +} + +int kvm_iommu_unmap_guest(struct kvm *kvm) +{ + struct kvm_assigned_dev_kernel *entry; + struct dmar_domain *domain = kvm->arch.intel_iommu_domain; + + /* check if iommu exists and in use */ + if (!domain) + return 0; + + list_for_each_entry(entry, &kvm->arch.assigned_dev_head, list) { + printk(KERN_DEBUG "VT-d unmap: host bdf = %x:%x:%x\n", + entry->host_busnr, + PCI_SLOT(entry->host_devfn), + PCI_FUNC(entry->host_devfn)); + + /* detach kvm dmar domain */ + intel_iommu_detach_dev(domain, entry->host_busnr, + entry->host_devfn); + } + kvm_iommu_unmap_memslots(kvm); + intel_iommu_domain_exit(domain); + return 0; +} |