diff options
Diffstat (limited to 'arch')
184 files changed, 7783 insertions, 2768 deletions
diff --git a/arch/Kconfig b/arch/Kconfig index 215e460..e5eb133 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -41,6 +41,17 @@ config KPROBES for kernel debugging, non-intrusive instrumentation and testing. If in doubt, say "N". +config OPTPROBES + bool "Kprobes jump optimization support (EXPERIMENTAL)" + default y + depends on KPROBES + depends on !PREEMPT + depends on HAVE_OPTPROBES + select KALLSYMS_ALL + help + This option will allow kprobes to optimize breakpoint to + a jump for reducing its overhead. + config HAVE_EFFICIENT_UNALIGNED_ACCESS bool help @@ -83,6 +94,8 @@ config HAVE_KPROBES config HAVE_KRETPROBES bool +config HAVE_OPTPROBES + bool # # An arch should select this if it provides all these things: # diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c index 62619f2..53c213f 100644 --- a/arch/alpha/kernel/osf_sys.c +++ b/arch/alpha/kernel/osf_sys.c @@ -361,7 +361,7 @@ osf_procfs_mount(char *dirname, struct procfs_args __user *args, int flags) SYSCALL_DEFINE4(osf_mount, unsigned long, typenr, char __user *, path, int, flag, void __user *, data) { - int retval = -EINVAL; + int retval; char *name; name = getname(path); @@ -379,6 +379,7 @@ SYSCALL_DEFINE4(osf_mount, unsigned long, typenr, char __user *, path, retval = osf_procfs_mount(name, data, flag); break; default: + retval = -EINVAL; printk("osf_mount(%ld, %x)\n", typenr, flag); } putname(name); diff --git a/arch/arm/include/asm/hardware/iop3xx-adma.h b/arch/arm/include/asm/hardware/iop3xx-adma.h index 1a8c727..9b28f12 100644 --- a/arch/arm/include/asm/hardware/iop3xx-adma.h +++ b/arch/arm/include/asm/hardware/iop3xx-adma.h @@ -366,8 +366,7 @@ static inline int iop_chan_xor_slot_count(size_t len, int src_cnt, slot_cnt += *slots_per_op; } - if (len) - slot_cnt += *slots_per_op; + slot_cnt += *slots_per_op; return slot_cnt; } @@ -389,8 +388,7 @@ static inline int iop_chan_zero_sum_slot_count(size_t len, int src_cnt, slot_cnt += *slots_per_op; } - if (len) - slot_cnt += *slots_per_op; + slot_cnt += *slots_per_op; return slot_cnt; } @@ -737,10 +735,8 @@ iop_desc_set_zero_sum_byte_count(struct iop_adma_desc_slot *desc, u32 len) i += slots_per_op; } while (len > IOP_ADMA_ZERO_SUM_MAX_BYTE_COUNT); - if (len) { - iter = iop_hw_desc_slot_idx(hw_desc, i); - iter->byte_count = len; - } + iter = iop_hw_desc_slot_idx(hw_desc, i); + iter->byte_count = len; } } diff --git a/arch/arm/mach-u300/include/mach/coh901318.h b/arch/arm/mach-u300/include/mach/coh901318.h index f4cfee9..b8155b4 100644 --- a/arch/arm/mach-u300/include/mach/coh901318.h +++ b/arch/arm/mach-u300/include/mach/coh901318.h @@ -53,7 +53,7 @@ struct coh901318_params { * struct coh_dma_channel - dma channel base * @name: ascii name of dma channel * @number: channel id number - * @desc_nbr_max: number of preallocated descriptortors + * @desc_nbr_max: number of preallocated descriptors * @priority_high: prio of channel, 0 low otherwise high. * @param: configuration parameters * @dev_addr: physical address of periphal connected to channel diff --git a/arch/cris/Kconfig b/arch/cris/Kconfig index 7adac38..059eac6 100644 --- a/arch/cris/Kconfig +++ b/arch/cris/Kconfig @@ -20,6 +20,12 @@ config RWSEM_GENERIC_SPINLOCK config RWSEM_XCHGADD_ALGORITHM bool +config GENERIC_TIME + def_bool y + +config ARCH_USES_GETTIMEOFFSET + def_bool y + config GENERIC_IOMAP bool default y diff --git a/arch/cris/arch-v32/drivers/cryptocop.c b/arch/cris/arch-v32/drivers/cryptocop.c index fd529a0..b70fb34 100644 --- a/arch/cris/arch-v32/drivers/cryptocop.c +++ b/arch/cris/arch-v32/drivers/cryptocop.c @@ -628,9 +628,9 @@ static int create_output_descriptors(struct cryptocop_operation *operation, int cdesc->dma_descr->buf = (char*)virt_to_phys(operation->tfrm_op.indata[*iniov_ix].iov_base + *iniov_offset); cdesc->dma_descr->after = cdesc->dma_descr->buf + dlength; + assert(desc_len >= dlength); desc_len -= dlength; *iniov_offset += dlength; - assert(desc_len >= 0); if (*iniov_offset >= operation->tfrm_op.indata[*iniov_ix].iov_len) { *iniov_offset = 0; ++(*iniov_ix); diff --git a/arch/cris/arch-v32/mach-fs/arbiter.c b/arch/cris/arch-v32/mach-fs/arbiter.c index 84d31bd..82ef293 100644 --- a/arch/cris/arch-v32/mach-fs/arbiter.c +++ b/arch/cris/arch-v32/mach-fs/arbiter.c @@ -332,7 +332,7 @@ int crisv32_arbiter_unwatch(int id) if (id == 0) intr_mask.bp0 = regk_marb_no; else if (id == 1) - intr_mask.bp2 = regk_marb_no; + intr_mask.bp1 = regk_marb_no; else if (id == 2) intr_mask.bp2 = regk_marb_no; else if (id == 3) diff --git a/arch/cris/kernel/time.c b/arch/cris/kernel/time.c index 074fe7d..a05dd31 100644 --- a/arch/cris/kernel/time.c +++ b/arch/cris/kernel/time.c @@ -42,75 +42,11 @@ unsigned long loops_per_usec; extern unsigned long do_slow_gettimeoffset(void); static unsigned long (*do_gettimeoffset)(void) = do_slow_gettimeoffset; -/* - * This version of gettimeofday has near microsecond resolution. - * - * Note: Division is quite slow on CRIS and do_gettimeofday is called - * rather often. Maybe we should do some kind of approximation here - * (a naive approximation would be to divide by 1024). - */ -void do_gettimeofday(struct timeval *tv) -{ - unsigned long flags; - signed long usec, sec; - local_irq_save(flags); - usec = do_gettimeoffset(); - - /* - * If time_adjust is negative then NTP is slowing the clock - * so make sure not to go into next possible interval. - * Better to lose some accuracy than have time go backwards.. - */ - if (unlikely(time_adjust < 0) && usec > tickadj) - usec = tickadj; - - sec = xtime.tv_sec; - usec += xtime.tv_nsec / 1000; - local_irq_restore(flags); - - while (usec >= 1000000) { - usec -= 1000000; - sec++; - } - - tv->tv_sec = sec; - tv->tv_usec = usec; -} - -EXPORT_SYMBOL(do_gettimeofday); - -int do_settimeofday(struct timespec *tv) +u32 arch_gettimeoffset(void) { - time_t wtm_sec, sec = tv->tv_sec; - long wtm_nsec, nsec = tv->tv_nsec; - - if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) - return -EINVAL; - - write_seqlock_irq(&xtime_lock); - /* - * This is revolting. We need to set "xtime" correctly. However, the - * value in this location is the value at the most recent update of - * wall time. Discover what correction gettimeofday() would have - * made, and then undo it! - */ - nsec -= do_gettimeoffset() * NSEC_PER_USEC; - - wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); - wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); - - set_normalized_timespec(&xtime, sec, nsec); - set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); - - ntp_clear(); - write_sequnlock_irq(&xtime_lock); - clock_was_set(); - return 0; + return do_gettimeoffset() * 1000; } -EXPORT_SYMBOL(do_settimeofday); - - /* * BUG: This routine does not handle hour overflow properly; it just * sets the minutes. Usually you'll only notice that after reboot! diff --git a/arch/frv/include/asm/pci.h b/arch/frv/include/asm/pci.h index 492b5c4..8c7260a 100644 --- a/arch/frv/include/asm/pci.h +++ b/arch/frv/include/asm/pci.h @@ -68,41 +68,4 @@ static inline void pci_dma_burst_advice(struct pci_dev *pdev, #define PCIBIOS_MIN_IO 0x100 #define PCIBIOS_MIN_MEM 0x00010000 -/* Make physical memory consistent for a single - * streaming mode DMA translation after a transfer. - * - * If you perform a pci_map_single() but wish to interrogate the - * buffer using the cpu, yet do not wish to teardown the PCI dma - * mapping, you must call this function before doing so. At the - * next point you give the PCI dma address back to the card, the - * device again owns the buffer. - */ -static inline void pci_dma_sync_single(struct pci_dev *hwdev, - dma_addr_t dma_handle, - size_t size, int direction) -{ - BUG_ON(direction == PCI_DMA_NONE); - - frv_cache_wback_inv((unsigned long)bus_to_virt(dma_handle), - (unsigned long)bus_to_virt(dma_handle) + size); -} - -/* Make physical memory consistent for a set of streaming - * mode DMA translations after a transfer. - * - * The same as pci_dma_sync_single but for a scatter-gather list, - * same rules and usage. - */ -static inline void pci_dma_sync_sg(struct pci_dev *hwdev, - struct scatterlist *sg, - int nelems, int direction) -{ - int i; - BUG_ON(direction == PCI_DMA_NONE); - - for (i = 0; i < nelems; i++) - frv_cache_wback_inv(sg_dma_address(&sg[i]), - sg_dma_address(&sg[i])+sg_dma_len(&sg[i])); -} - #endif /* _ASM_FRV_PCI_H */ diff --git a/arch/ia64/include/asm/elf.h b/arch/ia64/include/asm/elf.h index 4c41656..b5298eb 100644 --- a/arch/ia64/include/asm/elf.h +++ b/arch/ia64/include/asm/elf.h @@ -219,54 +219,6 @@ do { \ NEW_AUX_ENT(AT_SYSINFO_EHDR, (unsigned long) GATE_EHDR); \ } while (0) - -/* - * These macros parameterize elf_core_dump in fs/binfmt_elf.c to write out - * extra segments containing the gate DSO contents. Dumping its - * contents makes post-mortem fully interpretable later without matching up - * the same kernel and hardware config to see what PC values meant. - * Dumping its extra ELF program headers includes all the other information - * a debugger needs to easily find how the gate DSO was being used. - */ -#define ELF_CORE_EXTRA_PHDRS (GATE_EHDR->e_phnum) -#define ELF_CORE_WRITE_EXTRA_PHDRS \ -do { \ - const struct elf_phdr *const gate_phdrs = \ - (const struct elf_phdr *) (GATE_ADDR + GATE_EHDR->e_phoff); \ - int i; \ - Elf64_Off ofs = 0; \ - for (i = 0; i < GATE_EHDR->e_phnum; ++i) { \ - struct elf_phdr phdr = gate_phdrs[i]; \ - if (phdr.p_type == PT_LOAD) { \ - phdr.p_memsz = PAGE_ALIGN(phdr.p_memsz); \ - phdr.p_filesz = phdr.p_memsz; \ - if (ofs == 0) { \ - ofs = phdr.p_offset = offset; \ - offset += phdr.p_filesz; \ - } \ - else \ - phdr.p_offset = ofs; \ - } \ - else \ - phdr.p_offset += ofs; \ - phdr.p_paddr = 0; /* match other core phdrs */ \ - DUMP_WRITE(&phdr, sizeof(phdr)); \ - } \ -} while (0) -#define ELF_CORE_WRITE_EXTRA_DATA \ -do { \ - const struct elf_phdr *const gate_phdrs = \ - (const struct elf_phdr *) (GATE_ADDR + GATE_EHDR->e_phoff); \ - int i; \ - for (i = 0; i < GATE_EHDR->e_phnum; ++i) { \ - if (gate_phdrs[i].p_type == PT_LOAD) { \ - DUMP_WRITE((void *) gate_phdrs[i].p_vaddr, \ - PAGE_ALIGN(gate_phdrs[i].p_memsz)); \ - break; \ - } \ - } \ -} while (0) - /* * format for entries in the Global Offset Table */ diff --git a/arch/ia64/kernel/Makefile b/arch/ia64/kernel/Makefile index 4138282..db10b1e 100644 --- a/arch/ia64/kernel/Makefile +++ b/arch/ia64/kernel/Makefile @@ -45,6 +45,8 @@ endif obj-$(CONFIG_DMAR) += pci-dma.o obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o +obj-$(CONFIG_BINFMT_ELF) += elfcore.o + # fp_emulate() expects f2-f5,f16-f31 to contain the user-level state. CFLAGS_traps.o += -mfixed-range=f2-f5,f16-f31 diff --git a/arch/ia64/kernel/elfcore.c b/arch/ia64/kernel/elfcore.c new file mode 100644 index 0000000..bac1639 --- /dev/null +++ b/arch/ia64/kernel/elfcore.c @@ -0,0 +1,80 @@ +#include <linux/elf.h> +#include <linux/coredump.h> +#include <linux/fs.h> +#include <linux/mm.h> + +#include <asm/elf.h> + + +Elf64_Half elf_core_extra_phdrs(void) +{ + return GATE_EHDR->e_phnum; +} + +int elf_core_write_extra_phdrs(struct file *file, loff_t offset, size_t *size, + unsigned long limit) +{ + const struct elf_phdr *const gate_phdrs = + (const struct elf_phdr *) (GATE_ADDR + GATE_EHDR->e_phoff); + int i; + Elf64_Off ofs = 0; + + for (i = 0; i < GATE_EHDR->e_phnum; ++i) { + struct elf_phdr phdr = gate_phdrs[i]; + + if (phdr.p_type == PT_LOAD) { + phdr.p_memsz = PAGE_ALIGN(phdr.p_memsz); + phdr.p_filesz = phdr.p_memsz; + if (ofs == 0) { + ofs = phdr.p_offset = offset; + offset += phdr.p_filesz; + } else { + phdr.p_offset = ofs; + } + } else { + phdr.p_offset += ofs; + } + phdr.p_paddr = 0; /* match other core phdrs */ + *size += sizeof(phdr); + if (*size > limit || !dump_write(file, &phdr, sizeof(phdr))) + return 0; + } + return 1; +} + +int elf_core_write_extra_data(struct file *file, size_t *size, + unsigned long limit) +{ + const struct elf_phdr *const gate_phdrs = + (const struct elf_phdr *) (GATE_ADDR + GATE_EHDR->e_phoff); + int i; + + for (i = 0; i < GATE_EHDR->e_phnum; ++i) { + if (gate_phdrs[i].p_type == PT_LOAD) { + void *addr = (void *)gate_phdrs[i].p_vaddr; + size_t memsz = PAGE_ALIGN(gate_phdrs[i].p_memsz); + + *size += memsz; + if (*size > limit || !dump_write(file, addr, memsz)) + return 0; + break; + } + } + return 1; +} + +size_t elf_core_extra_data_size(void) +{ + const struct elf_phdr *const gate_phdrs = + (const struct elf_phdr *) (GATE_ADDR + GATE_EHDR->e_phoff); + int i; + size_t size = 0; + + for (i = 0; i < GATE_EHDR->e_phnum; ++i) { + if (gate_phdrs[i].p_type == PT_LOAD) { + size += PAGE_ALIGN(gate_phdrs[i].p_memsz); + break; + } + } + return size; +} diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c index b81e46b..703062c 100644 --- a/arch/ia64/kernel/perfmon.c +++ b/arch/ia64/kernel/perfmon.c @@ -2315,6 +2315,7 @@ pfm_smpl_buffer_alloc(struct task_struct *task, struct file *filp, pfm_context_t DPRINT(("Cannot allocate vma\n")); goto error_kmem; } + INIT_LIST_HEAD(&vma->anon_vma_chain); /* * partially initialize the vma for the sampling buffer diff --git a/arch/ia64/kvm/Kconfig b/arch/ia64/kvm/Kconfig index 01c7579..fa4d1e5 100644 --- a/arch/ia64/kvm/Kconfig +++ b/arch/ia64/kvm/Kconfig @@ -26,6 +26,7 @@ config KVM select ANON_INODES select HAVE_KVM_IRQCHIP select KVM_APIC_ARCHITECTURE + select KVM_MMIO ---help--- Support hosting fully virtualized guest machines using hardware virtualization extensions. You will need a fairly recent diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index 5fdeec5..26e0e08 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c @@ -241,10 +241,10 @@ static int handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 0; mmio: if (p->dir) - r = kvm_io_bus_read(&vcpu->kvm->mmio_bus, p->addr, + r = kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, p->addr, p->size, &p->data); else - r = kvm_io_bus_write(&vcpu->kvm->mmio_bus, p->addr, + r = kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, p->addr, p->size, &p->data); if (r) printk(KERN_ERR"kvm: No iodevice found! addr:%lx\n", p->addr); @@ -636,12 +636,9 @@ static void kvm_vcpu_post_transition(struct kvm_vcpu *vcpu) static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { union context *host_ctx, *guest_ctx; - int r; + int r, idx; - /* - * down_read() may sleep and return with interrupts enabled - */ - down_read(&vcpu->kvm->slots_lock); + idx = srcu_read_lock(&vcpu->kvm->srcu); again: if (signal_pending(current)) { @@ -663,7 +660,7 @@ again: if (r < 0) goto vcpu_run_fail; - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&vcpu->kvm->srcu, idx); kvm_guest_enter(); /* @@ -687,7 +684,7 @@ again: kvm_guest_exit(); preempt_enable(); - down_read(&vcpu->kvm->slots_lock); + idx = srcu_read_lock(&vcpu->kvm->srcu); r = kvm_handle_exit(kvm_run, vcpu); @@ -697,10 +694,10 @@ again: } out: - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&vcpu->kvm->srcu, idx); if (r > 0) { kvm_resched(vcpu); - down_read(&vcpu->kvm->slots_lock); + idx = srcu_read_lock(&vcpu->kvm->srcu); goto again; } @@ -971,7 +968,7 @@ long kvm_arch_vm_ioctl(struct file *filp, goto out; r = kvm_setup_default_irq_routing(kvm); if (r) { - kfree(kvm->arch.vioapic); + kvm_ioapic_destroy(kvm); goto out; } break; @@ -1377,12 +1374,14 @@ static void free_kvm(struct kvm *kvm) static void kvm_release_vm_pages(struct kvm *kvm) { + struct kvm_memslots *slots; struct kvm_memory_slot *memslot; int i, j; unsigned long base_gfn; - for (i = 0; i < kvm->nmemslots; i++) { - memslot = &kvm->memslots[i]; + slots = rcu_dereference(kvm->memslots); + for (i = 0; i < slots->nmemslots; i++) { + memslot = &slots->memslots[i]; base_gfn = memslot->base_gfn; for (j = 0; j < memslot->npages; j++) { @@ -1405,6 +1404,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kfree(kvm->arch.vioapic); kvm_release_vm_pages(kvm); kvm_free_physmem(kvm); + cleanup_srcu_struct(&kvm->srcu); free_kvm(kvm); } @@ -1576,15 +1576,15 @@ out: return r; } -int kvm_arch_set_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, +int kvm_arch_prepare_memory_region(struct kvm *kvm, + struct kvm_memory_slot *memslot, struct kvm_memory_slot old, + struct kvm_userspace_memory_region *mem, int user_alloc) { unsigned long i; unsigned long pfn; - int npages = mem->memory_size >> PAGE_SHIFT; - struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot]; + int npages = memslot->npages; unsigned long base_gfn = memslot->base_gfn; if (base_gfn + npages > (KVM_MAX_MEM_SIZE >> PAGE_SHIFT)) @@ -1608,6 +1608,14 @@ int kvm_arch_set_memory_region(struct kvm *kvm, return 0; } +void kvm_arch_commit_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, + struct kvm_memory_slot old, + int user_alloc) +{ + return; +} + void kvm_arch_flush_shadow(struct kvm *kvm) { kvm_flush_remote_tlbs(kvm); @@ -1802,7 +1810,7 @@ static int kvm_ia64_sync_dirty_log(struct kvm *kvm, if (log->slot >= KVM_MEMORY_SLOTS) goto out; - memslot = &kvm->memslots[log->slot]; + memslot = &kvm->memslots->memslots[log->slot]; r = -ENOENT; if (!memslot->dirty_bitmap) goto out; @@ -1827,6 +1835,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot; int is_dirty = 0; + mutex_lock(&kvm->slots_lock); spin_lock(&kvm->arch.dirty_log_lock); r = kvm_ia64_sync_dirty_log(kvm, log); @@ -1840,12 +1849,13 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, /* If nothing is dirty, don't bother messing with page tables. */ if (is_dirty) { kvm_flush_remote_tlbs(kvm); - memslot = &kvm->memslots[log->slot]; + memslot = &kvm->memslots->memslots[log->slot]; n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; memset(memslot->dirty_bitmap, 0, n); } r = 0; out: + mutex_unlock(&kvm->slots_lock); spin_unlock(&kvm->arch.dirty_log_lock); return r; } diff --git a/arch/ia64/kvm/kvm_fw.c b/arch/ia64/kvm/kvm_fw.c index e4b8231..cb548ee 100644 --- a/arch/ia64/kvm/kvm_fw.c +++ b/arch/ia64/kvm/kvm_fw.c @@ -75,7 +75,7 @@ static void set_pal_result(struct kvm_vcpu *vcpu, struct exit_ctl_data *p; p = kvm_get_exit_data(vcpu); - if (p && p->exit_reason == EXIT_REASON_PAL_CALL) { + if (p->exit_reason == EXIT_REASON_PAL_CALL) { p->u.pal_data.ret = result; return ; } @@ -87,7 +87,7 @@ static void set_sal_result(struct kvm_vcpu *vcpu, struct exit_ctl_data *p; p = kvm_get_exit_data(vcpu); - if (p && p->exit_reason == EXIT_REASON_SAL_CALL) { + if (p->exit_reason == EXIT_REASON_SAL_CALL) { p->u.sal_data.ret = result; return ; } @@ -322,7 +322,7 @@ static u64 kvm_get_pal_call_index(struct kvm_vcpu *vcpu) struct exit_ctl_data *p; p = kvm_get_exit_data(vcpu); - if (p && (p->exit_reason == EXIT_REASON_PAL_CALL)) + if (p->exit_reason == EXIT_REASON_PAL_CALL) index = p->u.pal_data.gr28; return index; @@ -646,18 +646,16 @@ static void kvm_get_sal_call_data(struct kvm_vcpu *vcpu, u64 *in0, u64 *in1, p = kvm_get_exit_data(vcpu); - if (p) { - if (p->exit_reason == EXIT_REASON_SAL_CALL) { - *in0 = p->u.sal_data.in0; - *in1 = p->u.sal_data.in1; - *in2 = p->u.sal_data.in2; - *in3 = p->u.sal_data.in3; - *in4 = p->u.sal_data.in4; - *in5 = p->u.sal_data.in5; - *in6 = p->u.sal_data.in6; - *in7 = p->u.sal_data.in7; - return ; - } + if (p->exit_reason == EXIT_REASON_SAL_CALL) { + *in0 = p->u.sal_data.in0; + *in1 = p->u.sal_data.in1; + *in2 = p->u.sal_data.in2; + *in3 = p->u.sal_data.in3; + *in4 = p->u.sal_data.in4; + *in5 = p->u.sal_data.in5; + *in6 = p->u.sal_data.in6; + *in7 = p->u.sal_data.in7; + return ; } *in0 = 0; } diff --git a/arch/ia64/kvm/mmio.c b/arch/ia64/kvm/mmio.c index 9bf55af..fb8f9f5 100644 --- a/arch/ia64/kvm/mmio.c +++ b/arch/ia64/kvm/mmio.c @@ -316,8 +316,8 @@ void emulate_io_inst(struct kvm_vcpu *vcpu, u64 padr, u64 ma) return; } else { inst_type = -1; - panic_vm(vcpu, "Unsupported MMIO access instruction! \ - Bunld[0]=0x%lx, Bundle[1]=0x%lx\n", + panic_vm(vcpu, "Unsupported MMIO access instruction! " + "Bunld[0]=0x%lx, Bundle[1]=0x%lx\n", bundle.i64[0], bundle.i64[1]); } diff --git a/arch/ia64/kvm/vcpu.c b/arch/ia64/kvm/vcpu.c index dce75b70..958815c 100644 --- a/arch/ia64/kvm/vcpu.c +++ b/arch/ia64/kvm/vcpu.c @@ -1639,8 +1639,8 @@ void vcpu_set_psr(struct kvm_vcpu *vcpu, unsigned long val) * Otherwise panic */ if (val & (IA64_PSR_PK | IA64_PSR_IS | IA64_PSR_VM)) - panic_vm(vcpu, "Only support guests with vpsr.pk =0 \ - & vpsr.is=0\n"); + panic_vm(vcpu, "Only support guests with vpsr.pk =0 " + "& vpsr.is=0\n"); /* * For those IA64_PSR bits: id/da/dd/ss/ed/ia diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index ca3335e..ed41759 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -117,6 +117,7 @@ ia64_init_addr_space (void) */ vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); if (vma) { + INIT_LIST_HEAD(&vma->anon_vma_chain); vma->vm_mm = current->mm; vma->vm_start = current->thread.rbs_bot & PAGE_MASK; vma->vm_end = vma->vm_start + PAGE_SIZE; @@ -135,6 +136,7 @@ ia64_init_addr_space (void) if (!(current->personality & MMAP_PAGE_ZERO)) { vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); if (vma) { + INIT_LIST_HEAD(&vma->anon_vma_chain); vma->vm_mm = current->mm; vma->vm_end = PAGE_SIZE; vma->vm_page_prot = __pgprot(pgprot_val(PAGE_READONLY) | _PAGE_MA_NAT); diff --git a/arch/parisc/Kconfig.debug b/arch/parisc/Kconfig.debug index bc989e5..7305ac8 100644 --- a/arch/parisc/Kconfig.debug +++ b/arch/parisc/Kconfig.debug @@ -12,4 +12,18 @@ config DEBUG_RODATA portion of the kernel code won't be covered by a TLB anymore. If in doubt, say "N". +config DEBUG_STRICT_USER_COPY_CHECKS + bool "Strict copy size checks" + depends on DEBUG_KERNEL && !TRACE_BRANCH_PROFILING + ---help--- + Enabling this option turns a certain set of sanity checks for user + copy operations into compile time failures. + + The copy_from_user() etc checks are there to help test if there + are sufficient security checks on the length argument of + the copy operation, by having gcc prove that the argument is + within bounds. + + If unsure, or if you run an older (pre 4.4) gcc, say N. + endmenu diff --git a/arch/parisc/include/asm/param.h b/arch/parisc/include/asm/param.h index 32e03d8..965d454 100644 --- a/arch/parisc/include/asm/param.h +++ b/arch/parisc/include/asm/param.h @@ -1,22 +1 @@ -#ifndef _ASMPARISC_PARAM_H -#define _ASMPARISC_PARAM_H - -#ifdef __KERNEL__ -#define HZ CONFIG_HZ -#define USER_HZ 100 /* some user API use "ticks" */ -#define CLOCKS_PER_SEC (USER_HZ) /* like times() */ -#endif - -#ifndef HZ -#define HZ 100 -#endif - -#define EXEC_PAGESIZE 4096 - -#ifndef NOGROUP -#define NOGROUP (-1) -#endif - -#define MAXHOSTNAMELEN 64 /* max length of hostname */ - -#endif +#include <asm-generic/param.h> diff --git a/arch/parisc/include/asm/system.h b/arch/parisc/include/asm/system.h index d91357b..4653c77 100644 --- a/arch/parisc/include/asm/system.h +++ b/arch/parisc/include/asm/system.h @@ -160,7 +160,7 @@ static inline void set_eiem(unsigned long val) ldcd). */ #define __PA_LDCW_ALIGNMENT 4 -#define __ldcw_align(a) ((volatile unsigned int *)a) +#define __ldcw_align(a) (&(a)->slock) #define __LDCW "ldcw,co" #endif /*!CONFIG_PA20*/ diff --git a/arch/parisc/include/asm/uaccess.h b/arch/parisc/include/asm/uaccess.h index 7cf799d..ff4cf9d 100644 --- a/arch/parisc/include/asm/uaccess.h +++ b/arch/parisc/include/asm/uaccess.h @@ -7,6 +7,7 @@ #include <asm/page.h> #include <asm/system.h> #include <asm/cache.h> +#include <asm/errno.h> #include <asm-generic/uaccess-unaligned.h> #define VERIFY_READ 0 @@ -234,13 +235,35 @@ extern long lstrnlen_user(const char __user *,long); unsigned long copy_to_user(void __user *dst, const void *src, unsigned long len); #define __copy_to_user copy_to_user -unsigned long copy_from_user(void *dst, const void __user *src, unsigned long len); -#define __copy_from_user copy_from_user +unsigned long __copy_from_user(void *dst, const void __user *src, unsigned long len); unsigned long copy_in_user(void __user *dst, const void __user *src, unsigned long len); #define __copy_in_user copy_in_user #define __copy_to_user_inatomic __copy_to_user #define __copy_from_user_inatomic __copy_from_user +extern void copy_from_user_overflow(void) +#ifdef CONFIG_DEBUG_STRICT_USER_COPY_CHECKS + __compiletime_error("copy_from_user() buffer size is not provably correct") +#else + __compiletime_warning("copy_from_user() buffer size is not provably correct") +#endif +; + +static inline unsigned long __must_check copy_from_user(void *to, + const void __user *from, + unsigned long n) +{ + int sz = __compiletime_object_size(to); + int ret = -EFAULT; + + if (likely(sz == -1 || !__builtin_constant_p(n) || sz >= n)) + ret = __copy_from_user(to, from, n); + else + copy_from_user_overflow(); + + return ret; +} + struct pt_regs; int fixup_exception(struct pt_regs *regs); diff --git a/arch/parisc/include/asm/unistd.h b/arch/parisc/include/asm/unistd.h index cda1583..1ce7d28 100644 --- a/arch/parisc/include/asm/unistd.h +++ b/arch/parisc/include/asm/unistd.h @@ -811,8 +811,10 @@ #define __NR_pwritev (__NR_Linux + 316) #define __NR_rt_tgsigqueueinfo (__NR_Linux + 317) #define __NR_perf_event_open (__NR_Linux + 318) +#define __NR_recvmmsg (__NR_Linux + 319) +#define __NR_accept4 (__NR_Linux + 320) -#define __NR_Linux_syscalls (__NR_perf_event_open + 1) +#define __NR_Linux_syscalls (__NR_accept4 + 1) #define __IGNORE_select /* newselect */ diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c index 1054baa..d054f3d 100644 --- a/arch/parisc/kernel/cache.c +++ b/arch/parisc/kernel/cache.c @@ -171,14 +171,14 @@ parisc_cache_init(void) cache_info.ic_conf.cc_cst, cache_info.ic_conf.cc_hv); - printk("D-TLB conf: sh %d page %d cst %d aid %d pad1 %d \n", + printk("D-TLB conf: sh %d page %d cst %d aid %d pad1 %d\n", cache_info.dt_conf.tc_sh, cache_info.dt_conf.tc_page, cache_info.dt_conf.tc_cst, cache_info.dt_conf.tc_aid, cache_info.dt_conf.tc_pad1); - printk("I-TLB conf: sh %d page %d cst %d aid %d pad1 %d \n", + printk("I-TLB conf: sh %d page %d cst %d aid %d pad1 %d\n", cache_info.it_conf.tc_sh, cache_info.it_conf.tc_page, cache_info.it_conf.tc_cst, diff --git a/arch/parisc/kernel/syscall_table.S b/arch/parisc/kernel/syscall_table.S index 01c4fcf..de5f6da 100644 --- a/arch/parisc/kernel/syscall_table.S +++ b/arch/parisc/kernel/syscall_table.S @@ -417,6 +417,8 @@ ENTRY_COMP(pwritev) ENTRY_COMP(rt_tgsigqueueinfo) ENTRY_SAME(perf_event_open) + ENTRY_COMP(recvmmsg) + ENTRY_SAME(accept4) /* 320 */ /* Nothing yet */ diff --git a/arch/parisc/kernel/time.c b/arch/parisc/kernel/time.c index a79c6f9..05511cc 100644 --- a/arch/parisc/kernel/time.c +++ b/arch/parisc/kernel/time.c @@ -250,9 +250,21 @@ static int __init rtc_init(void) } module_init(rtc_init); -void __init time_init(void) +void read_persistent_clock(struct timespec *ts) { static struct pdc_tod tod_data; + if (pdc_tod_read(&tod_data) == 0) { + ts->tv_sec = tod_data.tod_sec; + ts->tv_nsec = tod_data.tod_usec * 1000; + } else { + printk(KERN_ERR "Error reading tod clock\n"); + ts->tv_sec = 0; + ts->tv_nsec = 0; + } +} + +void __init time_init(void) +{ unsigned long current_cr16_khz; clocktick = (100 * PAGE0->mem_10msec) / HZ; @@ -264,19 +276,4 @@ void __init time_init(void) clocksource_cr16.mult = clocksource_khz2mult(current_cr16_khz, clocksource_cr16.shift); clocksource_register(&clocksource_cr16); - - if (pdc_tod_read(&tod_data) == 0) { - unsigned long flags; - - write_seqlock_irqsave(&xtime_lock, flags); - xtime.tv_sec = tod_data.tod_sec; - xtime.tv_nsec = tod_data.tod_usec * 1000; - set_normalized_timespec(&wall_to_monotonic, - -xtime.tv_sec, -xtime.tv_nsec); - write_sequnlock_irqrestore(&xtime_lock, flags); - } else { - printk(KERN_ERR "Error reading tod clock\n"); - xtime.tv_sec = 0; - xtime.tv_nsec = 0; - } } diff --git a/arch/parisc/kernel/unaligned.c b/arch/parisc/kernel/unaligned.c index e6f4b7a..92d977b 100644 --- a/arch/parisc/kernel/unaligned.c +++ b/arch/parisc/kernel/unaligned.c @@ -25,6 +25,7 @@ #include <linux/module.h> #include <linux/sched.h> #include <linux/signal.h> +#include <linux/ratelimit.h> #include <asm/uaccess.h> /* #define DEBUG_UNALIGNED 1 */ @@ -446,8 +447,7 @@ static int emulate_std(struct pt_regs *regs, int frreg, int flop) void handle_unaligned(struct pt_regs *regs) { - static unsigned long unaligned_count = 0; - static unsigned long last_time = 0; + static DEFINE_RATELIMIT_STATE(ratelimit, 5 * HZ, 5); unsigned long newbase = R1(regs->iir)?regs->gr[R1(regs->iir)]:0; int modify = 0; int ret = ERR_NOTHANDLED; @@ -460,14 +460,8 @@ void handle_unaligned(struct pt_regs *regs) goto force_sigbus; } - if (unaligned_count > 5 && - time_after(jiffies, last_time + 5 * HZ)) { - unaligned_count = 0; - last_time = jiffies; - } - - if (!(current->thread.flags & PARISC_UAC_NOPRINT) - && ++unaligned_count < 5) { + if (!(current->thread.flags & PARISC_UAC_NOPRINT) && + __ratelimit(&ratelimit)) { char buf[256]; sprintf(buf, "%s(%d): unaligned access to 0x" RFMT " at ip=0x" RFMT "\n", current->comm, task_pid_nr(current), regs->ior, regs->iaoq[0]); diff --git a/arch/parisc/lib/memcpy.c b/arch/parisc/lib/memcpy.c index abf41f4..1dbca5c 100644 --- a/arch/parisc/lib/memcpy.c +++ b/arch/parisc/lib/memcpy.c @@ -475,7 +475,8 @@ unsigned long copy_to_user(void __user *dst, const void *src, unsigned long len) return pa_memcpy((void __force *)dst, src, len); } -unsigned long copy_from_user(void *dst, const void __user *src, unsigned long len) +EXPORT_SYMBOL(__copy_from_user); +unsigned long __copy_from_user(void *dst, const void __user *src, unsigned long len) { mtsp(get_user_space(), 1); mtsp(get_kernel_space(), 2); diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h index af2abe7..aadf2dd 100644 --- a/arch/powerpc/include/asm/kvm_asm.h +++ b/arch/powerpc/include/asm/kvm_asm.h @@ -97,4 +97,10 @@ #define RESUME_HOST RESUME_FLAG_HOST #define RESUME_HOST_NV (RESUME_FLAG_HOST|RESUME_FLAG_NV) +#define KVM_GUEST_MODE_NONE 0 +#define KVM_GUEST_MODE_GUEST 1 +#define KVM_GUEST_MODE_SKIP 2 + +#define KVM_INST_FETCH_FAILED -1 + #endif /* __POWERPC_KVM_ASM_H__ */ diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index 74b7369..db7db0a 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -22,7 +22,7 @@ #include <linux/types.h> #include <linux/kvm_host.h> -#include <asm/kvm_ppc.h> +#include <asm/kvm_book3s_64_asm.h> struct kvmppc_slb { u64 esid; @@ -33,7 +33,8 @@ struct kvmppc_slb { bool Ks; bool Kp; bool nx; - bool large; + bool large; /* PTEs are 16MB */ + bool tb; /* 1TB segment */ bool class; }; @@ -69,6 +70,7 @@ struct kvmppc_sid_map { struct kvmppc_vcpu_book3s { struct kvm_vcpu vcpu; + struct kvmppc_book3s_shadow_vcpu shadow_vcpu; struct kvmppc_sid_map sid_map[SID_MAP_NUM]; struct kvmppc_slb slb[64]; struct { @@ -89,6 +91,7 @@ struct kvmppc_vcpu_book3s { u64 vsid_next; u64 vsid_max; int context_id; + ulong prog_flags; /* flags to inject when giving a 700 trap */ }; #define CONTEXT_HOST 0 @@ -119,6 +122,10 @@ extern void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat, extern u32 kvmppc_trampoline_lowmem; extern u32 kvmppc_trampoline_enter; +extern void kvmppc_rmcall(ulong srr0, ulong srr1); +extern void kvmppc_load_up_fpu(void); +extern void kvmppc_load_up_altivec(void); +extern void kvmppc_load_up_vsx(void); static inline struct kvmppc_vcpu_book3s *to_book3s(struct kvm_vcpu *vcpu) { diff --git a/arch/powerpc/include/asm/kvm_book3s_64_asm.h b/arch/powerpc/include/asm/kvm_book3s_64_asm.h index 2e06ee8..183461b 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64_asm.h +++ b/arch/powerpc/include/asm/kvm_book3s_64_asm.h @@ -20,6 +20,8 @@ #ifndef __ASM_KVM_BOOK3S_ASM_H__ #define __ASM_KVM_BOOK3S_ASM_H__ +#ifdef __ASSEMBLY__ + #ifdef CONFIG_KVM_BOOK3S_64_HANDLER #include <asm/kvm_asm.h> @@ -55,4 +57,20 @@ kvmppc_resume_\intno: #endif /* CONFIG_KVM_BOOK3S_64_HANDLER */ +#else /*__ASSEMBLY__ */ + +struct kvmppc_book3s_shadow_vcpu { + ulong gpr[14]; + u32 cr; + u32 xer; + ulong host_r1; + ulong host_r2; + ulong handler; + ulong scratch0; + ulong scratch1; + ulong vmhandler; +}; + +#endif /*__ASSEMBLY__ */ + #endif /* __ASM_KVM_BOOK3S_ASM_H__ */ diff --git a/arch/powerpc/include/asm/kvm_e500.h b/arch/powerpc/include/asm/kvm_e500.h index 9d497ce..7fea26f 100644 --- a/arch/powerpc/include/asm/kvm_e500.h +++ b/arch/powerpc/include/asm/kvm_e500.h @@ -52,9 +52,12 @@ struct kvmppc_vcpu_e500 { u32 mas5; u32 mas6; u32 mas7; + u32 l1csr0; u32 l1csr1; u32 hid0; u32 hid1; + u32 tlb0cfg; + u32 tlb1cfg; struct kvm_vcpu vcpu; }; diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 1201f62..5e5bae7 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -167,23 +167,40 @@ struct kvm_vcpu_arch { ulong trampoline_lowmem; ulong trampoline_enter; ulong highmem_handler; + ulong rmcall; ulong host_paca_phys; struct kvmppc_mmu mmu; #endif - u64 fpr[32]; ulong gpr[32]; + u64 fpr[32]; + u32 fpscr; + +#ifdef CONFIG_ALTIVEC + vector128 vr[32]; + vector128 vscr; +#endif + +#ifdef CONFIG_VSX + u64 vsr[32]; +#endif + ulong pc; - u32 cr; ulong ctr; ulong lr; + +#ifdef CONFIG_BOOKE ulong xer; + u32 cr; +#endif ulong msr; #ifdef CONFIG_PPC64 ulong shadow_msr; + ulong shadow_srr1; ulong hflags; + ulong guest_owned_ext; #endif u32 mmucr; ulong sprg0; @@ -242,6 +259,8 @@ struct kvm_vcpu_arch { #endif ulong fault_dear; ulong fault_esr; + ulong queued_dear; + ulong queued_esr; gpa_t paddr_accessed; u8 io_gpr; /* GPR used as IO source/target */ diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 269ee46..e264282 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -28,6 +28,9 @@ #include <linux/types.h> #include <linux/kvm_types.h> #include <linux/kvm_host.h> +#ifdef CONFIG_PPC_BOOK3S +#include <asm/kvm_book3s.h> +#endif enum emulation_result { EMULATE_DONE, /* no further processing */ @@ -80,8 +83,9 @@ extern void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu); extern void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu); extern int kvmppc_core_pending_dec(struct kvm_vcpu *vcpu); -extern void kvmppc_core_queue_program(struct kvm_vcpu *vcpu); +extern void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong flags); extern void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu); +extern void kvmppc_core_dequeue_dec(struct kvm_vcpu *vcpu); extern void kvmppc_core_queue_external(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq); @@ -95,4 +99,81 @@ extern void kvmppc_booke_exit(void); extern void kvmppc_core_destroy_mmu(struct kvm_vcpu *vcpu); +#ifdef CONFIG_PPC_BOOK3S + +/* We assume we're always acting on the current vcpu */ + +static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val) +{ + if ( num < 14 ) { + get_paca()->shadow_vcpu.gpr[num] = val; + to_book3s(vcpu)->shadow_vcpu.gpr[num] = val; + } else + vcpu->arch.gpr[num] = val; +} + +static inline ulong kvmppc_get_gpr(struct kvm_vcpu *vcpu, int num) +{ + if ( num < 14 ) + return get_paca()->shadow_vcpu.gpr[num]; + else + return vcpu->arch.gpr[num]; +} + +static inline void kvmppc_set_cr(struct kvm_vcpu *vcpu, u32 val) +{ + get_paca()->shadow_vcpu.cr = val; + to_book3s(vcpu)->shadow_vcpu.cr = val; +} + +static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu) +{ + return get_paca()->shadow_vcpu.cr; +} + +static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, u32 val) +{ + get_paca()->shadow_vcpu.xer = val; + to_book3s(vcpu)->shadow_vcpu.xer = val; +} + +static inline u32 kvmppc_get_xer(struct kvm_vcpu *vcpu) +{ + return get_paca()->shadow_vcpu.xer; +} + +#else + +static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val) +{ + vcpu->arch.gpr[num] = val; +} + +static inline ulong kvmppc_get_gpr(struct kvm_vcpu *vcpu, int num) +{ + return vcpu->arch.gpr[num]; +} + +static inline void kvmppc_set_cr(struct kvm_vcpu *vcpu, u32 val) +{ + vcpu->arch.cr = val; +} + +static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.cr; +} + +static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, u32 val) +{ + vcpu->arch.xer = val; +} + +static inline u32 kvmppc_get_xer(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.xer; +} + +#endif + #endif /* __POWERPC_KVM_PPC_H__ */ diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index 5e9b4ef..d8a6931 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -19,6 +19,9 @@ #include <asm/mmu.h> #include <asm/page.h> #include <asm/exception-64e.h> +#ifdef CONFIG_KVM_BOOK3S_64_HANDLER +#include <asm/kvm_book3s_64_asm.h> +#endif register struct paca_struct *local_paca asm("r13"); @@ -135,6 +138,8 @@ struct paca_struct { u64 esid; u64 vsid; } kvm_slb[64]; /* guest SLB */ + /* We use this to store guest state in */ + struct kvmppc_book3s_shadow_vcpu shadow_vcpu; u8 kvm_slb_max; /* highest used guest slb entry */ u8 kvm_in_guest; /* are we inside the guest? */ #endif diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index bc8dd53..5572e86 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -426,6 +426,10 @@ #define SRR1_WAKEMT 0x00280000 /* mtctrl */ #define SRR1_WAKEDEC 0x00180000 /* Decrementer interrupt */ #define SRR1_WAKETHERM 0x00100000 /* Thermal management interrupt */ +#define SRR1_PROGFPE 0x00100000 /* Floating Point Enabled */ +#define SRR1_PROGPRIV 0x00040000 /* Privileged instruction */ +#define SRR1_PROGTRAP 0x00020000 /* Trap */ +#define SRR1_PROGADDR 0x00010000 /* SRR0 contains subsequent addr */ #define SPRN_HSRR0 0x13A /* Save/Restore Register 0 */ #define SPRN_HSRR1 0x13B /* Save/Restore Register 1 */ diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index a6c2b63..957ceb7 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -194,6 +194,30 @@ int main(void) DEFINE(PACA_KVM_IN_GUEST, offsetof(struct paca_struct, kvm_in_guest)); DEFINE(PACA_KVM_SLB, offsetof(struct paca_struct, kvm_slb)); DEFINE(PACA_KVM_SLB_MAX, offsetof(struct paca_struct, kvm_slb_max)); + DEFINE(PACA_KVM_CR, offsetof(struct paca_struct, shadow_vcpu.cr)); + DEFINE(PACA_KVM_XER, offsetof(struct paca_struct, shadow_vcpu.xer)); + DEFINE(PACA_KVM_R0, offsetof(struct paca_struct, shadow_vcpu.gpr[0])); + DEFINE(PACA_KVM_R1, offsetof(struct paca_struct, shadow_vcpu.gpr[1])); + DEFINE(PACA_KVM_R2, offsetof(struct paca_struct, shadow_vcpu.gpr[2])); + DEFINE(PACA_KVM_R3, offsetof(struct paca_struct, shadow_vcpu.gpr[3])); + DEFINE(PACA_KVM_R4, offsetof(struct paca_struct, shadow_vcpu.gpr[4])); + DEFINE(PACA_KVM_R5, offsetof(struct paca_struct, shadow_vcpu.gpr[5])); + DEFINE(PACA_KVM_R6, offsetof(struct paca_struct, shadow_vcpu.gpr[6])); + DEFINE(PACA_KVM_R7, offsetof(struct paca_struct, shadow_vcpu.gpr[7])); + DEFINE(PACA_KVM_R8, offsetof(struct paca_struct, shadow_vcpu.gpr[8])); + DEFINE(PACA_KVM_R9, offsetof(struct paca_struct, shadow_vcpu.gpr[9])); + DEFINE(PACA_KVM_R10, offsetof(struct paca_struct, shadow_vcpu.gpr[10])); + DEFINE(PACA_KVM_R11, offsetof(struct paca_struct, shadow_vcpu.gpr[11])); + DEFINE(PACA_KVM_R12, offsetof(struct paca_struct, shadow_vcpu.gpr[12])); + DEFINE(PACA_KVM_R13, offsetof(struct paca_struct, shadow_vcpu.gpr[13])); + DEFINE(PACA_KVM_HOST_R1, offsetof(struct paca_struct, shadow_vcpu.host_r1)); + DEFINE(PACA_KVM_HOST_R2, offsetof(struct paca_struct, shadow_vcpu.host_r2)); + DEFINE(PACA_KVM_VMHANDLER, offsetof(struct paca_struct, + shadow_vcpu.vmhandler)); + DEFINE(PACA_KVM_SCRATCH0, offsetof(struct paca_struct, + shadow_vcpu.scratch0)); + DEFINE(PACA_KVM_SCRATCH1, offsetof(struct paca_struct, + shadow_vcpu.scratch1)); #endif #endif /* CONFIG_PPC64 */ @@ -389,8 +413,6 @@ int main(void) DEFINE(VCPU_HOST_PID, offsetof(struct kvm_vcpu, arch.host_pid)); DEFINE(VCPU_GPRS, offsetof(struct kvm_vcpu, arch.gpr)); DEFINE(VCPU_LR, offsetof(struct kvm_vcpu, arch.lr)); - DEFINE(VCPU_CR, offsetof(struct kvm_vcpu, arch.cr)); - DEFINE(VCPU_XER, offsetof(struct kvm_vcpu, arch.xer)); DEFINE(VCPU_CTR, offsetof(struct kvm_vcpu, arch.ctr)); DEFINE(VCPU_PC, offsetof(struct kvm_vcpu, arch.pc)); DEFINE(VCPU_MSR, offsetof(struct kvm_vcpu, arch.msr)); @@ -411,11 +433,16 @@ int main(void) DEFINE(VCPU_HOST_R2, offsetof(struct kvm_vcpu, arch.host_r2)); DEFINE(VCPU_HOST_MSR, offsetof(struct kvm_vcpu, arch.host_msr)); DEFINE(VCPU_SHADOW_MSR, offsetof(struct kvm_vcpu, arch.shadow_msr)); + DEFINE(VCPU_SHADOW_SRR1, offsetof(struct kvm_vcpu, arch.shadow_srr1)); DEFINE(VCPU_TRAMPOLINE_LOWMEM, offsetof(struct kvm_vcpu, arch.trampoline_lowmem)); DEFINE(VCPU_TRAMPOLINE_ENTER, offsetof(struct kvm_vcpu, arch.trampoline_enter)); DEFINE(VCPU_HIGHMEM_HANDLER, offsetof(struct kvm_vcpu, arch.highmem_handler)); + DEFINE(VCPU_RMCALL, offsetof(struct kvm_vcpu, arch.rmcall)); DEFINE(VCPU_HFLAGS, offsetof(struct kvm_vcpu, arch.hflags)); -#endif +#else + DEFINE(VCPU_CR, offsetof(struct kvm_vcpu, arch.cr)); + DEFINE(VCPU_XER, offsetof(struct kvm_vcpu, arch.xer)); +#endif /* CONFIG_PPC64 */ #endif #ifdef CONFIG_44x DEFINE(PGD_T_LOG2, PGD_T_LOG2); diff --git a/arch/powerpc/kernel/ppc_ksyms.c b/arch/powerpc/kernel/ppc_ksyms.c index 42545145..ab3e392 100644 --- a/arch/powerpc/kernel/ppc_ksyms.c +++ b/arch/powerpc/kernel/ppc_ksyms.c @@ -107,6 +107,7 @@ EXPORT_SYMBOL(giveup_altivec); #endif /* CONFIG_ALTIVEC */ #ifdef CONFIG_VSX EXPORT_SYMBOL(giveup_vsx); +EXPORT_SYMBOL_GPL(__giveup_vsx); #endif /* CONFIG_VSX */ #ifdef CONFIG_SPE EXPORT_SYMBOL(giveup_spe); diff --git a/arch/powerpc/kvm/44x_emulate.c b/arch/powerpc/kvm/44x_emulate.c index 61af58f..65ea083 100644 --- a/arch/powerpc/kvm/44x_emulate.c +++ b/arch/powerpc/kvm/44x_emulate.c @@ -65,13 +65,14 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, */ switch (dcrn) { case DCRN_CPR0_CONFIG_ADDR: - vcpu->arch.gpr[rt] = vcpu->arch.cpr0_cfgaddr; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.cpr0_cfgaddr); break; case DCRN_CPR0_CONFIG_DATA: local_irq_disable(); mtdcr(DCRN_CPR0_CONFIG_ADDR, vcpu->arch.cpr0_cfgaddr); - vcpu->arch.gpr[rt] = mfdcr(DCRN_CPR0_CONFIG_DATA); + kvmppc_set_gpr(vcpu, rt, + mfdcr(DCRN_CPR0_CONFIG_DATA)); local_irq_enable(); break; default: @@ -93,11 +94,11 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, /* emulate some access in kernel */ switch (dcrn) { case DCRN_CPR0_CONFIG_ADDR: - vcpu->arch.cpr0_cfgaddr = vcpu->arch.gpr[rs]; + vcpu->arch.cpr0_cfgaddr = kvmppc_get_gpr(vcpu, rs); break; default: run->dcr.dcrn = dcrn; - run->dcr.data = vcpu->arch.gpr[rs]; + run->dcr.data = kvmppc_get_gpr(vcpu, rs); run->dcr.is_write = 1; vcpu->arch.dcr_needed = 1; kvmppc_account_exit(vcpu, DCR_EXITS); @@ -146,13 +147,13 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) switch (sprn) { case SPRN_PID: - kvmppc_set_pid(vcpu, vcpu->arch.gpr[rs]); break; + kvmppc_set_pid(vcpu, kvmppc_get_gpr(vcpu, rs)); break; case SPRN_MMUCR: - vcpu->arch.mmucr = vcpu->arch.gpr[rs]; break; + vcpu->arch.mmucr = kvmppc_get_gpr(vcpu, rs); break; case SPRN_CCR0: - vcpu->arch.ccr0 = vcpu->arch.gpr[rs]; break; + vcpu->arch.ccr0 = kvmppc_get_gpr(vcpu, rs); break; case SPRN_CCR1: - vcpu->arch.ccr1 = vcpu->arch.gpr[rs]; break; + vcpu->arch.ccr1 = kvmppc_get_gpr(vcpu, rs); break; default: emulated = kvmppc_booke_emulate_mtspr(vcpu, sprn, rs); } @@ -167,13 +168,13 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) switch (sprn) { case SPRN_PID: - vcpu->arch.gpr[rt] = vcpu->arch.pid; break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.pid); break; case SPRN_MMUCR: - vcpu->arch.gpr[rt] = vcpu->arch.mmucr; break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.mmucr); break; case SPRN_CCR0: - vcpu->arch.gpr[rt] = vcpu->arch.ccr0; break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ccr0); break; case SPRN_CCR1: - vcpu->arch.gpr[rt] = vcpu->arch.ccr1; break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ccr1); break; default: emulated = kvmppc_booke_emulate_mfspr(vcpu, sprn, rt); } diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c index ff3cb63..2570fcc 100644 --- a/arch/powerpc/kvm/44x_tlb.c +++ b/arch/powerpc/kvm/44x_tlb.c @@ -439,7 +439,7 @@ int kvmppc_44x_emul_tlbwe(struct kvm_vcpu *vcpu, u8 ra, u8 rs, u8 ws) struct kvmppc_44x_tlbe *tlbe; unsigned int gtlb_index; - gtlb_index = vcpu->arch.gpr[ra]; + gtlb_index = kvmppc_get_gpr(vcpu, ra); if (gtlb_index > KVM44x_GUEST_TLB_SIZE) { printk("%s: index %d\n", __func__, gtlb_index); kvmppc_dump_vcpu(vcpu); @@ -455,15 +455,15 @@ int kvmppc_44x_emul_tlbwe(struct kvm_vcpu *vcpu, u8 ra, u8 rs, u8 ws) switch (ws) { case PPC44x_TLB_PAGEID: tlbe->tid = get_mmucr_stid(vcpu); - tlbe->word0 = vcpu->arch.gpr[rs]; + tlbe->word0 = kvmppc_get_gpr(vcpu, rs); break; case PPC44x_TLB_XLAT: - tlbe->word1 = vcpu->arch.gpr[rs]; + tlbe->word1 = kvmppc_get_gpr(vcpu, rs); break; case PPC44x_TLB_ATTRIB: - tlbe->word2 = vcpu->arch.gpr[rs]; + tlbe->word2 = kvmppc_get_gpr(vcpu, rs); break; default: @@ -500,18 +500,20 @@ int kvmppc_44x_emul_tlbsx(struct kvm_vcpu *vcpu, u8 rt, u8 ra, u8 rb, u8 rc) unsigned int as = get_mmucr_sts(vcpu); unsigned int pid = get_mmucr_stid(vcpu); - ea = vcpu->arch.gpr[rb]; + ea = kvmppc_get_gpr(vcpu, rb); if (ra) - ea += vcpu->arch.gpr[ra]; + ea += kvmppc_get_gpr(vcpu, ra); gtlb_index = kvmppc_44x_tlb_index(vcpu, ea, pid, as); if (rc) { + u32 cr = kvmppc_get_cr(vcpu); + if (gtlb_index < 0) - vcpu->arch.cr &= ~0x20000000; + kvmppc_set_cr(vcpu, cr & ~0x20000000); else - vcpu->arch.cr |= 0x20000000; + kvmppc_set_cr(vcpu, cr | 0x20000000); } - vcpu->arch.gpr[rt] = gtlb_index; + kvmppc_set_gpr(vcpu, rt, gtlb_index); kvmppc_set_exit_type(vcpu, EMULATED_TLBSX_EXITS); return EMULATE_DONE; diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index fe037fd..60624cc 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -20,6 +20,7 @@ config KVM bool select PREEMPT_NOTIFIERS select ANON_INODES + select KVM_MMIO config KVM_BOOK3S_64_HANDLER bool diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 3e294bd..9a271f0 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -33,12 +33,9 @@ /* #define EXIT_DEBUG */ /* #define EXIT_DEBUG_SIMPLE */ +/* #define DEBUG_EXT */ -/* Without AGGRESSIVE_DEC we only fire off a DEC interrupt when DEC turns 0. - * When set, we retrigger a DEC interrupt after that if DEC <= 0. - * PPC32 Linux runs faster without AGGRESSIVE_DEC, PPC64 Linux requires it. */ - -/* #define AGGRESSIVE_DEC */ +static void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr); struct kvm_stats_debugfs_item debugfs_entries[] = { { "exits", VCPU_STAT(sum_exits) }, @@ -72,16 +69,24 @@ void kvmppc_core_load_guest_debugstate(struct kvm_vcpu *vcpu) void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { memcpy(get_paca()->kvm_slb, to_book3s(vcpu)->slb_shadow, sizeof(get_paca()->kvm_slb)); + memcpy(&get_paca()->shadow_vcpu, &to_book3s(vcpu)->shadow_vcpu, + sizeof(get_paca()->shadow_vcpu)); get_paca()->kvm_slb_max = to_book3s(vcpu)->slb_shadow_max; } void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) { memcpy(to_book3s(vcpu)->slb_shadow, get_paca()->kvm_slb, sizeof(get_paca()->kvm_slb)); + memcpy(&to_book3s(vcpu)->shadow_vcpu, &get_paca()->shadow_vcpu, + sizeof(get_paca()->shadow_vcpu)); to_book3s(vcpu)->slb_shadow_max = get_paca()->kvm_slb_max; + + kvmppc_giveup_ext(vcpu, MSR_FP); + kvmppc_giveup_ext(vcpu, MSR_VEC); + kvmppc_giveup_ext(vcpu, MSR_VSX); } -#if defined(AGGRESSIVE_DEC) || defined(EXIT_DEBUG) +#if defined(EXIT_DEBUG) static u32 kvmppc_get_dec(struct kvm_vcpu *vcpu) { u64 jd = mftb() - vcpu->arch.dec_jiffies; @@ -89,6 +94,23 @@ static u32 kvmppc_get_dec(struct kvm_vcpu *vcpu) } #endif +static void kvmppc_recalc_shadow_msr(struct kvm_vcpu *vcpu) +{ + vcpu->arch.shadow_msr = vcpu->arch.msr; + /* Guest MSR values */ + vcpu->arch.shadow_msr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | + MSR_BE | MSR_DE; + /* Process MSR values */ + vcpu->arch.shadow_msr |= MSR_ME | MSR_RI | MSR_IR | MSR_DR | MSR_PR | + MSR_EE; + /* External providers the guest reserved */ + vcpu->arch.shadow_msr |= (vcpu->arch.msr & vcpu->arch.guest_owned_ext); + /* 64-bit Process MSR values */ +#ifdef CONFIG_PPC_BOOK3S_64 + vcpu->arch.shadow_msr |= MSR_ISF | MSR_HV; +#endif +} + void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr) { ulong old_msr = vcpu->arch.msr; @@ -96,12 +118,10 @@ void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr) #ifdef EXIT_DEBUG printk(KERN_INFO "KVM: Set MSR to 0x%llx\n", msr); #endif + msr &= to_book3s(vcpu)->msr_mask; vcpu->arch.msr = msr; - vcpu->arch.shadow_msr = msr | MSR_USER32; - vcpu->arch.shadow_msr &= ( MSR_VEC | MSR_VSX | MSR_FP | MSR_FE0 | - MSR_USER64 | MSR_SE | MSR_BE | MSR_DE | - MSR_FE1); + kvmppc_recalc_shadow_msr(vcpu); if (msr & (MSR_WE|MSR_POW)) { if (!vcpu->arch.pending_exceptions) { @@ -125,11 +145,10 @@ void kvmppc_inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 flags) vcpu->arch.mmu.reset_msr(vcpu); } -void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec) +static int kvmppc_book3s_vec2irqprio(unsigned int vec) { unsigned int prio; - vcpu->stat.queue_intr++; switch (vec) { case 0x100: prio = BOOK3S_IRQPRIO_SYSTEM_RESET; break; case 0x200: prio = BOOK3S_IRQPRIO_MACHINE_CHECK; break; @@ -149,15 +168,31 @@ void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec) default: prio = BOOK3S_IRQPRIO_MAX; break; } - set_bit(prio, &vcpu->arch.pending_exceptions); + return prio; +} + +static void kvmppc_book3s_dequeue_irqprio(struct kvm_vcpu *vcpu, + unsigned int vec) +{ + clear_bit(kvmppc_book3s_vec2irqprio(vec), + &vcpu->arch.pending_exceptions); +} + +void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec) +{ + vcpu->stat.queue_intr++; + + set_bit(kvmppc_book3s_vec2irqprio(vec), + &vcpu->arch.pending_exceptions); #ifdef EXIT_DEBUG printk(KERN_INFO "Queueing interrupt %x\n", vec); #endif } -void kvmppc_core_queue_program(struct kvm_vcpu *vcpu) +void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong flags) { + to_book3s(vcpu)->prog_flags = flags; kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_PROGRAM); } @@ -171,6 +206,11 @@ int kvmppc_core_pending_dec(struct kvm_vcpu *vcpu) return test_bit(BOOK3S_INTERRUPT_DECREMENTER >> 7, &vcpu->arch.pending_exceptions); } +void kvmppc_core_dequeue_dec(struct kvm_vcpu *vcpu) +{ + kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_DECREMENTER); +} + void kvmppc_core_queue_external(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq) { @@ -181,6 +221,7 @@ int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority) { int deliver = 1; int vec = 0; + ulong flags = 0ULL; switch (priority) { case BOOK3S_IRQPRIO_DECREMENTER: @@ -214,6 +255,7 @@ int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority) break; case BOOK3S_IRQPRIO_PROGRAM: vec = BOOK3S_INTERRUPT_PROGRAM; + flags = to_book3s(vcpu)->prog_flags; break; case BOOK3S_IRQPRIO_VSX: vec = BOOK3S_INTERRUPT_VSX; @@ -244,7 +286,7 @@ int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority) #endif if (deliver) - kvmppc_inject_interrupt(vcpu, vec, 0ULL); + kvmppc_inject_interrupt(vcpu, vec, flags); return deliver; } @@ -254,21 +296,15 @@ void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu) unsigned long *pending = &vcpu->arch.pending_exceptions; unsigned int priority; - /* XXX be more clever here - no need to mftb() on every entry */ - /* Issue DEC again if it's still active */ -#ifdef AGGRESSIVE_DEC - if (vcpu->arch.msr & MSR_EE) - if (kvmppc_get_dec(vcpu) & 0x80000000) - kvmppc_core_queue_dec(vcpu); -#endif - #ifdef EXIT_DEBUG if (vcpu->arch.pending_exceptions) printk(KERN_EMERG "KVM: Check pending: %lx\n", vcpu->arch.pending_exceptions); #endif priority = __ffs(*pending); while (priority <= (sizeof(unsigned int) * 8)) { - if (kvmppc_book3s_irqprio_deliver(vcpu, priority)) { + if (kvmppc_book3s_irqprio_deliver(vcpu, priority) && + (priority != BOOK3S_IRQPRIO_DECREMENTER)) { + /* DEC interrupts get cleared by mtdec */ clear_bit(priority, &vcpu->arch.pending_exceptions); break; } @@ -503,14 +539,14 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu, /* Page not found in guest PTE entries */ vcpu->arch.dear = vcpu->arch.fault_dear; to_book3s(vcpu)->dsisr = vcpu->arch.fault_dsisr; - vcpu->arch.msr |= (vcpu->arch.shadow_msr & 0x00000000f8000000ULL); + vcpu->arch.msr |= (vcpu->arch.shadow_srr1 & 0x00000000f8000000ULL); kvmppc_book3s_queue_irqprio(vcpu, vec); } else if (page_found == -EPERM) { /* Storage protection */ vcpu->arch.dear = vcpu->arch.fault_dear; to_book3s(vcpu)->dsisr = vcpu->arch.fault_dsisr & ~DSISR_NOHPTE; to_book3s(vcpu)->dsisr |= DSISR_PROTFAULT; - vcpu->arch.msr |= (vcpu->arch.shadow_msr & 0x00000000f8000000ULL); + vcpu->arch.msr |= (vcpu->arch.shadow_srr1 & 0x00000000f8000000ULL); kvmppc_book3s_queue_irqprio(vcpu, vec); } else if (page_found == -EINVAL) { /* Page not found in guest SLB */ @@ -532,13 +568,122 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu, r = kvmppc_emulate_mmio(run, vcpu); if ( r == RESUME_HOST_NV ) r = RESUME_HOST; - if ( r == RESUME_GUEST_NV ) - r = RESUME_GUEST; } return r; } +static inline int get_fpr_index(int i) +{ +#ifdef CONFIG_VSX + i *= 2; +#endif + return i; +} + +/* Give up external provider (FPU, Altivec, VSX) */ +static void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr) +{ + struct thread_struct *t = ¤t->thread; + u64 *vcpu_fpr = vcpu->arch.fpr; + u64 *vcpu_vsx = vcpu->arch.vsr; + u64 *thread_fpr = (u64*)t->fpr; + int i; + + if (!(vcpu->arch.guest_owned_ext & msr)) + return; + +#ifdef DEBUG_EXT + printk(KERN_INFO "Giving up ext 0x%lx\n", msr); +#endif + + switch (msr) { + case MSR_FP: + giveup_fpu(current); + for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++) + vcpu_fpr[i] = thread_fpr[get_fpr_index(i)]; + + vcpu->arch.fpscr = t->fpscr.val; + break; + case MSR_VEC: +#ifdef CONFIG_ALTIVEC + giveup_altivec(current); + memcpy(vcpu->arch.vr, t->vr, sizeof(vcpu->arch.vr)); + vcpu->arch.vscr = t->vscr; +#endif + break; + case MSR_VSX: +#ifdef CONFIG_VSX + __giveup_vsx(current); + for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr); i++) + vcpu_vsx[i] = thread_fpr[get_fpr_index(i) + 1]; +#endif + break; + default: + BUG(); + } + + vcpu->arch.guest_owned_ext &= ~msr; + current->thread.regs->msr &= ~msr; + kvmppc_recalc_shadow_msr(vcpu); +} + +/* Handle external providers (FPU, Altivec, VSX) */ +static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr, + ulong msr) +{ + struct thread_struct *t = ¤t->thread; + u64 *vcpu_fpr = vcpu->arch.fpr; + u64 *vcpu_vsx = vcpu->arch.vsr; + u64 *thread_fpr = (u64*)t->fpr; + int i; + + if (!(vcpu->arch.msr & msr)) { + kvmppc_book3s_queue_irqprio(vcpu, exit_nr); + return RESUME_GUEST; + } + +#ifdef DEBUG_EXT + printk(KERN_INFO "Loading up ext 0x%lx\n", msr); +#endif + + current->thread.regs->msr |= msr; + + switch (msr) { + case MSR_FP: + for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++) + thread_fpr[get_fpr_index(i)] = vcpu_fpr[i]; + + t->fpscr.val = vcpu->arch.fpscr; + t->fpexc_mode = 0; + kvmppc_load_up_fpu(); + break; + case MSR_VEC: +#ifdef CONFIG_ALTIVEC + memcpy(t->vr, vcpu->arch.vr, sizeof(vcpu->arch.vr)); + t->vscr = vcpu->arch.vscr; + t->vrsave = -1; + kvmppc_load_up_altivec(); +#endif + break; + case MSR_VSX: +#ifdef CONFIG_VSX + for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr); i++) + thread_fpr[get_fpr_index(i) + 1] = vcpu_vsx[i]; + kvmppc_load_up_vsx(); +#endif + break; + default: + BUG(); + } + + vcpu->arch.guest_owned_ext |= msr; + + kvmppc_recalc_shadow_msr(vcpu); + + return RESUME_GUEST; +} + int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned int exit_nr) { @@ -563,7 +708,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, case BOOK3S_INTERRUPT_INST_STORAGE: vcpu->stat.pf_instruc++; /* only care about PTEG not found errors, but leave NX alone */ - if (vcpu->arch.shadow_msr & 0x40000000) { + if (vcpu->arch.shadow_srr1 & 0x40000000) { r = kvmppc_handle_pagefault(run, vcpu, vcpu->arch.pc, exit_nr); vcpu->stat.sp_instruc++; } else if (vcpu->arch.mmu.is_dcbz32(vcpu) && @@ -575,7 +720,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, */ kvmppc_mmu_pte_flush(vcpu, vcpu->arch.pc, ~0xFFFULL); } else { - vcpu->arch.msr |= (vcpu->arch.shadow_msr & 0x58000000); + vcpu->arch.msr |= vcpu->arch.shadow_srr1 & 0x58000000; kvmppc_book3s_queue_irqprio(vcpu, exit_nr); kvmppc_mmu_pte_flush(vcpu, vcpu->arch.pc, ~0xFFFULL); r = RESUME_GUEST; @@ -621,6 +766,9 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, case BOOK3S_INTERRUPT_PROGRAM: { enum emulation_result er; + ulong flags; + + flags = vcpu->arch.shadow_srr1 & 0x1f0000ull; if (vcpu->arch.msr & MSR_PR) { #ifdef EXIT_DEBUG @@ -628,7 +776,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, #endif if ((vcpu->arch.last_inst & 0xff0007ff) != (INS_DCBZ & 0xfffffff7)) { - kvmppc_book3s_queue_irqprio(vcpu, exit_nr); + kvmppc_core_queue_program(vcpu, flags); r = RESUME_GUEST; break; } @@ -638,12 +786,12 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, er = kvmppc_emulate_instruction(run, vcpu); switch (er) { case EMULATE_DONE: - r = RESUME_GUEST; + r = RESUME_GUEST_NV; break; case EMULATE_FAIL: printk(KERN_CRIT "%s: emulation at %lx failed (%08x)\n", __func__, vcpu->arch.pc, vcpu->arch.last_inst); - kvmppc_book3s_queue_irqprio(vcpu, exit_nr); + kvmppc_core_queue_program(vcpu, flags); r = RESUME_GUEST; break; default: @@ -653,23 +801,30 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, } case BOOK3S_INTERRUPT_SYSCALL: #ifdef EXIT_DEBUG - printk(KERN_INFO "Syscall Nr %d\n", (int)vcpu->arch.gpr[0]); + printk(KERN_INFO "Syscall Nr %d\n", (int)kvmppc_get_gpr(vcpu, 0)); #endif vcpu->stat.syscall_exits++; kvmppc_book3s_queue_irqprio(vcpu, exit_nr); r = RESUME_GUEST; break; - case BOOK3S_INTERRUPT_MACHINE_CHECK: case BOOK3S_INTERRUPT_FP_UNAVAIL: - case BOOK3S_INTERRUPT_TRACE: + r = kvmppc_handle_ext(vcpu, exit_nr, MSR_FP); + break; case BOOK3S_INTERRUPT_ALTIVEC: + r = kvmppc_handle_ext(vcpu, exit_nr, MSR_VEC); + break; case BOOK3S_INTERRUPT_VSX: + r = kvmppc_handle_ext(vcpu, exit_nr, MSR_VSX); + break; + case BOOK3S_INTERRUPT_MACHINE_CHECK: + case BOOK3S_INTERRUPT_TRACE: kvmppc_book3s_queue_irqprio(vcpu, exit_nr); r = RESUME_GUEST; break; default: /* Ugh - bork here! What did we get? */ - printk(KERN_EMERG "exit_nr=0x%x | pc=0x%lx | msr=0x%lx\n", exit_nr, vcpu->arch.pc, vcpu->arch.shadow_msr); + printk(KERN_EMERG "exit_nr=0x%x | pc=0x%lx | msr=0x%lx\n", + exit_nr, vcpu->arch.pc, vcpu->arch.shadow_srr1); r = RESUME_HOST; BUG(); break; @@ -712,10 +867,10 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) int i; regs->pc = vcpu->arch.pc; - regs->cr = vcpu->arch.cr; + regs->cr = kvmppc_get_cr(vcpu); regs->ctr = vcpu->arch.ctr; regs->lr = vcpu->arch.lr; - regs->xer = vcpu->arch.xer; + regs->xer = kvmppc_get_xer(vcpu); regs->msr = vcpu->arch.msr; regs->srr0 = vcpu->arch.srr0; regs->srr1 = vcpu->arch.srr1; @@ -729,7 +884,7 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) regs->sprg7 = vcpu->arch.sprg6; for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) - regs->gpr[i] = vcpu->arch.gpr[i]; + regs->gpr[i] = kvmppc_get_gpr(vcpu, i); return 0; } @@ -739,10 +894,10 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) int i; vcpu->arch.pc = regs->pc; - vcpu->arch.cr = regs->cr; + kvmppc_set_cr(vcpu, regs->cr); vcpu->arch.ctr = regs->ctr; vcpu->arch.lr = regs->lr; - vcpu->arch.xer = regs->xer; + kvmppc_set_xer(vcpu, regs->xer); kvmppc_set_msr(vcpu, regs->msr); vcpu->arch.srr0 = regs->srr0; vcpu->arch.srr1 = regs->srr1; @@ -754,8 +909,8 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) vcpu->arch.sprg6 = regs->sprg5; vcpu->arch.sprg7 = regs->sprg6; - for (i = 0; i < ARRAY_SIZE(vcpu->arch.gpr); i++) - vcpu->arch.gpr[i] = regs->gpr[i]; + for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) + kvmppc_set_gpr(vcpu, i, regs->gpr[i]); return 0; } @@ -850,7 +1005,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, int is_dirty = 0; int r, n; - down_write(&kvm->slots_lock); + mutex_lock(&kvm->slots_lock); r = kvm_get_dirty_log(kvm, log, &is_dirty); if (r) @@ -858,7 +1013,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, /* If nothing is dirty, don't bother messing with page tables. */ if (is_dirty) { - memslot = &kvm->memslots[log->slot]; + memslot = &kvm->memslots->memslots[log->slot]; ga = memslot->base_gfn << PAGE_SHIFT; ga_end = ga + (memslot->npages << PAGE_SHIFT); @@ -872,7 +1027,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, r = 0; out: - up_write(&kvm->slots_lock); + mutex_unlock(&kvm->slots_lock); return r; } @@ -910,6 +1065,7 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) vcpu->arch.trampoline_lowmem = kvmppc_trampoline_lowmem; vcpu->arch.trampoline_enter = kvmppc_trampoline_enter; vcpu->arch.highmem_handler = (ulong)kvmppc_handler_highmem; + vcpu->arch.rmcall = *(ulong*)kvmppc_rmcall; vcpu->arch.shadow_msr = MSR_USER64; @@ -943,6 +1099,10 @@ extern int __kvmppc_vcpu_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu); int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) { int ret; + struct thread_struct ext_bkp; + bool save_vec = current->thread.used_vr; + bool save_vsx = current->thread.used_vsr; + ulong ext_msr; /* No need to go into the guest when all we do is going out */ if (signal_pending(current)) { @@ -950,6 +1110,35 @@ int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) return -EINTR; } + /* Save FPU state in stack */ + if (current->thread.regs->msr & MSR_FP) + giveup_fpu(current); + memcpy(ext_bkp.fpr, current->thread.fpr, sizeof(current->thread.fpr)); + ext_bkp.fpscr = current->thread.fpscr; + ext_bkp.fpexc_mode = current->thread.fpexc_mode; + +#ifdef CONFIG_ALTIVEC + /* Save Altivec state in stack */ + if (save_vec) { + if (current->thread.regs->msr & MSR_VEC) + giveup_altivec(current); + memcpy(ext_bkp.vr, current->thread.vr, sizeof(ext_bkp.vr)); + ext_bkp.vscr = current->thread.vscr; + ext_bkp.vrsave = current->thread.vrsave; + } + ext_bkp.used_vr = current->thread.used_vr; +#endif + +#ifdef CONFIG_VSX + /* Save VSX state in stack */ + if (save_vsx && (current->thread.regs->msr & MSR_VSX)) + __giveup_vsx(current); + ext_bkp.used_vsr = current->thread.used_vsr; +#endif + + /* Remember the MSR with disabled extensions */ + ext_msr = current->thread.regs->msr; + /* XXX we get called with irq disabled - change that! */ local_irq_enable(); @@ -957,6 +1146,32 @@ int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) local_irq_disable(); + current->thread.regs->msr = ext_msr; + + /* Make sure we save the guest FPU/Altivec/VSX state */ + kvmppc_giveup_ext(vcpu, MSR_FP); + kvmppc_giveup_ext(vcpu, MSR_VEC); + kvmppc_giveup_ext(vcpu, MSR_VSX); + + /* Restore FPU state from stack */ + memcpy(current->thread.fpr, ext_bkp.fpr, sizeof(ext_bkp.fpr)); + current->thread.fpscr = ext_bkp.fpscr; + current->thread.fpexc_mode = ext_bkp.fpexc_mode; + +#ifdef CONFIG_ALTIVEC + /* Restore Altivec state from stack */ + if (save_vec && current->thread.used_vr) { + memcpy(current->thread.vr, ext_bkp.vr, sizeof(ext_bkp.vr)); + current->thread.vscr = ext_bkp.vscr; + current->thread.vrsave= ext_bkp.vrsave; + } + current->thread.used_vr = ext_bkp.used_vr; +#endif + +#ifdef CONFIG_VSX + current->thread.used_vsr = ext_bkp.used_vsr; +#endif + return ret; } diff --git a/arch/powerpc/kvm/book3s_64_emulate.c b/arch/powerpc/kvm/book3s_64_emulate.c index 1027eac..2b0ee7e 100644 --- a/arch/powerpc/kvm/book3s_64_emulate.c +++ b/arch/powerpc/kvm/book3s_64_emulate.c @@ -65,11 +65,11 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, case 31: switch (get_xop(inst)) { case OP_31_XOP_MFMSR: - vcpu->arch.gpr[get_rt(inst)] = vcpu->arch.msr; + kvmppc_set_gpr(vcpu, get_rt(inst), vcpu->arch.msr); break; case OP_31_XOP_MTMSRD: { - ulong rs = vcpu->arch.gpr[get_rs(inst)]; + ulong rs = kvmppc_get_gpr(vcpu, get_rs(inst)); if (inst & 0x10000) { vcpu->arch.msr &= ~(MSR_RI | MSR_EE); vcpu->arch.msr |= rs & (MSR_RI | MSR_EE); @@ -78,30 +78,30 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, break; } case OP_31_XOP_MTMSR: - kvmppc_set_msr(vcpu, vcpu->arch.gpr[get_rs(inst)]); + kvmppc_set_msr(vcpu, kvmppc_get_gpr(vcpu, get_rs(inst))); break; case OP_31_XOP_MFSRIN: { int srnum; - srnum = (vcpu->arch.gpr[get_rb(inst)] >> 28) & 0xf; + srnum = (kvmppc_get_gpr(vcpu, get_rb(inst)) >> 28) & 0xf; if (vcpu->arch.mmu.mfsrin) { u32 sr; sr = vcpu->arch.mmu.mfsrin(vcpu, srnum); - vcpu->arch.gpr[get_rt(inst)] = sr; + kvmppc_set_gpr(vcpu, get_rt(inst), sr); } break; } case OP_31_XOP_MTSRIN: vcpu->arch.mmu.mtsrin(vcpu, - (vcpu->arch.gpr[get_rb(inst)] >> 28) & 0xf, - vcpu->arch.gpr[get_rs(inst)]); + (kvmppc_get_gpr(vcpu, get_rb(inst)) >> 28) & 0xf, + kvmppc_get_gpr(vcpu, get_rs(inst))); break; case OP_31_XOP_TLBIE: case OP_31_XOP_TLBIEL: { bool large = (inst & 0x00200000) ? true : false; - ulong addr = vcpu->arch.gpr[get_rb(inst)]; + ulong addr = kvmppc_get_gpr(vcpu, get_rb(inst)); vcpu->arch.mmu.tlbie(vcpu, addr, large); break; } @@ -111,14 +111,16 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, if (!vcpu->arch.mmu.slbmte) return EMULATE_FAIL; - vcpu->arch.mmu.slbmte(vcpu, vcpu->arch.gpr[get_rs(inst)], - vcpu->arch.gpr[get_rb(inst)]); + vcpu->arch.mmu.slbmte(vcpu, + kvmppc_get_gpr(vcpu, get_rs(inst)), + kvmppc_get_gpr(vcpu, get_rb(inst))); break; case OP_31_XOP_SLBIE: if (!vcpu->arch.mmu.slbie) return EMULATE_FAIL; - vcpu->arch.mmu.slbie(vcpu, vcpu->arch.gpr[get_rb(inst)]); + vcpu->arch.mmu.slbie(vcpu, + kvmppc_get_gpr(vcpu, get_rb(inst))); break; case OP_31_XOP_SLBIA: if (!vcpu->arch.mmu.slbia) @@ -132,9 +134,9 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, } else { ulong t, rb; - rb = vcpu->arch.gpr[get_rb(inst)]; + rb = kvmppc_get_gpr(vcpu, get_rb(inst)); t = vcpu->arch.mmu.slbmfee(vcpu, rb); - vcpu->arch.gpr[get_rt(inst)] = t; + kvmppc_set_gpr(vcpu, get_rt(inst), t); } break; case OP_31_XOP_SLBMFEV: @@ -143,20 +145,20 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, } else { ulong t, rb; - rb = vcpu->arch.gpr[get_rb(inst)]; + rb = kvmppc_get_gpr(vcpu, get_rb(inst)); t = vcpu->arch.mmu.slbmfev(vcpu, rb); - vcpu->arch.gpr[get_rt(inst)] = t; + kvmppc_set_gpr(vcpu, get_rt(inst), t); } break; case OP_31_XOP_DCBZ: { - ulong rb = vcpu->arch.gpr[get_rb(inst)]; + ulong rb = kvmppc_get_gpr(vcpu, get_rb(inst)); ulong ra = 0; ulong addr; u32 zeros[8] = { 0, 0, 0, 0, 0, 0, 0, 0 }; if (get_ra(inst)) - ra = vcpu->arch.gpr[get_ra(inst)]; + ra = kvmppc_get_gpr(vcpu, get_ra(inst)); addr = (ra + rb) & ~31ULL; if (!(vcpu->arch.msr & MSR_SF)) @@ -233,43 +235,44 @@ static void kvmppc_write_bat(struct kvm_vcpu *vcpu, int sprn, u32 val) int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) { int emulated = EMULATE_DONE; + ulong spr_val = kvmppc_get_gpr(vcpu, rs); switch (sprn) { case SPRN_SDR1: - to_book3s(vcpu)->sdr1 = vcpu->arch.gpr[rs]; + to_book3s(vcpu)->sdr1 = spr_val; break; case SPRN_DSISR: - to_book3s(vcpu)->dsisr = vcpu->arch.gpr[rs]; + to_book3s(vcpu)->dsisr = spr_val; break; case SPRN_DAR: - vcpu->arch.dear = vcpu->arch.gpr[rs]; + vcpu->arch.dear = spr_val; break; case SPRN_HIOR: - to_book3s(vcpu)->hior = vcpu->arch.gpr[rs]; + to_book3s(vcpu)->hior = spr_val; break; case SPRN_IBAT0U ... SPRN_IBAT3L: case SPRN_IBAT4U ... SPRN_IBAT7L: case SPRN_DBAT0U ... SPRN_DBAT3L: case SPRN_DBAT4U ... SPRN_DBAT7L: - kvmppc_write_bat(vcpu, sprn, (u32)vcpu->arch.gpr[rs]); + kvmppc_write_bat(vcpu, sprn, (u32)spr_val); /* BAT writes happen so rarely that we're ok to flush * everything here */ kvmppc_mmu_pte_flush(vcpu, 0, 0); break; case SPRN_HID0: - to_book3s(vcpu)->hid[0] = vcpu->arch.gpr[rs]; + to_book3s(vcpu)->hid[0] = spr_val; break; case SPRN_HID1: - to_book3s(vcpu)->hid[1] = vcpu->arch.gpr[rs]; + to_book3s(vcpu)->hid[1] = spr_val; break; case SPRN_HID2: - to_book3s(vcpu)->hid[2] = vcpu->arch.gpr[rs]; + to_book3s(vcpu)->hid[2] = spr_val; break; case SPRN_HID4: - to_book3s(vcpu)->hid[4] = vcpu->arch.gpr[rs]; + to_book3s(vcpu)->hid[4] = spr_val; break; case SPRN_HID5: - to_book3s(vcpu)->hid[5] = vcpu->arch.gpr[rs]; + to_book3s(vcpu)->hid[5] = spr_val; /* guest HID5 set can change is_dcbz32 */ if (vcpu->arch.mmu.is_dcbz32(vcpu) && (mfmsr() & MSR_HV)) @@ -299,38 +302,38 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) switch (sprn) { case SPRN_SDR1: - vcpu->arch.gpr[rt] = to_book3s(vcpu)->sdr1; + kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->sdr1); break; case SPRN_DSISR: - vcpu->arch.gpr[rt] = to_book3s(vcpu)->dsisr; + kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->dsisr); break; case SPRN_DAR: - vcpu->arch.gpr[rt] = vcpu->arch.dear; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.dear); break; case SPRN_HIOR: - vcpu->arch.gpr[rt] = to_book3s(vcpu)->hior; + kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hior); break; case SPRN_HID0: - vcpu->arch.gpr[rt] = to_book3s(vcpu)->hid[0]; + kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hid[0]); break; case SPRN_HID1: - vcpu->arch.gpr[rt] = to_book3s(vcpu)->hid[1]; + kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hid[1]); break; case SPRN_HID2: - vcpu->arch.gpr[rt] = to_book3s(vcpu)->hid[2]; + kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hid[2]); break; case SPRN_HID4: - vcpu->arch.gpr[rt] = to_book3s(vcpu)->hid[4]; + kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hid[4]); break; case SPRN_HID5: - vcpu->arch.gpr[rt] = to_book3s(vcpu)->hid[5]; + kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hid[5]); break; case SPRN_THRM1: case SPRN_THRM2: case SPRN_THRM3: case SPRN_CTRLF: case SPRN_CTRLT: - vcpu->arch.gpr[rt] = 0; + kvmppc_set_gpr(vcpu, rt, 0); break; default: printk(KERN_INFO "KVM: invalid SPR read: %d\n", sprn); diff --git a/arch/powerpc/kvm/book3s_64_exports.c b/arch/powerpc/kvm/book3s_64_exports.c index 5b2db38..1dd5a1d 100644 --- a/arch/powerpc/kvm/book3s_64_exports.c +++ b/arch/powerpc/kvm/book3s_64_exports.c @@ -22,3 +22,11 @@ EXPORT_SYMBOL_GPL(kvmppc_trampoline_enter); EXPORT_SYMBOL_GPL(kvmppc_trampoline_lowmem); +EXPORT_SYMBOL_GPL(kvmppc_rmcall); +EXPORT_SYMBOL_GPL(kvmppc_load_up_fpu); +#ifdef CONFIG_ALTIVEC +EXPORT_SYMBOL_GPL(kvmppc_load_up_altivec); +#endif +#ifdef CONFIG_VSX +EXPORT_SYMBOL_GPL(kvmppc_load_up_vsx); +#endif diff --git a/arch/powerpc/kvm/book3s_64_interrupts.S b/arch/powerpc/kvm/book3s_64_interrupts.S index 7b55d80..c1584d0 100644 --- a/arch/powerpc/kvm/book3s_64_interrupts.S +++ b/arch/powerpc/kvm/book3s_64_interrupts.S @@ -28,11 +28,6 @@ #define ULONG_SIZE 8 #define VCPU_GPR(n) (VCPU_GPRS + (n * ULONG_SIZE)) -.macro mfpaca tmp_reg, src_reg, offset, vcpu_reg - ld \tmp_reg, (PACA_EXMC+\offset)(r13) - std \tmp_reg, VCPU_GPR(\src_reg)(\vcpu_reg) -.endm - .macro DISABLE_INTERRUPTS mfmsr r0 rldicl r0,r0,48,1 @@ -40,6 +35,26 @@ mtmsrd r0,1 .endm +#define VCPU_LOAD_NVGPRS(vcpu) \ + ld r14, VCPU_GPR(r14)(vcpu); \ + ld r15, VCPU_GPR(r15)(vcpu); \ + ld r16, VCPU_GPR(r16)(vcpu); \ + ld r17, VCPU_GPR(r17)(vcpu); \ + ld r18, VCPU_GPR(r18)(vcpu); \ + ld r19, VCPU_GPR(r19)(vcpu); \ + ld r20, VCPU_GPR(r20)(vcpu); \ + ld r21, VCPU_GPR(r21)(vcpu); \ + ld r22, VCPU_GPR(r22)(vcpu); \ + ld r23, VCPU_GPR(r23)(vcpu); \ + ld r24, VCPU_GPR(r24)(vcpu); \ + ld r25, VCPU_GPR(r25)(vcpu); \ + ld r26, VCPU_GPR(r26)(vcpu); \ + ld r27, VCPU_GPR(r27)(vcpu); \ + ld r28, VCPU_GPR(r28)(vcpu); \ + ld r29, VCPU_GPR(r29)(vcpu); \ + ld r30, VCPU_GPR(r30)(vcpu); \ + ld r31, VCPU_GPR(r31)(vcpu); \ + /***************************************************************************** * * * Guest entry / exit code that is in kernel module memory (highmem) * @@ -67,61 +82,32 @@ kvm_start_entry: SAVE_NVGPRS(r1) /* Save LR */ - mflr r14 - std r14, _LINK(r1) - -/* XXX optimize non-volatile loading away */ -kvm_start_lightweight: + std r0, _LINK(r1) - DISABLE_INTERRUPTS + /* Load non-volatile guest state from the vcpu */ + VCPU_LOAD_NVGPRS(r4) /* Save R1/R2 in the PACA */ - std r1, PACAR1(r13) - std r2, (PACA_EXMC+EX_SRR0)(r13) + std r1, PACA_KVM_HOST_R1(r13) + std r2, PACA_KVM_HOST_R2(r13) + + /* XXX swap in/out on load? */ ld r3, VCPU_HIGHMEM_HANDLER(r4) - std r3, PACASAVEDMSR(r13) + std r3, PACA_KVM_VMHANDLER(r13) - /* Load non-volatile guest state from the vcpu */ - ld r14, VCPU_GPR(r14)(r4) - ld r15, VCPU_GPR(r15)(r4) - ld r16, VCPU_GPR(r16)(r4) - ld r17, VCPU_GPR(r17)(r4) - ld r18, VCPU_GPR(r18)(r4) - ld r19, VCPU_GPR(r19)(r4) - ld r20, VCPU_GPR(r20)(r4) - ld r21, VCPU_GPR(r21)(r4) - ld r22, VCPU_GPR(r22)(r4) - ld r23, VCPU_GPR(r23)(r4) - ld r24, VCPU_GPR(r24)(r4) - ld r25, VCPU_GPR(r25)(r4) - ld r26, VCPU_GPR(r26)(r4) - ld r27, VCPU_GPR(r27)(r4) - ld r28, VCPU_GPR(r28)(r4) - ld r29, VCPU_GPR(r29)(r4) - ld r30, VCPU_GPR(r30)(r4) - ld r31, VCPU_GPR(r31)(r4) +kvm_start_lightweight: ld r9, VCPU_PC(r4) /* r9 = vcpu->arch.pc */ ld r10, VCPU_SHADOW_MSR(r4) /* r10 = vcpu->arch.shadow_msr */ - ld r3, VCPU_TRAMPOLINE_ENTER(r4) - mtsrr0 r3 - - LOAD_REG_IMMEDIATE(r3, MSR_KERNEL & ~(MSR_IR | MSR_DR)) - mtsrr1 r3 - - /* Load guest state in the respective registers */ - lwz r3, VCPU_CR(r4) /* r3 = vcpu->arch.cr */ - stw r3, (PACA_EXMC + EX_CCR)(r13) - - ld r3, VCPU_CTR(r4) /* r3 = vcpu->arch.ctr */ - mtctr r3 /* CTR = r3 */ + /* Load some guest state in the respective registers */ + ld r5, VCPU_CTR(r4) /* r5 = vcpu->arch.ctr */ + /* will be swapped in by rmcall */ ld r3, VCPU_LR(r4) /* r3 = vcpu->arch.lr */ mtlr r3 /* LR = r3 */ - ld r3, VCPU_XER(r4) /* r3 = vcpu->arch.xer */ - std r3, (PACA_EXMC + EX_R3)(r13) + DISABLE_INTERRUPTS /* Some guests may need to have dcbz set to 32 byte length. * @@ -141,36 +127,15 @@ kvm_start_lightweight: mtspr SPRN_HID5,r3 no_dcbz32_on: - /* Load guest GPRs */ - - ld r3, VCPU_GPR(r9)(r4) - std r3, (PACA_EXMC + EX_R9)(r13) - ld r3, VCPU_GPR(r10)(r4) - std r3, (PACA_EXMC + EX_R10)(r13) - ld r3, VCPU_GPR(r11)(r4) - std r3, (PACA_EXMC + EX_R11)(r13) - ld r3, VCPU_GPR(r12)(r4) - std r3, (PACA_EXMC + EX_R12)(r13) - ld r3, VCPU_GPR(r13)(r4) - std r3, (PACA_EXMC + EX_R13)(r13) - - ld r0, VCPU_GPR(r0)(r4) - ld r1, VCPU_GPR(r1)(r4) - ld r2, VCPU_GPR(r2)(r4) - ld r3, VCPU_GPR(r3)(r4) - ld r5, VCPU_GPR(r5)(r4) - ld r6, VCPU_GPR(r6)(r4) - ld r7, VCPU_GPR(r7)(r4) - ld r8, VCPU_GPR(r8)(r4) - ld r4, VCPU_GPR(r4)(r4) - - /* This sets the Magic value for the trampoline */ - - li r11, 1 - stb r11, PACA_KVM_IN_GUEST(r13) + + ld r6, VCPU_RMCALL(r4) + mtctr r6 + + ld r3, VCPU_TRAMPOLINE_ENTER(r4) + LOAD_REG_IMMEDIATE(r4, MSR_KERNEL & ~(MSR_IR | MSR_DR)) /* Jump to SLB patching handlder and into our guest */ - RFI + bctr /* * This is the handler in module memory. It gets jumped at from the @@ -184,125 +149,70 @@ kvmppc_handler_highmem: /* * Register usage at this point: * - * R00 = guest R13 - * R01 = host R1 - * R02 = host R2 - * R10 = guest PC - * R11 = guest MSR - * R12 = exit handler id - * R13 = PACA - * PACA.exmc.R9 = guest R1 - * PACA.exmc.R10 = guest R10 - * PACA.exmc.R11 = guest R11 - * PACA.exmc.R12 = guest R12 - * PACA.exmc.R13 = guest R2 - * PACA.exmc.DAR = guest DAR - * PACA.exmc.DSISR = guest DSISR - * PACA.exmc.LR = guest instruction - * PACA.exmc.CCR = guest CR - * PACA.exmc.SRR0 = guest R0 + * R0 = guest last inst + * R1 = host R1 + * R2 = host R2 + * R3 = guest PC + * R4 = guest MSR + * R5 = guest DAR + * R6 = guest DSISR + * R13 = PACA + * PACA.KVM.* = guest * * */ - std r3, (PACA_EXMC+EX_R3)(r13) + /* R7 = vcpu */ + ld r7, GPR4(r1) - /* save the exit id in R3 */ - mr r3, r12 + /* Now save the guest state */ - /* R12 = vcpu */ - ld r12, GPR4(r1) + stw r0, VCPU_LAST_INST(r7) - /* Now save the guest state */ + std r3, VCPU_PC(r7) + std r4, VCPU_SHADOW_SRR1(r7) + std r5, VCPU_FAULT_DEAR(r7) + std r6, VCPU_FAULT_DSISR(r7) - std r0, VCPU_GPR(r13)(r12) - std r4, VCPU_GPR(r4)(r12) - std r5, VCPU_GPR(r5)(r12) - std r6, VCPU_GPR(r6)(r12) - std r7, VCPU_GPR(r7)(r12) - std r8, VCPU_GPR(r8)(r12) - std r9, VCPU_GPR(r9)(r12) - - /* get registers from PACA */ - mfpaca r5, r0, EX_SRR0, r12 - mfpaca r5, r3, EX_R3, r12 - mfpaca r5, r1, EX_R9, r12 - mfpaca r5, r10, EX_R10, r12 - mfpaca r5, r11, EX_R11, r12 - mfpaca r5, r12, EX_R12, r12 - mfpaca r5, r2, EX_R13, r12 - - lwz r5, (PACA_EXMC+EX_LR)(r13) - stw r5, VCPU_LAST_INST(r12) - - lwz r5, (PACA_EXMC+EX_CCR)(r13) - stw r5, VCPU_CR(r12) - - ld r5, VCPU_HFLAGS(r12) + ld r5, VCPU_HFLAGS(r7) rldicl. r5, r5, 0, 63 /* CR = ((r5 & 1) == 0) */ beq no_dcbz32_off + li r4, 0 mfspr r5,SPRN_HID5 - rldimi r5,r5,6,56 + rldimi r5,r4,6,56 mtspr SPRN_HID5,r5 no_dcbz32_off: - /* XXX maybe skip on lightweight? */ - std r14, VCPU_GPR(r14)(r12) - std r15, VCPU_GPR(r15)(r12) - std r16, VCPU_GPR(r16)(r12) - std r17, VCPU_GPR(r17)(r12) - std r18, VCPU_GPR(r18)(r12) - std r19, VCPU_GPR(r19)(r12) - std r20, VCPU_GPR(r20)(r12) - std r21, VCPU_GPR(r21)(r12) - std r22, VCPU_GPR(r22)(r12) - std r23, VCPU_GPR(r23)(r12) - std r24, VCPU_GPR(r24)(r12) - std r25, VCPU_GPR(r25)(r12) - std r26, VCPU_GPR(r26)(r12) - std r27, VCPU_GPR(r27)(r12) - std r28, VCPU_GPR(r28)(r12) - std r29, VCPU_GPR(r29)(r12) - std r30, VCPU_GPR(r30)(r12) - std r31, VCPU_GPR(r31)(r12) - - /* Restore non-volatile host registers (r14 - r31) */ - REST_NVGPRS(r1) - - /* Save guest PC (R10) */ - std r10, VCPU_PC(r12) - - /* Save guest msr (R11) */ - std r11, VCPU_SHADOW_MSR(r12) - - /* Save guest CTR (in R12) */ + std r14, VCPU_GPR(r14)(r7) + std r15, VCPU_GPR(r15)(r7) + std r16, VCPU_GPR(r16)(r7) + std r17, VCPU_GPR(r17)(r7) + std r18, VCPU_GPR(r18)(r7) + std r19, VCPU_GPR(r19)(r7) + std r20, VCPU_GPR(r20)(r7) + std r21, VCPU_GPR(r21)(r7) + std r22, VCPU_GPR(r22)(r7) + std r23, VCPU_GPR(r23)(r7) + std r24, VCPU_GPR(r24)(r7) + std r25, VCPU_GPR(r25)(r7) + std r26, VCPU_GPR(r26)(r7) + std r27, VCPU_GPR(r27)(r7) + std r28, VCPU_GPR(r28)(r7) + std r29, VCPU_GPR(r29)(r7) + std r30, VCPU_GPR(r30)(r7) + std r31, VCPU_GPR(r31)(r7) + + /* Save guest CTR */ mfctr r5 - std r5, VCPU_CTR(r12) + std r5, VCPU_CTR(r7) /* Save guest LR */ mflr r5 - std r5, VCPU_LR(r12) - - /* Save guest XER */ - mfxer r5 - std r5, VCPU_XER(r12) - - /* Save guest DAR */ - ld r5, (PACA_EXMC+EX_DAR)(r13) - std r5, VCPU_FAULT_DEAR(r12) - - /* Save guest DSISR */ - lwz r5, (PACA_EXMC+EX_DSISR)(r13) - std r5, VCPU_FAULT_DSISR(r12) + std r5, VCPU_LR(r7) /* Restore host msr -> SRR1 */ - ld r7, VCPU_HOST_MSR(r12) - mtsrr1 r7 - - /* Restore host IP -> SRR0 */ - ld r6, VCPU_HOST_RETIP(r12) - mtsrr0 r6 + ld r6, VCPU_HOST_MSR(r7) /* * For some interrupts, we need to call the real Linux @@ -314,13 +224,14 @@ no_dcbz32_off: * r3 = address of interrupt handler (exit reason) */ - cmpwi r3, BOOK3S_INTERRUPT_EXTERNAL + cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL beq call_linux_handler - cmpwi r3, BOOK3S_INTERRUPT_DECREMENTER + cmpwi r12, BOOK3S_INTERRUPT_DECREMENTER beq call_linux_handler - /* Back to Interruptable Mode! (goto kvm_return_point) */ - RFI + /* Back to EE=1 */ + mtmsr r6 + b kvm_return_point call_linux_handler: @@ -333,16 +244,22 @@ call_linux_handler: * interrupt handler! * * R3 still contains the exit code, - * R6 VCPU_HOST_RETIP and - * R7 VCPU_HOST_MSR + * R5 VCPU_HOST_RETIP and + * R6 VCPU_HOST_MSR */ - mtlr r3 + /* Restore host IP -> SRR0 */ + ld r5, VCPU_HOST_RETIP(r7) + + /* XXX Better move to a safe function? + * What if we get an HTAB flush in between mtsrr0 and mtsrr1? */ - ld r5, VCPU_TRAMPOLINE_LOWMEM(r12) - mtsrr0 r5 - LOAD_REG_IMMEDIATE(r5, MSR_KERNEL & ~(MSR_IR | MSR_DR)) - mtsrr1 r5 + mtlr r12 + + ld r4, VCPU_TRAMPOLINE_LOWMEM(r7) + mtsrr0 r4 + LOAD_REG_IMMEDIATE(r3, MSR_KERNEL & ~(MSR_IR | MSR_DR)) + mtsrr1 r3 RFI @@ -351,42 +268,51 @@ kvm_return_point: /* Jump back to lightweight entry if we're supposed to */ /* go back into the guest */ - mr r5, r3 + + /* Pass the exit number as 3rd argument to kvmppc_handle_exit */ + mr r5, r12 + /* Restore r3 (kvm_run) and r4 (vcpu) */ REST_2GPRS(3, r1) bl KVMPPC_HANDLE_EXIT -#if 0 /* XXX get lightweight exits back */ + /* If RESUME_GUEST, get back in the loop */ cmpwi r3, RESUME_GUEST - bne kvm_exit_heavyweight + beq kvm_loop_lightweight - /* put VCPU and KVM_RUN back into place and roll again! */ - REST_2GPRS(3, r1) - b kvm_start_lightweight + cmpwi r3, RESUME_GUEST_NV + beq kvm_loop_heavyweight -kvm_exit_heavyweight: - /* Restore non-volatile host registers */ - ld r14, _LINK(r1) - mtlr r14 - REST_NVGPRS(r1) +kvm_exit_loop: - addi r1, r1, SWITCH_FRAME_SIZE -#else ld r4, _LINK(r1) mtlr r4 - cmpwi r3, RESUME_GUEST - bne kvm_exit_heavyweight + /* Restore non-volatile host registers (r14 - r31) */ + REST_NVGPRS(r1) + + addi r1, r1, SWITCH_FRAME_SIZE + blr + +kvm_loop_heavyweight: + + ld r4, _LINK(r1) + std r4, (16 + SWITCH_FRAME_SIZE)(r1) + /* Load vcpu and cpu_run */ REST_2GPRS(3, r1) - addi r1, r1, SWITCH_FRAME_SIZE + /* Load non-volatile guest state from the vcpu */ + VCPU_LOAD_NVGPRS(r4) - b kvm_start_entry + /* Jump back into the beginning of this function */ + b kvm_start_lightweight -kvm_exit_heavyweight: +kvm_loop_lightweight: - addi r1, r1, SWITCH_FRAME_SIZE -#endif + /* We'll need the vcpu pointer */ + REST_GPR(4, r1) + + /* Jump back into the beginning of this function */ + b kvm_start_lightweight - blr diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c index e4beeb3..512dcff 100644 --- a/arch/powerpc/kvm/book3s_64_mmu.c +++ b/arch/powerpc/kvm/book3s_64_mmu.c @@ -54,7 +54,7 @@ static struct kvmppc_slb *kvmppc_mmu_book3s_64_find_slbe( if (!vcpu_book3s->slb[i].valid) continue; - if (vcpu_book3s->slb[i].large) + if (vcpu_book3s->slb[i].tb) cmp_esid = esid_1t; if (vcpu_book3s->slb[i].esid == cmp_esid) @@ -65,9 +65,10 @@ static struct kvmppc_slb *kvmppc_mmu_book3s_64_find_slbe( eaddr, esid, esid_1t); for (i = 0; i < vcpu_book3s->slb_nr; i++) { if (vcpu_book3s->slb[i].vsid) - dprintk(" %d: %c%c %llx %llx\n", i, + dprintk(" %d: %c%c%c %llx %llx\n", i, vcpu_book3s->slb[i].valid ? 'v' : ' ', vcpu_book3s->slb[i].large ? 'l' : ' ', + vcpu_book3s->slb[i].tb ? 't' : ' ', vcpu_book3s->slb[i].esid, vcpu_book3s->slb[i].vsid); } @@ -84,7 +85,7 @@ static u64 kvmppc_mmu_book3s_64_ea_to_vp(struct kvm_vcpu *vcpu, gva_t eaddr, if (!slb) return 0; - if (slb->large) + if (slb->tb) return (((u64)eaddr >> 12) & 0xfffffff) | (((u64)slb->vsid) << 28); @@ -309,7 +310,8 @@ static void kvmppc_mmu_book3s_64_slbmte(struct kvm_vcpu *vcpu, u64 rs, u64 rb) slbe = &vcpu_book3s->slb[slb_nr]; slbe->large = (rs & SLB_VSID_L) ? 1 : 0; - slbe->esid = slbe->large ? esid_1t : esid; + slbe->tb = (rs & SLB_VSID_B_1T) ? 1 : 0; + slbe->esid = slbe->tb ? esid_1t : esid; slbe->vsid = rs >> 12; slbe->valid = (rb & SLB_ESID_V) ? 1 : 0; slbe->Ks = (rs & SLB_VSID_KS) ? 1 : 0; diff --git a/arch/powerpc/kvm/book3s_64_rmhandlers.S b/arch/powerpc/kvm/book3s_64_rmhandlers.S index fb7dd2e..c83c60a 100644 --- a/arch/powerpc/kvm/book3s_64_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_64_rmhandlers.S @@ -45,36 +45,25 @@ kvmppc_trampoline_\intno: * To distinguish, we check a magic byte in the PACA */ mfspr r13, SPRN_SPRG_PACA /* r13 = PACA */ - std r12, (PACA_EXMC + EX_R12)(r13) + std r12, PACA_KVM_SCRATCH0(r13) mfcr r12 - stw r12, (PACA_EXMC + EX_CCR)(r13) + stw r12, PACA_KVM_SCRATCH1(r13) lbz r12, PACA_KVM_IN_GUEST(r13) - cmpwi r12, 0 + cmpwi r12, KVM_GUEST_MODE_NONE bne ..kvmppc_handler_hasmagic_\intno /* No KVM guest? Then jump back to the Linux handler! */ - lwz r12, (PACA_EXMC + EX_CCR)(r13) + lwz r12, PACA_KVM_SCRATCH1(r13) mtcr r12 - ld r12, (PACA_EXMC + EX_R12)(r13) + ld r12, PACA_KVM_SCRATCH0(r13) mfspr r13, SPRN_SPRG_SCRATCH0 /* r13 = original r13 */ b kvmppc_resume_\intno /* Get back original handler */ /* Now we know we're handling a KVM guest */ ..kvmppc_handler_hasmagic_\intno: - /* Unset guest state */ - li r12, 0 - stb r12, PACA_KVM_IN_GUEST(r13) - std r1, (PACA_EXMC+EX_R9)(r13) - std r10, (PACA_EXMC+EX_R10)(r13) - std r11, (PACA_EXMC+EX_R11)(r13) - std r2, (PACA_EXMC+EX_R13)(r13) - - mfsrr0 r10 - mfsrr1 r11 - - /* Restore R1/R2 so we can handle faults */ - ld r1, PACAR1(r13) - ld r2, (PACA_EXMC+EX_SRR0)(r13) + /* Should we just skip the faulting instruction? */ + cmpwi r12, KVM_GUEST_MODE_SKIP + beq kvmppc_handler_skip_ins /* Let's store which interrupt we're handling */ li r12, \intno @@ -102,23 +91,107 @@ INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_ALTIVEC INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_VSX /* + * Bring us back to the faulting code, but skip the + * faulting instruction. + * + * This is a generic exit path from the interrupt + * trampolines above. + * + * Input Registers: + * + * R12 = free + * R13 = PACA + * PACA.KVM.SCRATCH0 = guest R12 + * PACA.KVM.SCRATCH1 = guest CR + * SPRG_SCRATCH0 = guest R13 + * + */ +kvmppc_handler_skip_ins: + + /* Patch the IP to the next instruction */ + mfsrr0 r12 + addi r12, r12, 4 + mtsrr0 r12 + + /* Clean up all state */ + lwz r12, PACA_KVM_SCRATCH1(r13) + mtcr r12 + ld r12, PACA_KVM_SCRATCH0(r13) + mfspr r13, SPRN_SPRG_SCRATCH0 + + /* And get back into the code */ + RFI + +/* * This trampoline brings us back to a real mode handler * * Input Registers: * - * R6 = SRR0 - * R7 = SRR1 + * R5 = SRR0 + * R6 = SRR1 * LR = real-mode IP * */ .global kvmppc_handler_lowmem_trampoline kvmppc_handler_lowmem_trampoline: - mtsrr0 r6 - mtsrr1 r7 + mtsrr0 r5 + mtsrr1 r6 blr kvmppc_handler_lowmem_trampoline_end: +/* + * Call a function in real mode + * + * Input Registers: + * + * R3 = function + * R4 = MSR + * R5 = CTR + * + */ +_GLOBAL(kvmppc_rmcall) + mtmsr r4 /* Disable relocation, so mtsrr + doesn't get interrupted */ + mtctr r5 + mtsrr0 r3 + mtsrr1 r4 + RFI + +/* + * Activate current's external feature (FPU/Altivec/VSX) + */ +#define define_load_up(what) \ + \ +_GLOBAL(kvmppc_load_up_ ## what); \ + subi r1, r1, INT_FRAME_SIZE; \ + mflr r3; \ + std r3, _LINK(r1); \ + mfmsr r4; \ + std r31, GPR3(r1); \ + mr r31, r4; \ + li r5, MSR_DR; \ + oris r5, r5, MSR_EE@h; \ + andc r4, r4, r5; \ + mtmsr r4; \ + \ + bl .load_up_ ## what; \ + \ + mtmsr r31; \ + ld r3, _LINK(r1); \ + ld r31, GPR3(r1); \ + addi r1, r1, INT_FRAME_SIZE; \ + mtlr r3; \ + blr + +define_load_up(fpu) +#ifdef CONFIG_ALTIVEC +define_load_up(altivec) +#endif +#ifdef CONFIG_VSX +define_load_up(vsx) +#endif + .global kvmppc_trampoline_lowmem kvmppc_trampoline_lowmem: .long kvmppc_handler_lowmem_trampoline - _stext diff --git a/arch/powerpc/kvm/book3s_64_slb.S b/arch/powerpc/kvm/book3s_64_slb.S index ecd237a..35b76272 100644 --- a/arch/powerpc/kvm/book3s_64_slb.S +++ b/arch/powerpc/kvm/book3s_64_slb.S @@ -31,7 +31,7 @@ #define REBOLT_SLB_ENTRY(num) \ ld r10, SHADOW_SLB_ESID(num)(r11); \ cmpdi r10, 0; \ - beq slb_exit_skip_1; \ + beq slb_exit_skip_ ## num; \ oris r10, r10, SLB_ESID_V@h; \ ld r9, SHADOW_SLB_VSID(num)(r11); \ slbmte r9, r10; \ @@ -51,23 +51,21 @@ kvmppc_handler_trampoline_enter: * * MSR = ~IR|DR * R13 = PACA + * R1 = host R1 + * R2 = host R2 * R9 = guest IP * R10 = guest MSR - * R11 = free - * R12 = free - * PACA[PACA_EXMC + EX_R9] = guest R9 - * PACA[PACA_EXMC + EX_R10] = guest R10 - * PACA[PACA_EXMC + EX_R11] = guest R11 - * PACA[PACA_EXMC + EX_R12] = guest R12 - * PACA[PACA_EXMC + EX_R13] = guest R13 - * PACA[PACA_EXMC + EX_CCR] = guest CR - * PACA[PACA_EXMC + EX_R3] = guest XER + * all other GPRS = free + * PACA[KVM_CR] = guest CR + * PACA[KVM_XER] = guest XER */ mtsrr0 r9 mtsrr1 r10 - mtspr SPRN_SPRG_SCRATCH0, r0 + /* Activate guest mode, so faults get handled by KVM */ + li r11, KVM_GUEST_MODE_GUEST + stb r11, PACA_KVM_IN_GUEST(r13) /* Remove LPAR shadow entries */ @@ -131,20 +129,27 @@ slb_do_enter: /* Enter guest */ - mfspr r0, SPRN_SPRG_SCRATCH0 - - ld r9, (PACA_EXMC+EX_R9)(r13) - ld r10, (PACA_EXMC+EX_R10)(r13) - ld r12, (PACA_EXMC+EX_R12)(r13) - - lwz r11, (PACA_EXMC+EX_CCR)(r13) + ld r0, (PACA_KVM_R0)(r13) + ld r1, (PACA_KVM_R1)(r13) + ld r2, (PACA_KVM_R2)(r13) + ld r3, (PACA_KVM_R3)(r13) + ld r4, (PACA_KVM_R4)(r13) + ld r5, (PACA_KVM_R5)(r13) + ld r6, (PACA_KVM_R6)(r13) + ld r7, (PACA_KVM_R7)(r13) + ld r8, (PACA_KVM_R8)(r13) + ld r9, (PACA_KVM_R9)(r13) + ld r10, (PACA_KVM_R10)(r13) + ld r12, (PACA_KVM_R12)(r13) + + lwz r11, (PACA_KVM_CR)(r13) mtcr r11 - ld r11, (PACA_EXMC+EX_R3)(r13) + ld r11, (PACA_KVM_XER)(r13) mtxer r11 - ld r11, (PACA_EXMC+EX_R11)(r13) - ld r13, (PACA_EXMC+EX_R13)(r13) + ld r11, (PACA_KVM_R11)(r13) + ld r13, (PACA_KVM_R13)(r13) RFI kvmppc_handler_trampoline_enter_end: @@ -162,28 +167,54 @@ kvmppc_handler_trampoline_exit: /* Register usage at this point: * - * SPRG_SCRATCH0 = guest R13 - * R01 = host R1 - * R02 = host R2 - * R10 = guest PC - * R11 = guest MSR - * R12 = exit handler id - * R13 = PACA - * PACA.exmc.CCR = guest CR - * PACA.exmc.R9 = guest R1 - * PACA.exmc.R10 = guest R10 - * PACA.exmc.R11 = guest R11 - * PACA.exmc.R12 = guest R12 - * PACA.exmc.R13 = guest R2 + * SPRG_SCRATCH0 = guest R13 + * R12 = exit handler id + * R13 = PACA + * PACA.KVM.SCRATCH0 = guest R12 + * PACA.KVM.SCRATCH1 = guest CR * */ /* Save registers */ - std r0, (PACA_EXMC+EX_SRR0)(r13) - std r9, (PACA_EXMC+EX_R3)(r13) - std r10, (PACA_EXMC+EX_LR)(r13) - std r11, (PACA_EXMC+EX_DAR)(r13) + std r0, PACA_KVM_R0(r13) + std r1, PACA_KVM_R1(r13) + std r2, PACA_KVM_R2(r13) + std r3, PACA_KVM_R3(r13) + std r4, PACA_KVM_R4(r13) + std r5, PACA_KVM_R5(r13) + std r6, PACA_KVM_R6(r13) + std r7, PACA_KVM_R7(r13) + std r8, PACA_KVM_R8(r13) + std r9, PACA_KVM_R9(r13) + std r10, PACA_KVM_R10(r13) + std r11, PACA_KVM_R11(r13) + + /* Restore R1/R2 so we can handle faults */ + ld r1, PACA_KVM_HOST_R1(r13) + ld r2, PACA_KVM_HOST_R2(r13) + + /* Save guest PC and MSR in GPRs */ + mfsrr0 r3 + mfsrr1 r4 + + /* Get scratch'ed off registers */ + mfspr r9, SPRN_SPRG_SCRATCH0 + std r9, PACA_KVM_R13(r13) + + ld r8, PACA_KVM_SCRATCH0(r13) + std r8, PACA_KVM_R12(r13) + + lwz r7, PACA_KVM_SCRATCH1(r13) + stw r7, PACA_KVM_CR(r13) + + /* Save more register state */ + + mfxer r6 + stw r6, PACA_KVM_XER(r13) + + mfdar r5 + mfdsisr r6 /* * In order for us to easily get the last instruction, @@ -202,17 +233,28 @@ kvmppc_handler_trampoline_exit: ld_last_inst: /* Save off the guest instruction we're at */ + + /* Set guest mode to 'jump over instruction' so if lwz faults + * we'll just continue at the next IP. */ + li r9, KVM_GUEST_MODE_SKIP + stb r9, PACA_KVM_IN_GUEST(r13) + /* 1) enable paging for data */ mfmsr r9 ori r11, r9, MSR_DR /* Enable paging for data */ mtmsr r11 /* 2) fetch the instruction */ - lwz r0, 0(r10) + li r0, KVM_INST_FETCH_FAILED /* In case lwz faults */ + lwz r0, 0(r3) /* 3) disable paging again */ mtmsr r9 no_ld_last_inst: + /* Unset guest mode */ + li r9, KVM_GUEST_MODE_NONE + stb r9, PACA_KVM_IN_GUEST(r13) + /* Restore bolted entries from the shadow and fix it along the way */ /* We don't store anything in entry 0, so we don't need to take care of it */ @@ -233,29 +275,27 @@ no_ld_last_inst: slb_do_exit: - /* Restore registers */ - - ld r11, (PACA_EXMC+EX_DAR)(r13) - ld r10, (PACA_EXMC+EX_LR)(r13) - ld r9, (PACA_EXMC+EX_R3)(r13) - - /* Save last inst */ - stw r0, (PACA_EXMC+EX_LR)(r13) - - /* Save DAR and DSISR before going to paged mode */ - mfdar r0 - std r0, (PACA_EXMC+EX_DAR)(r13) - mfdsisr r0 - stw r0, (PACA_EXMC+EX_DSISR)(r13) + /* Register usage at this point: + * + * R0 = guest last inst + * R1 = host R1 + * R2 = host R2 + * R3 = guest PC + * R4 = guest MSR + * R5 = guest DAR + * R6 = guest DSISR + * R12 = exit handler id + * R13 = PACA + * PACA.KVM.* = guest * + * + */ /* RFI into the highmem handler */ - mfmsr r0 - ori r0, r0, MSR_IR|MSR_DR|MSR_RI /* Enable paging */ - mtsrr1 r0 - ld r0, PACASAVEDMSR(r13) /* Highmem handler address */ - mtsrr0 r0 - - mfspr r0, SPRN_SPRG_SCRATCH0 + mfmsr r7 + ori r7, r7, MSR_IR|MSR_DR|MSR_RI /* Enable paging */ + mtsrr1 r7 + ld r8, PACA_KVM_VMHANDLER(r13) /* Highmem handler address */ + mtsrr0 r8 RFI kvmppc_handler_trampoline_exit_end: diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 06f5a9e..4d686cc 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -69,10 +69,10 @@ void kvmppc_dump_vcpu(struct kvm_vcpu *vcpu) for (i = 0; i < 32; i += 4) { printk("gpr%02d: %08lx %08lx %08lx %08lx\n", i, - vcpu->arch.gpr[i], - vcpu->arch.gpr[i+1], - vcpu->arch.gpr[i+2], - vcpu->arch.gpr[i+3]); + kvmppc_get_gpr(vcpu, i), + kvmppc_get_gpr(vcpu, i+1), + kvmppc_get_gpr(vcpu, i+2), + kvmppc_get_gpr(vcpu, i+3)); } } @@ -82,8 +82,32 @@ static void kvmppc_booke_queue_irqprio(struct kvm_vcpu *vcpu, set_bit(priority, &vcpu->arch.pending_exceptions); } -void kvmppc_core_queue_program(struct kvm_vcpu *vcpu) +static void kvmppc_core_queue_dtlb_miss(struct kvm_vcpu *vcpu, + ulong dear_flags, ulong esr_flags) { + vcpu->arch.queued_dear = dear_flags; + vcpu->arch.queued_esr = esr_flags; + kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_DTLB_MISS); +} + +static void kvmppc_core_queue_data_storage(struct kvm_vcpu *vcpu, + ulong dear_flags, ulong esr_flags) +{ + vcpu->arch.queued_dear = dear_flags; + vcpu->arch.queued_esr = esr_flags; + kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_DATA_STORAGE); +} + +static void kvmppc_core_queue_inst_storage(struct kvm_vcpu *vcpu, + ulong esr_flags) +{ + vcpu->arch.queued_esr = esr_flags; + kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_INST_STORAGE); +} + +void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong esr_flags) +{ + vcpu->arch.queued_esr = esr_flags; kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_PROGRAM); } @@ -97,6 +121,11 @@ int kvmppc_core_pending_dec(struct kvm_vcpu *vcpu) return test_bit(BOOKE_IRQPRIO_DECREMENTER, &vcpu->arch.pending_exceptions); } +void kvmppc_core_dequeue_dec(struct kvm_vcpu *vcpu) +{ + clear_bit(BOOKE_IRQPRIO_DECREMENTER, &vcpu->arch.pending_exceptions); +} + void kvmppc_core_queue_external(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq) { @@ -109,14 +138,19 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, { int allowed = 0; ulong msr_mask; + bool update_esr = false, update_dear = false; switch (priority) { - case BOOKE_IRQPRIO_PROGRAM: case BOOKE_IRQPRIO_DTLB_MISS: - case BOOKE_IRQPRIO_ITLB_MISS: - case BOOKE_IRQPRIO_SYSCALL: case BOOKE_IRQPRIO_DATA_STORAGE: + update_dear = true; + /* fall through */ case BOOKE_IRQPRIO_INST_STORAGE: + case BOOKE_IRQPRIO_PROGRAM: + update_esr = true; + /* fall through */ + case BOOKE_IRQPRIO_ITLB_MISS: + case BOOKE_IRQPRIO_SYSCALL: case BOOKE_IRQPRIO_FP_UNAVAIL: case BOOKE_IRQPRIO_SPE_UNAVAIL: case BOOKE_IRQPRIO_SPE_FP_DATA: @@ -151,6 +185,10 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, vcpu->arch.srr0 = vcpu->arch.pc; vcpu->arch.srr1 = vcpu->arch.msr; vcpu->arch.pc = vcpu->arch.ivpr | vcpu->arch.ivor[priority]; + if (update_esr == true) + vcpu->arch.esr = vcpu->arch.queued_esr; + if (update_dear == true) + vcpu->arch.dear = vcpu->arch.queued_dear; kvmppc_set_msr(vcpu, vcpu->arch.msr & msr_mask); clear_bit(priority, &vcpu->arch.pending_exceptions); @@ -223,8 +261,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, if (vcpu->arch.msr & MSR_PR) { /* Program traps generated by user-level software must be handled * by the guest kernel. */ - vcpu->arch.esr = vcpu->arch.fault_esr; - kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_PROGRAM); + kvmppc_core_queue_program(vcpu, vcpu->arch.fault_esr); r = RESUME_GUEST; kvmppc_account_exit(vcpu, USR_PR_INST); break; @@ -280,16 +317,14 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, break; case BOOKE_INTERRUPT_DATA_STORAGE: - vcpu->arch.dear = vcpu->arch.fault_dear; - vcpu->arch.esr = vcpu->arch.fault_esr; - kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_DATA_STORAGE); + kvmppc_core_queue_data_storage(vcpu, vcpu->arch.fault_dear, + vcpu->arch.fault_esr); kvmppc_account_exit(vcpu, DSI_EXITS); r = RESUME_GUEST; break; case BOOKE_INTERRUPT_INST_STORAGE: - vcpu->arch.esr = vcpu->arch.fault_esr; - kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_INST_STORAGE); + kvmppc_core_queue_inst_storage(vcpu, vcpu->arch.fault_esr); kvmppc_account_exit(vcpu, ISI_EXITS); r = RESUME_GUEST; break; @@ -310,9 +345,9 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, gtlb_index = kvmppc_mmu_dtlb_index(vcpu, eaddr); if (gtlb_index < 0) { /* The guest didn't have a mapping for it. */ - kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_DTLB_MISS); - vcpu->arch.dear = vcpu->arch.fault_dear; - vcpu->arch.esr = vcpu->arch.fault_esr; + kvmppc_core_queue_dtlb_miss(vcpu, + vcpu->arch.fault_dear, + vcpu->arch.fault_esr); kvmppc_mmu_dtlb_miss(vcpu); kvmppc_account_exit(vcpu, DTLB_REAL_MISS_EXITS); r = RESUME_GUEST; @@ -426,7 +461,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) { vcpu->arch.pc = 0; vcpu->arch.msr = 0; - vcpu->arch.gpr[1] = (16<<20) - 8; /* -8 for the callee-save LR slot */ + kvmppc_set_gpr(vcpu, 1, (16<<20) - 8); /* -8 for the callee-save LR slot */ vcpu->arch.shadow_pid = 1; @@ -444,10 +479,10 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) int i; regs->pc = vcpu->arch.pc; - regs->cr = vcpu->arch.cr; + regs->cr = kvmppc_get_cr(vcpu); regs->ctr = vcpu->arch.ctr; regs->lr = vcpu->arch.lr; - regs->xer = vcpu->arch.xer; + regs->xer = kvmppc_get_xer(vcpu); regs->msr = vcpu->arch.msr; regs->srr0 = vcpu->arch.srr0; regs->srr1 = vcpu->arch.srr1; @@ -461,7 +496,7 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) regs->sprg7 = vcpu->arch.sprg6; for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) - regs->gpr[i] = vcpu->arch.gpr[i]; + regs->gpr[i] = kvmppc_get_gpr(vcpu, i); return 0; } @@ -471,10 +506,10 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) int i; vcpu->arch.pc = regs->pc; - vcpu->arch.cr = regs->cr; + kvmppc_set_cr(vcpu, regs->cr); vcpu->arch.ctr = regs->ctr; vcpu->arch.lr = regs->lr; - vcpu->arch.xer = regs->xer; + kvmppc_set_xer(vcpu, regs->xer); kvmppc_set_msr(vcpu, regs->msr); vcpu->arch.srr0 = regs->srr0; vcpu->arch.srr1 = regs->srr1; @@ -486,8 +521,8 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) vcpu->arch.sprg6 = regs->sprg5; vcpu->arch.sprg7 = regs->sprg6; - for (i = 0; i < ARRAY_SIZE(vcpu->arch.gpr); i++) - vcpu->arch.gpr[i] = regs->gpr[i]; + for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) + kvmppc_set_gpr(vcpu, i, regs->gpr[i]); return 0; } diff --git a/arch/powerpc/kvm/booke_emulate.c b/arch/powerpc/kvm/booke_emulate.c index aebc65e..cbc790e 100644 --- a/arch/powerpc/kvm/booke_emulate.c +++ b/arch/powerpc/kvm/booke_emulate.c @@ -62,20 +62,20 @@ int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, case OP_31_XOP_MFMSR: rt = get_rt(inst); - vcpu->arch.gpr[rt] = vcpu->arch.msr; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.msr); kvmppc_set_exit_type(vcpu, EMULATED_MFMSR_EXITS); break; case OP_31_XOP_MTMSR: rs = get_rs(inst); kvmppc_set_exit_type(vcpu, EMULATED_MTMSR_EXITS); - kvmppc_set_msr(vcpu, vcpu->arch.gpr[rs]); + kvmppc_set_msr(vcpu, kvmppc_get_gpr(vcpu, rs)); break; case OP_31_XOP_WRTEE: rs = get_rs(inst); vcpu->arch.msr = (vcpu->arch.msr & ~MSR_EE) - | (vcpu->arch.gpr[rs] & MSR_EE); + | (kvmppc_get_gpr(vcpu, rs) & MSR_EE); kvmppc_set_exit_type(vcpu, EMULATED_WRTEE_EXITS); break; @@ -101,22 +101,23 @@ int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) { int emulated = EMULATE_DONE; + ulong spr_val = kvmppc_get_gpr(vcpu, rs); switch (sprn) { case SPRN_DEAR: - vcpu->arch.dear = vcpu->arch.gpr[rs]; break; + vcpu->arch.dear = spr_val; break; case SPRN_ESR: - vcpu->arch.esr = vcpu->arch.gpr[rs]; break; + vcpu->arch.esr = spr_val; break; case SPRN_DBCR0: - vcpu->arch.dbcr0 = vcpu->arch.gpr[rs]; break; + vcpu->arch.dbcr0 = spr_val; break; case SPRN_DBCR1: - vcpu->arch.dbcr1 = vcpu->arch.gpr[rs]; break; + vcpu->arch.dbcr1 = spr_val; break; case SPRN_DBSR: - vcpu->arch.dbsr &= ~vcpu->arch.gpr[rs]; break; + vcpu->arch.dbsr &= ~spr_val; break; case SPRN_TSR: - vcpu->arch.tsr &= ~vcpu->arch.gpr[rs]; break; + vcpu->arch.tsr &= ~spr_val; break; case SPRN_TCR: - vcpu->arch.tcr = vcpu->arch.gpr[rs]; + vcpu->arch.tcr = spr_val; kvmppc_emulate_dec(vcpu); break; @@ -124,64 +125,64 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) * loaded into the real SPRGs when resuming the * guest. */ case SPRN_SPRG4: - vcpu->arch.sprg4 = vcpu->arch.gpr[rs]; break; + vcpu->arch.sprg4 = spr_val; break; case SPRN_SPRG5: - vcpu->arch.sprg5 = vcpu->arch.gpr[rs]; break; + vcpu->arch.sprg5 = spr_val; break; case SPRN_SPRG6: - vcpu->arch.sprg6 = vcpu->arch.gpr[rs]; break; + vcpu->arch.sprg6 = spr_val; break; case SPRN_SPRG7: - vcpu->arch.sprg7 = vcpu->arch.gpr[rs]; break; + vcpu->arch.sprg7 = spr_val; break; case SPRN_IVPR: - vcpu->arch.ivpr = vcpu->arch.gpr[rs]; + vcpu->arch.ivpr = spr_val; break; case SPRN_IVOR0: - vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL] = vcpu->arch.gpr[rs]; + vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL] = spr_val; break; case SPRN_IVOR1: - vcpu->arch.ivor[BOOKE_IRQPRIO_MACHINE_CHECK] = vcpu->arch.gpr[rs]; + vcpu->arch.ivor[BOOKE_IRQPRIO_MACHINE_CHECK] = spr_val; break; case SPRN_IVOR2: - vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE] = vcpu->arch.gpr[rs]; + vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE] = spr_val; break; case SPRN_IVOR3: - vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE] = vcpu->arch.gpr[rs]; + vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE] = spr_val; break; case SPRN_IVOR4: - vcpu->arch.ivor[BOOKE_IRQPRIO_EXTERNAL] = vcpu->arch.gpr[rs]; + vcpu->arch.ivor[BOOKE_IRQPRIO_EXTERNAL] = spr_val; break; case SPRN_IVOR5: - vcpu->arch.ivor[BOOKE_IRQPRIO_ALIGNMENT] = vcpu->arch.gpr[rs]; + vcpu->arch.ivor[BOOKE_IRQPRIO_ALIGNMENT] = spr_val; break; case SPRN_IVOR6: - vcpu->arch.ivor[BOOKE_IRQPRIO_PROGRAM] = vcpu->arch.gpr[rs]; + vcpu->arch.ivor[BOOKE_IRQPRIO_PROGRAM] = spr_val; break; case SPRN_IVOR7: - vcpu->arch.ivor[BOOKE_IRQPRIO_FP_UNAVAIL] = vcpu->arch.gpr[rs]; + vcpu->arch.ivor[BOOKE_IRQPRIO_FP_UNAVAIL] = spr_val; break; case SPRN_IVOR8: - vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL] = vcpu->arch.gpr[rs]; + vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL] = spr_val; break; case SPRN_IVOR9: - vcpu->arch.ivor[BOOKE_IRQPRIO_AP_UNAVAIL] = vcpu->arch.gpr[rs]; + vcpu->arch.ivor[BOOKE_IRQPRIO_AP_UNAVAIL] = spr_val; break; case SPRN_IVOR10: - vcpu->arch.ivor[BOOKE_IRQPRIO_DECREMENTER] = vcpu->arch.gpr[rs]; + vcpu->arch.ivor[BOOKE_IRQPRIO_DECREMENTER] = spr_val; break; case SPRN_IVOR11: - vcpu->arch.ivor[BOOKE_IRQPRIO_FIT] = vcpu->arch.gpr[rs]; + vcpu->arch.ivor[BOOKE_IRQPRIO_FIT] = spr_val; break; case SPRN_IVOR12: - vcpu->arch.ivor[BOOKE_IRQPRIO_WATCHDOG] = vcpu->arch.gpr[rs]; + vcpu->arch.ivor[BOOKE_IRQPRIO_WATCHDOG] = spr_val; break; case SPRN_IVOR13: - vcpu->arch.ivor[BOOKE_IRQPRIO_DTLB_MISS] = vcpu->arch.gpr[rs]; + vcpu->arch.ivor[BOOKE_IRQPRIO_DTLB_MISS] = spr_val; break; case SPRN_IVOR14: - vcpu->arch.ivor[BOOKE_IRQPRIO_ITLB_MISS] = vcpu->arch.gpr[rs]; + vcpu->arch.ivor[BOOKE_IRQPRIO_ITLB_MISS] = spr_val; break; case SPRN_IVOR15: - vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG] = vcpu->arch.gpr[rs]; + vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG] = spr_val; break; default: @@ -197,65 +198,65 @@ int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) switch (sprn) { case SPRN_IVPR: - vcpu->arch.gpr[rt] = vcpu->arch.ivpr; break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivpr); break; case SPRN_DEAR: - vcpu->arch.gpr[rt] = vcpu->arch.dear; break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.dear); break; case SPRN_ESR: - vcpu->arch.gpr[rt] = vcpu->arch.esr; break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.esr); break; case SPRN_DBCR0: - vcpu->arch.gpr[rt] = vcpu->arch.dbcr0; break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.dbcr0); break; case SPRN_DBCR1: - vcpu->arch.gpr[rt] = vcpu->arch.dbcr1; break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.dbcr1); break; case SPRN_DBSR: - vcpu->arch.gpr[rt] = vcpu->arch.dbsr; break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.dbsr); break; case SPRN_IVOR0: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL]; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL]); break; case SPRN_IVOR1: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_MACHINE_CHECK]; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_MACHINE_CHECK]); break; case SPRN_IVOR2: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE]; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE]); break; case SPRN_IVOR3: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE]; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE]); break; case SPRN_IVOR4: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_EXTERNAL]; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_EXTERNAL]); break; case SPRN_IVOR5: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_ALIGNMENT]; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_ALIGNMENT]); break; case SPRN_IVOR6: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_PROGRAM]; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_PROGRAM]); break; case SPRN_IVOR7: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_FP_UNAVAIL]; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_FP_UNAVAIL]); break; case SPRN_IVOR8: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL]; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL]); break; case SPRN_IVOR9: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_AP_UNAVAIL]; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_AP_UNAVAIL]); break; case SPRN_IVOR10: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_DECREMENTER]; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_DECREMENTER]); break; case SPRN_IVOR11: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_FIT]; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_FIT]); break; case SPRN_IVOR12: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_WATCHDOG]; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_WATCHDOG]); break; case SPRN_IVOR13: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_DTLB_MISS]; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_DTLB_MISS]); break; case SPRN_IVOR14: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_ITLB_MISS]; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_ITLB_MISS]); break; case SPRN_IVOR15: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG]; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG]); break; default: diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c index 64949ee..efa1198 100644 --- a/arch/powerpc/kvm/e500.c +++ b/arch/powerpc/kvm/e500.c @@ -60,6 +60,12 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu) kvmppc_e500_tlb_setup(vcpu_e500); + /* Registers init */ + vcpu->arch.pvr = mfspr(SPRN_PVR); + + /* Since booke kvm only support one core, update all vcpus' PIR to 0 */ + vcpu->vcpu_id = 0; + return 0; } diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c index be95b8d..8e3edfb 100644 --- a/arch/powerpc/kvm/e500_emulate.c +++ b/arch/powerpc/kvm/e500_emulate.c @@ -74,54 +74,59 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); int emulated = EMULATE_DONE; + ulong spr_val = kvmppc_get_gpr(vcpu, rs); switch (sprn) { case SPRN_PID: vcpu_e500->pid[0] = vcpu->arch.shadow_pid = - vcpu->arch.pid = vcpu->arch.gpr[rs]; + vcpu->arch.pid = spr_val; break; case SPRN_PID1: - vcpu_e500->pid[1] = vcpu->arch.gpr[rs]; break; + vcpu_e500->pid[1] = spr_val; break; case SPRN_PID2: - vcpu_e500->pid[2] = vcpu->arch.gpr[rs]; break; + vcpu_e500->pid[2] = spr_val; break; case SPRN_MAS0: - vcpu_e500->mas0 = vcpu->arch.gpr[rs]; break; + vcpu_e500->mas0 = spr_val; break; case SPRN_MAS1: - vcpu_e500->mas1 = vcpu->arch.gpr[rs]; break; + vcpu_e500->mas1 = spr_val; break; case SPRN_MAS2: - vcpu_e500->mas2 = vcpu->arch.gpr[rs]; break; + vcpu_e500->mas2 = spr_val; break; case SPRN_MAS3: - vcpu_e500->mas3 = vcpu->arch.gpr[rs]; break; + vcpu_e500->mas3 = spr_val; break; case SPRN_MAS4: - vcpu_e500->mas4 = vcpu->arch.gpr[rs]; break; + vcpu_e500->mas4 = spr_val; break; case SPRN_MAS6: - vcpu_e500->mas6 = vcpu->arch.gpr[rs]; break; + vcpu_e500->mas6 = spr_val; break; case SPRN_MAS7: - vcpu_e500->mas7 = vcpu->arch.gpr[rs]; break; + vcpu_e500->mas7 = spr_val; break; + case SPRN_L1CSR0: + vcpu_e500->l1csr0 = spr_val; + vcpu_e500->l1csr0 &= ~(L1CSR0_DCFI | L1CSR0_CLFC); + break; case SPRN_L1CSR1: - vcpu_e500->l1csr1 = vcpu->arch.gpr[rs]; break; + vcpu_e500->l1csr1 = spr_val; break; case SPRN_HID0: - vcpu_e500->hid0 = vcpu->arch.gpr[rs]; break; + vcpu_e500->hid0 = spr_val; break; case SPRN_HID1: - vcpu_e500->hid1 = vcpu->arch.gpr[rs]; break; + vcpu_e500->hid1 = spr_val; break; case SPRN_MMUCSR0: emulated = kvmppc_e500_emul_mt_mmucsr0(vcpu_e500, - vcpu->arch.gpr[rs]); + spr_val); break; /* extra exceptions */ case SPRN_IVOR32: - vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_UNAVAIL] = vcpu->arch.gpr[rs]; + vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_UNAVAIL] = spr_val; break; case SPRN_IVOR33: - vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_DATA] = vcpu->arch.gpr[rs]; + vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_DATA] = spr_val; break; case SPRN_IVOR34: - vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_ROUND] = vcpu->arch.gpr[rs]; + vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_ROUND] = spr_val; break; case SPRN_IVOR35: - vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR] = vcpu->arch.gpr[rs]; + vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR] = spr_val; break; default: @@ -138,63 +143,57 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) switch (sprn) { case SPRN_PID: - vcpu->arch.gpr[rt] = vcpu_e500->pid[0]; break; + kvmppc_set_gpr(vcpu, rt, vcpu_e500->pid[0]); break; case SPRN_PID1: - vcpu->arch.gpr[rt] = vcpu_e500->pid[1]; break; + kvmppc_set_gpr(vcpu, rt, vcpu_e500->pid[1]); break; case SPRN_PID2: - vcpu->arch.gpr[rt] = vcpu_e500->pid[2]; break; + kvmppc_set_gpr(vcpu, rt, vcpu_e500->pid[2]); break; case SPRN_MAS0: - vcpu->arch.gpr[rt] = vcpu_e500->mas0; break; + kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas0); break; case SPRN_MAS1: - vcpu->arch.gpr[rt] = vcpu_e500->mas1; break; + kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas1); break; case SPRN_MAS2: - vcpu->arch.gpr[rt] = vcpu_e500->mas2; break; + kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas2); break; case SPRN_MAS3: - vcpu->arch.gpr[rt] = vcpu_e500->mas3; break; + kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas3); break; case SPRN_MAS4: - vcpu->arch.gpr[rt] = vcpu_e500->mas4; break; + kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas4); break; case SPRN_MAS6: - vcpu->arch.gpr[rt] = vcpu_e500->mas6; break; + kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas6); break; case SPRN_MAS7: - vcpu->arch.gpr[rt] = vcpu_e500->mas7; break; + kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas7); break; case SPRN_TLB0CFG: - vcpu->arch.gpr[rt] = mfspr(SPRN_TLB0CFG); - vcpu->arch.gpr[rt] &= ~0xfffUL; - vcpu->arch.gpr[rt] |= vcpu_e500->guest_tlb_size[0]; - break; - + kvmppc_set_gpr(vcpu, rt, vcpu_e500->tlb0cfg); break; case SPRN_TLB1CFG: - vcpu->arch.gpr[rt] = mfspr(SPRN_TLB1CFG); - vcpu->arch.gpr[rt] &= ~0xfffUL; - vcpu->arch.gpr[rt] |= vcpu_e500->guest_tlb_size[1]; - break; - + kvmppc_set_gpr(vcpu, rt, vcpu_e500->tlb1cfg); break; + case SPRN_L1CSR0: + kvmppc_set_gpr(vcpu, rt, vcpu_e500->l1csr0); break; case SPRN_L1CSR1: - vcpu->arch.gpr[rt] = vcpu_e500->l1csr1; break; + kvmppc_set_gpr(vcpu, rt, vcpu_e500->l1csr1); break; case SPRN_HID0: - vcpu->arch.gpr[rt] = vcpu_e500->hid0; break; + kvmppc_set_gpr(vcpu, rt, vcpu_e500->hid0); break; case SPRN_HID1: - vcpu->arch.gpr[rt] = vcpu_e500->hid1; break; + kvmppc_set_gpr(vcpu, rt, vcpu_e500->hid1); break; case SPRN_MMUCSR0: - vcpu->arch.gpr[rt] = 0; break; + kvmppc_set_gpr(vcpu, rt, 0); break; case SPRN_MMUCFG: - vcpu->arch.gpr[rt] = mfspr(SPRN_MMUCFG); break; + kvmppc_set_gpr(vcpu, rt, mfspr(SPRN_MMUCFG)); break; /* extra exceptions */ case SPRN_IVOR32: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_UNAVAIL]; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_UNAVAIL]); break; case SPRN_IVOR33: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_DATA]; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_DATA]); break; case SPRN_IVOR34: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_ROUND]; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_ROUND]); break; case SPRN_IVOR35: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR]; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR]); break; default: emulated = kvmppc_booke_emulate_mfspr(vcpu, sprn, rt); diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c index fb1e1dc..0d772e6 100644 --- a/arch/powerpc/kvm/e500_tlb.c +++ b/arch/powerpc/kvm/e500_tlb.c @@ -417,7 +417,7 @@ int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, int ra, int rb) int esel, tlbsel; gva_t ea; - ea = ((ra) ? vcpu->arch.gpr[ra] : 0) + vcpu->arch.gpr[rb]; + ea = ((ra) ? kvmppc_get_gpr(vcpu, ra) : 0) + kvmppc_get_gpr(vcpu, rb); ia = (ea >> 2) & 0x1; @@ -470,7 +470,7 @@ int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb) struct tlbe *gtlbe = NULL; gva_t ea; - ea = vcpu->arch.gpr[rb]; + ea = kvmppc_get_gpr(vcpu, rb); for (tlbsel = 0; tlbsel < 2; tlbsel++) { esel = kvmppc_e500_tlb_index(vcpu_e500, ea, tlbsel, pid, as); @@ -728,6 +728,12 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500) if (vcpu_e500->shadow_pages[1] == NULL) goto err_out_page0; + /* Init TLB configuration register */ + vcpu_e500->tlb0cfg = mfspr(SPRN_TLB0CFG) & ~0xfffUL; + vcpu_e500->tlb0cfg |= vcpu_e500->guest_tlb_size[0]; + vcpu_e500->tlb1cfg = mfspr(SPRN_TLB1CFG) & ~0xfffUL; + vcpu_e500->tlb1cfg |= vcpu_e500->guest_tlb_size[1]; + return 0; err_out_page0: diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c index 4a9ac66..cb72a65 100644 --- a/arch/powerpc/kvm/emulate.c +++ b/arch/powerpc/kvm/emulate.c @@ -83,6 +83,9 @@ void kvmppc_emulate_dec(struct kvm_vcpu *vcpu) pr_debug("mtDEC: %x\n", vcpu->arch.dec); #ifdef CONFIG_PPC64 + /* mtdec lowers the interrupt line when positive. */ + kvmppc_core_dequeue_dec(vcpu); + /* POWER4+ triggers a dec interrupt if the value is < 0 */ if (vcpu->arch.dec & 0x80000000) { hrtimer_try_to_cancel(&vcpu->arch.dec_timer); @@ -140,14 +143,18 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) pr_debug(KERN_INFO "Emulating opcode %d / %d\n", get_op(inst), get_xop(inst)); + /* Try again next time */ + if (inst == KVM_INST_FETCH_FAILED) + return EMULATE_DONE; + switch (get_op(inst)) { case OP_TRAP: #ifdef CONFIG_PPC64 case OP_TRAP_64: + kvmppc_core_queue_program(vcpu, SRR1_PROGTRAP); #else - vcpu->arch.esr |= ESR_PTR; + kvmppc_core_queue_program(vcpu, vcpu->arch.esr | ESR_PTR); #endif - kvmppc_core_queue_program(vcpu); advance = 0; break; @@ -167,14 +174,14 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) case OP_31_XOP_STWX: rs = get_rs(inst); emulated = kvmppc_handle_store(run, vcpu, - vcpu->arch.gpr[rs], + kvmppc_get_gpr(vcpu, rs), 4, 1); break; case OP_31_XOP_STBX: rs = get_rs(inst); emulated = kvmppc_handle_store(run, vcpu, - vcpu->arch.gpr[rs], + kvmppc_get_gpr(vcpu, rs), 1, 1); break; @@ -183,14 +190,14 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) ra = get_ra(inst); rb = get_rb(inst); - ea = vcpu->arch.gpr[rb]; + ea = kvmppc_get_gpr(vcpu, rb); if (ra) - ea += vcpu->arch.gpr[ra]; + ea += kvmppc_get_gpr(vcpu, ra); emulated = kvmppc_handle_store(run, vcpu, - vcpu->arch.gpr[rs], + kvmppc_get_gpr(vcpu, rs), 1, 1); - vcpu->arch.gpr[rs] = ea; + kvmppc_set_gpr(vcpu, rs, ea); break; case OP_31_XOP_LHZX: @@ -203,12 +210,12 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) ra = get_ra(inst); rb = get_rb(inst); - ea = vcpu->arch.gpr[rb]; + ea = kvmppc_get_gpr(vcpu, rb); if (ra) - ea += vcpu->arch.gpr[ra]; + ea += kvmppc_get_gpr(vcpu, ra); emulated = kvmppc_handle_load(run, vcpu, rt, 2, 1); - vcpu->arch.gpr[ra] = ea; + kvmppc_set_gpr(vcpu, ra, ea); break; case OP_31_XOP_MFSPR: @@ -217,47 +224,49 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) switch (sprn) { case SPRN_SRR0: - vcpu->arch.gpr[rt] = vcpu->arch.srr0; break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.srr0); break; case SPRN_SRR1: - vcpu->arch.gpr[rt] = vcpu->arch.srr1; break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.srr1); break; case SPRN_PVR: - vcpu->arch.gpr[rt] = vcpu->arch.pvr; break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.pvr); break; case SPRN_PIR: - vcpu->arch.gpr[rt] = vcpu->vcpu_id; break; + kvmppc_set_gpr(vcpu, rt, vcpu->vcpu_id); break; case SPRN_MSSSR0: - vcpu->arch.gpr[rt] = 0; break; + kvmppc_set_gpr(vcpu, rt, 0); break; /* Note: mftb and TBRL/TBWL are user-accessible, so * the guest can always access the real TB anyways. * In fact, we probably will never see these traps. */ case SPRN_TBWL: - vcpu->arch.gpr[rt] = get_tb() >> 32; break; + kvmppc_set_gpr(vcpu, rt, get_tb() >> 32); break; case SPRN_TBWU: - vcpu->arch.gpr[rt] = get_tb(); break; + kvmppc_set_gpr(vcpu, rt, get_tb()); break; case SPRN_SPRG0: - vcpu->arch.gpr[rt] = vcpu->arch.sprg0; break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.sprg0); break; case SPRN_SPRG1: - vcpu->arch.gpr[rt] = vcpu->arch.sprg1; break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.sprg1); break; case SPRN_SPRG2: - vcpu->arch.gpr[rt] = vcpu->arch.sprg2; break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.sprg2); break; case SPRN_SPRG3: - vcpu->arch.gpr[rt] = vcpu->arch.sprg3; break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.sprg3); break; /* Note: SPRG4-7 are user-readable, so we don't get * a trap. */ case SPRN_DEC: { u64 jd = get_tb() - vcpu->arch.dec_jiffies; - vcpu->arch.gpr[rt] = vcpu->arch.dec - jd; - pr_debug(KERN_INFO "mfDEC: %x - %llx = %lx\n", vcpu->arch.dec, jd, vcpu->arch.gpr[rt]); + kvmppc_set_gpr(vcpu, rt, vcpu->arch.dec - jd); + pr_debug(KERN_INFO "mfDEC: %x - %llx = %lx\n", + vcpu->arch.dec, jd, + kvmppc_get_gpr(vcpu, rt)); break; } default: emulated = kvmppc_core_emulate_mfspr(vcpu, sprn, rt); if (emulated == EMULATE_FAIL) { printk("mfspr: unknown spr %x\n", sprn); - vcpu->arch.gpr[rt] = 0; + kvmppc_set_gpr(vcpu, rt, 0); } break; } @@ -269,7 +278,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) rb = get_rb(inst); emulated = kvmppc_handle_store(run, vcpu, - vcpu->arch.gpr[rs], + kvmppc_get_gpr(vcpu, rs), 2, 1); break; @@ -278,14 +287,14 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) ra = get_ra(inst); rb = get_rb(inst); - ea = vcpu->arch.gpr[rb]; + ea = kvmppc_get_gpr(vcpu, rb); if (ra) - ea += vcpu->arch.gpr[ra]; + ea += kvmppc_get_gpr(vcpu, ra); emulated = kvmppc_handle_store(run, vcpu, - vcpu->arch.gpr[rs], + kvmppc_get_gpr(vcpu, rs), 2, 1); - vcpu->arch.gpr[ra] = ea; + kvmppc_set_gpr(vcpu, ra, ea); break; case OP_31_XOP_MTSPR: @@ -293,9 +302,9 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) rs = get_rs(inst); switch (sprn) { case SPRN_SRR0: - vcpu->arch.srr0 = vcpu->arch.gpr[rs]; break; + vcpu->arch.srr0 = kvmppc_get_gpr(vcpu, rs); break; case SPRN_SRR1: - vcpu->arch.srr1 = vcpu->arch.gpr[rs]; break; + vcpu->arch.srr1 = kvmppc_get_gpr(vcpu, rs); break; /* XXX We need to context-switch the timebase for * watchdog and FIT. */ @@ -305,18 +314,18 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) case SPRN_MSSSR0: break; case SPRN_DEC: - vcpu->arch.dec = vcpu->arch.gpr[rs]; + vcpu->arch.dec = kvmppc_get_gpr(vcpu, rs); kvmppc_emulate_dec(vcpu); break; case SPRN_SPRG0: - vcpu->arch.sprg0 = vcpu->arch.gpr[rs]; break; + vcpu->arch.sprg0 = kvmppc_get_gpr(vcpu, rs); break; case SPRN_SPRG1: - vcpu->arch.sprg1 = vcpu->arch.gpr[rs]; break; + vcpu->arch.sprg1 = kvmppc_get_gpr(vcpu, rs); break; case SPRN_SPRG2: - vcpu->arch.sprg2 = vcpu->arch.gpr[rs]; break; + vcpu->arch.sprg2 = kvmppc_get_gpr(vcpu, rs); break; case SPRN_SPRG3: - vcpu->arch.sprg3 = vcpu->arch.gpr[rs]; break; + vcpu->arch.sprg3 = kvmppc_get_gpr(vcpu, rs); break; default: emulated = kvmppc_core_emulate_mtspr(vcpu, sprn, rs); @@ -348,7 +357,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) rb = get_rb(inst); emulated = kvmppc_handle_store(run, vcpu, - vcpu->arch.gpr[rs], + kvmppc_get_gpr(vcpu, rs), 4, 0); break; @@ -363,7 +372,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) rb = get_rb(inst); emulated = kvmppc_handle_store(run, vcpu, - vcpu->arch.gpr[rs], + kvmppc_get_gpr(vcpu, rs), 2, 0); break; @@ -382,7 +391,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) ra = get_ra(inst); rt = get_rt(inst); emulated = kvmppc_handle_load(run, vcpu, rt, 4, 1); - vcpu->arch.gpr[ra] = vcpu->arch.paddr_accessed; + kvmppc_set_gpr(vcpu, ra, vcpu->arch.paddr_accessed); break; case OP_LBZ: @@ -394,35 +403,39 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) ra = get_ra(inst); rt = get_rt(inst); emulated = kvmppc_handle_load(run, vcpu, rt, 1, 1); - vcpu->arch.gpr[ra] = vcpu->arch.paddr_accessed; + kvmppc_set_gpr(vcpu, ra, vcpu->arch.paddr_accessed); break; case OP_STW: rs = get_rs(inst); - emulated = kvmppc_handle_store(run, vcpu, vcpu->arch.gpr[rs], + emulated = kvmppc_handle_store(run, vcpu, + kvmppc_get_gpr(vcpu, rs), 4, 1); break; case OP_STWU: ra = get_ra(inst); rs = get_rs(inst); - emulated = kvmppc_handle_store(run, vcpu, vcpu->arch.gpr[rs], + emulated = kvmppc_handle_store(run, vcpu, + kvmppc_get_gpr(vcpu, rs), 4, 1); - vcpu->arch.gpr[ra] = vcpu->arch.paddr_accessed; + kvmppc_set_gpr(vcpu, ra, vcpu->arch.paddr_accessed); break; case OP_STB: rs = get_rs(inst); - emulated = kvmppc_handle_store(run, vcpu, vcpu->arch.gpr[rs], + emulated = kvmppc_handle_store(run, vcpu, + kvmppc_get_gpr(vcpu, rs), 1, 1); break; case OP_STBU: ra = get_ra(inst); rs = get_rs(inst); - emulated = kvmppc_handle_store(run, vcpu, vcpu->arch.gpr[rs], + emulated = kvmppc_handle_store(run, vcpu, + kvmppc_get_gpr(vcpu, rs), 1, 1); - vcpu->arch.gpr[ra] = vcpu->arch.paddr_accessed; + kvmppc_set_gpr(vcpu, ra, vcpu->arch.paddr_accessed); break; case OP_LHZ: @@ -434,21 +447,23 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) ra = get_ra(inst); rt = get_rt(inst); emulated = kvmppc_handle_load(run, vcpu, rt, 2, 1); - vcpu->arch.gpr[ra] = vcpu->arch.paddr_accessed; + kvmppc_set_gpr(vcpu, ra, vcpu->arch.paddr_accessed); break; case OP_STH: rs = get_rs(inst); - emulated = kvmppc_handle_store(run, vcpu, vcpu->arch.gpr[rs], + emulated = kvmppc_handle_store(run, vcpu, + kvmppc_get_gpr(vcpu, rs), 2, 1); break; case OP_STHU: ra = get_ra(inst); rs = get_rs(inst); - emulated = kvmppc_handle_store(run, vcpu, vcpu->arch.gpr[rs], + emulated = kvmppc_handle_store(run, vcpu, + kvmppc_get_gpr(vcpu, rs), 2, 1); - vcpu->arch.gpr[ra] = vcpu->arch.paddr_accessed; + kvmppc_set_gpr(vcpu, ra, vcpu->arch.paddr_accessed); break; default: @@ -461,6 +476,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) advance = 0; printk(KERN_ERR "Couldn't emulate instruction 0x%08x " "(op %d xop %d)\n", inst, get_op(inst), get_xop(inst)); + kvmppc_core_queue_program(vcpu, 0); } } diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index f06cf93..51aedd7 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -137,6 +137,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) { kvmppc_free_vcpus(kvm); kvm_free_physmem(kvm); + cleanup_srcu_struct(&kvm->srcu); kfree(kvm); } @@ -165,14 +166,24 @@ long kvm_arch_dev_ioctl(struct file *filp, return -EINVAL; } -int kvm_arch_set_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, - struct kvm_memory_slot old, - int user_alloc) +int kvm_arch_prepare_memory_region(struct kvm *kvm, + struct kvm_memory_slot *memslot, + struct kvm_memory_slot old, + struct kvm_userspace_memory_region *mem, + int user_alloc) { return 0; } +void kvm_arch_commit_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, + struct kvm_memory_slot old, + int user_alloc) +{ + return; +} + + void kvm_arch_flush_shadow(struct kvm *kvm) { } @@ -260,34 +271,35 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, static void kvmppc_complete_dcr_load(struct kvm_vcpu *vcpu, struct kvm_run *run) { - ulong *gpr = &vcpu->arch.gpr[vcpu->arch.io_gpr]; - *gpr = run->dcr.data; + kvmppc_set_gpr(vcpu, vcpu->arch.io_gpr, run->dcr.data); } static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu, struct kvm_run *run) { - ulong *gpr = &vcpu->arch.gpr[vcpu->arch.io_gpr]; + ulong gpr; - if (run->mmio.len > sizeof(*gpr)) { + if (run->mmio.len > sizeof(gpr)) { printk(KERN_ERR "bad MMIO length: %d\n", run->mmio.len); return; } if (vcpu->arch.mmio_is_bigendian) { switch (run->mmio.len) { - case 4: *gpr = *(u32 *)run->mmio.data; break; - case 2: *gpr = *(u16 *)run->mmio.data; break; - case 1: *gpr = *(u8 *)run->mmio.data; break; + case 4: gpr = *(u32 *)run->mmio.data; break; + case 2: gpr = *(u16 *)run->mmio.data; break; + case 1: gpr = *(u8 *)run->mmio.data; break; } } else { /* Convert BE data from userland back to LE. */ switch (run->mmio.len) { - case 4: *gpr = ld_le32((u32 *)run->mmio.data); break; - case 2: *gpr = ld_le16((u16 *)run->mmio.data); break; - case 1: *gpr = *(u8 *)run->mmio.data; break; + case 4: gpr = ld_le32((u32 *)run->mmio.data); break; + case 2: gpr = ld_le16((u16 *)run->mmio.data); break; + case 1: gpr = *(u8 *)run->mmio.data; break; } } + + kvmppc_set_gpr(vcpu, vcpu->arch.io_gpr, gpr); } int kvmppc_handle_load(struct kvm_run *run, struct kvm_vcpu *vcpu, diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index b037d95..64c0022 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -451,7 +451,7 @@ static int __cpuinit numa_setup_cpu(unsigned long lcpu) nid = of_node_to_nid_single(cpu); if (nid < 0 || !node_online(nid)) - nid = any_online_node(NODE_MASK_ALL); + nid = first_online_node; out: map_cpu_to_node(lcpu, nid); @@ -1114,7 +1114,7 @@ int hot_add_scn_to_nid(unsigned long scn_addr) int nid, found = 0; if (!numa_enabled || (min_common_depth < 0)) - return any_online_node(NODE_MASK_ALL); + return first_online_node; memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); if (memory) { @@ -1125,7 +1125,7 @@ int hot_add_scn_to_nid(unsigned long scn_addr) } if (nid < 0 || !node_online(nid)) - nid = any_online_node(NODE_MASK_ALL); + nid = first_online_node; if (NODE_DATA(nid)->node_spanned_pages) return nid; diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c index 341aff2..cd128b0 100644 --- a/arch/s390/hypfs/inode.c +++ b/arch/s390/hypfs/inode.c @@ -288,46 +288,30 @@ static int hypfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_blocksize_bits = PAGE_CACHE_SHIFT; sb->s_magic = HYPFS_MAGIC; sb->s_op = &hypfs_s_ops; - if (hypfs_parse_options(data, sb)) { - rc = -EINVAL; - goto err_alloc; - } + if (hypfs_parse_options(data, sb)) + return -EINVAL; root_inode = hypfs_make_inode(sb, S_IFDIR | 0755); - if (!root_inode) { - rc = -ENOMEM; - goto err_alloc; - } + if (!root_inode) + return -ENOMEM; root_inode->i_op = &simple_dir_inode_operations; root_inode->i_fop = &simple_dir_operations; - root_dentry = d_alloc_root(root_inode); + sb->s_root = root_dentry = d_alloc_root(root_inode); if (!root_dentry) { iput(root_inode); - rc = -ENOMEM; - goto err_alloc; + return -ENOMEM; } if (MACHINE_IS_VM) rc = hypfs_vm_create_files(sb, root_dentry); else rc = hypfs_diag_create_files(sb, root_dentry); if (rc) - goto err_tree; + return rc; sbi->update_file = hypfs_create_update_file(sb, root_dentry); - if (IS_ERR(sbi->update_file)) { - rc = PTR_ERR(sbi->update_file); - goto err_tree; - } + if (IS_ERR(sbi->update_file)) + return PTR_ERR(sbi->update_file); hypfs_update_update(sb); - sb->s_root = root_dentry; pr_info("Hypervisor filesystem mounted\n"); return 0; - -err_tree: - hypfs_delete_tree(root_dentry); - d_genocide(root_dentry); - dput(root_dentry); -err_alloc: - kfree(sbi); - return rc; } static int hypfs_get_super(struct file_system_type *fst, int flags, @@ -340,12 +324,12 @@ static void hypfs_kill_super(struct super_block *sb) { struct hypfs_sb_info *sb_info = sb->s_fs_info; - if (sb->s_root) { + if (sb->s_root) hypfs_delete_tree(sb->s_root); + if (sb_info->update_file) hypfs_remove(sb_info->update_file); - kfree(sb->s_fs_info); - sb->s_fs_info = NULL; - } + kfree(sb->s_fs_info); + sb->s_fs_info = NULL; kill_litter_super(sb); } diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 3fa0a10..4929286 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -242,6 +242,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kvm_free_physmem(kvm); free_page((unsigned long)(kvm->arch.sca)); debug_unregister(kvm->arch.dbf); + cleanup_srcu_struct(&kvm->srcu); kfree(kvm); } @@ -690,14 +691,12 @@ long kvm_arch_vcpu_ioctl(struct file *filp, } /* Section: memory related */ -int kvm_arch_set_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, - struct kvm_memory_slot old, - int user_alloc) +int kvm_arch_prepare_memory_region(struct kvm *kvm, + struct kvm_memory_slot *memslot, + struct kvm_memory_slot old, + struct kvm_userspace_memory_region *mem, + int user_alloc) { - int i; - struct kvm_vcpu *vcpu; - /* A few sanity checks. We can have exactly one memory slot which has to start at guest virtual zero and which has to be located at a page boundary in userland and which has to end at a page boundary. @@ -720,14 +719,23 @@ int kvm_arch_set_memory_region(struct kvm *kvm, if (!user_alloc) return -EINVAL; + return 0; +} + +void kvm_arch_commit_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, + struct kvm_memory_slot old, + int user_alloc) +{ + int i; + struct kvm_vcpu *vcpu; + /* request update of sie control block for all available vcpus */ kvm_for_each_vcpu(i, vcpu, kvm) { if (test_and_set_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) continue; kvm_s390_inject_sigp_stop(vcpu, ACTION_RELOADVCPU_ON_STOP); } - - return 0; } void kvm_arch_flush_shadow(struct kvm *kvm) diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 06cce82..60f09ab 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -67,10 +67,14 @@ static inline long kvm_s390_vcpu_get_memsize(struct kvm_vcpu *vcpu) static inline void kvm_s390_vcpu_set_mem(struct kvm_vcpu *vcpu) { + int idx; struct kvm_memory_slot *mem; + struct kvm_memslots *memslots; - down_read(&vcpu->kvm->slots_lock); - mem = &vcpu->kvm->memslots[0]; + idx = srcu_read_lock(&vcpu->kvm->srcu); + memslots = rcu_dereference(vcpu->kvm->memslots); + + mem = &memslots->memslots[0]; vcpu->arch.sie_block->gmsor = mem->userspace_addr; vcpu->arch.sie_block->gmslm = @@ -78,7 +82,7 @@ static inline void kvm_s390_vcpu_set_mem(struct kvm_vcpu *vcpu) (mem->npages << PAGE_SHIFT) + VIRTIODESCSPACE - 1ul; - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&vcpu->kvm->srcu, idx); } /* implemented in priv.c */ diff --git a/arch/sh/boards/mach-migor/setup.c b/arch/sh/boards/mach-migor/setup.c index be300aa..7da0fc9 100644 --- a/arch/sh/boards/mach-migor/setup.c +++ b/arch/sh/boards/mach-migor/setup.c @@ -419,6 +419,9 @@ static struct i2c_board_info migor_i2c_devices[] = { I2C_BOARD_INFO("migor_ts", 0x51), .irq = 38, /* IRQ6 */ }, + { + I2C_BOARD_INFO("wm8978", 0x1a), + }, }; static struct i2c_board_info migor_i2c_camera[] = { @@ -619,6 +622,19 @@ static int __init migor_devices_setup(void) platform_resource_setup_memory(&migor_ceu_device, "ceu", 4 << 20); + /* SIU: Port B */ + gpio_request(GPIO_FN_SIUBOLR, NULL); + gpio_request(GPIO_FN_SIUBOBT, NULL); + gpio_request(GPIO_FN_SIUBISLD, NULL); + gpio_request(GPIO_FN_SIUBOSLD, NULL); + gpio_request(GPIO_FN_SIUMCKB, NULL); + + /* + * The original driver sets SIUB OLR/OBT, ILR/IBT, and SIUA OLR/OBT to + * output. Need only SIUB, set to output for master mode (table 34.2) + */ + __raw_writew(__raw_readw(PORT_MSELCRA) | 1, PORT_MSELCRA); + i2c_register_board_info(0, migor_i2c_devices, ARRAY_SIZE(migor_i2c_devices)); diff --git a/arch/sh/boot/compressed/cache.c b/arch/sh/boot/compressed/cache.c index e27fc74..d0b77b6 100644 --- a/arch/sh/boot/compressed/cache.c +++ b/arch/sh/boot/compressed/cache.c @@ -5,7 +5,7 @@ int cache_control(unsigned int command) for (i = 0; i < (32 * 1024); i += 32) { (void)*p; - p += (32 / sizeof (int)); + p += (32 / sizeof(int)); } return 0; diff --git a/arch/sh/include/asm/cacheflush.h b/arch/sh/include/asm/cacheflush.h index da3ebec..1f4e562 100644 --- a/arch/sh/include/asm/cacheflush.h +++ b/arch/sh/include/asm/cacheflush.h @@ -86,8 +86,8 @@ extern void copy_from_user_page(struct vm_area_struct *vma, struct page *page, unsigned long vaddr, void *dst, const void *src, unsigned long len); -#define flush_cache_vmap(start, end) flush_cache_all() -#define flush_cache_vunmap(start, end) flush_cache_all() +#define flush_cache_vmap(start, end) local_flush_cache_all(NULL) +#define flush_cache_vunmap(start, end) local_flush_cache_all(NULL) #define flush_dcache_mmap_lock(mapping) do { } while (0) #define flush_dcache_mmap_unlock(mapping) do { } while (0) diff --git a/arch/sh/include/asm/dma-register.h b/arch/sh/include/asm/dma-register.h new file mode 100644 index 0000000..51cd78f --- /dev/null +++ b/arch/sh/include/asm/dma-register.h @@ -0,0 +1,51 @@ +/* + * Common header for the legacy SH DMA driver and the new dmaengine driver + * + * extracted from arch/sh/include/asm/dma-sh.h: + * + * Copyright (C) 2000 Takashi YOSHII + * Copyright (C) 2003 Paul Mundt + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + */ +#ifndef DMA_REGISTER_H +#define DMA_REGISTER_H + +/* DMA register */ +#define SAR 0x00 +#define DAR 0x04 +#define TCR 0x08 +#define CHCR 0x0C +#define DMAOR 0x40 + +/* DMAOR definitions */ +#define DMAOR_AE 0x00000004 +#define DMAOR_NMIF 0x00000002 +#define DMAOR_DME 0x00000001 + +/* Definitions for the SuperH DMAC */ +#define REQ_L 0x00000000 +#define REQ_E 0x00080000 +#define RACK_H 0x00000000 +#define RACK_L 0x00040000 +#define ACK_R 0x00000000 +#define ACK_W 0x00020000 +#define ACK_H 0x00000000 +#define ACK_L 0x00010000 +#define DM_INC 0x00004000 +#define DM_DEC 0x00008000 +#define DM_FIX 0x0000c000 +#define SM_INC 0x00001000 +#define SM_DEC 0x00002000 +#define SM_FIX 0x00003000 +#define RS_IN 0x00000200 +#define RS_OUT 0x00000300 +#define TS_BLK 0x00000040 +#define TM_BUR 0x00000020 +#define CHCR_DE 0x00000001 +#define CHCR_TE 0x00000002 +#define CHCR_IE 0x00000004 + +#endif diff --git a/arch/sh/include/asm/dma-sh.h b/arch/sh/include/asm/dma-sh.h index e934a2e..f3acb8e 100644 --- a/arch/sh/include/asm/dma-sh.h +++ b/arch/sh/include/asm/dma-sh.h @@ -11,7 +11,8 @@ #ifndef __DMA_SH_H #define __DMA_SH_H -#include <asm/dma.h> +#include <asm/dma-register.h> +#include <cpu/dma-register.h> #include <cpu/dma.h> /* DMAOR contorl: The DMAOR access size is different by CPU.*/ @@ -53,34 +54,6 @@ static int dmte_irq_map[] __maybe_unused = { #endif }; -/* Definitions for the SuperH DMAC */ -#define REQ_L 0x00000000 -#define REQ_E 0x00080000 -#define RACK_H 0x00000000 -#define RACK_L 0x00040000 -#define ACK_R 0x00000000 -#define ACK_W 0x00020000 -#define ACK_H 0x00000000 -#define ACK_L 0x00010000 -#define DM_INC 0x00004000 -#define DM_DEC 0x00008000 -#define DM_FIX 0x0000c000 -#define SM_INC 0x00001000 -#define SM_DEC 0x00002000 -#define SM_FIX 0x00003000 -#define RS_IN 0x00000200 -#define RS_OUT 0x00000300 -#define TS_BLK 0x00000040 -#define TM_BUR 0x00000020 -#define CHCR_DE 0x00000001 -#define CHCR_TE 0x00000002 -#define CHCR_IE 0x00000004 - -/* DMAOR definitions */ -#define DMAOR_AE 0x00000004 -#define DMAOR_NMIF 0x00000002 -#define DMAOR_DME 0x00000001 - /* * Define the default configuration for dual address memory-memory transfer. * The 0x400 value represents auto-request, external->external. @@ -111,61 +84,4 @@ static u32 dma_base_addr[] __maybe_unused = { #endif }; -/* DMA register */ -#define SAR 0x00 -#define DAR 0x04 -#define TCR 0x08 -#define CHCR 0x0C -#define DMAOR 0x40 - -/* - * for dma engine - * - * SuperH DMA mode - */ -#define SHDMA_MIX_IRQ (1 << 1) -#define SHDMA_DMAOR1 (1 << 2) -#define SHDMA_DMAE1 (1 << 3) - -enum sh_dmae_slave_chan_id { - SHDMA_SLAVE_SCIF0_TX, - SHDMA_SLAVE_SCIF0_RX, - SHDMA_SLAVE_SCIF1_TX, - SHDMA_SLAVE_SCIF1_RX, - SHDMA_SLAVE_SCIF2_TX, - SHDMA_SLAVE_SCIF2_RX, - SHDMA_SLAVE_SCIF3_TX, - SHDMA_SLAVE_SCIF3_RX, - SHDMA_SLAVE_SCIF4_TX, - SHDMA_SLAVE_SCIF4_RX, - SHDMA_SLAVE_SCIF5_TX, - SHDMA_SLAVE_SCIF5_RX, - SHDMA_SLAVE_SIUA_TX, - SHDMA_SLAVE_SIUA_RX, - SHDMA_SLAVE_SIUB_TX, - SHDMA_SLAVE_SIUB_RX, - SHDMA_SLAVE_NUMBER, /* Must stay last */ -}; - -struct sh_dmae_slave_config { - enum sh_dmae_slave_chan_id slave_id; - dma_addr_t addr; - u32 chcr; - char mid_rid; -}; - -struct sh_dmae_pdata { - unsigned int mode; - struct sh_dmae_slave_config *config; - int config_num; -}; - -struct device; - -struct sh_dmae_slave { - enum sh_dmae_slave_chan_id slave_id; /* Set by the platform */ - struct device *dma_dev; /* Set by the platform */ - struct sh_dmae_slave_config *config; /* Set by the driver */ -}; - #endif /* __DMA_SH_H */ diff --git a/arch/sh/include/asm/dmaengine.h b/arch/sh/include/asm/dmaengine.h new file mode 100644 index 0000000..bf2f30cf --- /dev/null +++ b/arch/sh/include/asm/dmaengine.h @@ -0,0 +1,93 @@ +/* + * Header for the new SH dmaengine driver + * + * Copyright (C) 2010 Guennadi Liakhovetski <g.liakhovetski@gmx.de> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#ifndef ASM_DMAENGINE_H +#define ASM_DMAENGINE_H + +#include <linux/dmaengine.h> +#include <linux/list.h> + +#include <asm/dma-register.h> + +#define SH_DMAC_MAX_CHANNELS 6 + +enum sh_dmae_slave_chan_id { + SHDMA_SLAVE_SCIF0_TX, + SHDMA_SLAVE_SCIF0_RX, + SHDMA_SLAVE_SCIF1_TX, + SHDMA_SLAVE_SCIF1_RX, + SHDMA_SLAVE_SCIF2_TX, + SHDMA_SLAVE_SCIF2_RX, + SHDMA_SLAVE_SCIF3_TX, + SHDMA_SLAVE_SCIF3_RX, + SHDMA_SLAVE_SCIF4_TX, + SHDMA_SLAVE_SCIF4_RX, + SHDMA_SLAVE_SCIF5_TX, + SHDMA_SLAVE_SCIF5_RX, + SHDMA_SLAVE_SIUA_TX, + SHDMA_SLAVE_SIUA_RX, + SHDMA_SLAVE_SIUB_TX, + SHDMA_SLAVE_SIUB_RX, + SHDMA_SLAVE_NUMBER, /* Must stay last */ +}; + +struct sh_dmae_slave_config { + enum sh_dmae_slave_chan_id slave_id; + dma_addr_t addr; + u32 chcr; + char mid_rid; +}; + +struct sh_dmae_channel { + unsigned int offset; + unsigned int dmars; + unsigned int dmars_bit; +}; + +struct sh_dmae_pdata { + struct sh_dmae_slave_config *slave; + int slave_num; + struct sh_dmae_channel *channel; + int channel_num; + unsigned int ts_low_shift; + unsigned int ts_low_mask; + unsigned int ts_high_shift; + unsigned int ts_high_mask; + unsigned int *ts_shift; + int ts_shift_num; + u16 dmaor_init; +}; + +struct device; + +/* Used by slave DMA clients to request DMA to/from a specific peripheral */ +struct sh_dmae_slave { + enum sh_dmae_slave_chan_id slave_id; /* Set by the platform */ + struct device *dma_dev; /* Set by the platform */ + struct sh_dmae_slave_config *config; /* Set by the driver */ +}; + +struct sh_dmae_regs { + u32 sar; /* SAR / source address */ + u32 dar; /* DAR / destination address */ + u32 tcr; /* TCR / transfer count */ +}; + +struct sh_desc { + struct sh_dmae_regs hw; + struct list_head node; + struct dma_async_tx_descriptor async_tx; + enum dma_data_direction direction; + dma_cookie_t cookie; + size_t partial; + int chunks; + int mark; +}; + +#endif diff --git a/arch/sh/include/asm/io.h b/arch/sh/include/asm/io.h index 7dab7b23..f689554 100644 --- a/arch/sh/include/asm/io.h +++ b/arch/sh/include/asm/io.h @@ -291,21 +291,21 @@ unsigned long long poke_real_address_q(unsigned long long addr, * doesn't exist, so everything must go through page tables. */ #ifdef CONFIG_MMU -void __iomem *__ioremap_caller(unsigned long offset, unsigned long size, +void __iomem *__ioremap_caller(phys_addr_t offset, unsigned long size, pgprot_t prot, void *caller); void __iounmap(void __iomem *addr); static inline void __iomem * -__ioremap(unsigned long offset, unsigned long size, pgprot_t prot) +__ioremap(phys_addr_t offset, unsigned long size, pgprot_t prot) { return __ioremap_caller(offset, size, prot, __builtin_return_address(0)); } static inline void __iomem * -__ioremap_29bit(unsigned long offset, unsigned long size, pgprot_t prot) +__ioremap_29bit(phys_addr_t offset, unsigned long size, pgprot_t prot) { #ifdef CONFIG_29BIT - unsigned long last_addr = offset + size - 1; + phys_addr_t last_addr = offset + size - 1; /* * For P1 and P2 space this is trivial, as everything is already @@ -329,7 +329,7 @@ __ioremap_29bit(unsigned long offset, unsigned long size, pgprot_t prot) } static inline void __iomem * -__ioremap_mode(unsigned long offset, unsigned long size, pgprot_t prot) +__ioremap_mode(phys_addr_t offset, unsigned long size, pgprot_t prot) { void __iomem *ret; @@ -349,35 +349,32 @@ __ioremap_mode(unsigned long offset, unsigned long size, pgprot_t prot) #define __iounmap(addr) do { } while (0) #endif /* CONFIG_MMU */ -static inline void __iomem * -ioremap(unsigned long offset, unsigned long size) +static inline void __iomem *ioremap(phys_addr_t offset, unsigned long size) { return __ioremap_mode(offset, size, PAGE_KERNEL_NOCACHE); } static inline void __iomem * -ioremap_cache(unsigned long offset, unsigned long size) +ioremap_cache(phys_addr_t offset, unsigned long size) { return __ioremap_mode(offset, size, PAGE_KERNEL); } #ifdef CONFIG_HAVE_IOREMAP_PROT static inline void __iomem * -ioremap_prot(resource_size_t offset, unsigned long size, unsigned long flags) +ioremap_prot(phys_addr_t offset, unsigned long size, unsigned long flags) { return __ioremap_mode(offset, size, __pgprot(flags)); } #endif #ifdef CONFIG_IOREMAP_FIXED -extern void __iomem *ioremap_fixed(resource_size_t, unsigned long, - unsigned long, pgprot_t); +extern void __iomem *ioremap_fixed(phys_addr_t, unsigned long, pgprot_t); extern int iounmap_fixed(void __iomem *); extern void ioremap_fixed_init(void); #else static inline void __iomem * -ioremap_fixed(resource_size_t phys_addr, unsigned long offset, - unsigned long size, pgprot_t prot) +ioremap_fixed(phys_addr_t phys_addr, unsigned long size, pgprot_t prot) { BUG(); return NULL; diff --git a/arch/sh/include/asm/mmu.h b/arch/sh/include/asm/mmu.h index 15a05b6..19fe845 100644 --- a/arch/sh/include/asm/mmu.h +++ b/arch/sh/include/asm/mmu.h @@ -55,19 +55,29 @@ typedef struct { #ifdef CONFIG_PMB /* arch/sh/mm/pmb.c */ -long pmb_remap(unsigned long virt, unsigned long phys, - unsigned long size, pgprot_t prot); -void pmb_unmap(unsigned long addr); -void pmb_init(void); bool __in_29bit_mode(void); + +void pmb_init(void); +int pmb_bolt_mapping(unsigned long virt, phys_addr_t phys, + unsigned long size, pgprot_t prot); +void __iomem *pmb_remap_caller(phys_addr_t phys, unsigned long size, + pgprot_t prot, void *caller); +int pmb_unmap(void __iomem *addr); + #else -static inline long pmb_remap(unsigned long virt, unsigned long phys, - unsigned long size, pgprot_t prot) + +static inline void __iomem * +pmb_remap_caller(phys_addr_t phys, unsigned long size, + pgprot_t prot, void *caller) +{ + return NULL; +} + +static inline int pmb_unmap(void __iomem *addr) { return -EINVAL; } -#define pmb_unmap(addr) do { } while (0) #define pmb_init(addr) do { } while (0) #ifdef CONFIG_29BIT @@ -77,6 +87,13 @@ static inline long pmb_remap(unsigned long virt, unsigned long phys, #endif #endif /* CONFIG_PMB */ + +static inline void __iomem * +pmb_remap(phys_addr_t phys, unsigned long size, pgprot_t prot) +{ + return pmb_remap_caller(phys, size, prot, __builtin_return_address(0)); +} + #endif /* __ASSEMBLY__ */ #endif /* __MMU_H */ diff --git a/arch/sh/include/asm/siu.h b/arch/sh/include/asm/siu.h index 57565a3..f1b1e69 100644 --- a/arch/sh/include/asm/siu.h +++ b/arch/sh/include/asm/siu.h @@ -11,7 +11,7 @@ #ifndef ASM_SIU_H #define ASM_SIU_H -#include <asm/dma-sh.h> +#include <asm/dmaengine.h> struct device; diff --git a/arch/sh/include/asm/topology.h b/arch/sh/include/asm/topology.h index 37cdadd..88e7340 100644 --- a/arch/sh/include/asm/topology.h +++ b/arch/sh/include/asm/topology.h @@ -35,7 +35,7 @@ #define pcibus_to_node(bus) ((void)(bus), -1) #define cpumask_of_pcibus(bus) (pcibus_to_node(bus) == -1 ? \ - CPU_MASK_ALL_PTR : \ + cpu_all_mask : \ cpumask_of_node(pcibus_to_node(bus))) #endif diff --git a/arch/sh/include/cpu-sh3/cpu/dma-register.h b/arch/sh/include/cpu-sh3/cpu/dma-register.h new file mode 100644 index 0000000..2349e48 --- /dev/null +++ b/arch/sh/include/cpu-sh3/cpu/dma-register.h @@ -0,0 +1,41 @@ +/* + * SH3 CPU-specific DMA definitions, used by both DMA drivers + * + * Copyright (C) 2010 Guennadi Liakhovetski <g.liakhovetski@gmx.de> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#ifndef CPU_DMA_REGISTER_H +#define CPU_DMA_REGISTER_H + +#define CHCR_TS_LOW_MASK 0x18 +#define CHCR_TS_LOW_SHIFT 3 +#define CHCR_TS_HIGH_MASK 0 +#define CHCR_TS_HIGH_SHIFT 0 + +#define DMAOR_INIT DMAOR_DME + +/* + * The SuperH DMAC supports a number of transmit sizes, we list them here, + * with their respective values as they appear in the CHCR registers. + */ +enum { + XMIT_SZ_8BIT, + XMIT_SZ_16BIT, + XMIT_SZ_32BIT, + XMIT_SZ_128BIT, +}; + +/* log2(size / 8) - used to calculate number of transfers */ +#define TS_SHIFT { \ + [XMIT_SZ_8BIT] = 0, \ + [XMIT_SZ_16BIT] = 1, \ + [XMIT_SZ_32BIT] = 2, \ + [XMIT_SZ_128BIT] = 4, \ +} + +#define TS_INDEX2VAL(i) (((i) & 3) << CHCR_TS_LOW_SHIFT) + +#endif diff --git a/arch/sh/include/cpu-sh3/cpu/dma.h b/arch/sh/include/cpu-sh3/cpu/dma.h index 207811a..24e28b9 100644 --- a/arch/sh/include/cpu-sh3/cpu/dma.h +++ b/arch/sh/include/cpu-sh3/cpu/dma.h @@ -20,31 +20,4 @@ #define TS_32 0x00000010 #define TS_128 0x00000018 -#define CHCR_TS_LOW_MASK 0x18 -#define CHCR_TS_LOW_SHIFT 3 -#define CHCR_TS_HIGH_MASK 0 -#define CHCR_TS_HIGH_SHIFT 0 - -#define DMAOR_INIT DMAOR_DME - -/* - * The SuperH DMAC supports a number of transmit sizes, we list them here, - * with their respective values as they appear in the CHCR registers. - */ -enum { - XMIT_SZ_8BIT, - XMIT_SZ_16BIT, - XMIT_SZ_32BIT, - XMIT_SZ_128BIT, -}; - -#define TS_SHIFT { \ - [XMIT_SZ_8BIT] = 0, \ - [XMIT_SZ_16BIT] = 1, \ - [XMIT_SZ_32BIT] = 2, \ - [XMIT_SZ_128BIT] = 4, \ -} - -#define TS_INDEX2VAL(i) (((i) & 3) << CHCR_TS_LOW_SHIFT) - #endif /* __ASM_CPU_SH3_DMA_H */ diff --git a/arch/sh/include/cpu-sh4/cpu/dma-register.h b/arch/sh/include/cpu-sh4/cpu/dma-register.h new file mode 100644 index 0000000..55f9fec --- /dev/null +++ b/arch/sh/include/cpu-sh4/cpu/dma-register.h @@ -0,0 +1,112 @@ +/* + * SH4 CPU-specific DMA definitions, used by both DMA drivers + * + * Copyright (C) 2010 Guennadi Liakhovetski <g.liakhovetski@gmx.de> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#ifndef CPU_DMA_REGISTER_H +#define CPU_DMA_REGISTER_H + +/* SH7751/7760/7780 DMA IRQ sources */ + +#ifdef CONFIG_CPU_SH4A + +#define DMAOR_INIT DMAOR_DME + +#if defined(CONFIG_CPU_SUBTYPE_SH7343) || \ + defined(CONFIG_CPU_SUBTYPE_SH7730) +#define CHCR_TS_LOW_MASK 0x00000018 +#define CHCR_TS_LOW_SHIFT 3 +#define CHCR_TS_HIGH_MASK 0 +#define CHCR_TS_HIGH_SHIFT 0 +#elif defined(CONFIG_CPU_SUBTYPE_SH7722) || \ + defined(CONFIG_CPU_SUBTYPE_SH7724) +#define CHCR_TS_LOW_MASK 0x00000018 +#define CHCR_TS_LOW_SHIFT 3 +#define CHCR_TS_HIGH_MASK 0x00300000 +#define CHCR_TS_HIGH_SHIFT (20 - 2) /* 2 bits for shifted low TS */ +#elif defined(CONFIG_CPU_SUBTYPE_SH7763) || \ + defined(CONFIG_CPU_SUBTYPE_SH7764) +#define CHCR_TS_LOW_MASK 0x00000018 +#define CHCR_TS_LOW_SHIFT 3 +#define CHCR_TS_HIGH_MASK 0 +#define CHCR_TS_HIGH_SHIFT 0 +#elif defined(CONFIG_CPU_SUBTYPE_SH7723) +#define CHCR_TS_LOW_MASK 0x00000018 +#define CHCR_TS_LOW_SHIFT 3 +#define CHCR_TS_HIGH_MASK 0 +#define CHCR_TS_HIGH_SHIFT 0 +#elif defined(CONFIG_CPU_SUBTYPE_SH7780) +#define CHCR_TS_LOW_MASK 0x00000018 +#define CHCR_TS_LOW_SHIFT 3 +#define CHCR_TS_HIGH_MASK 0 +#define CHCR_TS_HIGH_SHIFT 0 +#else /* SH7785 */ +#define CHCR_TS_LOW_MASK 0x00000018 +#define CHCR_TS_LOW_SHIFT 3 +#define CHCR_TS_HIGH_MASK 0 +#define CHCR_TS_HIGH_SHIFT 0 +#endif + +/* Transmit sizes and respective CHCR register values */ +enum { + XMIT_SZ_8BIT = 0, + XMIT_SZ_16BIT = 1, + XMIT_SZ_32BIT = 2, + XMIT_SZ_64BIT = 7, + XMIT_SZ_128BIT = 3, + XMIT_SZ_256BIT = 4, + XMIT_SZ_128BIT_BLK = 0xb, + XMIT_SZ_256BIT_BLK = 0xc, +}; + +/* log2(size / 8) - used to calculate number of transfers */ +#define TS_SHIFT { \ + [XMIT_SZ_8BIT] = 0, \ + [XMIT_SZ_16BIT] = 1, \ + [XMIT_SZ_32BIT] = 2, \ + [XMIT_SZ_64BIT] = 3, \ + [XMIT_SZ_128BIT] = 4, \ + [XMIT_SZ_256BIT] = 5, \ + [XMIT_SZ_128BIT_BLK] = 4, \ + [XMIT_SZ_256BIT_BLK] = 5, \ +} + +#define TS_INDEX2VAL(i) ((((i) & 3) << CHCR_TS_LOW_SHIFT) | \ + ((((i) >> 2) & 3) << CHCR_TS_HIGH_SHIFT)) + +#else /* CONFIG_CPU_SH4A */ + +#define DMAOR_INIT (0x8000 | DMAOR_DME) + +#define CHCR_TS_LOW_MASK 0x70 +#define CHCR_TS_LOW_SHIFT 4 +#define CHCR_TS_HIGH_MASK 0 +#define CHCR_TS_HIGH_SHIFT 0 + +/* Transmit sizes and respective CHCR register values */ +enum { + XMIT_SZ_8BIT = 1, + XMIT_SZ_16BIT = 2, + XMIT_SZ_32BIT = 3, + XMIT_SZ_64BIT = 0, + XMIT_SZ_256BIT = 4, +}; + +/* log2(size / 8) - used to calculate number of transfers */ +#define TS_SHIFT { \ + [XMIT_SZ_8BIT] = 0, \ + [XMIT_SZ_16BIT] = 1, \ + [XMIT_SZ_32BIT] = 2, \ + [XMIT_SZ_64BIT] = 3, \ + [XMIT_SZ_256BIT] = 5, \ +} + +#define TS_INDEX2VAL(i) (((i) & 7) << CHCR_TS_LOW_SHIFT) + +#endif /* CONFIG_CPU_SH4A */ + +#endif diff --git a/arch/sh/include/cpu-sh4/cpu/dma-sh4a.h b/arch/sh/include/cpu-sh4/cpu/dma-sh4a.h index e734ea4..9647e68 100644 --- a/arch/sh/include/cpu-sh4/cpu/dma-sh4a.h +++ b/arch/sh/include/cpu-sh4/cpu/dma-sh4a.h @@ -8,20 +8,12 @@ #define DMAE0_IRQ 78 /* DMA Error IRQ*/ #define SH_DMAC_BASE0 0xFE008020 #define SH_DMARS_BASE0 0xFE009000 -#define CHCR_TS_LOW_MASK 0x00000018 -#define CHCR_TS_LOW_SHIFT 3 -#define CHCR_TS_HIGH_MASK 0 -#define CHCR_TS_HIGH_SHIFT 0 #elif defined(CONFIG_CPU_SUBTYPE_SH7722) #define DMTE0_IRQ 48 #define DMTE4_IRQ 76 #define DMAE0_IRQ 78 /* DMA Error IRQ*/ #define SH_DMAC_BASE0 0xFE008020 #define SH_DMARS_BASE0 0xFE009000 -#define CHCR_TS_LOW_MASK 0x00000018 -#define CHCR_TS_LOW_SHIFT 3 -#define CHCR_TS_HIGH_MASK 0x00300000 -#define CHCR_TS_HIGH_SHIFT 20 #elif defined(CONFIG_CPU_SUBTYPE_SH7763) || \ defined(CONFIG_CPU_SUBTYPE_SH7764) #define DMTE0_IRQ 34 @@ -29,10 +21,6 @@ #define DMAE0_IRQ 38 #define SH_DMAC_BASE0 0xFF608020 #define SH_DMARS_BASE0 0xFF609000 -#define CHCR_TS_LOW_MASK 0x00000018 -#define CHCR_TS_LOW_SHIFT 3 -#define CHCR_TS_HIGH_MASK 0 -#define CHCR_TS_HIGH_SHIFT 0 #elif defined(CONFIG_CPU_SUBTYPE_SH7723) #define DMTE0_IRQ 48 /* DMAC0A*/ #define DMTE4_IRQ 76 /* DMAC0B */ @@ -46,10 +34,6 @@ #define SH_DMAC_BASE0 0xFE008020 #define SH_DMAC_BASE1 0xFDC08020 #define SH_DMARS_BASE0 0xFDC09000 -#define CHCR_TS_LOW_MASK 0x00000018 -#define CHCR_TS_LOW_SHIFT 3 -#define CHCR_TS_HIGH_MASK 0 -#define CHCR_TS_HIGH_SHIFT 0 #elif defined(CONFIG_CPU_SUBTYPE_SH7724) #define DMTE0_IRQ 48 /* DMAC0A*/ #define DMTE4_IRQ 76 /* DMAC0B */ @@ -64,10 +48,6 @@ #define SH_DMAC_BASE1 0xFDC08020 #define SH_DMARS_BASE0 0xFE009000 #define SH_DMARS_BASE1 0xFDC09000 -#define CHCR_TS_LOW_MASK 0x00000018 -#define CHCR_TS_LOW_SHIFT 3 -#define CHCR_TS_HIGH_MASK 0x00600000 -#define CHCR_TS_HIGH_SHIFT 21 #elif defined(CONFIG_CPU_SUBTYPE_SH7780) #define DMTE0_IRQ 34 #define DMTE4_IRQ 44 @@ -80,10 +60,6 @@ #define SH_DMAC_BASE0 0xFC808020 #define SH_DMAC_BASE1 0xFC818020 #define SH_DMARS_BASE0 0xFC809000 -#define CHCR_TS_LOW_MASK 0x00000018 -#define CHCR_TS_LOW_SHIFT 3 -#define CHCR_TS_HIGH_MASK 0 -#define CHCR_TS_HIGH_SHIFT 0 #else /* SH7785 */ #define DMTE0_IRQ 33 #define DMTE4_IRQ 37 @@ -97,10 +73,6 @@ #define SH_DMAC_BASE0 0xFC808020 #define SH_DMAC_BASE1 0xFCC08020 #define SH_DMARS_BASE0 0xFC809000 -#define CHCR_TS_LOW_MASK 0x00000018 -#define CHCR_TS_LOW_SHIFT 3 -#define CHCR_TS_HIGH_MASK 0 -#define CHCR_TS_HIGH_SHIFT 0 #endif #define REQ_HE 0x000000C0 @@ -108,38 +80,4 @@ #define REQ_LE 0x00000040 #define TM_BURST 0x00000020 -/* - * The SuperH DMAC supports a number of transmit sizes, we list them here, - * with their respective values as they appear in the CHCR registers. - * - * Defaults to a 64-bit transfer size. - */ -enum { - XMIT_SZ_8BIT = 0, - XMIT_SZ_16BIT = 1, - XMIT_SZ_32BIT = 2, - XMIT_SZ_64BIT = 7, - XMIT_SZ_128BIT = 3, - XMIT_SZ_256BIT = 4, - XMIT_SZ_128BIT_BLK = 0xb, - XMIT_SZ_256BIT_BLK = 0xc, -}; - -/* - * The DMA count is defined as the number of bytes to transfer. - */ -#define TS_SHIFT { \ - [XMIT_SZ_8BIT] = 0, \ - [XMIT_SZ_16BIT] = 1, \ - [XMIT_SZ_32BIT] = 2, \ - [XMIT_SZ_64BIT] = 3, \ - [XMIT_SZ_128BIT] = 4, \ - [XMIT_SZ_256BIT] = 5, \ - [XMIT_SZ_128BIT_BLK] = 4, \ - [XMIT_SZ_256BIT_BLK] = 5, \ -} - -#define TS_INDEX2VAL(i) ((((i) & 3) << CHCR_TS_LOW_SHIFT) | \ - ((((i) >> 2) & 3) << CHCR_TS_HIGH_SHIFT)) - #endif /* __ASM_SH_CPU_SH4_DMA_SH7780_H */ diff --git a/arch/sh/include/cpu-sh4/cpu/dma.h b/arch/sh/include/cpu-sh4/cpu/dma.h index 114a369..ca747e9 100644 --- a/arch/sh/include/cpu-sh4/cpu/dma.h +++ b/arch/sh/include/cpu-sh4/cpu/dma.h @@ -5,9 +5,8 @@ #ifdef CONFIG_CPU_SH4A -#define DMAOR_INIT (DMAOR_DME) - #include <cpu/dma-sh4a.h> + #else /* CONFIG_CPU_SH4A */ /* * SH7750/SH7751/SH7760 @@ -17,7 +16,6 @@ #define DMTE6_IRQ 46 #define DMAE0_IRQ 38 -#define DMAOR_INIT (0x8000|DMAOR_DME) #define SH_DMAC_BASE0 0xffa00000 #define SH_DMAC_BASE1 0xffa00070 /* Definitions for the SuperH DMAC */ @@ -27,40 +25,8 @@ #define TS_32 0x00000030 #define TS_64 0x00000000 -#define CHCR_TS_LOW_MASK 0x70 -#define CHCR_TS_LOW_SHIFT 4 -#define CHCR_TS_HIGH_MASK 0 -#define CHCR_TS_HIGH_SHIFT 0 - #define DMAOR_COD 0x00000008 -/* - * The SuperH DMAC supports a number of transmit sizes, we list them here, - * with their respective values as they appear in the CHCR registers. - * - * Defaults to a 64-bit transfer size. - */ -enum { - XMIT_SZ_8BIT = 1, - XMIT_SZ_16BIT = 2, - XMIT_SZ_32BIT = 3, - XMIT_SZ_64BIT = 0, - XMIT_SZ_256BIT = 4, -}; - -/* - * The DMA count is defined as the number of bytes to transfer. - */ -#define TS_SHIFT { \ - [XMIT_SZ_8BIT] = 0, \ - [XMIT_SZ_16BIT] = 1, \ - [XMIT_SZ_32BIT] = 2, \ - [XMIT_SZ_64BIT] = 3, \ - [XMIT_SZ_256BIT] = 5, \ -} - -#define TS_INDEX2VAL(i) (((i) & 7) << CHCR_TS_LOW_SHIFT) - #endif #endif /* __ASM_CPU_SH4_DMA_H */ diff --git a/arch/sh/include/mach-migor/mach/migor.h b/arch/sh/include/mach-migor/mach/migor.h index cee6cb8..42fccf9 100644 --- a/arch/sh/include/mach-migor/mach/migor.h +++ b/arch/sh/include/mach-migor/mach/migor.h @@ -1,6 +1,7 @@ #ifndef __ASM_SH_MIGOR_H #define __ASM_SH_MIGOR_H +#define PORT_MSELCRA 0xa4050180 #define PORT_MSELCRB 0xa4050182 #define BSC_CS4BCR 0xfec10010 #define BSC_CS6ABCR 0xfec1001c diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7722.c b/arch/sh/kernel/cpu/sh4a/setup-sh7722.c index ef3f978..fd7e363 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7722.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7722.c @@ -7,19 +7,167 @@ * License. See the file "COPYING" in the main directory of this archive * for more details. */ -#include <linux/platform_device.h> #include <linux/init.h> +#include <linux/mm.h> +#include <linux/platform_device.h> #include <linux/serial.h> #include <linux/serial_sci.h> -#include <linux/mm.h> +#include <linux/sh_timer.h> #include <linux/uio_driver.h> #include <linux/usb/m66592.h> -#include <linux/sh_timer.h> + #include <asm/clock.h> +#include <asm/dmaengine.h> #include <asm/mmzone.h> -#include <asm/dma-sh.h> +#include <asm/siu.h> + +#include <cpu/dma-register.h> #include <cpu/sh7722.h> +static struct sh_dmae_slave_config sh7722_dmae_slaves[] = { + { + .slave_id = SHDMA_SLAVE_SCIF0_TX, + .addr = 0xffe0000c, + .chcr = DM_FIX | SM_INC | 0x800 | TS_INDEX2VAL(XMIT_SZ_8BIT), + .mid_rid = 0x21, + }, { + .slave_id = SHDMA_SLAVE_SCIF0_RX, + .addr = 0xffe00014, + .chcr = DM_INC | SM_FIX | 0x800 | TS_INDEX2VAL(XMIT_SZ_8BIT), + .mid_rid = 0x22, + }, { + .slave_id = SHDMA_SLAVE_SCIF1_TX, + .addr = 0xffe1000c, + .chcr = DM_FIX | SM_INC | 0x800 | TS_INDEX2VAL(XMIT_SZ_8BIT), + .mid_rid = 0x25, + }, { + .slave_id = SHDMA_SLAVE_SCIF1_RX, + .addr = 0xffe10014, + .chcr = DM_INC | SM_FIX | 0x800 | TS_INDEX2VAL(XMIT_SZ_8BIT), + .mid_rid = 0x26, + }, { + .slave_id = SHDMA_SLAVE_SCIF2_TX, + .addr = 0xffe2000c, + .chcr = DM_FIX | SM_INC | 0x800 | TS_INDEX2VAL(XMIT_SZ_8BIT), + .mid_rid = 0x29, + }, { + .slave_id = SHDMA_SLAVE_SCIF2_RX, + .addr = 0xffe20014, + .chcr = DM_INC | SM_FIX | 0x800 | TS_INDEX2VAL(XMIT_SZ_8BIT), + .mid_rid = 0x2a, + }, { + .slave_id = SHDMA_SLAVE_SIUA_TX, + .addr = 0xa454c098, + .chcr = DM_FIX | SM_INC | 0x800 | TS_INDEX2VAL(XMIT_SZ_32BIT), + .mid_rid = 0xb1, + }, { + .slave_id = SHDMA_SLAVE_SIUA_RX, + .addr = 0xa454c090, + .chcr = DM_INC | SM_FIX | 0x800 | TS_INDEX2VAL(XMIT_SZ_32BIT), + .mid_rid = 0xb2, + }, { + .slave_id = SHDMA_SLAVE_SIUB_TX, + .addr = 0xa454c09c, + .chcr = DM_FIX | SM_INC | 0x800 | TS_INDEX2VAL(XMIT_SZ_32BIT), + .mid_rid = 0xb5, + }, { + .slave_id = SHDMA_SLAVE_SIUB_RX, + .addr = 0xa454c094, + .chcr = DM_INC | SM_FIX | 0x800 | TS_INDEX2VAL(XMIT_SZ_32BIT), + .mid_rid = 0xb6, + }, +}; + +static struct sh_dmae_channel sh7722_dmae_channels[] = { + { + .offset = 0, + .dmars = 0, + .dmars_bit = 0, + }, { + .offset = 0x10, + .dmars = 0, + .dmars_bit = 8, + }, { + .offset = 0x20, + .dmars = 4, + .dmars_bit = 0, + }, { + .offset = 0x30, + .dmars = 4, + .dmars_bit = 8, + }, { + .offset = 0x50, + .dmars = 8, + .dmars_bit = 0, + }, { + .offset = 0x60, + .dmars = 8, + .dmars_bit = 8, + } +}; + +static unsigned int ts_shift[] = TS_SHIFT; + +static struct sh_dmae_pdata dma_platform_data = { + .slave = sh7722_dmae_slaves, + .slave_num = ARRAY_SIZE(sh7722_dmae_slaves), + .channel = sh7722_dmae_channels, + .channel_num = ARRAY_SIZE(sh7722_dmae_channels), + .ts_low_shift = CHCR_TS_LOW_SHIFT, + .ts_low_mask = CHCR_TS_LOW_MASK, + .ts_high_shift = CHCR_TS_HIGH_SHIFT, + .ts_high_mask = CHCR_TS_HIGH_MASK, + .ts_shift = ts_shift, + .ts_shift_num = ARRAY_SIZE(ts_shift), + .dmaor_init = DMAOR_INIT, +}; + +static struct resource sh7722_dmae_resources[] = { + [0] = { + /* Channel registers and DMAOR */ + .start = 0xfe008020, + .end = 0xfe00808f, + .flags = IORESOURCE_MEM, + }, + [1] = { + /* DMARSx */ + .start = 0xfe009000, + .end = 0xfe00900b, + .flags = IORESOURCE_MEM, + }, + { + /* DMA error IRQ */ + .start = 78, + .end = 78, + .flags = IORESOURCE_IRQ, + }, + { + /* IRQ for channels 0-3 */ + .start = 48, + .end = 51, + .flags = IORESOURCE_IRQ, + }, + { + /* IRQ for channels 4-5 */ + .start = 76, + .end = 77, + .flags = IORESOURCE_IRQ, + }, +}; + +struct platform_device dma_device = { + .name = "sh-dma-engine", + .id = -1, + .resource = sh7722_dmae_resources, + .num_resources = ARRAY_SIZE(sh7722_dmae_resources), + .dev = { + .platform_data = &dma_platform_data, + }, + .archdata = { + .hwblk_id = HWBLK_DMAC, + }, +}; + /* Serial */ static struct plat_sci_port scif0_platform_data = { .mapbase = 0xffe00000, @@ -388,15 +536,36 @@ static struct platform_device tmu2_device = { }, }; -static struct sh_dmae_pdata dma_platform_data = { - .mode = 0, +static struct siu_platform siu_platform_data = { + .dma_dev = &dma_device.dev, + .dma_slave_tx_a = SHDMA_SLAVE_SIUA_TX, + .dma_slave_rx_a = SHDMA_SLAVE_SIUA_RX, + .dma_slave_tx_b = SHDMA_SLAVE_SIUB_TX, + .dma_slave_rx_b = SHDMA_SLAVE_SIUB_RX, }; -static struct platform_device dma_device = { - .name = "sh-dma-engine", +static struct resource siu_resources[] = { + [0] = { + .start = 0xa4540000, + .end = 0xa454c10f, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 108, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device siu_device = { + .name = "sh_siu", .id = -1, - .dev = { - .platform_data = &dma_platform_data, + .dev = { + .platform_data = &siu_platform_data, + }, + .resource = siu_resources, + .num_resources = ARRAY_SIZE(siu_resources), + .archdata = { + .hwblk_id = HWBLK_SIU, }, }; @@ -414,6 +583,7 @@ static struct platform_device *sh7722_devices[] __initdata = { &vpu_device, &veu_device, &jpu_device, + &siu_device, &dma_device, }; diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c index 31e3451..e7fa2a9 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c @@ -21,22 +21,189 @@ #include <linux/sh_timer.h> #include <linux/io.h> #include <linux/notifier.h> + #include <asm/suspend.h> #include <asm/clock.h> -#include <asm/dma-sh.h> +#include <asm/dmaengine.h> #include <asm/mmzone.h> + +#include <cpu/dma-register.h> #include <cpu/sh7724.h> /* DMA */ -static struct sh_dmae_pdata dma_platform_data = { - .mode = SHDMA_DMAOR1, +static struct sh_dmae_channel sh7724_dmae0_channels[] = { + { + .offset = 0, + .dmars = 0, + .dmars_bit = 0, + }, { + .offset = 0x10, + .dmars = 0, + .dmars_bit = 8, + }, { + .offset = 0x20, + .dmars = 4, + .dmars_bit = 0, + }, { + .offset = 0x30, + .dmars = 4, + .dmars_bit = 8, + }, { + .offset = 0x50, + .dmars = 8, + .dmars_bit = 0, + }, { + .offset = 0x60, + .dmars = 8, + .dmars_bit = 8, + } +}; + +static struct sh_dmae_channel sh7724_dmae1_channels[] = { + { + .offset = 0, + .dmars = 0, + .dmars_bit = 0, + }, { + .offset = 0x10, + .dmars = 0, + .dmars_bit = 8, + }, { + .offset = 0x20, + .dmars = 4, + .dmars_bit = 0, + }, { + .offset = 0x30, + .dmars = 4, + .dmars_bit = 8, + }, { + .offset = 0x50, + .dmars = 8, + .dmars_bit = 0, + }, { + .offset = 0x60, + .dmars = 8, + .dmars_bit = 8, + } +}; + +static unsigned int ts_shift[] = TS_SHIFT; + +static struct sh_dmae_pdata dma0_platform_data = { + .channel = sh7724_dmae0_channels, + .channel_num = ARRAY_SIZE(sh7724_dmae0_channels), + .ts_low_shift = CHCR_TS_LOW_SHIFT, + .ts_low_mask = CHCR_TS_LOW_MASK, + .ts_high_shift = CHCR_TS_HIGH_SHIFT, + .ts_high_mask = CHCR_TS_HIGH_MASK, + .ts_shift = ts_shift, + .ts_shift_num = ARRAY_SIZE(ts_shift), + .dmaor_init = DMAOR_INIT, +}; + +static struct sh_dmae_pdata dma1_platform_data = { + .channel = sh7724_dmae1_channels, + .channel_num = ARRAY_SIZE(sh7724_dmae1_channels), + .ts_low_shift = CHCR_TS_LOW_SHIFT, + .ts_low_mask = CHCR_TS_LOW_MASK, + .ts_high_shift = CHCR_TS_HIGH_SHIFT, + .ts_high_mask = CHCR_TS_HIGH_MASK, + .ts_shift = ts_shift, + .ts_shift_num = ARRAY_SIZE(ts_shift), + .dmaor_init = DMAOR_INIT, +}; + +/* Resource order important! */ +static struct resource sh7724_dmae0_resources[] = { + { + /* Channel registers and DMAOR */ + .start = 0xfe008020, + .end = 0xfe00808f, + .flags = IORESOURCE_MEM, + }, + { + /* DMARSx */ + .start = 0xfe009000, + .end = 0xfe00900b, + .flags = IORESOURCE_MEM, + }, + { + /* DMA error IRQ */ + .start = 78, + .end = 78, + .flags = IORESOURCE_IRQ, + }, + { + /* IRQ for channels 0-3 */ + .start = 48, + .end = 51, + .flags = IORESOURCE_IRQ, + }, + { + /* IRQ for channels 4-5 */ + .start = 76, + .end = 77, + .flags = IORESOURCE_IRQ, + }, }; -static struct platform_device dma_device = { - .name = "sh-dma-engine", - .id = -1, - .dev = { - .platform_data = &dma_platform_data, +/* Resource order important! */ +static struct resource sh7724_dmae1_resources[] = { + { + /* Channel registers and DMAOR */ + .start = 0xfdc08020, + .end = 0xfdc0808f, + .flags = IORESOURCE_MEM, + }, + { + /* DMARSx */ + .start = 0xfdc09000, + .end = 0xfdc0900b, + .flags = IORESOURCE_MEM, + }, + { + /* DMA error IRQ */ + .start = 74, + .end = 74, + .flags = IORESOURCE_IRQ, + }, + { + /* IRQ for channels 0-3 */ + .start = 40, + .end = 43, + .flags = IORESOURCE_IRQ, + }, + { + /* IRQ for channels 4-5 */ + .start = 72, + .end = 73, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device dma0_device = { + .name = "sh-dma-engine", + .id = 0, + .resource = sh7724_dmae0_resources, + .num_resources = ARRAY_SIZE(sh7724_dmae0_resources), + .dev = { + .platform_data = &dma0_platform_data, + }, + .archdata = { + .hwblk_id = HWBLK_DMAC0, + }, +}; + +static struct platform_device dma1_device = { + .name = "sh-dma-engine", + .id = 1, + .resource = sh7724_dmae1_resources, + .num_resources = ARRAY_SIZE(sh7724_dmae1_resources), + .dev = { + .platform_data = &dma1_platform_data, + }, + .archdata = { + .hwblk_id = HWBLK_DMAC1, }, }; @@ -663,7 +830,8 @@ static struct platform_device *sh7724_devices[] __initdata = { &tmu3_device, &tmu4_device, &tmu5_device, - &dma_device, + &dma0_device, + &dma1_device, &rtc_device, &iic0_device, &iic1_device, diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7780.c b/arch/sh/kernel/cpu/sh4a/setup-sh7780.c index f8f2161..02e792c 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7780.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7780.c @@ -13,7 +13,10 @@ #include <linux/io.h> #include <linux/serial_sci.h> #include <linux/sh_timer.h> -#include <asm/dma-sh.h> + +#include <asm/dmaengine.h> + +#include <cpu/dma-register.h> static struct plat_sci_port scif0_platform_data = { .mapbase = 0xffe00000, @@ -247,15 +250,131 @@ static struct platform_device rtc_device = { .resource = rtc_resources, }; -static struct sh_dmae_pdata dma_platform_data = { - .mode = (SHDMA_MIX_IRQ | SHDMA_DMAOR1), +/* DMA */ +static struct sh_dmae_channel sh7780_dmae0_channels[] = { + { + .offset = 0, + .dmars = 0, + .dmars_bit = 0, + }, { + .offset = 0x10, + .dmars = 0, + .dmars_bit = 8, + }, { + .offset = 0x20, + .dmars = 4, + .dmars_bit = 0, + }, { + .offset = 0x30, + .dmars = 4, + .dmars_bit = 8, + }, { + .offset = 0x50, + .dmars = 8, + .dmars_bit = 0, + }, { + .offset = 0x60, + .dmars = 8, + .dmars_bit = 8, + } +}; + +static struct sh_dmae_channel sh7780_dmae1_channels[] = { + { + .offset = 0, + }, { + .offset = 0x10, + }, { + .offset = 0x20, + }, { + .offset = 0x30, + }, { + .offset = 0x50, + }, { + .offset = 0x60, + } +}; + +static unsigned int ts_shift[] = TS_SHIFT; + +static struct sh_dmae_pdata dma0_platform_data = { + .channel = sh7780_dmae0_channels, + .channel_num = ARRAY_SIZE(sh7780_dmae0_channels), + .ts_low_shift = CHCR_TS_LOW_SHIFT, + .ts_low_mask = CHCR_TS_LOW_MASK, + .ts_high_shift = CHCR_TS_HIGH_SHIFT, + .ts_high_mask = CHCR_TS_HIGH_MASK, + .ts_shift = ts_shift, + .ts_shift_num = ARRAY_SIZE(ts_shift), + .dmaor_init = DMAOR_INIT, +}; + +static struct sh_dmae_pdata dma1_platform_data = { + .channel = sh7780_dmae1_channels, + .channel_num = ARRAY_SIZE(sh7780_dmae1_channels), + .ts_low_shift = CHCR_TS_LOW_SHIFT, + .ts_low_mask = CHCR_TS_LOW_MASK, + .ts_high_shift = CHCR_TS_HIGH_SHIFT, + .ts_high_mask = CHCR_TS_HIGH_MASK, + .ts_shift = ts_shift, + .ts_shift_num = ARRAY_SIZE(ts_shift), + .dmaor_init = DMAOR_INIT, }; -static struct platform_device dma_device = { +static struct resource sh7780_dmae0_resources[] = { + [0] = { + /* Channel registers and DMAOR */ + .start = 0xfc808020, + .end = 0xfc80808f, + .flags = IORESOURCE_MEM, + }, + [1] = { + /* DMARSx */ + .start = 0xfc809000, + .end = 0xfc80900b, + .flags = IORESOURCE_MEM, + }, + { + /* Real DMA error IRQ is 38, and channel IRQs are 34-37, 44-45 */ + .start = 34, + .end = 34, + .flags = IORESOURCE_IRQ | IORESOURCE_IRQ_SHAREABLE, + }, +}; + +static struct resource sh7780_dmae1_resources[] = { + [0] = { + /* Channel registers and DMAOR */ + .start = 0xfc818020, + .end = 0xfc81808f, + .flags = IORESOURCE_MEM, + }, + /* DMAC1 has no DMARS */ + { + /* Real DMA error IRQ is 38, and channel IRQs are 46-47, 92-95 */ + .start = 46, + .end = 46, + .flags = IORESOURCE_IRQ | IORESOURCE_IRQ_SHAREABLE, + }, +}; + +static struct platform_device dma0_device = { .name = "sh-dma-engine", - .id = -1, + .id = 0, + .resource = sh7780_dmae0_resources, + .num_resources = ARRAY_SIZE(sh7780_dmae0_resources), .dev = { - .platform_data = &dma_platform_data, + .platform_data = &dma0_platform_data, + }, +}; + +static struct platform_device dma1_device = { + .name = "sh-dma-engine", + .id = 1, + .resource = sh7780_dmae1_resources, + .num_resources = ARRAY_SIZE(sh7780_dmae1_resources), + .dev = { + .platform_data = &dma1_platform_data, }, }; @@ -269,7 +388,8 @@ static struct platform_device *sh7780_devices[] __initdata = { &tmu4_device, &tmu5_device, &rtc_device, - &dma_device, + &dma0_device, + &dma1_device, }; static int __init sh7780_devices_setup(void) diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7785.c b/arch/sh/kernel/cpu/sh4a/setup-sh7785.c index 23448d8..1fcd88b 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7785.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7785.c @@ -14,9 +14,12 @@ #include <linux/io.h> #include <linux/mm.h> #include <linux/sh_timer.h> -#include <asm/dma-sh.h> + +#include <asm/dmaengine.h> #include <asm/mmzone.h> +#include <cpu/dma-register.h> + static struct plat_sci_port scif0_platform_data = { .mapbase = 0xffea0000, .flags = UPF_BOOT_AUTOCONF, @@ -295,15 +298,131 @@ static struct platform_device tmu5_device = { .num_resources = ARRAY_SIZE(tmu5_resources), }; -static struct sh_dmae_pdata dma_platform_data = { - .mode = (SHDMA_MIX_IRQ | SHDMA_DMAOR1), +/* DMA */ +static struct sh_dmae_channel sh7785_dmae0_channels[] = { + { + .offset = 0, + .dmars = 0, + .dmars_bit = 0, + }, { + .offset = 0x10, + .dmars = 0, + .dmars_bit = 8, + }, { + .offset = 0x20, + .dmars = 4, + .dmars_bit = 0, + }, { + .offset = 0x30, + .dmars = 4, + .dmars_bit = 8, + }, { + .offset = 0x50, + .dmars = 8, + .dmars_bit = 0, + }, { + .offset = 0x60, + .dmars = 8, + .dmars_bit = 8, + } +}; + +static struct sh_dmae_channel sh7785_dmae1_channels[] = { + { + .offset = 0, + }, { + .offset = 0x10, + }, { + .offset = 0x20, + }, { + .offset = 0x30, + }, { + .offset = 0x50, + }, { + .offset = 0x60, + } +}; + +static unsigned int ts_shift[] = TS_SHIFT; + +static struct sh_dmae_pdata dma0_platform_data = { + .channel = sh7785_dmae0_channels, + .channel_num = ARRAY_SIZE(sh7785_dmae0_channels), + .ts_low_shift = CHCR_TS_LOW_SHIFT, + .ts_low_mask = CHCR_TS_LOW_MASK, + .ts_high_shift = CHCR_TS_HIGH_SHIFT, + .ts_high_mask = CHCR_TS_HIGH_MASK, + .ts_shift = ts_shift, + .ts_shift_num = ARRAY_SIZE(ts_shift), + .dmaor_init = DMAOR_INIT, +}; + +static struct sh_dmae_pdata dma1_platform_data = { + .channel = sh7785_dmae1_channels, + .channel_num = ARRAY_SIZE(sh7785_dmae1_channels), + .ts_low_shift = CHCR_TS_LOW_SHIFT, + .ts_low_mask = CHCR_TS_LOW_MASK, + .ts_high_shift = CHCR_TS_HIGH_SHIFT, + .ts_high_mask = CHCR_TS_HIGH_MASK, + .ts_shift = ts_shift, + .ts_shift_num = ARRAY_SIZE(ts_shift), + .dmaor_init = DMAOR_INIT, }; -static struct platform_device dma_device = { +static struct resource sh7785_dmae0_resources[] = { + [0] = { + /* Channel registers and DMAOR */ + .start = 0xfc808020, + .end = 0xfc80808f, + .flags = IORESOURCE_MEM, + }, + [1] = { + /* DMARSx */ + .start = 0xfc809000, + .end = 0xfc80900b, + .flags = IORESOURCE_MEM, + }, + { + /* Real DMA error IRQ is 39, and channel IRQs are 33-38 */ + .start = 33, + .end = 33, + .flags = IORESOURCE_IRQ | IORESOURCE_IRQ_SHAREABLE, + }, +}; + +static struct resource sh7785_dmae1_resources[] = { + [0] = { + /* Channel registers and DMAOR */ + .start = 0xfcc08020, + .end = 0xfcc0808f, + .flags = IORESOURCE_MEM, + }, + /* DMAC1 has no DMARS */ + { + /* Real DMA error IRQ is 58, and channel IRQs are 52-57 */ + .start = 52, + .end = 52, + .flags = IORESOURCE_IRQ | IORESOURCE_IRQ_SHAREABLE, + }, +}; + +static struct platform_device dma0_device = { .name = "sh-dma-engine", - .id = -1, + .id = 0, + .resource = sh7785_dmae0_resources, + .num_resources = ARRAY_SIZE(sh7785_dmae0_resources), .dev = { - .platform_data = &dma_platform_data, + .platform_data = &dma0_platform_data, + }, +}; + +static struct platform_device dma1_device = { + .name = "sh-dma-engine", + .id = 1, + .resource = sh7785_dmae1_resources, + .num_resources = ARRAY_SIZE(sh7785_dmae1_resources), + .dev = { + .platform_data = &dma1_platform_data, }, }; @@ -320,7 +439,8 @@ static struct platform_device *sh7785_devices[] __initdata = { &tmu3_device, &tmu4_device, &tmu5_device, - &dma_device, + &dma0_device, + &dma1_device, }; static int __init sh7785_devices_setup(void) diff --git a/arch/sh/kernel/hw_breakpoint.c b/arch/sh/kernel/hw_breakpoint.c index e2f1753..675eea7 100644 --- a/arch/sh/kernel/hw_breakpoint.c +++ b/arch/sh/kernel/hw_breakpoint.c @@ -143,26 +143,6 @@ static int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len) return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE); } -/* - * Store a breakpoint's encoded address, length, and type. - */ -static int arch_store_info(struct perf_event *bp) -{ - struct arch_hw_breakpoint *info = counter_arch_bp(bp); - - /* - * User-space requests will always have the address field populated - * For kernel-addresses, either the address or symbol name can be - * specified. - */ - if (info->name) - info->address = (unsigned long)kallsyms_lookup_name(info->name); - if (info->address) - return 0; - - return -EINVAL; -} - int arch_bp_generic_fields(int sh_len, int sh_type, int *gen_len, int *gen_type) { @@ -276,10 +256,12 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp, return ret; } - ret = arch_store_info(bp); - - if (ret < 0) - return ret; + /* + * For kernel-addresses, either the address or symbol name can be + * specified. + */ + if (info->name) + info->address = (unsigned long)kallsyms_lookup_name(info->name); /* * Check that the low-order bits of the address are appropriate diff --git a/arch/sh/kernel/setup.c b/arch/sh/kernel/setup.c index 3459e70..8870d6b 100644 --- a/arch/sh/kernel/setup.c +++ b/arch/sh/kernel/setup.c @@ -443,7 +443,7 @@ void __init setup_arch(char **cmdline_p) nodes_clear(node_online_map); - /* Setup bootmem with available RAM */ + pmb_init(); lmb_init(); setup_memory(); sparse_init(); @@ -452,7 +452,6 @@ void __init setup_arch(char **cmdline_p) conswitchp = &dummy_con; #endif paging_init(); - pmb_init(); ioremap_fixed_init(); diff --git a/arch/sh/kernel/time.c b/arch/sh/kernel/time.c index 953fa16..8a0072d 100644 --- a/arch/sh/kernel/time.c +++ b/arch/sh/kernel/time.c @@ -39,12 +39,12 @@ static int null_rtc_set_time(const time_t secs) void (*rtc_sh_get_time)(struct timespec *) = null_rtc_get_time; int (*rtc_sh_set_time)(const time_t) = null_rtc_set_time; -#ifdef CONFIG_GENERIC_CMOS_UPDATE void read_persistent_clock(struct timespec *ts) { rtc_sh_get_time(ts); } +#ifdef CONFIG_GENERIC_CMOS_UPDATE int update_persistent_clock(struct timespec now) { return rtc_sh_set_time(now.tv_sec); @@ -113,9 +113,5 @@ void __init time_init(void) hwblk_init(); clk_init(); - rtc_sh_get_time(&xtime); - set_normalized_timespec(&wall_to_monotonic, - -xtime.tv_sec, -xtime.tv_nsec); - late_time_init = sh_late_time_init; } diff --git a/arch/sh/lib/libgcc.h b/arch/sh/lib/libgcc.h index 3f19d1c..05909d58 100644 --- a/arch/sh/lib/libgcc.h +++ b/arch/sh/lib/libgcc.h @@ -17,8 +17,7 @@ struct DWstruct { #error I feel sick. #endif -typedef union -{ +typedef union { struct DWstruct s; long long ll; } DWunion; diff --git a/arch/sh/mm/ioremap.c b/arch/sh/mm/ioremap.c index c68d2d7..1ab2385 100644 --- a/arch/sh/mm/ioremap.c +++ b/arch/sh/mm/ioremap.c @@ -34,11 +34,12 @@ * caller shouldn't need to know that small detail. */ void __iomem * __init_refok -__ioremap_caller(unsigned long phys_addr, unsigned long size, +__ioremap_caller(phys_addr_t phys_addr, unsigned long size, pgprot_t pgprot, void *caller) { struct vm_struct *area; unsigned long offset, last_addr, addr, orig_addr; + void __iomem *mapped; /* Don't allow wraparound or zero size */ last_addr = phys_addr + size - 1; @@ -46,6 +47,20 @@ __ioremap_caller(unsigned long phys_addr, unsigned long size, return NULL; /* + * If we can't yet use the regular approach, go the fixmap route. + */ + if (!mem_init_done) + return ioremap_fixed(phys_addr, size, pgprot); + + /* + * First try to remap through the PMB. + * PMB entries are all pre-faulted. + */ + mapped = pmb_remap_caller(phys_addr, size, pgprot, caller); + if (mapped && !IS_ERR(mapped)) + return mapped; + + /* * Mappings have to be page-aligned */ offset = phys_addr & ~PAGE_MASK; @@ -53,12 +68,6 @@ __ioremap_caller(unsigned long phys_addr, unsigned long size, size = PAGE_ALIGN(last_addr+1) - phys_addr; /* - * If we can't yet use the regular approach, go the fixmap route. - */ - if (!mem_init_done) - return ioremap_fixed(phys_addr, offset, size, pgprot); - - /* * Ok, go for it.. */ area = get_vm_area_caller(size, VM_IOREMAP, caller); @@ -67,33 +76,10 @@ __ioremap_caller(unsigned long phys_addr, unsigned long size, area->phys_addr = phys_addr; orig_addr = addr = (unsigned long)area->addr; -#ifdef CONFIG_PMB - /* - * First try to remap through the PMB once a valid VMA has been - * established. Smaller allocations (or the rest of the size - * remaining after a PMB mapping due to the size not being - * perfectly aligned on a PMB size boundary) are then mapped - * through the UTLB using conventional page tables. - * - * PMB entries are all pre-faulted. - */ - if (unlikely(phys_addr >= P1SEG)) { - unsigned long mapped; - - mapped = pmb_remap(addr, phys_addr, size, pgprot); - if (likely(mapped)) { - addr += mapped; - phys_addr += mapped; - size -= mapped; - } + if (ioremap_page_range(addr, addr + size, phys_addr, pgprot)) { + vunmap((void *)orig_addr); + return NULL; } -#endif - - if (likely(size)) - if (ioremap_page_range(addr, addr + size, phys_addr, pgprot)) { - vunmap((void *)orig_addr); - return NULL; - } return (void __iomem *)(offset + (char *)orig_addr); } @@ -133,23 +119,11 @@ void __iounmap(void __iomem *addr) if (iounmap_fixed(addr) == 0) return; -#ifdef CONFIG_PMB /* - * Purge any PMB entries that may have been established for this - * mapping, then proceed with conventional VMA teardown. - * - * XXX: Note that due to the way that remove_vm_area() does - * matching of the resultant VMA, we aren't able to fast-forward - * the address past the PMB space until the end of the VMA where - * the page tables reside. As such, unmap_vm_area() will be - * forced to linearly scan over the area until it finds the page - * tables where PTEs that need to be unmapped actually reside, - * which is far from optimal. Perhaps we need to use a separate - * VMA for the PMB mappings? - * -- PFM. + * If the PMB handled it, there's nothing else to do. */ - pmb_unmap(vaddr); -#endif + if (pmb_unmap(addr) == 0) + return; p = remove_vm_area((void *)(vaddr & PAGE_MASK)); if (!p) { diff --git a/arch/sh/mm/ioremap_fixed.c b/arch/sh/mm/ioremap_fixed.c index 0b78b1e..7f682e5 100644 --- a/arch/sh/mm/ioremap_fixed.c +++ b/arch/sh/mm/ioremap_fixed.c @@ -45,14 +45,21 @@ void __init ioremap_fixed_init(void) } void __init __iomem * -ioremap_fixed(resource_size_t phys_addr, unsigned long offset, - unsigned long size, pgprot_t prot) +ioremap_fixed(phys_addr_t phys_addr, unsigned long size, pgprot_t prot) { enum fixed_addresses idx0, idx; struct ioremap_map *map; unsigned int nrpages; + unsigned long offset; int i, slot; + /* + * Mappings have to be page-aligned + */ + offset = phys_addr & ~PAGE_MASK; + phys_addr &= PAGE_MASK; + size = PAGE_ALIGN(phys_addr + size) - phys_addr; + slot = -1; for (i = 0; i < FIX_N_IOREMAPS; i++) { map = &ioremap_maps[i]; diff --git a/arch/sh/mm/numa.c b/arch/sh/mm/numa.c index 422e927..961b340 100644 --- a/arch/sh/mm/numa.c +++ b/arch/sh/mm/numa.c @@ -74,6 +74,9 @@ void __init setup_bootmem_node(int nid, unsigned long start, unsigned long end) start_pfn = start >> PAGE_SHIFT; end_pfn = end >> PAGE_SHIFT; + pmb_bolt_mapping((unsigned long)__va(start), start, end - start, + PAGE_KERNEL); + lmb_add(start, end - start); __add_active_range(nid, start_pfn, end_pfn); diff --git a/arch/sh/mm/pmb.c b/arch/sh/mm/pmb.c index 198bcff..a4662e2 100644 --- a/arch/sh/mm/pmb.c +++ b/arch/sh/mm/pmb.c @@ -23,7 +23,8 @@ #include <linux/err.h> #include <linux/io.h> #include <linux/spinlock.h> -#include <linux/rwlock.h> +#include <linux/vmalloc.h> +#include <asm/cacheflush.h> #include <asm/sizes.h> #include <asm/system.h> #include <asm/uaccess.h> @@ -52,12 +53,24 @@ struct pmb_entry { struct pmb_entry *link; }; +static struct { + unsigned long size; + int flag; +} pmb_sizes[] = { + { .size = SZ_512M, .flag = PMB_SZ_512M, }, + { .size = SZ_128M, .flag = PMB_SZ_128M, }, + { .size = SZ_64M, .flag = PMB_SZ_64M, }, + { .size = SZ_16M, .flag = PMB_SZ_16M, }, +}; + static void pmb_unmap_entry(struct pmb_entry *, int depth); static DEFINE_RWLOCK(pmb_rwlock); static struct pmb_entry pmb_entry_list[NR_PMB_ENTRIES]; static DECLARE_BITMAP(pmb_map, NR_PMB_ENTRIES); +static unsigned int pmb_iomapping_enabled; + static __always_inline unsigned long mk_pmb_entry(unsigned int entry) { return (entry & PMB_E_MASK) << PMB_E_SHIFT; @@ -73,6 +86,142 @@ static __always_inline unsigned long mk_pmb_data(unsigned int entry) return mk_pmb_entry(entry) | PMB_DATA; } +static __always_inline unsigned int pmb_ppn_in_range(unsigned long ppn) +{ + return ppn >= __pa(memory_start) && ppn < __pa(memory_end); +} + +/* + * Ensure that the PMB entries match our cache configuration. + * + * When we are in 32-bit address extended mode, CCR.CB becomes + * invalid, so care must be taken to manually adjust cacheable + * translations. + */ +static __always_inline unsigned long pmb_cache_flags(void) +{ + unsigned long flags = 0; + +#if defined(CONFIG_CACHE_OFF) + flags |= PMB_WT | PMB_UB; +#elif defined(CONFIG_CACHE_WRITETHROUGH) + flags |= PMB_C | PMB_WT | PMB_UB; +#elif defined(CONFIG_CACHE_WRITEBACK) + flags |= PMB_C; +#endif + + return flags; +} + +/* + * Convert typical pgprot value to the PMB equivalent + */ +static inline unsigned long pgprot_to_pmb_flags(pgprot_t prot) +{ + unsigned long pmb_flags = 0; + u64 flags = pgprot_val(prot); + + if (flags & _PAGE_CACHABLE) + pmb_flags |= PMB_C; + if (flags & _PAGE_WT) + pmb_flags |= PMB_WT | PMB_UB; + + return pmb_flags; +} + +static inline bool pmb_can_merge(struct pmb_entry *a, struct pmb_entry *b) +{ + return (b->vpn == (a->vpn + a->size)) && + (b->ppn == (a->ppn + a->size)) && + (b->flags == a->flags); +} + +static bool pmb_mapping_exists(unsigned long vaddr, phys_addr_t phys, + unsigned long size) +{ + int i; + + read_lock(&pmb_rwlock); + + for (i = 0; i < ARRAY_SIZE(pmb_entry_list); i++) { + struct pmb_entry *pmbe, *iter; + unsigned long span; + + if (!test_bit(i, pmb_map)) + continue; + + pmbe = &pmb_entry_list[i]; + + /* + * See if VPN and PPN are bounded by an existing mapping. + */ + if ((vaddr < pmbe->vpn) || (vaddr >= (pmbe->vpn + pmbe->size))) + continue; + if ((phys < pmbe->ppn) || (phys >= (pmbe->ppn + pmbe->size))) + continue; + + /* + * Now see if we're in range of a simple mapping. + */ + if (size <= pmbe->size) { + read_unlock(&pmb_rwlock); + return true; + } + + span = pmbe->size; + + /* + * Finally for sizes that involve compound mappings, walk + * the chain. + */ + for (iter = pmbe->link; iter; iter = iter->link) + span += iter->size; + + /* + * Nothing else to do if the range requirements are met. + */ + if (size <= span) { + read_unlock(&pmb_rwlock); + return true; + } + } + + read_unlock(&pmb_rwlock); + return false; +} + +static bool pmb_size_valid(unsigned long size) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(pmb_sizes); i++) + if (pmb_sizes[i].size == size) + return true; + + return false; +} + +static inline bool pmb_addr_valid(unsigned long addr, unsigned long size) +{ + return (addr >= P1SEG && (addr + size - 1) < P3SEG); +} + +static inline bool pmb_prot_valid(pgprot_t prot) +{ + return (pgprot_val(prot) & _PAGE_USER) == 0; +} + +static int pmb_size_to_flags(unsigned long size) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(pmb_sizes); i++) + if (pmb_sizes[i].size == size) + return pmb_sizes[i].flag; + + return 0; +} + static int pmb_alloc_entry(void) { int pos; @@ -140,33 +289,22 @@ static void pmb_free(struct pmb_entry *pmbe) } /* - * Ensure that the PMB entries match our cache configuration. - * - * When we are in 32-bit address extended mode, CCR.CB becomes - * invalid, so care must be taken to manually adjust cacheable - * translations. + * Must be run uncached. */ -static __always_inline unsigned long pmb_cache_flags(void) +static void __set_pmb_entry(struct pmb_entry *pmbe) { - unsigned long flags = 0; + unsigned long addr, data; -#if defined(CONFIG_CACHE_WRITETHROUGH) - flags |= PMB_C | PMB_WT | PMB_UB; -#elif defined(CONFIG_CACHE_WRITEBACK) - flags |= PMB_C; -#endif + addr = mk_pmb_addr(pmbe->entry); + data = mk_pmb_data(pmbe->entry); - return flags; -} + jump_to_uncached(); -/* - * Must be run uncached. - */ -static void __set_pmb_entry(struct pmb_entry *pmbe) -{ - writel_uncached(pmbe->vpn | PMB_V, mk_pmb_addr(pmbe->entry)); - writel_uncached(pmbe->ppn | pmbe->flags | PMB_V, - mk_pmb_data(pmbe->entry)); + /* Set V-bit */ + __raw_writel(pmbe->vpn | PMB_V, addr); + __raw_writel(pmbe->ppn | pmbe->flags | PMB_V, data); + + back_to_cached(); } static void __clear_pmb_entry(struct pmb_entry *pmbe) @@ -194,144 +332,155 @@ static void set_pmb_entry(struct pmb_entry *pmbe) spin_unlock_irqrestore(&pmbe->lock, flags); } -static struct { - unsigned long size; - int flag; -} pmb_sizes[] = { - { .size = SZ_512M, .flag = PMB_SZ_512M, }, - { .size = SZ_128M, .flag = PMB_SZ_128M, }, - { .size = SZ_64M, .flag = PMB_SZ_64M, }, - { .size = SZ_16M, .flag = PMB_SZ_16M, }, -}; - -long pmb_remap(unsigned long vaddr, unsigned long phys, - unsigned long size, pgprot_t prot) +int pmb_bolt_mapping(unsigned long vaddr, phys_addr_t phys, + unsigned long size, pgprot_t prot) { struct pmb_entry *pmbp, *pmbe; - unsigned long wanted; - int pmb_flags, i; - long err; - u64 flags; + unsigned long orig_addr, orig_size; + unsigned long flags, pmb_flags; + int i, mapped; - flags = pgprot_val(prot); + if (!pmb_addr_valid(vaddr, size)) + return -EFAULT; + if (pmb_mapping_exists(vaddr, phys, size)) + return 0; - pmb_flags = PMB_WT | PMB_UB; - - /* Convert typical pgprot value to the PMB equivalent */ - if (flags & _PAGE_CACHABLE) { - pmb_flags |= PMB_C; + orig_addr = vaddr; + orig_size = size; - if ((flags & _PAGE_WT) == 0) - pmb_flags &= ~(PMB_WT | PMB_UB); - } + flush_tlb_kernel_range(vaddr, vaddr + size); + pmb_flags = pgprot_to_pmb_flags(prot); pmbp = NULL; - wanted = size; -again: - for (i = 0; i < ARRAY_SIZE(pmb_sizes); i++) { - unsigned long flags; + do { + for (i = mapped = 0; i < ARRAY_SIZE(pmb_sizes); i++) { + if (size < pmb_sizes[i].size) + continue; + + pmbe = pmb_alloc(vaddr, phys, pmb_flags | + pmb_sizes[i].flag, PMB_NO_ENTRY); + if (IS_ERR(pmbe)) { + pmb_unmap_entry(pmbp, mapped); + return PTR_ERR(pmbe); + } - if (size < pmb_sizes[i].size) - continue; + spin_lock_irqsave(&pmbe->lock, flags); - pmbe = pmb_alloc(vaddr, phys, pmb_flags | pmb_sizes[i].flag, - PMB_NO_ENTRY); - if (IS_ERR(pmbe)) { - err = PTR_ERR(pmbe); - goto out; - } + pmbe->size = pmb_sizes[i].size; - spin_lock_irqsave(&pmbe->lock, flags); + __set_pmb_entry(pmbe); - __set_pmb_entry(pmbe); + phys += pmbe->size; + vaddr += pmbe->size; + size -= pmbe->size; - phys += pmb_sizes[i].size; - vaddr += pmb_sizes[i].size; - size -= pmb_sizes[i].size; + /* + * Link adjacent entries that span multiple PMB + * entries for easier tear-down. + */ + if (likely(pmbp)) { + spin_lock(&pmbp->lock); + pmbp->link = pmbe; + spin_unlock(&pmbp->lock); + } - pmbe->size = pmb_sizes[i].size; + pmbp = pmbe; - /* - * Link adjacent entries that span multiple PMB entries - * for easier tear-down. - */ - if (likely(pmbp)) { - spin_lock(&pmbp->lock); - pmbp->link = pmbe; - spin_unlock(&pmbp->lock); + /* + * Instead of trying smaller sizes on every + * iteration (even if we succeed in allocating + * space), try using pmb_sizes[i].size again. + */ + i--; + mapped++; + + spin_unlock_irqrestore(&pmbe->lock, flags); } + } while (size >= SZ_16M); - pmbp = pmbe; + flush_cache_vmap(orig_addr, orig_addr + orig_size); - /* - * Instead of trying smaller sizes on every iteration - * (even if we succeed in allocating space), try using - * pmb_sizes[i].size again. - */ - i--; + return 0; +} - spin_unlock_irqrestore(&pmbe->lock, flags); - } +void __iomem *pmb_remap_caller(phys_addr_t phys, unsigned long size, + pgprot_t prot, void *caller) +{ + unsigned long vaddr; + phys_addr_t offset, last_addr; + phys_addr_t align_mask; + unsigned long aligned; + struct vm_struct *area; + int i, ret; - if (size >= SZ_16M) - goto again; + if (!pmb_iomapping_enabled) + return NULL; - return wanted - size; + /* + * Small mappings need to go through the TLB. + */ + if (size < SZ_16M) + return ERR_PTR(-EINVAL); + if (!pmb_prot_valid(prot)) + return ERR_PTR(-EINVAL); -out: - pmb_unmap_entry(pmbp, NR_PMB_ENTRIES); + for (i = 0; i < ARRAY_SIZE(pmb_sizes); i++) + if (size >= pmb_sizes[i].size) + break; + + last_addr = phys + size; + align_mask = ~(pmb_sizes[i].size - 1); + offset = phys & ~align_mask; + phys &= align_mask; + aligned = ALIGN(last_addr, pmb_sizes[i].size) - phys; + + /* + * XXX: This should really start from uncached_end, but this + * causes the MMU to reset, so for now we restrict it to the + * 0xb000...0xc000 range. + */ + area = __get_vm_area_caller(aligned, VM_IOREMAP, 0xb0000000, + P3SEG, caller); + if (!area) + return NULL; + + area->phys_addr = phys; + vaddr = (unsigned long)area->addr; + + ret = pmb_bolt_mapping(vaddr, phys, size, prot); + if (unlikely(ret != 0)) + return ERR_PTR(ret); - return err; + return (void __iomem *)(offset + (char *)vaddr); } -void pmb_unmap(unsigned long addr) +int pmb_unmap(void __iomem *addr) { struct pmb_entry *pmbe = NULL; - int i; + unsigned long vaddr = (unsigned long __force)addr; + int i, found = 0; read_lock(&pmb_rwlock); for (i = 0; i < ARRAY_SIZE(pmb_entry_list); i++) { if (test_bit(i, pmb_map)) { pmbe = &pmb_entry_list[i]; - if (pmbe->vpn == addr) + if (pmbe->vpn == vaddr) { + found = 1; break; + } } } read_unlock(&pmb_rwlock); - pmb_unmap_entry(pmbe, NR_PMB_ENTRIES); -} - -static bool pmb_can_merge(struct pmb_entry *a, struct pmb_entry *b) -{ - return (b->vpn == (a->vpn + a->size)) && - (b->ppn == (a->ppn + a->size)) && - (b->flags == a->flags); -} - -static bool pmb_size_valid(unsigned long size) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(pmb_sizes); i++) - if (pmb_sizes[i].size == size) - return true; - - return false; -} - -static int pmb_size_to_flags(unsigned long size) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(pmb_sizes); i++) - if (pmb_sizes[i].size == size) - return pmb_sizes[i].flag; + if (found) { + pmb_unmap_entry(pmbe, NR_PMB_ENTRIES); + return 0; + } - return 0; + return -EINVAL; } static void __pmb_unmap_entry(struct pmb_entry *pmbe, int depth) @@ -351,6 +500,8 @@ static void __pmb_unmap_entry(struct pmb_entry *pmbe, int depth) */ __clear_pmb_entry(pmbe); + flush_cache_vunmap(pmbe->vpn, pmbe->vpn + pmbe->size); + pmbe = pmblink->link; pmb_free(pmblink); @@ -369,11 +520,6 @@ static void pmb_unmap_entry(struct pmb_entry *pmbe, int depth) write_unlock_irqrestore(&pmb_rwlock, flags); } -static __always_inline unsigned int pmb_ppn_in_range(unsigned long ppn) -{ - return ppn >= __pa(memory_start) && ppn < __pa(memory_end); -} - static void __init pmb_notify(void) { int i; @@ -625,6 +771,18 @@ static void __init pmb_resize(void) } #endif +static int __init early_pmb(char *p) +{ + if (!p) + return 0; + + if (strstr(p, "iomap")) + pmb_iomapping_enabled = 1; + + return 0; +} +early_param("pmb", early_pmb); + void __init pmb_init(void) { /* Synchronize software state */ @@ -713,7 +871,7 @@ static int __init pmb_debugfs_init(void) return 0; } -postcore_initcall(pmb_debugfs_init); +subsys_initcall(pmb_debugfs_init); #ifdef CONFIG_PM static int pmb_sysdev_suspend(struct sys_device *dev, pm_message_t state) diff --git a/arch/sparc/configs/sparc32_defconfig b/arch/sparc/configs/sparc32_defconfig index 99a1f19..6a8d078 100644 --- a/arch/sparc/configs/sparc32_defconfig +++ b/arch/sparc/configs/sparc32_defconfig @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.33-rc2 -# Mon Jan 11 23:20:31 2010 +# Linux kernel version: 2.6.33 +# Wed Mar 3 02:52:23 2010 # # CONFIG_64BIT is not set CONFIG_SPARC=y @@ -9,6 +9,8 @@ CONFIG_SPARC32=y # CONFIG_SPARC64 is not set CONFIG_ARCH_DEFCONFIG="arch/sparc/configs/sparc32_defconfig" CONFIG_BITS=32 +CONFIG_GENERIC_TIME=y +CONFIG_ARCH_USES_GETTIMEOFFSET=y CONFIG_AUDIT_ARCH=y CONFIG_MMU=y CONFIG_HIGHMEM=y @@ -48,11 +50,6 @@ CONFIG_RCU_FANOUT=32 # CONFIG_TREE_RCU_TRACE is not set # CONFIG_IKCONFIG is not set CONFIG_LOG_BUF_SHIFT=14 -CONFIG_GROUP_SCHED=y -CONFIG_FAIR_GROUP_SCHED=y -CONFIG_RT_GROUP_SCHED=y -CONFIG_USER_SCHED=y -# CONFIG_CGROUP_SCHED is not set # CONFIG_CGROUPS is not set CONFIG_SYSFS_DEPRECATED=y CONFIG_SYSFS_DEPRECATED_V2=y @@ -68,6 +65,7 @@ CONFIG_INITRAMFS_SOURCE="" CONFIG_RD_GZIP=y CONFIG_RD_BZIP2=y CONFIG_RD_LZMA=y +CONFIG_RD_LZO=y # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set CONFIG_SYSCTL=y CONFIG_ANON_INODES=y @@ -211,7 +209,6 @@ CONFIG_SBUSCHAR=y CONFIG_PCI=y CONFIG_PCI_SYSCALL=y # CONFIG_ARCH_SUPPORTS_MSI is not set -CONFIG_PCI_LEGACY=y # CONFIG_PCI_DEBUG is not set # CONFIG_PCI_STUB is not set # CONFIG_PCI_IOV is not set @@ -232,7 +229,6 @@ CONFIG_NET=y # Networking options # CONFIG_PACKET=y -# CONFIG_PACKET_MMAP is not set CONFIG_UNIX=y CONFIG_XFRM=y CONFIG_XFRM_USER=m @@ -379,11 +375,13 @@ CONFIG_MISC_DEVICES=y # CONFIG_TIFM_CORE is not set # CONFIG_ENCLOSURE_SERVICES is not set # CONFIG_HP_ILO is not set +# CONFIG_TI_DAC7512 is not set # CONFIG_C2PORT is not set # # EEPROM support # +# CONFIG_EEPROM_AT25 is not set # CONFIG_EEPROM_93CX6 is not set # CONFIG_CB710_CORE is not set CONFIG_HAVE_IDE=y @@ -507,7 +505,9 @@ CONFIG_SUNQE=m # CONFIG_SUNGEM is not set # CONFIG_CASSINI is not set # CONFIG_NET_VENDOR_3COM is not set +# CONFIG_ENC28J60 is not set # CONFIG_ETHOC is not set +# CONFIG_GRETH is not set # CONFIG_DNET is not set # CONFIG_NET_TULIP is not set # CONFIG_HP100 is not set @@ -521,6 +521,7 @@ CONFIG_SUNQE=m # CONFIG_NET_PCI is not set # CONFIG_B44 is not set # CONFIG_KS8842 is not set +# CONFIG_KS8851 is not set # CONFIG_KS8851_MLL is not set # CONFIG_ATL2 is not set CONFIG_NETDEV_1000=y @@ -563,6 +564,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y # CONFIG_MLX4_CORE is not set # CONFIG_TEHUTI is not set # CONFIG_BNX2X is not set +# CONFIG_QLCNIC is not set # CONFIG_QLGE is not set # CONFIG_SFC is not set # CONFIG_BE2NET is not set @@ -665,6 +667,7 @@ CONFIG_DEVKMEM=y # # Non-8250 serial port support # +# CONFIG_SERIAL_MAX3100 is not set CONFIG_SERIAL_SUNCORE=y CONFIG_SERIAL_SUNZILOG=y CONFIG_SERIAL_SUNZILOG_CONSOLE=y @@ -689,7 +692,23 @@ CONFIG_HW_RANDOM=m # CONFIG_TCG_TPM is not set CONFIG_DEVPORT=y # CONFIG_I2C is not set -# CONFIG_SPI is not set +CONFIG_SPI=y +# CONFIG_SPI_DEBUG is not set +CONFIG_SPI_MASTER=y + +# +# SPI Master Controller Drivers +# +CONFIG_SPI_BITBANG=m +CONFIG_SPI_XILINX=m +CONFIG_SPI_XILINX_PLTFM=m +# CONFIG_SPI_DESIGNWARE is not set + +# +# SPI Protocol Masters +# +# CONFIG_SPI_SPIDEV is not set +# CONFIG_SPI_TLE62X0 is not set # # PPS support @@ -706,10 +725,13 @@ CONFIG_HWMON=y # # Native drivers # +# CONFIG_SENSORS_ADCXX is not set # CONFIG_SENSORS_I5K_AMB is not set # CONFIG_SENSORS_F71805F is not set # CONFIG_SENSORS_F71882FG is not set # CONFIG_SENSORS_IT87 is not set +# CONFIG_SENSORS_LM70 is not set +# CONFIG_SENSORS_MAX1111 is not set # CONFIG_SENSORS_PC87360 is not set # CONFIG_SENSORS_PC87427 is not set # CONFIG_SENSORS_SIS5595 is not set @@ -720,6 +742,7 @@ CONFIG_HWMON=y # CONFIG_SENSORS_VT8231 is not set # CONFIG_SENSORS_W83627HF is not set # CONFIG_SENSORS_W83627EHF is not set +# CONFIG_SENSORS_LIS3_SPI is not set # CONFIG_THERMAL is not set # CONFIG_WATCHDOG is not set CONFIG_SSB_POSSIBLE=y @@ -736,6 +759,8 @@ CONFIG_SSB_POSSIBLE=y # CONFIG_MFD_SM501 is not set # CONFIG_HTC_PASIC3 is not set # CONFIG_MFD_TMIO is not set +# CONFIG_MFD_MC13783 is not set +# CONFIG_AB4500_CORE is not set # CONFIG_REGULATOR is not set # CONFIG_MEDIA_SUPPORT is not set @@ -743,6 +768,7 @@ CONFIG_SSB_POSSIBLE=y # Graphics support # CONFIG_VGA_ARB=y +CONFIG_VGA_ARB_MAX_GPUS=16 # CONFIG_VGASTATE is not set # CONFIG_VIDEO_OUTPUT_CONTROL is not set # CONFIG_FB is not set @@ -808,6 +834,14 @@ CONFIG_RTC_INTF_DEV=y # # SPI RTC drivers # +# CONFIG_RTC_DRV_M41T94 is not set +# CONFIG_RTC_DRV_DS1305 is not set +# CONFIG_RTC_DRV_DS1390 is not set +# CONFIG_RTC_DRV_MAX6902 is not set +# CONFIG_RTC_DRV_R9701 is not set +# CONFIG_RTC_DRV_RS5C348 is not set +# CONFIG_RTC_DRV_DS3234 is not set +# CONFIG_RTC_DRV_PCF2123 is not set # # Platform RTC drivers @@ -1180,9 +1214,11 @@ CONFIG_CRC32=y CONFIG_LIBCRC32C=m CONFIG_ZLIB_INFLATE=y CONFIG_ZLIB_DEFLATE=y +CONFIG_LZO_DECOMPRESS=y CONFIG_DECOMPRESS_GZIP=y CONFIG_DECOMPRESS_BZIP2=y CONFIG_DECOMPRESS_LZMA=y +CONFIG_DECOMPRESS_LZO=y CONFIG_HAS_IOMEM=y CONFIG_HAS_IOPORT=y CONFIG_HAS_DMA=y diff --git a/arch/sparc/configs/sparc64_defconfig b/arch/sparc/configs/sparc64_defconfig index 41c5a56..56e3163 100644 --- a/arch/sparc/configs/sparc64_defconfig +++ b/arch/sparc/configs/sparc64_defconfig @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.33-rc2 -# Wed Jan 20 16:31:47 2010 +# Linux kernel version: 2.6.33 +# Wed Mar 3 02:54:29 2010 # CONFIG_64BIT=y CONFIG_SPARC=y @@ -55,14 +55,10 @@ CONFIG_TREE_RCU=y # CONFIG_RCU_TRACE is not set CONFIG_RCU_FANOUT=64 # CONFIG_RCU_FANOUT_EXACT is not set +# CONFIG_RCU_FAST_NO_HZ is not set # CONFIG_TREE_RCU_TRACE is not set # CONFIG_IKCONFIG is not set CONFIG_LOG_BUF_SHIFT=18 -CONFIG_GROUP_SCHED=y -CONFIG_FAIR_GROUP_SCHED=y -CONFIG_RT_GROUP_SCHED=y -CONFIG_USER_SCHED=y -# CONFIG_CGROUP_SCHED is not set # CONFIG_CGROUPS is not set # CONFIG_SYSFS_DEPRECATED_V2 is not set CONFIG_RELAY=y @@ -77,6 +73,7 @@ CONFIG_INITRAMFS_SOURCE="" CONFIG_RD_GZIP=y CONFIG_RD_BZIP2=y CONFIG_RD_LZMA=y +CONFIG_RD_LZO=y CONFIG_CC_OPTIMIZE_FOR_SIZE=y CONFIG_SYSCTL=y CONFIG_ANON_INODES=y @@ -105,7 +102,6 @@ CONFIG_PERF_USE_VMALLOC=y # Kernel Performance Events And Counters # CONFIG_PERF_EVENTS=y -CONFIG_EVENT_PROFILE=y CONFIG_PERF_COUNTERS=y # CONFIG_DEBUG_PERF_USE_VMALLOC is not set CONFIG_VM_EVENT_COUNTERS=y @@ -266,7 +262,6 @@ CONFIG_PCI_DOMAINS=y CONFIG_PCI_SYSCALL=y CONFIG_ARCH_SUPPORTS_MSI=y CONFIG_PCI_MSI=y -# CONFIG_PCI_LEGACY is not set # CONFIG_PCI_DEBUG is not set # CONFIG_PCI_STUB is not set # CONFIG_PCI_IOV is not set @@ -290,7 +285,6 @@ CONFIG_NET=y # Networking options # CONFIG_PACKET=y -CONFIG_PACKET_MMAP=y CONFIG_UNIX=y CONFIG_XFRM=y CONFIG_XFRM_USER=m @@ -425,10 +419,6 @@ CONFIG_BLK_DEV=y # CONFIG_BLK_DEV_COW_COMMON is not set CONFIG_BLK_DEV_LOOP=m CONFIG_BLK_DEV_CRYPTOLOOP=m - -# -# DRBD disabled because PROC_FS, INET or CONNECTOR not selected -# # CONFIG_BLK_DEV_DRBD is not set CONFIG_BLK_DEV_NBD=m # CONFIG_BLK_DEV_SX8 is not set @@ -677,6 +667,7 @@ CONFIG_SUNGEM=m CONFIG_SUNVNET=m # CONFIG_NET_VENDOR_3COM is not set # CONFIG_ETHOC is not set +# CONFIG_GRETH is not set # CONFIG_DNET is not set # CONFIG_NET_TULIP is not set # CONFIG_HP100 is not set @@ -691,6 +682,7 @@ CONFIG_NET_PCI=y # CONFIG_PCNET32 is not set # CONFIG_AMD8111_ETH is not set # CONFIG_ADAPTEC_STARFIRE is not set +# CONFIG_KSZ884X_PCI is not set # CONFIG_B44 is not set # CONFIG_FORCEDETH is not set # CONFIG_E100 is not set @@ -741,6 +733,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y # CONFIG_CHELSIO_T3 is not set # CONFIG_ENIC is not set # CONFIG_IXGBE is not set +# CONFIG_IXGBEVF is not set # CONFIG_IXGB is not set # CONFIG_S2IO is not set # CONFIG_VXGE is not set @@ -751,6 +744,7 @@ CONFIG_NIU=m # CONFIG_MLX4_CORE is not set # CONFIG_TEHUTI is not set # CONFIG_BNX2X is not set +# CONFIG_QLCNIC is not set # CONFIG_QLGE is not set # CONFIG_SFC is not set # CONFIG_BE2NET is not set @@ -1028,6 +1022,7 @@ CONFIG_HWMON=y # CONFIG_SENSORS_SMSC47M192 is not set # CONFIG_SENSORS_SMSC47B397 is not set # CONFIG_SENSORS_ADS7828 is not set +# CONFIG_SENSORS_AMC6821 is not set # CONFIG_SENSORS_THMC50 is not set # CONFIG_SENSORS_TMP401 is not set # CONFIG_SENSORS_TMP421 is not set @@ -1076,6 +1071,7 @@ CONFIG_SSB_POSSIBLE=y # Graphics support # CONFIG_VGA_ARB=y +CONFIG_VGA_ARB_MAX_GPUS=16 # CONFIG_DRM is not set # CONFIG_VGASTATE is not set # CONFIG_VIDEO_OUTPUT_CONTROL is not set @@ -1279,6 +1275,7 @@ CONFIG_SND_ALI5451=m # CONFIG_SND_YMFPCI is not set CONFIG_SND_USB=y # CONFIG_SND_USB_AUDIO is not set +# CONFIG_SND_USB_UA101 is not set # CONFIG_SND_USB_CAIAQ is not set CONFIG_SND_SPARC=y # CONFIG_SND_SUN_AMD7930 is not set @@ -1301,6 +1298,7 @@ CONFIG_USB_HIDDEV=y # # Special HID drivers # +# CONFIG_HID_3M_PCT is not set CONFIG_HID_A4TECH=y CONFIG_HID_APPLE=y CONFIG_HID_BELKIN=y @@ -1317,14 +1315,19 @@ CONFIG_HID_KENSINGTON=y CONFIG_HID_LOGITECH=y # CONFIG_LOGITECH_FF is not set # CONFIG_LOGIRUMBLEPAD2_FF is not set +# CONFIG_LOGIG940_FF is not set CONFIG_HID_MICROSOFT=y +# CONFIG_HID_MOSART is not set CONFIG_HID_MONTEREY=y CONFIG_HID_NTRIG=y +CONFIG_HID_ORTEK=y CONFIG_HID_PANTHERLORD=y # CONFIG_PANTHERLORD_FF is not set CONFIG_HID_PETALYNX=y +# CONFIG_HID_QUANTA is not set CONFIG_HID_SAMSUNG=y CONFIG_HID_SONY=y +# CONFIG_HID_STANTUM is not set CONFIG_HID_SUNPLUS=y CONFIG_HID_GREENASIA=y # CONFIG_GREENASIA_FF is not set @@ -1807,6 +1810,7 @@ CONFIG_CRYPTO_MANAGER=y CONFIG_CRYPTO_MANAGER2=y CONFIG_CRYPTO_GF128MUL=m CONFIG_CRYPTO_NULL=m +# CONFIG_CRYPTO_PCRYPT is not set CONFIG_CRYPTO_WORKQUEUE=y # CONFIG_CRYPTO_CRYPTD is not set CONFIG_CRYPTO_AUTHENC=y @@ -1904,9 +1908,11 @@ CONFIG_CRC32=y CONFIG_LIBCRC32C=m CONFIG_ZLIB_INFLATE=y CONFIG_ZLIB_DEFLATE=y +CONFIG_LZO_DECOMPRESS=y CONFIG_DECOMPRESS_GZIP=y CONFIG_DECOMPRESS_BZIP2=y CONFIG_DECOMPRESS_LZMA=y +CONFIG_DECOMPRESS_LZO=y CONFIG_HAS_IOMEM=y CONFIG_HAS_IOPORT=y CONFIG_HAS_DMA=y diff --git a/arch/sparc/include/asm/io_32.h b/arch/sparc/include/asm/io_32.h index 679c750..2889574 100644 --- a/arch/sparc/include/asm/io_32.h +++ b/arch/sparc/include/asm/io_32.h @@ -249,10 +249,14 @@ extern void iounmap(volatile void __iomem *addr); #define ioread8(X) readb(X) #define ioread16(X) readw(X) +#define ioread16be(X) __raw_readw(X) #define ioread32(X) readl(X) +#define ioread32be(X) __raw_readl(X) #define iowrite8(val,X) writeb(val,X) #define iowrite16(val,X) writew(val,X) +#define iowrite16be(val,X) __raw_writew(val,X) #define iowrite32(val,X) writel(val,X) +#define iowrite32be(val,X) __raw_writel(val,X) static inline void ioread8_rep(void __iomem *port, void *buf, unsigned long count) { diff --git a/arch/sparc/include/asm/io_64.h b/arch/sparc/include/asm/io_64.h index 4aee21d..9517d06 100644 --- a/arch/sparc/include/asm/io_64.h +++ b/arch/sparc/include/asm/io_64.h @@ -468,10 +468,14 @@ static inline void iounmap(volatile void __iomem *addr) #define ioread8(X) readb(X) #define ioread16(X) readw(X) +#define ioread16be(X) __raw_readw(X) #define ioread32(X) readl(X) +#define ioread32be(X) __raw_readl(X) #define iowrite8(val,X) writeb(val,X) #define iowrite16(val,X) writew(val,X) +#define iowrite16be(val,X) __raw_writew(val,X) #define iowrite32(val,X) writel(val,X) +#define iowrite32be(val,X) __raw_writel(val,X) /* Create a virtual mapping cookie for an IO port range */ extern void __iomem *ioport_map(unsigned long port, unsigned int nr); diff --git a/arch/sparc/include/asm/perfctr.h b/arch/sparc/include/asm/perfctr.h index 8368730..8d8720a 100644 --- a/arch/sparc/include/asm/perfctr.h +++ b/arch/sparc/include/asm/perfctr.h @@ -10,8 +10,8 @@ * from enumeration below. The meaning of further arguments * are determined by the operation code. * - * int sys_perfctr(int opcode, unsigned long arg0, - * unsigned long arg1, unsigned long arg2) + * NOTE: This system call is no longer provided, use the perf_events + * infrastructure. * * Pointers which are passed by the user are pointers to 64-bit * integers. diff --git a/arch/sparc/include/asm/system_64.h b/arch/sparc/include/asm/system_64.h index d47a98e..d24cfe1 100644 --- a/arch/sparc/include/asm/system_64.h +++ b/arch/sparc/include/asm/system_64.h @@ -143,15 +143,7 @@ do { \ * and 2 stores in this critical code path. -DaveM */ #define switch_to(prev, next, last) \ -do { if (test_thread_flag(TIF_PERFCTR)) { \ - unsigned long __tmp; \ - read_pcr(__tmp); \ - current_thread_info()->pcr_reg = __tmp; \ - read_pic(__tmp); \ - current_thread_info()->kernel_cntd0 += (unsigned int)(__tmp);\ - current_thread_info()->kernel_cntd1 += ((__tmp) >> 32); \ - } \ - flush_tlb_pending(); \ +do { flush_tlb_pending(); \ save_and_clear_fpu(); \ /* If you are tempted to conditionalize the following */ \ /* so that ASI is only written if it changes, think again. */ \ @@ -197,11 +189,6 @@ do { if (test_thread_flag(TIF_PERFCTR)) { \ "l1", "l2", "l3", "l4", "l5", "l6", "l7", \ "i0", "i1", "i2", "i3", "i4", "i5", \ "o0", "o1", "o2", "o3", "o4", "o5", "o7"); \ - /* If you fuck with this, update ret_from_syscall code too. */ \ - if (test_thread_flag(TIF_PERFCTR)) { \ - write_pcr(current_thread_info()->pcr_reg); \ - reset_pic(); \ - } \ } while(0) static inline unsigned long xchg32(__volatile__ unsigned int *m, unsigned int val) diff --git a/arch/sparc/include/asm/thread_info_64.h b/arch/sparc/include/asm/thread_info_64.h index 39be9f2..9e2d944 100644 --- a/arch/sparc/include/asm/thread_info_64.h +++ b/arch/sparc/include/asm/thread_info_64.h @@ -58,11 +58,6 @@ struct thread_info { unsigned long gsr[7]; unsigned long xfsr[7]; - __u64 __user *user_cntd0; - __u64 __user *user_cntd1; - __u64 kernel_cntd0, kernel_cntd1; - __u64 pcr_reg; - struct restart_block restart_block; struct pt_regs *kern_una_regs; @@ -96,15 +91,10 @@ struct thread_info { #define TI_RWIN_SPTRS 0x000003c8 #define TI_GSR 0x00000400 #define TI_XFSR 0x00000438 -#define TI_USER_CNTD0 0x00000470 -#define TI_USER_CNTD1 0x00000478 -#define TI_KERN_CNTD0 0x00000480 -#define TI_KERN_CNTD1 0x00000488 -#define TI_PCR 0x00000490 -#define TI_RESTART_BLOCK 0x00000498 -#define TI_KUNA_REGS 0x000004c8 -#define TI_KUNA_INSN 0x000004d0 -#define TI_FPREGS 0x00000500 +#define TI_RESTART_BLOCK 0x00000470 +#define TI_KUNA_REGS 0x000004a0 +#define TI_KUNA_INSN 0x000004a8 +#define TI_FPREGS 0x000004c0 /* We embed this in the uppermost byte of thread_info->flags */ #define FAULT_CODE_WRITE 0x01 /* Write access, implies D-TLB */ @@ -199,7 +189,7 @@ register struct thread_info *current_thread_info_reg asm("g6"); * * On trap return we need to test several values: * - * user: need_resched, notify_resume, sigpending, wsaved, perfctr + * user: need_resched, notify_resume, sigpending, wsaved * kernel: fpdepth * * So to check for work in the kernel case we simply load the fpdepth @@ -220,7 +210,7 @@ register struct thread_info *current_thread_info_reg asm("g6"); #define TIF_NOTIFY_RESUME 1 /* callback before returning to user */ #define TIF_SIGPENDING 2 /* signal pending */ #define TIF_NEED_RESCHED 3 /* rescheduling necessary */ -#define TIF_PERFCTR 4 /* performance counters active */ +/* flag bit 4 is available */ #define TIF_UNALIGNED 5 /* allowed to do unaligned accesses */ /* flag bit 6 is available */ #define TIF_32BIT 7 /* 32-bit binary */ @@ -241,7 +231,6 @@ register struct thread_info *current_thread_info_reg asm("g6"); #define _TIF_NOTIFY_RESUME (1<<TIF_NOTIFY_RESUME) #define _TIF_SIGPENDING (1<<TIF_SIGPENDING) #define _TIF_NEED_RESCHED (1<<TIF_NEED_RESCHED) -#define _TIF_PERFCTR (1<<TIF_PERFCTR) #define _TIF_UNALIGNED (1<<TIF_UNALIGNED) #define _TIF_32BIT (1<<TIF_32BIT) #define _TIF_SECCOMP (1<<TIF_SECCOMP) @@ -252,7 +241,7 @@ register struct thread_info *current_thread_info_reg asm("g6"); #define _TIF_USER_WORK_MASK ((0xff << TI_FLAG_WSAVED_SHIFT) | \ _TIF_DO_NOTIFY_RESUME_MASK | \ - _TIF_NEED_RESCHED | _TIF_PERFCTR) + _TIF_NEED_RESCHED) #define _TIF_DO_NOTIFY_RESUME_MASK (_TIF_NOTIFY_RESUME | _TIF_SIGPENDING) /* diff --git a/arch/sparc/kernel/entry.h b/arch/sparc/kernel/entry.h index 4f53a23..c011b93 100644 --- a/arch/sparc/kernel/entry.h +++ b/arch/sparc/kernel/entry.h @@ -48,7 +48,6 @@ extern void __init boot_cpu_id_too_large(int cpu); extern unsigned int dcache_parity_tl1_occurred; extern unsigned int icache_parity_tl1_occurred; -extern asmlinkage void update_perfctrs(void); extern asmlinkage void sparc_breakpoint(struct pt_regs *regs); extern void timer_interrupt(int irq, struct pt_regs *regs); diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c index cb70476..a5cf386 100644 --- a/arch/sparc/kernel/process_64.c +++ b/arch/sparc/kernel/process_64.c @@ -352,12 +352,6 @@ void exit_thread(void) else t->utraps[0]--; } - - if (test_and_clear_thread_flag(TIF_PERFCTR)) { - t->user_cntd0 = t->user_cntd1 = NULL; - t->pcr_reg = 0; - write_pcr(0); - } } void flush_thread(void) @@ -371,13 +365,6 @@ void flush_thread(void) set_thread_wsaved(0); - /* Turn off performance counters if on. */ - if (test_and_clear_thread_flag(TIF_PERFCTR)) { - t->user_cntd0 = t->user_cntd1 = NULL; - t->pcr_reg = 0; - write_pcr(0); - } - /* Clear FPU register state. */ t->fpsaved[0] = 0; @@ -591,16 +578,6 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, t->kregs->u_regs[UREG_FP] = ((unsigned long) child_sf) - STACK_BIAS; - /* Special case, if we are spawning a kernel thread from - * a userspace task (usermode helper, NFS or similar), we - * must disable performance counters in the child because - * the address space and protection realm are changing. - */ - if (t->flags & _TIF_PERFCTR) { - t->user_cntd0 = t->user_cntd1 = NULL; - t->pcr_reg = 0; - t->flags &= ~_TIF_PERFCTR; - } t->flags |= ((long)ASI_P << TI_FLAG_CURRENT_DS_SHIFT); t->kregs->u_regs[UREG_G6] = (unsigned long) t; t->kregs->u_regs[UREG_G4] = (unsigned long) t->task; diff --git a/arch/sparc/kernel/rtrap_64.S b/arch/sparc/kernel/rtrap_64.S index 1ddec40..83f1873 100644 --- a/arch/sparc/kernel/rtrap_64.S +++ b/arch/sparc/kernel/rtrap_64.S @@ -65,48 +65,6 @@ __handle_user_windows: ba,pt %xcc, __handle_user_windows_continue andn %l1, %l4, %l1 -__handle_perfctrs: - call update_perfctrs - wrpr %g0, RTRAP_PSTATE, %pstate - wrpr %g0, RTRAP_PSTATE_IRQOFF, %pstate - ldub [%g6 + TI_WSAVED], %o2 - brz,pt %o2, 1f - nop - /* Redo userwin+sched+sig checks */ - call fault_in_user_windows - - wrpr %g0, RTRAP_PSTATE, %pstate - wrpr %g0, RTRAP_PSTATE_IRQOFF, %pstate - ldx [%g6 + TI_FLAGS], %l0 - andcc %l0, _TIF_NEED_RESCHED, %g0 - be,pt %xcc, 1f - - nop - call schedule - wrpr %g0, RTRAP_PSTATE, %pstate - wrpr %g0, RTRAP_PSTATE_IRQOFF, %pstate - ldx [%g6 + TI_FLAGS], %l0 -1: andcc %l0, _TIF_DO_NOTIFY_RESUME_MASK, %g0 - - be,pt %xcc, __handle_perfctrs_continue - sethi %hi(TSTATE_PEF), %o0 - mov %l5, %o1 - add %sp, PTREGS_OFF, %o0 - mov %l0, %o2 - call do_notify_resume - - wrpr %g0, RTRAP_PSTATE, %pstate - wrpr %g0, RTRAP_PSTATE_IRQOFF, %pstate - /* Signal delivery can modify pt_regs tstate, so we must - * reload it. - */ - ldx [%sp + PTREGS_OFF + PT_V9_TSTATE], %l1 - sethi %hi(0xf << 20), %l4 - and %l1, %l4, %l4 - andn %l1, %l4, %l1 - ba,pt %xcc, __handle_perfctrs_continue - - sethi %hi(TSTATE_PEF), %o0 __handle_userfpu: rd %fprs, %l5 andcc %l5, FPRS_FEF, %g0 @@ -191,9 +149,9 @@ rtrap_no_irq_enable: * take until the next local IRQ before the signal/resched * event would be handled. * - * This also means that if we have to deal with performance - * counters or user windows, we have to redo all of these - * sched+signal checks with IRQs disabled. + * This also means that if we have to deal with user + * windows, we have to redo all of these sched+signal checks + * with IRQs disabled. */ to_user: wrpr %g0, RTRAP_PSTATE_IRQOFF, %pstate wrpr 0, %pil @@ -214,12 +172,8 @@ __handle_signal_continue: brnz,pn %o2, __handle_user_windows nop __handle_user_windows_continue: - ldx [%g6 + TI_FLAGS], %l5 - andcc %l5, _TIF_PERFCTR, %g0 sethi %hi(TSTATE_PEF), %o0 - bne,pn %xcc, __handle_perfctrs -__handle_perfctrs_continue: - andcc %l1, %o0, %g0 + andcc %l1, %o0, %g0 /* This fpdepth clear is necessary for non-syscall rtraps only */ user_nowork: diff --git a/arch/sparc/kernel/sys32.S b/arch/sparc/kernel/sys32.S index e706113..46a76ba 100644 --- a/arch/sparc/kernel/sys32.S +++ b/arch/sparc/kernel/sys32.S @@ -51,7 +51,6 @@ SIGN1(sys32_exit_group, sys_exit_group, %o0) SIGN1(sys32_wait4, compat_sys_wait4, %o2) SIGN1(sys32_creat, sys_creat, %o1) SIGN1(sys32_mknod, sys_mknod, %o1) -SIGN1(sys32_perfctr, sys_perfctr, %o0) SIGN1(sys32_umount, sys_umount, %o1) SIGN1(sys32_signal, sys_signal, %o0) SIGN1(sys32_access, sys_access, %o1) diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c index d77f543..cb1bef6 100644 --- a/arch/sparc/kernel/sys_sparc_64.c +++ b/arch/sparc/kernel/sys_sparc_64.c @@ -27,7 +27,6 @@ #include <asm/uaccess.h> #include <asm/utrap.h> -#include <asm/perfctr.h> #include <asm/unistd.h> #include "entry.h" @@ -766,109 +765,6 @@ SYSCALL_DEFINE5(rt_sigaction, int, sig, const struct sigaction __user *, act, return ret; } -/* Invoked by rtrap code to update performance counters in - * user space. - */ -asmlinkage void update_perfctrs(void) -{ - unsigned long pic, tmp; - - read_pic(pic); - tmp = (current_thread_info()->kernel_cntd0 += (unsigned int)pic); - __put_user(tmp, current_thread_info()->user_cntd0); - tmp = (current_thread_info()->kernel_cntd1 += (pic >> 32)); - __put_user(tmp, current_thread_info()->user_cntd1); - reset_pic(); -} - -SYSCALL_DEFINE4(perfctr, int, opcode, unsigned long, arg0, - unsigned long, arg1, unsigned long, arg2) -{ - int err = 0; - - switch(opcode) { - case PERFCTR_ON: - current_thread_info()->pcr_reg = arg2; - current_thread_info()->user_cntd0 = (u64 __user *) arg0; - current_thread_info()->user_cntd1 = (u64 __user *) arg1; - current_thread_info()->kernel_cntd0 = - current_thread_info()->kernel_cntd1 = 0; - write_pcr(arg2); - reset_pic(); - set_thread_flag(TIF_PERFCTR); - break; - - case PERFCTR_OFF: - err = -EINVAL; - if (test_thread_flag(TIF_PERFCTR)) { - current_thread_info()->user_cntd0 = - current_thread_info()->user_cntd1 = NULL; - current_thread_info()->pcr_reg = 0; - write_pcr(0); - clear_thread_flag(TIF_PERFCTR); - err = 0; - } - break; - - case PERFCTR_READ: { - unsigned long pic, tmp; - - if (!test_thread_flag(TIF_PERFCTR)) { - err = -EINVAL; - break; - } - read_pic(pic); - tmp = (current_thread_info()->kernel_cntd0 += (unsigned int)pic); - err |= __put_user(tmp, current_thread_info()->user_cntd0); - tmp = (current_thread_info()->kernel_cntd1 += (pic >> 32)); - err |= __put_user(tmp, current_thread_info()->user_cntd1); - reset_pic(); - break; - } - - case PERFCTR_CLRPIC: - if (!test_thread_flag(TIF_PERFCTR)) { - err = -EINVAL; - break; - } - current_thread_info()->kernel_cntd0 = - current_thread_info()->kernel_cntd1 = 0; - reset_pic(); - break; - - case PERFCTR_SETPCR: { - u64 __user *user_pcr = (u64 __user *)arg0; - - if (!test_thread_flag(TIF_PERFCTR)) { - err = -EINVAL; - break; - } - err |= __get_user(current_thread_info()->pcr_reg, user_pcr); - write_pcr(current_thread_info()->pcr_reg); - current_thread_info()->kernel_cntd0 = - current_thread_info()->kernel_cntd1 = 0; - reset_pic(); - break; - } - - case PERFCTR_GETPCR: { - u64 __user *user_pcr = (u64 __user *)arg0; - - if (!test_thread_flag(TIF_PERFCTR)) { - err = -EINVAL; - break; - } - err |= __put_user(current_thread_info()->pcr_reg, user_pcr); - break; - } - - default: - err = -EINVAL; - break; - }; - return err; -} - /* * Do a system call from kernel instead of calling sys_execve so we * end up with proper pt_regs. diff --git a/arch/sparc/kernel/syscalls.S b/arch/sparc/kernel/syscalls.S index dc4a458..1d7e274 100644 --- a/arch/sparc/kernel/syscalls.S +++ b/arch/sparc/kernel/syscalls.S @@ -110,31 +110,12 @@ sys_clone: .globl ret_from_syscall ret_from_syscall: - /* Clear current_thread_info()->new_child, and - * check performance counter stuff too. - */ + /* Clear current_thread_info()->new_child. */ stb %g0, [%g6 + TI_NEW_CHILD] ldx [%g6 + TI_FLAGS], %l0 call schedule_tail mov %g7, %o0 - andcc %l0, _TIF_PERFCTR, %g0 - be,pt %icc, 1f - nop - ldx [%g6 + TI_PCR], %o7 - wr %g0, %o7, %pcr - - /* Blackbird errata workaround. See commentary in - * smp.c:smp_percpu_timer_interrupt() for more - * information. - */ - ba,pt %xcc, 99f - nop - - .align 64 -99: wr %g0, %g0, %pic - rd %pic, %g0 - -1: ba,pt %xcc, ret_sys_call + ba,pt %xcc, ret_sys_call ldx [%sp + PTREGS_OFF + PT_V9_I0], %o0 .globl sparc_exit diff --git a/arch/sparc/kernel/systbls.h b/arch/sparc/kernel/systbls.h index d2f999a..68312fe 100644 --- a/arch/sparc/kernel/systbls.h +++ b/arch/sparc/kernel/systbls.h @@ -36,8 +36,6 @@ extern asmlinkage long sys_rt_sigaction(int sig, struct sigaction __user *oact, void __user *restorer, size_t sigsetsize); -extern asmlinkage long sys_perfctr(int opcode, unsigned long arg0, - unsigned long arg1, unsigned long arg2); extern asmlinkage void sparc64_set_context(struct pt_regs *regs); extern asmlinkage void sparc64_get_context(struct pt_regs *regs); diff --git a/arch/sparc/kernel/systbls_64.S b/arch/sparc/kernel/systbls_64.S index e575b46..1761425 100644 --- a/arch/sparc/kernel/systbls_64.S +++ b/arch/sparc/kernel/systbls_64.S @@ -21,7 +21,7 @@ sys_call_table32: /*0*/ .word sys_restart_syscall, sys32_exit, sys_fork, sys_read, sys_write /*5*/ .word sys32_open, sys_close, sys32_wait4, sys32_creat, sys_link /*10*/ .word sys_unlink, sunos_execv, sys_chdir, sys_chown16, sys32_mknod -/*15*/ .word sys_chmod, sys_lchown16, sys_brk, sys32_perfctr, sys32_lseek +/*15*/ .word sys_chmod, sys_lchown16, sys_brk, sys_nis_syscall, sys32_lseek /*20*/ .word sys_getpid, sys_capget, sys_capset, sys_setuid16, sys_getuid16 /*25*/ .word sys32_vmsplice, compat_sys_ptrace, sys_alarm, sys32_sigaltstack, sys_pause /*30*/ .word compat_sys_utime, sys_lchown, sys_fchown, sys32_access, sys32_nice @@ -96,7 +96,7 @@ sys_call_table: /*0*/ .word sys_restart_syscall, sparc_exit, sys_fork, sys_read, sys_write /*5*/ .word sys_open, sys_close, sys_wait4, sys_creat, sys_link /*10*/ .word sys_unlink, sys_nis_syscall, sys_chdir, sys_chown, sys_mknod -/*15*/ .word sys_chmod, sys_lchown, sys_brk, sys_perfctr, sys_lseek +/*15*/ .word sys_chmod, sys_lchown, sys_brk, sys_nis_syscall, sys_lseek /*20*/ .word sys_getpid, sys_capget, sys_capset, sys_setuid, sys_getuid /*25*/ .word sys_vmsplice, sys_ptrace, sys_alarm, sys_sigaltstack, sys_nis_syscall /*30*/ .word sys_utime, sys_nis_syscall, sys_nis_syscall, sys_access, sys_nice diff --git a/arch/sparc/kernel/traps_64.c b/arch/sparc/kernel/traps_64.c index 10f7bb9..bdc05a2 100644 --- a/arch/sparc/kernel/traps_64.c +++ b/arch/sparc/kernel/traps_64.c @@ -2548,15 +2548,6 @@ void __init trap_init(void) rwbuf_stkptrs) || TI_GSR != offsetof(struct thread_info, gsr) || TI_XFSR != offsetof(struct thread_info, xfsr) || - TI_USER_CNTD0 != offsetof(struct thread_info, - user_cntd0) || - TI_USER_CNTD1 != offsetof(struct thread_info, - user_cntd1) || - TI_KERN_CNTD0 != offsetof(struct thread_info, - kernel_cntd0) || - TI_KERN_CNTD1 != offsetof(struct thread_info, - kernel_cntd1) || - TI_PCR != offsetof(struct thread_info, pcr_reg) || TI_PRE_COUNT != offsetof(struct thread_info, preempt_count) || TI_NEW_CHILD != offsetof(struct thread_info, new_child) || diff --git a/arch/sparc/prom/p1275.c b/arch/sparc/prom/p1275.c index 4b7c937..2d8b70d 100644 --- a/arch/sparc/prom/p1275.c +++ b/arch/sparc/prom/p1275.c @@ -32,10 +32,9 @@ extern void prom_cif_interface(void); extern void prom_cif_callback(void); /* - * This provides SMP safety on the p1275buf. prom_callback() drops this lock - * to allow recursuve acquisition. + * This provides SMP safety on the p1275buf. */ -DEFINE_SPINLOCK(prom_entry_lock); +DEFINE_RAW_SPINLOCK(prom_entry_lock); long p1275_cmd(const char *service, long fmt, ...) { @@ -47,7 +46,9 @@ long p1275_cmd(const char *service, long fmt, ...) p = p1275buf.prom_buffer; - spin_lock_irqsave(&prom_entry_lock, flags); + raw_local_save_flags(flags); + raw_local_irq_restore(PIL_NMI); + raw_spin_lock(&prom_entry_lock); p1275buf.prom_args[0] = (unsigned long)p; /* service */ strcpy (p, service); @@ -139,7 +140,8 @@ long p1275_cmd(const char *service, long fmt, ...) va_end(list); x = p1275buf.prom_args [nargs + 3]; - spin_unlock_irqrestore(&prom_entry_lock, flags); + raw_spin_unlock(&prom_entry_lock); + raw_local_irq_restore(flags); return x; } diff --git a/arch/um/.gitignore b/arch/um/.gitignore new file mode 100644 index 0000000..a73d3a1 --- /dev/null +++ b/arch/um/.gitignore @@ -0,0 +1,3 @@ +kernel/config.c +kernel/config.tmp +kernel/vmlinux.lds diff --git a/arch/um/drivers/line.c b/arch/um/drivers/line.c index cf8a97f..64cda95 100644 --- a/arch/um/drivers/line.c +++ b/arch/um/drivers/line.c @@ -18,10 +18,10 @@ static irqreturn_t line_interrupt(int irq, void *data) { struct chan *chan = data; struct line *line = chan->line; - struct tty_struct *tty = line->tty; + struct tty_struct *tty; if (line) - chan_interrupt(&line->chan_list, &line->task, tty, irq); + chan_interrupt(&line->chan_list, &line->task, line->tty, irq); return IRQ_HANDLED; } diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c index 3b3c366..de317d0 100644 --- a/arch/um/drivers/mconsole_kern.c +++ b/arch/um/drivers/mconsole_kern.c @@ -140,7 +140,7 @@ void mconsole_proc(struct mc_request *req) goto out; } - err = may_open(&nd.path, MAY_READ, FMODE_READ); + err = may_open(&nd.path, MAY_READ, O_RDONLY); if (result) { mconsole_reply(req, "Failed to open file", 1, 0); path_put(&nd.path); diff --git a/arch/um/sys-i386/Makefile b/arch/um/sys-i386/Makefile index 1b549bc..804b28d 100644 --- a/arch/um/sys-i386/Makefile +++ b/arch/um/sys-i386/Makefile @@ -6,6 +6,8 @@ obj-y = bug.o bugs.o checksum.o delay.o fault.o ksyms.o ldt.o ptrace.o \ ptrace_user.o setjmp.o signal.o stub.o stub_segv.o syscalls.o sysrq.o \ sys_call_table.o tls.o +obj-$(CONFIG_BINFMT_ELF) += elfcore.o + subarch-obj-y = lib/semaphore_32.o lib/string_32.o subarch-obj-$(CONFIG_HIGHMEM) += mm/highmem_32.o subarch-obj-$(CONFIG_MODULES) += kernel/module.o diff --git a/arch/um/sys-i386/asm/elf.h b/arch/um/sys-i386/asm/elf.h index 7708854..e64cd41 100644 --- a/arch/um/sys-i386/asm/elf.h +++ b/arch/um/sys-i386/asm/elf.h @@ -116,47 +116,4 @@ do { \ } \ } while (0) -/* - * These macros parameterize elf_core_dump in fs/binfmt_elf.c to write out - * extra segments containing the vsyscall DSO contents. Dumping its - * contents makes post-mortem fully interpretable later without matching up - * the same kernel and hardware config to see what PC values meant. - * Dumping its extra ELF program headers includes all the other information - * a debugger needs to easily find how the vsyscall DSO was being used. - */ -#define ELF_CORE_EXTRA_PHDRS \ - (vsyscall_ehdr ? (((struct elfhdr *)vsyscall_ehdr)->e_phnum) : 0 ) - -#define ELF_CORE_WRITE_EXTRA_PHDRS \ -if ( vsyscall_ehdr ) { \ - const struct elfhdr *const ehdrp = (struct elfhdr *)vsyscall_ehdr; \ - const struct elf_phdr *const phdrp = \ - (const struct elf_phdr *) (vsyscall_ehdr + ehdrp->e_phoff); \ - int i; \ - Elf32_Off ofs = 0; \ - for (i = 0; i < ehdrp->e_phnum; ++i) { \ - struct elf_phdr phdr = phdrp[i]; \ - if (phdr.p_type == PT_LOAD) { \ - ofs = phdr.p_offset = offset; \ - offset += phdr.p_filesz; \ - } \ - else \ - phdr.p_offset += ofs; \ - phdr.p_paddr = 0; /* match other core phdrs */ \ - DUMP_WRITE(&phdr, sizeof(phdr)); \ - } \ -} -#define ELF_CORE_WRITE_EXTRA_DATA \ -if ( vsyscall_ehdr ) { \ - const struct elfhdr *const ehdrp = (struct elfhdr *)vsyscall_ehdr; \ - const struct elf_phdr *const phdrp = \ - (const struct elf_phdr *) (vsyscall_ehdr + ehdrp->e_phoff); \ - int i; \ - for (i = 0; i < ehdrp->e_phnum; ++i) { \ - if (phdrp[i].p_type == PT_LOAD) \ - DUMP_WRITE((void *) phdrp[i].p_vaddr, \ - phdrp[i].p_filesz); \ - } \ -} - #endif diff --git a/arch/um/sys-i386/elfcore.c b/arch/um/sys-i386/elfcore.c new file mode 100644 index 0000000..6bb49b6 --- /dev/null +++ b/arch/um/sys-i386/elfcore.c @@ -0,0 +1,83 @@ +#include <linux/elf.h> +#include <linux/coredump.h> +#include <linux/fs.h> +#include <linux/mm.h> + +#include <asm/elf.h> + + +Elf32_Half elf_core_extra_phdrs(void) +{ + return vsyscall_ehdr ? (((struct elfhdr *)vsyscall_ehdr)->e_phnum) : 0; +} + +int elf_core_write_extra_phdrs(struct file *file, loff_t offset, size_t *size, + unsigned long limit) +{ + if ( vsyscall_ehdr ) { + const struct elfhdr *const ehdrp = + (struct elfhdr *) vsyscall_ehdr; + const struct elf_phdr *const phdrp = + (const struct elf_phdr *) (vsyscall_ehdr + ehdrp->e_phoff); + int i; + Elf32_Off ofs = 0; + + for (i = 0; i < ehdrp->e_phnum; ++i) { + struct elf_phdr phdr = phdrp[i]; + + if (phdr.p_type == PT_LOAD) { + ofs = phdr.p_offset = offset; + offset += phdr.p_filesz; + } else { + phdr.p_offset += ofs; + } + phdr.p_paddr = 0; /* match other core phdrs */ + *size += sizeof(phdr); + if (*size > limit + || !dump_write(file, &phdr, sizeof(phdr))) + return 0; + } + } + return 1; +} + +int elf_core_write_extra_data(struct file *file, size_t *size, + unsigned long limit) +{ + if ( vsyscall_ehdr ) { + const struct elfhdr *const ehdrp = + (struct elfhdr *) vsyscall_ehdr; + const struct elf_phdr *const phdrp = + (const struct elf_phdr *) (vsyscall_ehdr + ehdrp->e_phoff); + int i; + + for (i = 0; i < ehdrp->e_phnum; ++i) { + if (phdrp[i].p_type == PT_LOAD) { + void *addr = (void *) phdrp[i].p_vaddr; + size_t filesz = phdrp[i].p_filesz; + + *size += filesz; + if (*size > limit + || !dump_write(file, addr, filesz)) + return 0; + } + } + } + return 1; +} + +size_t elf_core_extra_data_size(void) +{ + if ( vsyscall_ehdr ) { + const struct elfhdr *const ehdrp = + (struct elfhdr *)vsyscall_ehdr; + const struct elf_phdr *const phdrp = + (const struct elf_phdr *) (vsyscall_ehdr + ehdrp->e_phoff); + int i; + + for (i = 0; i < ehdrp->e_phnum; ++i) + if (phdrp[i].p_type == PT_LOAD) + return (size_t) phdrp[i].p_filesz; + } + return 0; +} diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 57ccdce..e984403 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -31,6 +31,7 @@ config X86 select ARCH_WANT_FRAME_POINTERS select HAVE_DMA_ATTRS select HAVE_KRETPROBES + select HAVE_OPTPROBES select HAVE_FTRACE_MCOUNT_RECORD select HAVE_DYNAMIC_FTRACE select HAVE_FUNCTION_TRACER @@ -392,8 +393,12 @@ config X86_ELAN config X86_MRST bool "Moorestown MID platform" + depends on PCI + depends on PCI_GOANY depends on X86_32 depends on X86_EXTENDED_PLATFORM + depends on X86_IO_APIC + select APB_TIMER ---help--- Moorestown is Intel's Low Power Intel Architecture (LPIA) based Moblin Internet Device(MID) platform. Moorestown consists of two chips: @@ -428,6 +433,7 @@ config X86_32_NON_STANDARD config X86_NUMAQ bool "NUMAQ (IBM/Sequent)" depends on X86_32_NON_STANDARD + depends on PCI select NUMA select X86_MPPARSE ---help--- @@ -628,6 +634,16 @@ config HPET_EMULATE_RTC def_bool y depends on HPET_TIMER && (RTC=y || RTC=m || RTC_DRV_CMOS=m || RTC_DRV_CMOS=y) +config APB_TIMER + def_bool y if MRST + prompt "Langwell APB Timer Support" if X86_MRST + help + APB timer is the replacement for 8254, HPET on X86 MID platforms. + The APBT provides a stable time base on SMP + systems, unlike the TSC, but it is more expensive to access, + as it is off-chip. APB timers are always running regardless of CPU + C states, they are used as per CPU clockevent device when possible. + # Mark as embedded because too many people got it wrong. # The code disables itself when not needed. config DMI diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index 9f828f8..493092e 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -11,6 +11,7 @@ header-y += sigcontext32.h header-y += ucontext.h header-y += processor-flags.h header-y += hw_breakpoint.h +header-y += hyperv.h unifdef-y += e820.h unifdef-y += ist.h diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index f1e253c..b09ec55 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -165,10 +165,12 @@ static inline void apply_paravirt(struct paravirt_patch_site *start, * invalid instruction possible) or if the instructions are changed from a * consistent state to another consistent state atomically. * More care must be taken when modifying code in the SMP case because of - * Intel's errata. + * Intel's errata. text_poke_smp() takes care that errata, but still + * doesn't support NMI/MCE handler code modifying. * On the local CPU you need to be protected again NMI or MCE handlers seeing an * inconsistent instruction while you patch. */ extern void *text_poke(void *addr, const void *opcode, size_t len); +extern void *text_poke_smp(void *addr, const void *opcode, size_t len); #endif /* _ASM_X86_ALTERNATIVE_H */ diff --git a/arch/x86/include/asm/apb_timer.h b/arch/x86/include/asm/apb_timer.h new file mode 100644 index 0000000..c74a2ee --- /dev/null +++ b/arch/x86/include/asm/apb_timer.h @@ -0,0 +1,70 @@ +/* + * apb_timer.h: Driver for Langwell APB timer based on Synopsis DesignWare + * + * (C) Copyright 2009 Intel Corporation + * Author: Jacob Pan (jacob.jun.pan@intel.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + * + * Note: + */ + +#ifndef ASM_X86_APBT_H +#define ASM_X86_APBT_H +#include <linux/sfi.h> + +#ifdef CONFIG_APB_TIMER + +/* Langwell DW APB timer registers */ +#define APBTMR_N_LOAD_COUNT 0x00 +#define APBTMR_N_CURRENT_VALUE 0x04 +#define APBTMR_N_CONTROL 0x08 +#define APBTMR_N_EOI 0x0c +#define APBTMR_N_INT_STATUS 0x10 + +#define APBTMRS_INT_STATUS 0xa0 +#define APBTMRS_EOI 0xa4 +#define APBTMRS_RAW_INT_STATUS 0xa8 +#define APBTMRS_COMP_VERSION 0xac +#define APBTMRS_REG_SIZE 0x14 + +/* register bits */ +#define APBTMR_CONTROL_ENABLE (1<<0) +#define APBTMR_CONTROL_MODE_PERIODIC (1<<1) /*1: periodic 0:free running */ +#define APBTMR_CONTROL_INT (1<<2) + +/* default memory mapped register base */ +#define LNW_SCU_ADDR 0xFF100000 +#define LNW_EXT_TIMER_OFFSET 0x1B800 +#define APBT_DEFAULT_BASE (LNW_SCU_ADDR+LNW_EXT_TIMER_OFFSET) +#define LNW_EXT_TIMER_PGOFFSET 0x800 + +/* APBT clock speed range from PCLK to fabric base, 25-100MHz */ +#define APBT_MAX_FREQ 50 +#define APBT_MIN_FREQ 1 +#define APBT_MMAP_SIZE 1024 + +#define APBT_DEV_USED 1 + +extern void apbt_time_init(void); +extern struct clock_event_device *global_clock_event; +extern unsigned long apbt_quick_calibrate(void); +extern int arch_setup_apbt_irqs(int irq, int trigger, int mask, int cpu); +extern void apbt_setup_secondary_clock(void); +extern unsigned int boot_cpu_id; +extern int disable_apbt_percpu; + +extern struct sfi_timer_table_entry *sfi_get_mtmr(int hint); +extern void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr); +extern int sfi_mtimer_num; + +#else /* CONFIG_APB_TIMER */ + +static inline unsigned long apbt_quick_calibrate(void) {return 0; } +static inline void apbt_time_init(void) {return 0; } + +#endif +#endif /* ASM_X86_APBT_H */ diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index eeac829..a929c9e 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -53,13 +53,6 @@ extern void threshold_interrupt(void); extern void call_function_interrupt(void); extern void call_function_single_interrupt(void); -/* PIC specific functions */ -extern void disable_8259A_irq(unsigned int irq); -extern void enable_8259A_irq(unsigned int irq); -extern int i8259A_irq_pending(unsigned int irq); -extern void make_8259A_irq(unsigned int irq); -extern void init_8259A(int aeoi); - /* IOAPIC */ #define IO_APIC_IRQ(x) (((x) >= NR_IRQS_LEGACY) || ((1<<(x)) & io_apic_irqs)) extern unsigned long io_apic_irqs; diff --git a/arch/x86/include/asm/hyperv.h b/arch/x86/include/asm/hyperv.h new file mode 100644 index 0000000..e153a2b --- /dev/null +++ b/arch/x86/include/asm/hyperv.h @@ -0,0 +1,186 @@ +#ifndef _ASM_X86_KVM_HYPERV_H +#define _ASM_X86_KVM_HYPERV_H + +#include <linux/types.h> + +/* + * The below CPUID leaves are present if VersionAndFeatures.HypervisorPresent + * is set by CPUID(HvCpuIdFunctionVersionAndFeatures). + */ +#define HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS 0x40000000 +#define HYPERV_CPUID_INTERFACE 0x40000001 +#define HYPERV_CPUID_VERSION 0x40000002 +#define HYPERV_CPUID_FEATURES 0x40000003 +#define HYPERV_CPUID_ENLIGHTMENT_INFO 0x40000004 +#define HYPERV_CPUID_IMPLEMENT_LIMITS 0x40000005 + +/* + * Feature identification. EAX indicates which features are available + * to the partition based upon the current partition privileges. + */ + +/* VP Runtime (HV_X64_MSR_VP_RUNTIME) available */ +#define HV_X64_MSR_VP_RUNTIME_AVAILABLE (1 << 0) +/* Partition Reference Counter (HV_X64_MSR_TIME_REF_COUNT) available*/ +#define HV_X64_MSR_TIME_REF_COUNT_AVAILABLE (1 << 1) +/* + * Basic SynIC MSRs (HV_X64_MSR_SCONTROL through HV_X64_MSR_EOM + * and HV_X64_MSR_SINT0 through HV_X64_MSR_SINT15) available + */ +#define HV_X64_MSR_SYNIC_AVAILABLE (1 << 2) +/* + * Synthetic Timer MSRs (HV_X64_MSR_STIMER0_CONFIG through + * HV_X64_MSR_STIMER3_COUNT) available + */ +#define HV_X64_MSR_SYNTIMER_AVAILABLE (1 << 3) +/* + * APIC access MSRs (HV_X64_MSR_EOI, HV_X64_MSR_ICR and HV_X64_MSR_TPR) + * are available + */ +#define HV_X64_MSR_APIC_ACCESS_AVAILABLE (1 << 4) +/* Hypercall MSRs (HV_X64_MSR_GUEST_OS_ID and HV_X64_MSR_HYPERCALL) available*/ +#define HV_X64_MSR_HYPERCALL_AVAILABLE (1 << 5) +/* Access virtual processor index MSR (HV_X64_MSR_VP_INDEX) available*/ +#define HV_X64_MSR_VP_INDEX_AVAILABLE (1 << 6) +/* Virtual system reset MSR (HV_X64_MSR_RESET) is available*/ +#define HV_X64_MSR_RESET_AVAILABLE (1 << 7) + /* + * Access statistics pages MSRs (HV_X64_MSR_STATS_PARTITION_RETAIL_PAGE, + * HV_X64_MSR_STATS_PARTITION_INTERNAL_PAGE, HV_X64_MSR_STATS_VP_RETAIL_PAGE, + * HV_X64_MSR_STATS_VP_INTERNAL_PAGE) available + */ +#define HV_X64_MSR_STAT_PAGES_AVAILABLE (1 << 8) + +/* + * Feature identification: EBX indicates which flags were specified at + * partition creation. The format is the same as the partition creation + * flag structure defined in section Partition Creation Flags. + */ +#define HV_X64_CREATE_PARTITIONS (1 << 0) +#define HV_X64_ACCESS_PARTITION_ID (1 << 1) +#define HV_X64_ACCESS_MEMORY_POOL (1 << 2) +#define HV_X64_ADJUST_MESSAGE_BUFFERS (1 << 3) +#define HV_X64_POST_MESSAGES (1 << 4) +#define HV_X64_SIGNAL_EVENTS (1 << 5) +#define HV_X64_CREATE_PORT (1 << 6) +#define HV_X64_CONNECT_PORT (1 << 7) +#define HV_X64_ACCESS_STATS (1 << 8) +#define HV_X64_DEBUGGING (1 << 11) +#define HV_X64_CPU_POWER_MANAGEMENT (1 << 12) +#define HV_X64_CONFIGURE_PROFILER (1 << 13) + +/* + * Feature identification. EDX indicates which miscellaneous features + * are available to the partition. + */ +/* The MWAIT instruction is available (per section MONITOR / MWAIT) */ +#define HV_X64_MWAIT_AVAILABLE (1 << 0) +/* Guest debugging support is available */ +#define HV_X64_GUEST_DEBUGGING_AVAILABLE (1 << 1) +/* Performance Monitor support is available*/ +#define HV_X64_PERF_MONITOR_AVAILABLE (1 << 2) +/* Support for physical CPU dynamic partitioning events is available*/ +#define HV_X64_CPU_DYNAMIC_PARTITIONING_AVAILABLE (1 << 3) +/* + * Support for passing hypercall input parameter block via XMM + * registers is available + */ +#define HV_X64_HYPERCALL_PARAMS_XMM_AVAILABLE (1 << 4) +/* Support for a virtual guest idle state is available */ +#define HV_X64_GUEST_IDLE_STATE_AVAILABLE (1 << 5) + +/* + * Implementation recommendations. Indicates which behaviors the hypervisor + * recommends the OS implement for optimal performance. + */ + /* + * Recommend using hypercall for address space switches rather + * than MOV to CR3 instruction + */ +#define HV_X64_MWAIT_RECOMMENDED (1 << 0) +/* Recommend using hypercall for local TLB flushes rather + * than INVLPG or MOV to CR3 instructions */ +#define HV_X64_LOCAL_TLB_FLUSH_RECOMMENDED (1 << 1) +/* + * Recommend using hypercall for remote TLB flushes rather + * than inter-processor interrupts + */ +#define HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED (1 << 2) +/* + * Recommend using MSRs for accessing APIC registers + * EOI, ICR and TPR rather than their memory-mapped counterparts + */ +#define HV_X64_APIC_ACCESS_RECOMMENDED (1 << 3) +/* Recommend using the hypervisor-provided MSR to initiate a system RESET */ +#define HV_X64_SYSTEM_RESET_RECOMMENDED (1 << 4) +/* + * Recommend using relaxed timing for this partition. If used, + * the VM should disable any watchdog timeouts that rely on the + * timely delivery of external interrupts + */ +#define HV_X64_RELAXED_TIMING_RECOMMENDED (1 << 5) + +/* MSR used to identify the guest OS. */ +#define HV_X64_MSR_GUEST_OS_ID 0x40000000 + +/* MSR used to setup pages used to communicate with the hypervisor. */ +#define HV_X64_MSR_HYPERCALL 0x40000001 + +/* MSR used to provide vcpu index */ +#define HV_X64_MSR_VP_INDEX 0x40000002 + +/* Define the virtual APIC registers */ +#define HV_X64_MSR_EOI 0x40000070 +#define HV_X64_MSR_ICR 0x40000071 +#define HV_X64_MSR_TPR 0x40000072 +#define HV_X64_MSR_APIC_ASSIST_PAGE 0x40000073 + +/* Define synthetic interrupt controller model specific registers. */ +#define HV_X64_MSR_SCONTROL 0x40000080 +#define HV_X64_MSR_SVERSION 0x40000081 +#define HV_X64_MSR_SIEFP 0x40000082 +#define HV_X64_MSR_SIMP 0x40000083 +#define HV_X64_MSR_EOM 0x40000084 +#define HV_X64_MSR_SINT0 0x40000090 +#define HV_X64_MSR_SINT1 0x40000091 +#define HV_X64_MSR_SINT2 0x40000092 +#define HV_X64_MSR_SINT3 0x40000093 +#define HV_X64_MSR_SINT4 0x40000094 +#define HV_X64_MSR_SINT5 0x40000095 +#define HV_X64_MSR_SINT6 0x40000096 +#define HV_X64_MSR_SINT7 0x40000097 +#define HV_X64_MSR_SINT8 0x40000098 +#define HV_X64_MSR_SINT9 0x40000099 +#define HV_X64_MSR_SINT10 0x4000009A +#define HV_X64_MSR_SINT11 0x4000009B +#define HV_X64_MSR_SINT12 0x4000009C +#define HV_X64_MSR_SINT13 0x4000009D +#define HV_X64_MSR_SINT14 0x4000009E +#define HV_X64_MSR_SINT15 0x4000009F + + +#define HV_X64_MSR_HYPERCALL_ENABLE 0x00000001 +#define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT 12 +#define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK \ + (~((1ull << HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT) - 1)) + +/* Declare the various hypercall operations. */ +#define HV_X64_HV_NOTIFY_LONG_SPIN_WAIT 0x0008 + +#define HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE 0x00000001 +#define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT 12 +#define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_MASK \ + (~((1ull << HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT) - 1)) + +#define HV_PROCESSOR_POWER_STATE_C0 0 +#define HV_PROCESSOR_POWER_STATE_C1 1 +#define HV_PROCESSOR_POWER_STATE_C2 2 +#define HV_PROCESSOR_POWER_STATE_C3 3 + +/* hypercall status code */ +#define HV_STATUS_SUCCESS 0 +#define HV_STATUS_INVALID_HYPERCALL_CODE 2 +#define HV_STATUS_INVALID_HYPERCALL_INPUT 3 +#define HV_STATUS_INVALID_ALIGNMENT 4 + +#endif diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h index 7ec65b1..1655147 100644 --- a/arch/x86/include/asm/i8259.h +++ b/arch/x86/include/asm/i8259.h @@ -26,11 +26,6 @@ extern unsigned int cached_irq_mask; extern raw_spinlock_t i8259A_lock; -extern void init_8259A(int auto_eoi); -extern void enable_8259A_irq(unsigned int irq); -extern void disable_8259A_irq(unsigned int irq); -extern unsigned int startup_8259A_irq(unsigned int irq); - /* the PIC may need a careful delay on some platforms, hence specific calls */ static inline unsigned char inb_pic(unsigned int port) { @@ -57,7 +52,17 @@ static inline void outb_pic(unsigned char value, unsigned int port) extern struct irq_chip i8259A_chip; -extern void mask_8259A(void); -extern void unmask_8259A(void); +struct legacy_pic { + int nr_legacy_irqs; + struct irq_chip *chip; + void (*mask_all)(void); + void (*restore_mask)(void); + void (*init)(int auto_eoi); + int (*irq_pending)(unsigned int irq); + void (*make_irq)(unsigned int irq); +}; + +extern struct legacy_pic *legacy_pic; +extern struct legacy_pic null_legacy_pic; #endif /* _ASM_X86_I8259_H */ diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 5f61f6e..35832a0 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -143,8 +143,6 @@ extern int noioapicreroute; /* 1 if the timer IRQ uses the '8259A Virtual Wire' mode */ extern int timer_through_8259; -extern void io_apic_disable_legacy(void); - /* * If we use the IO-APIC for IRQ routing, disable automatic * assignment of PCI IRQ's. @@ -189,6 +187,7 @@ extern struct mp_ioapic_gsi mp_gsi_routing[]; int mp_find_ioapic(int gsi); int mp_find_ioapic_pin(int ioapic, int gsi); void __init mp_register_ioapic(int id, u32 address, u32 gsi_base); +extern void __init pre_init_apic_IRQ0(void); #else /* !CONFIG_X86_IO_APIC */ @@ -198,7 +197,11 @@ static const int timer_through_8259 = 0; static inline void ioapic_init_mappings(void) { } static inline void ioapic_insert_resources(void) { } static inline void probe_nr_irqs_gsi(void) { } +static inline int mp_find_ioapic(int gsi) { return 0; } +struct io_apic_irq_attr; +static inline int io_apic_set_pci_routing(struct device *dev, int irq, + struct io_apic_irq_attr *irq_attr) { return 0; } #endif #endif /* _ASM_X86_IO_APIC_H */ diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index 2622927..5458380 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -48,6 +48,5 @@ extern DECLARE_BITMAP(used_vectors, NR_VECTORS); extern int vector_used_by_percpu_irq(unsigned int vector); extern void init_ISA_irqs(void); -extern int nr_legacy_irqs; #endif /* _ASM_X86_IRQ_H */ diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h index 4fe681d..4ffa345 100644 --- a/arch/x86/include/asm/kprobes.h +++ b/arch/x86/include/asm/kprobes.h @@ -32,7 +32,10 @@ struct kprobe; typedef u8 kprobe_opcode_t; #define BREAKPOINT_INSTRUCTION 0xcc -#define RELATIVEJUMP_INSTRUCTION 0xe9 +#define RELATIVEJUMP_OPCODE 0xe9 +#define RELATIVEJUMP_SIZE 5 +#define RELATIVECALL_OPCODE 0xe8 +#define RELATIVE_ADDR_SIZE 4 #define MAX_INSN_SIZE 16 #define MAX_STACK_SIZE 64 #define MIN_STACK_SIZE(ADDR) \ @@ -44,6 +47,17 @@ typedef u8 kprobe_opcode_t; #define flush_insn_slot(p) do { } while (0) +/* optinsn template addresses */ +extern kprobe_opcode_t optprobe_template_entry; +extern kprobe_opcode_t optprobe_template_val; +extern kprobe_opcode_t optprobe_template_call; +extern kprobe_opcode_t optprobe_template_end; +#define MAX_OPTIMIZED_LENGTH (MAX_INSN_SIZE + RELATIVE_ADDR_SIZE) +#define MAX_OPTINSN_SIZE \ + (((unsigned long)&optprobe_template_end - \ + (unsigned long)&optprobe_template_entry) + \ + MAX_OPTIMIZED_LENGTH + RELATIVEJUMP_SIZE) + extern const int kretprobe_blacklist_size; void arch_remove_kprobe(struct kprobe *p); @@ -64,6 +78,21 @@ struct arch_specific_insn { int boostable; }; +struct arch_optimized_insn { + /* copy of the original instructions */ + kprobe_opcode_t copied_insn[RELATIVE_ADDR_SIZE]; + /* detour code buffer */ + kprobe_opcode_t *insn; + /* the size of instructions copied to detour code buffer */ + size_t size; +}; + +/* Return true (!0) if optinsn is prepared for optimization. */ +static inline int arch_prepared_optinsn(struct arch_optimized_insn *optinsn) +{ + return optinsn->size; +} + struct prev_kprobe { struct kprobe *kp; unsigned long status; diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 7c18e12..7a6f54f 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -54,13 +54,23 @@ struct x86_emulate_ctxt; struct x86_emulate_ops { /* * read_std: Read bytes of standard (non-emulated/special) memory. - * Used for instruction fetch, stack operations, and others. + * Used for descriptor reading. * @addr: [IN ] Linear address from which to read. * @val: [OUT] Value read from memory, zero-extended to 'u_long'. * @bytes: [IN ] Number of bytes to read from memory. */ int (*read_std)(unsigned long addr, void *val, - unsigned int bytes, struct kvm_vcpu *vcpu); + unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error); + + /* + * fetch: Read bytes of standard (non-emulated/special) memory. + * Used for instruction fetch. + * @addr: [IN ] Linear address from which to read. + * @val: [OUT] Value read from memory, zero-extended to 'u_long'. + * @bytes: [IN ] Number of bytes to read from memory. + */ + int (*fetch)(unsigned long addr, void *val, + unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error); /* * read_emulated: Read bytes from emulated/special memory area. @@ -74,7 +84,7 @@ struct x86_emulate_ops { struct kvm_vcpu *vcpu); /* - * write_emulated: Read bytes from emulated/special memory area. + * write_emulated: Write bytes to emulated/special memory area. * @addr: [IN ] Linear address to which to write. * @val: [IN ] Value to write to memory (low-order bytes used as * required). @@ -168,6 +178,7 @@ struct x86_emulate_ctxt { /* Execution mode, passed to the emulator. */ #define X86EMUL_MODE_REAL 0 /* Real mode. */ +#define X86EMUL_MODE_VM86 1 /* Virtual 8086 mode. */ #define X86EMUL_MODE_PROT16 2 /* 16-bit protected mode. */ #define X86EMUL_MODE_PROT32 4 /* 32-bit protected mode. */ #define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 4f865e8..06d9e79 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -25,7 +25,7 @@ #include <asm/mtrr.h> #include <asm/msr-index.h> -#define KVM_MAX_VCPUS 16 +#define KVM_MAX_VCPUS 64 #define KVM_MEMORY_SLOTS 32 /* memory slots that does not exposed to userspace */ #define KVM_PRIVATE_MEM_SLOTS 4 @@ -38,19 +38,6 @@ #define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \ 0xFFFFFF0000000000ULL) -#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ - (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD) -#define KVM_GUEST_CR0_MASK \ - (KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) -#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST \ - (X86_CR0_WP | X86_CR0_NE | X86_CR0_TS | X86_CR0_MP) -#define KVM_VM_CR0_ALWAYS_ON \ - (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) -#define KVM_GUEST_CR4_MASK \ - (X86_CR4_VME | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_PGE | X86_CR4_VMXE) -#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) -#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) - #define INVALID_PAGE (~(hpa_t)0) #define UNMAPPED_GVA (~(gpa_t)0) @@ -256,7 +243,8 @@ struct kvm_mmu { void (*new_cr3)(struct kvm_vcpu *vcpu); int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err); void (*free)(struct kvm_vcpu *vcpu); - gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva); + gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access, + u32 *error); void (*prefetch_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page); int (*sync_page)(struct kvm_vcpu *vcpu, @@ -282,13 +270,15 @@ struct kvm_vcpu_arch { u32 regs_dirty; unsigned long cr0; + unsigned long cr0_guest_owned_bits; unsigned long cr2; unsigned long cr3; unsigned long cr4; + unsigned long cr4_guest_owned_bits; unsigned long cr8; u32 hflags; u64 pdptrs[4]; /* pae */ - u64 shadow_efer; + u64 efer; u64 apic_base; struct kvm_lapic *apic; /* kernel irqchip context */ int32_t apic_arb_prio; @@ -374,17 +364,27 @@ struct kvm_vcpu_arch { /* used for guest single stepping over the given code position */ u16 singlestep_cs; unsigned long singlestep_rip; + /* fields used by HYPER-V emulation */ + u64 hv_vapic; }; struct kvm_mem_alias { gfn_t base_gfn; unsigned long npages; gfn_t target_gfn; +#define KVM_ALIAS_INVALID 1UL + unsigned long flags; }; -struct kvm_arch{ - int naliases; +#define KVM_ARCH_HAS_UNALIAS_INSTANTIATION + +struct kvm_mem_aliases { struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS]; + int naliases; +}; + +struct kvm_arch { + struct kvm_mem_aliases *aliases; unsigned int n_free_mmu_pages; unsigned int n_requested_mmu_pages; @@ -416,6 +416,10 @@ struct kvm_arch{ s64 kvmclock_offset; struct kvm_xen_hvm_config xen_hvm_config; + + /* fields used by HYPER-V emulation */ + u64 hv_guest_os_id; + u64 hv_hypercall; }; struct kvm_vm_stat { @@ -471,6 +475,7 @@ struct kvm_x86_ops { int (*hardware_setup)(void); /* __init */ void (*hardware_unsetup)(void); /* __exit */ bool (*cpu_has_accelerated_tpr)(void); + void (*cpuid_update)(struct kvm_vcpu *vcpu); /* Create, but do not attach this VCPU */ struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id); @@ -492,6 +497,7 @@ struct kvm_x86_ops { void (*set_segment)(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l); + void (*decache_cr0_guest_bits)(struct kvm_vcpu *vcpu); void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu); void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); @@ -501,12 +507,13 @@ struct kvm_x86_ops { void (*set_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); void (*get_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); void (*set_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); - unsigned long (*get_dr)(struct kvm_vcpu *vcpu, int dr); - void (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value, - int *exception); + int (*get_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long *dest); + int (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value); void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg); unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); + void (*fpu_activate)(struct kvm_vcpu *vcpu); + void (*fpu_deactivate)(struct kvm_vcpu *vcpu); void (*tlb_flush)(struct kvm_vcpu *vcpu); @@ -531,7 +538,8 @@ struct kvm_x86_ops { int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); int (*get_tdp_level)(void); u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); - bool (*gb_page_enable)(void); + int (*get_lpage_level)(void); + bool (*rdtscp_supported)(void); const struct trace_print_flags *exit_reasons_str; }; @@ -606,8 +614,7 @@ int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value); void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); -int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, - int type_bits, int seg); +int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg); int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason); @@ -653,6 +660,10 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); int kvm_mmu_load(struct kvm_vcpu *vcpu); void kvm_mmu_unload(struct kvm_vcpu *vcpu); void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); +gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); +gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); +gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); +gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); @@ -666,6 +677,7 @@ void kvm_disable_tdp(void); int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3); int complete_pio(struct kvm_vcpu *vcpu); +bool kvm_check_iopl(struct kvm_vcpu *vcpu); struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn); diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index c584076..ffae142 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -2,6 +2,7 @@ #define _ASM_X86_KVM_PARA_H #include <linux/types.h> +#include <asm/hyperv.h> /* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It * should be used to determine that a VM is running under KVM. diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h new file mode 100644 index 0000000..451d30e --- /dev/null +++ b/arch/x86/include/asm/mrst.h @@ -0,0 +1,19 @@ +/* + * mrst.h: Intel Moorestown platform specific setup code + * + * (C) Copyright 2009 Intel Corporation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ +#ifndef _ASM_X86_MRST_H +#define _ASM_X86_MRST_H +extern int pci_mrst_init(void); +int __init sfi_parse_mrtc(struct sfi_table_header *table); + +#define SFI_MTMR_MAX_NUM 8 +#define SFI_MRTC_MAX 8 + +#endif /* _ASM_X86_MRST_H */ diff --git a/arch/x86/include/asm/numaq.h b/arch/x86/include/asm/numaq.h index 13370b9..37c5165 100644 --- a/arch/x86/include/asm/numaq.h +++ b/arch/x86/include/asm/numaq.h @@ -30,6 +30,7 @@ extern int found_numaq; extern int get_memcfg_numaq(void); +extern int pci_numaq_init(void); extern void *xquad_portio; diff --git a/arch/x86/include/asm/olpc.h b/arch/x86/include/asm/olpc.h index 3a57385..101229b 100644 --- a/arch/x86/include/asm/olpc.h +++ b/arch/x86/include/asm/olpc.h @@ -13,7 +13,6 @@ struct olpc_platform_t { #define OLPC_F_PRESENT 0x01 #define OLPC_F_DCON 0x02 -#define OLPC_F_VSA 0x04 #ifdef CONFIG_OLPC @@ -51,18 +50,6 @@ static inline int olpc_has_dcon(void) } /* - * The VSA is software from AMD that typical Geode bioses will include. - * It is used to emulate the PCI bus, VGA, etc. OLPC's Open Firmware does - * not include the VSA; instead, PCI is emulated by the kernel. - * - * The VSA is described further in arch/x86/pci/olpc.c. - */ -static inline int olpc_has_vsa(void) -{ - return (olpc_platform_info.flags & OLPC_F_VSA) ? 1 : 0; -} - -/* * The "Mass Production" version of OLPC's XO is identified as being model * C2. During the prototype phase, the following models (in chronological * order) were created: A1, B1, B2, B3, B4, C1. The A1 through B2 models @@ -87,13 +74,10 @@ static inline int olpc_has_dcon(void) return 0; } -static inline int olpc_has_vsa(void) -{ - return 0; -} - #endif +extern int pci_olpc_init(void); + /* EC related functions */ extern int olpc_ec_cmd(unsigned char cmd, unsigned char *inbuf, size_t inlen, diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index b4a00dd..3e002ca 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -45,8 +45,15 @@ static inline int pci_proc_domain(struct pci_bus *bus) #ifdef CONFIG_PCI extern unsigned int pcibios_assign_all_busses(void); +extern int pci_legacy_init(void); +# ifdef CONFIG_ACPI +# define x86_default_pci_init pci_acpi_init +# else +# define x86_default_pci_init pci_legacy_init +# endif #else -#define pcibios_assign_all_busses() 0 +# define pcibios_assign_all_busses() 0 +# define x86_default_pci_init NULL #endif extern unsigned long pci_mem_start; diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index 05b58cc..1a04223 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -83,7 +83,6 @@ struct irq_routing_table { extern unsigned int pcibios_irq_mask; -extern int pcibios_scanned; extern spinlock_t pci_config_lock; extern int (*pcibios_enable_irq)(struct pci_dev *dev); @@ -106,16 +105,15 @@ extern bool port_cf9_safe; extern int pci_direct_probe(void); extern void pci_direct_init(int type); extern void pci_pcbios_init(void); -extern int pci_olpc_init(void); extern void __init dmi_check_pciprobe(void); extern void __init dmi_check_skip_isa_align(void); /* some common used subsys_initcalls */ extern int __init pci_acpi_init(void); -extern int __init pcibios_irq_init(void); -extern int __init pci_visws_init(void); -extern int __init pci_numaq_init(void); +extern void __init pcibios_irq_init(void); extern int __init pcibios_init(void); +extern int pci_legacy_init(void); +extern void pcibios_fixup_irqs(void); /* pci-mmconfig.c */ @@ -183,3 +181,17 @@ static inline void mmio_config_writel(void __iomem *pos, u32 val) { asm volatile("movl %%eax,(%1)" : : "a" (val), "r" (pos) : "memory"); } + +#ifdef CONFIG_PCI +# ifdef CONFIG_ACPI +# define x86_default_pci_init pci_acpi_init +# else +# define x86_default_pci_init pci_legacy_init +# endif +# define x86_default_pci_init_irq pcibios_irq_init +# define x86_default_pci_fixup_irqs pcibios_fixup_irqs +#else +# define x86_default_pci_init NULL +# define x86_default_pci_init_irq NULL +# define x86_default_pci_fixup_irqs NULL +#endif diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 18e496c..86b1506 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -37,10 +37,8 @@ void setup_bios_corruption_check(void); #ifdef CONFIG_X86_VISWS extern void visws_early_detect(void); -extern int is_visws_box(void); #else static inline void visws_early_detect(void) { } -static inline int is_visws_box(void) { return 0; } #endif extern unsigned long saved_video_mode; diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 1fecb7e..38638cd 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -313,7 +313,7 @@ struct __attribute__ ((__packed__)) vmcb { #define SVM_EXIT_ERR -1 -#define SVM_CR0_SELECTIVE_MASK (1 << 3 | 1) /* TS and MP */ +#define SVM_CR0_SELECTIVE_MASK (X86_CR0_TS | X86_CR0_MP) #define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda" #define SVM_VMRUN ".byte 0x0f, 0x01, 0xd8" diff --git a/arch/x86/include/asm/visws/cobalt.h b/arch/x86/include/asm/visws/cobalt.h index 166adf6..2edb376 100644 --- a/arch/x86/include/asm/visws/cobalt.h +++ b/arch/x86/include/asm/visws/cobalt.h @@ -122,4 +122,6 @@ extern char visws_board_type; extern char visws_board_rev; +extern int pci_visws_init(void); + #endif /* _ASM_X86_VISWS_COBALT_H */ diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 2b49454..fb9a080 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -53,6 +53,7 @@ */ #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001 #define SECONDARY_EXEC_ENABLE_EPT 0x00000002 +#define SECONDARY_EXEC_RDTSCP 0x00000008 #define SECONDARY_EXEC_ENABLE_VPID 0x00000020 #define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 #define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080 @@ -251,6 +252,7 @@ enum vmcs_field { #define EXIT_REASON_MSR_READ 31 #define EXIT_REASON_MSR_WRITE 32 #define EXIT_REASON_MWAIT_INSTRUCTION 36 +#define EXIT_REASON_MONITOR_INSTRUCTION 39 #define EXIT_REASON_PAUSE_INSTRUCTION 40 #define EXIT_REASON_MCE_DURING_VMENTRY 41 #define EXIT_REASON_TPR_BELOW_THRESHOLD 43 @@ -362,6 +364,7 @@ enum vmcs_field { #define VMX_EPTP_UC_BIT (1ull << 8) #define VMX_EPTP_WB_BIT (1ull << 14) #define VMX_EPT_2MB_PAGE_BIT (1ull << 16) +#define VMX_EPT_1GB_PAGE_BIT (1ull << 17) #define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull << 24) #define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) #define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) @@ -374,7 +377,7 @@ enum vmcs_field { #define VMX_EPT_READABLE_MASK 0x1ull #define VMX_EPT_WRITABLE_MASK 0x2ull #define VMX_EPT_EXECUTABLE_MASK 0x4ull -#define VMX_EPT_IGMT_BIT (1ull << 6) +#define VMX_EPT_IPAT_BIT (1ull << 6) #define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 60cc352..519b543 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -99,6 +99,20 @@ struct x86_init_iommu { }; /** + * struct x86_init_pci - platform specific pci init functions + * @arch_init: platform specific pci arch init call + * @init: platform specific pci subsystem init + * @init_irq: platform specific pci irq init + * @fixup_irqs: platform specific pci irq fixup + */ +struct x86_init_pci { + int (*arch_init)(void); + int (*init)(void); + void (*init_irq)(void); + void (*fixup_irqs)(void); +}; + +/** * struct x86_init_ops - functions for platform specific setup * */ @@ -110,6 +124,7 @@ struct x86_init_ops { struct x86_init_paging paging; struct x86_init_timers timers; struct x86_init_iommu iommu; + struct x86_init_pci pci; }; /** diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index d87f09b..4c58352 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -87,6 +87,7 @@ obj-$(CONFIG_VM86) += vm86_32.o obj-$(CONFIG_EARLY_PRINTK) += early_printk.o obj-$(CONFIG_HPET_TIMER) += hpet.o +obj-$(CONFIG_APB_TIMER) += apb_timer.o obj-$(CONFIG_K8_NB) += k8.o obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 738fcb6..a54d714 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -35,6 +35,7 @@ #include <linux/ioport.h> #include <linux/pci.h> +#include <asm/pci_x86.h> #include <asm/pgtable.h> #include <asm/io_apic.h> #include <asm/apic.h> @@ -1624,6 +1625,9 @@ int __init acpi_boot_init(void) acpi_table_parse(ACPI_SIG_HPET, acpi_parse_hpet); + if (!acpi_noirq) + x86_init.pci.init = pci_acpi_init; + return 0; } diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index e6ea034..3a4bf35 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -7,6 +7,7 @@ #include <linux/mm.h> #include <linux/vmalloc.h> #include <linux/memory.h> +#include <linux/stop_machine.h> #include <asm/alternative.h> #include <asm/sections.h> #include <asm/pgtable.h> @@ -572,3 +573,62 @@ void *__kprobes text_poke(void *addr, const void *opcode, size_t len) local_irq_restore(flags); return addr; } + +/* + * Cross-modifying kernel text with stop_machine(). + * This code originally comes from immediate value. + */ +static atomic_t stop_machine_first; +static int wrote_text; + +struct text_poke_params { + void *addr; + const void *opcode; + size_t len; +}; + +static int __kprobes stop_machine_text_poke(void *data) +{ + struct text_poke_params *tpp = data; + + if (atomic_dec_and_test(&stop_machine_first)) { + text_poke(tpp->addr, tpp->opcode, tpp->len); + smp_wmb(); /* Make sure other cpus see that this has run */ + wrote_text = 1; + } else { + while (!wrote_text) + cpu_relax(); + smp_mb(); /* Load wrote_text before following execution */ + } + + flush_icache_range((unsigned long)tpp->addr, + (unsigned long)tpp->addr + tpp->len); + return 0; +} + +/** + * text_poke_smp - Update instructions on a live kernel on SMP + * @addr: address to modify + * @opcode: source of the copy + * @len: length to copy + * + * Modify multi-byte instruction by using stop_machine() on SMP. This allows + * user to poke/set multi-byte text on SMP. Only non-NMI/MCE code modifying + * should be allowed, since stop_machine() does _not_ protect code against + * NMI and MCE. + * + * Note: Must be called under get_online_cpus() and text_mutex. + */ +void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len) +{ + struct text_poke_params tpp; + + tpp.addr = addr; + tpp.opcode = opcode; + tpp.len = len; + atomic_set(&stop_machine_first, 1); + wrote_text = 0; + stop_machine(stop_machine_text_poke, (void *)&tpp, NULL); + return addr; +} + diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c new file mode 100644 index 0000000..4b70995 --- /dev/null +++ b/arch/x86/kernel/apb_timer.c @@ -0,0 +1,784 @@ +/* + * apb_timer.c: Driver for Langwell APB timers + * + * (C) Copyright 2009 Intel Corporation + * Author: Jacob Pan (jacob.jun.pan@intel.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + * + * Note: + * Langwell is the south complex of Intel Moorestown MID platform. There are + * eight external timers in total that can be used by the operating system. + * The timer information, such as frequency and addresses, is provided to the + * OS via SFI tables. + * Timer interrupts are routed via FW/HW emulated IOAPIC independently via + * individual redirection table entries (RTE). + * Unlike HPET, there is no master counter, therefore one of the timers are + * used as clocksource. The overall allocation looks like: + * - timer 0 - NR_CPUs for per cpu timer + * - one timer for clocksource + * - one timer for watchdog driver. + * It is also worth notice that APB timer does not support true one-shot mode, + * free-running mode will be used here to emulate one-shot mode. + * APB timer can also be used as broadcast timer along with per cpu local APIC + * timer, but by default APB timer has higher rating than local APIC timers. + */ + +#include <linux/clocksource.h> +#include <linux/clockchips.h> +#include <linux/delay.h> +#include <linux/errno.h> +#include <linux/init.h> +#include <linux/sysdev.h> +#include <linux/pm.h> +#include <linux/pci.h> +#include <linux/sfi.h> +#include <linux/interrupt.h> +#include <linux/cpu.h> +#include <linux/irq.h> + +#include <asm/fixmap.h> +#include <asm/apb_timer.h> + +#define APBT_MASK CLOCKSOURCE_MASK(32) +#define APBT_SHIFT 22 +#define APBT_CLOCKEVENT_RATING 150 +#define APBT_CLOCKSOURCE_RATING 250 +#define APBT_MIN_DELTA_USEC 200 + +#define EVT_TO_APBT_DEV(evt) container_of(evt, struct apbt_dev, evt) +#define APBT_CLOCKEVENT0_NUM (0) +#define APBT_CLOCKEVENT1_NUM (1) +#define APBT_CLOCKSOURCE_NUM (2) + +static unsigned long apbt_address; +static int apb_timer_block_enabled; +static void __iomem *apbt_virt_address; +static int phy_cs_timer_id; + +/* + * Common DW APB timer info + */ +static uint64_t apbt_freq; + +static void apbt_set_mode(enum clock_event_mode mode, + struct clock_event_device *evt); +static int apbt_next_event(unsigned long delta, + struct clock_event_device *evt); +static cycle_t apbt_read_clocksource(struct clocksource *cs); +static void apbt_restart_clocksource(struct clocksource *cs); + +struct apbt_dev { + struct clock_event_device evt; + unsigned int num; + int cpu; + unsigned int irq; + unsigned int tick; + unsigned int count; + unsigned int flags; + char name[10]; +}; + +int disable_apbt_percpu __cpuinitdata; + +static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev); + +#ifdef CONFIG_SMP +static unsigned int apbt_num_timers_used; +static struct apbt_dev *apbt_devs; +#endif + +static inline unsigned long apbt_readl_reg(unsigned long a) +{ + return readl(apbt_virt_address + a); +} + +static inline void apbt_writel_reg(unsigned long d, unsigned long a) +{ + writel(d, apbt_virt_address + a); +} + +static inline unsigned long apbt_readl(int n, unsigned long a) +{ + return readl(apbt_virt_address + a + n * APBTMRS_REG_SIZE); +} + +static inline void apbt_writel(int n, unsigned long d, unsigned long a) +{ + writel(d, apbt_virt_address + a + n * APBTMRS_REG_SIZE); +} + +static inline void apbt_set_mapping(void) +{ + struct sfi_timer_table_entry *mtmr; + + if (apbt_virt_address) { + pr_debug("APBT base already mapped\n"); + return; + } + mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM); + if (mtmr == NULL) { + printk(KERN_ERR "Failed to get MTMR %d from SFI\n", + APBT_CLOCKEVENT0_NUM); + return; + } + apbt_address = (unsigned long)mtmr->phys_addr; + if (!apbt_address) { + printk(KERN_WARNING "No timer base from SFI, use default\n"); + apbt_address = APBT_DEFAULT_BASE; + } + apbt_virt_address = ioremap_nocache(apbt_address, APBT_MMAP_SIZE); + if (apbt_virt_address) { + pr_debug("Mapped APBT physical addr %p at virtual addr %p\n",\ + (void *)apbt_address, (void *)apbt_virt_address); + } else { + pr_debug("Failed mapping APBT phy address at %p\n",\ + (void *)apbt_address); + goto panic_noapbt; + } + apbt_freq = mtmr->freq_hz / USEC_PER_SEC; + sfi_free_mtmr(mtmr); + + /* Now figure out the physical timer id for clocksource device */ + mtmr = sfi_get_mtmr(APBT_CLOCKSOURCE_NUM); + if (mtmr == NULL) + goto panic_noapbt; + + /* Now figure out the physical timer id */ + phy_cs_timer_id = (unsigned int)(mtmr->phys_addr & 0xff) + / APBTMRS_REG_SIZE; + pr_debug("Use timer %d for clocksource\n", phy_cs_timer_id); + return; + +panic_noapbt: + panic("Failed to setup APB system timer\n"); + +} + +static inline void apbt_clear_mapping(void) +{ + iounmap(apbt_virt_address); + apbt_virt_address = NULL; +} + +/* + * APBT timer interrupt enable / disable + */ +static inline int is_apbt_capable(void) +{ + return apbt_virt_address ? 1 : 0; +} + +static struct clocksource clocksource_apbt = { + .name = "apbt", + .rating = APBT_CLOCKSOURCE_RATING, + .read = apbt_read_clocksource, + .mask = APBT_MASK, + .shift = APBT_SHIFT, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, + .resume = apbt_restart_clocksource, +}; + +/* boot APB clock event device */ +static struct clock_event_device apbt_clockevent = { + .name = "apbt0", + .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, + .set_mode = apbt_set_mode, + .set_next_event = apbt_next_event, + .shift = APBT_SHIFT, + .irq = 0, + .rating = APBT_CLOCKEVENT_RATING, +}; + +/* + * if user does not want to use per CPU apb timer, just give it a lower rating + * than local apic timer and skip the late per cpu timer init. + */ +static inline int __init setup_x86_mrst_timer(char *arg) +{ + if (!arg) + return -EINVAL; + + if (strcmp("apbt_only", arg) == 0) + disable_apbt_percpu = 0; + else if (strcmp("lapic_and_apbt", arg) == 0) + disable_apbt_percpu = 1; + else { + pr_warning("X86 MRST timer option %s not recognised" + " use x86_mrst_timer=apbt_only or lapic_and_apbt\n", + arg); + return -EINVAL; + } + return 0; +} +__setup("x86_mrst_timer=", setup_x86_mrst_timer); + +/* + * start count down from 0xffff_ffff. this is done by toggling the enable bit + * then load initial load count to ~0. + */ +static void apbt_start_counter(int n) +{ + unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL); + + ctrl &= ~APBTMR_CONTROL_ENABLE; + apbt_writel(n, ctrl, APBTMR_N_CONTROL); + apbt_writel(n, ~0, APBTMR_N_LOAD_COUNT); + /* enable, mask interrupt */ + ctrl &= ~APBTMR_CONTROL_MODE_PERIODIC; + ctrl |= (APBTMR_CONTROL_ENABLE | APBTMR_CONTROL_INT); + apbt_writel(n, ctrl, APBTMR_N_CONTROL); + /* read it once to get cached counter value initialized */ + apbt_read_clocksource(&clocksource_apbt); +} + +static irqreturn_t apbt_interrupt_handler(int irq, void *data) +{ + struct apbt_dev *dev = (struct apbt_dev *)data; + struct clock_event_device *aevt = &dev->evt; + + if (!aevt->event_handler) { + printk(KERN_INFO "Spurious APBT timer interrupt on %d\n", + dev->num); + return IRQ_NONE; + } + aevt->event_handler(aevt); + return IRQ_HANDLED; +} + +static void apbt_restart_clocksource(struct clocksource *cs) +{ + apbt_start_counter(phy_cs_timer_id); +} + +/* Setup IRQ routing via IOAPIC */ +#ifdef CONFIG_SMP +static void apbt_setup_irq(struct apbt_dev *adev) +{ + struct irq_chip *chip; + struct irq_desc *desc; + + /* timer0 irq has been setup early */ + if (adev->irq == 0) + return; + desc = irq_to_desc(adev->irq); + chip = get_irq_chip(adev->irq); + disable_irq(adev->irq); + desc->status |= IRQ_MOVE_PCNTXT; + irq_set_affinity(adev->irq, cpumask_of(adev->cpu)); + /* APB timer irqs are set up as mp_irqs, timer is edge triggerred */ + set_irq_chip_and_handler_name(adev->irq, chip, handle_edge_irq, "edge"); + enable_irq(adev->irq); + if (system_state == SYSTEM_BOOTING) + if (request_irq(adev->irq, apbt_interrupt_handler, + IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING, + adev->name, adev)) { + printk(KERN_ERR "Failed request IRQ for APBT%d\n", + adev->num); + } +} +#endif + +static void apbt_enable_int(int n) +{ + unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL); + /* clear pending intr */ + apbt_readl(n, APBTMR_N_EOI); + ctrl &= ~APBTMR_CONTROL_INT; + apbt_writel(n, ctrl, APBTMR_N_CONTROL); +} + +static void apbt_disable_int(int n) +{ + unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL); + + ctrl |= APBTMR_CONTROL_INT; + apbt_writel(n, ctrl, APBTMR_N_CONTROL); +} + + +static int __init apbt_clockevent_register(void) +{ + struct sfi_timer_table_entry *mtmr; + struct apbt_dev *adev = &__get_cpu_var(cpu_apbt_dev); + + mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM); + if (mtmr == NULL) { + printk(KERN_ERR "Failed to get MTMR %d from SFI\n", + APBT_CLOCKEVENT0_NUM); + return -ENODEV; + } + + /* + * We need to calculate the scaled math multiplication factor for + * nanosecond to apbt tick conversion. + * mult = (nsec/cycle)*2^APBT_SHIFT + */ + apbt_clockevent.mult = div_sc((unsigned long) mtmr->freq_hz + , NSEC_PER_SEC, APBT_SHIFT); + + /* Calculate the min / max delta */ + apbt_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, + &apbt_clockevent); + apbt_clockevent.min_delta_ns = clockevent_delta2ns( + APBT_MIN_DELTA_USEC*apbt_freq, + &apbt_clockevent); + /* + * Start apbt with the boot cpu mask and make it + * global if not used for per cpu timer. + */ + apbt_clockevent.cpumask = cpumask_of(smp_processor_id()); + adev->num = smp_processor_id(); + memcpy(&adev->evt, &apbt_clockevent, sizeof(struct clock_event_device)); + + if (disable_apbt_percpu) { + apbt_clockevent.rating = APBT_CLOCKEVENT_RATING - 100; + global_clock_event = &adev->evt; + printk(KERN_DEBUG "%s clockevent registered as global\n", + global_clock_event->name); + } + + if (request_irq(apbt_clockevent.irq, apbt_interrupt_handler, + IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING, + apbt_clockevent.name, adev)) { + printk(KERN_ERR "Failed request IRQ for APBT%d\n", + apbt_clockevent.irq); + } + + clockevents_register_device(&adev->evt); + /* Start APBT 0 interrupts */ + apbt_enable_int(APBT_CLOCKEVENT0_NUM); + + sfi_free_mtmr(mtmr); + return 0; +} + +#ifdef CONFIG_SMP +/* Should be called with per cpu */ +void apbt_setup_secondary_clock(void) +{ + struct apbt_dev *adev; + struct clock_event_device *aevt; + int cpu; + + /* Don't register boot CPU clockevent */ + cpu = smp_processor_id(); + if (cpu == boot_cpu_id) + return; + /* + * We need to calculate the scaled math multiplication factor for + * nanosecond to apbt tick conversion. + * mult = (nsec/cycle)*2^APBT_SHIFT + */ + printk(KERN_INFO "Init per CPU clockevent %d\n", cpu); + adev = &per_cpu(cpu_apbt_dev, cpu); + aevt = &adev->evt; + + memcpy(aevt, &apbt_clockevent, sizeof(*aevt)); + aevt->cpumask = cpumask_of(cpu); + aevt->name = adev->name; + aevt->mode = CLOCK_EVT_MODE_UNUSED; + + printk(KERN_INFO "Registering CPU %d clockevent device %s, mask %08x\n", + cpu, aevt->name, *(u32 *)aevt->cpumask); + + apbt_setup_irq(adev); + + clockevents_register_device(aevt); + + apbt_enable_int(cpu); + + return; +} + +/* + * this notify handler process CPU hotplug events. in case of S0i3, nonboot + * cpus are disabled/enabled frequently, for performance reasons, we keep the + * per cpu timer irq registered so that we do need to do free_irq/request_irq. + * + * TODO: it might be more reliable to directly disable percpu clockevent device + * without the notifier chain. currently, cpu 0 may get interrupts from other + * cpu timers during the offline process due to the ordering of notification. + * the extra interrupt is harmless. + */ +static int apbt_cpuhp_notify(struct notifier_block *n, + unsigned long action, void *hcpu) +{ + unsigned long cpu = (unsigned long)hcpu; + struct apbt_dev *adev = &per_cpu(cpu_apbt_dev, cpu); + + switch (action & 0xf) { + case CPU_DEAD: + apbt_disable_int(cpu); + if (system_state == SYSTEM_RUNNING) + pr_debug("skipping APBT CPU %lu offline\n", cpu); + else if (adev) { + pr_debug("APBT clockevent for cpu %lu offline\n", cpu); + free_irq(adev->irq, adev); + } + break; + default: + pr_debug(KERN_INFO "APBT notified %lu, no action\n", action); + } + return NOTIFY_OK; +} + +static __init int apbt_late_init(void) +{ + if (disable_apbt_percpu) + return 0; + /* This notifier should be called after workqueue is ready */ + hotcpu_notifier(apbt_cpuhp_notify, -20); + return 0; +} +fs_initcall(apbt_late_init); +#else + +void apbt_setup_secondary_clock(void) {} + +#endif /* CONFIG_SMP */ + +static void apbt_set_mode(enum clock_event_mode mode, + struct clock_event_device *evt) +{ + unsigned long ctrl; + uint64_t delta; + int timer_num; + struct apbt_dev *adev = EVT_TO_APBT_DEV(evt); + + timer_num = adev->num; + pr_debug("%s CPU %d timer %d mode=%d\n", + __func__, first_cpu(*evt->cpumask), timer_num, mode); + + switch (mode) { + case CLOCK_EVT_MODE_PERIODIC: + delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * apbt_clockevent.mult; + delta >>= apbt_clockevent.shift; + ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL); + ctrl |= APBTMR_CONTROL_MODE_PERIODIC; + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + /* + * DW APB p. 46, have to disable timer before load counter, + * may cause sync problem. + */ + ctrl &= ~APBTMR_CONTROL_ENABLE; + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + udelay(1); + pr_debug("Setting clock period %d for HZ %d\n", (int)delta, HZ); + apbt_writel(timer_num, delta, APBTMR_N_LOAD_COUNT); + ctrl |= APBTMR_CONTROL_ENABLE; + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + break; + /* APB timer does not have one-shot mode, use free running mode */ + case CLOCK_EVT_MODE_ONESHOT: + ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL); + /* + * set free running mode, this mode will let timer reload max + * timeout which will give time (3min on 25MHz clock) to rearm + * the next event, therefore emulate the one-shot mode. + */ + ctrl &= ~APBTMR_CONTROL_ENABLE; + ctrl &= ~APBTMR_CONTROL_MODE_PERIODIC; + + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + /* write again to set free running mode */ + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + + /* + * DW APB p. 46, load counter with all 1s before starting free + * running mode. + */ + apbt_writel(timer_num, ~0, APBTMR_N_LOAD_COUNT); + ctrl &= ~APBTMR_CONTROL_INT; + ctrl |= APBTMR_CONTROL_ENABLE; + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + break; + + case CLOCK_EVT_MODE_UNUSED: + case CLOCK_EVT_MODE_SHUTDOWN: + apbt_disable_int(timer_num); + ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL); + ctrl &= ~APBTMR_CONTROL_ENABLE; + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + break; + + case CLOCK_EVT_MODE_RESUME: + apbt_enable_int(timer_num); + break; + } +} + +static int apbt_next_event(unsigned long delta, + struct clock_event_device *evt) +{ + unsigned long ctrl; + int timer_num; + + struct apbt_dev *adev = EVT_TO_APBT_DEV(evt); + + timer_num = adev->num; + /* Disable timer */ + ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL); + ctrl &= ~APBTMR_CONTROL_ENABLE; + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + /* write new count */ + apbt_writel(timer_num, delta, APBTMR_N_LOAD_COUNT); + ctrl |= APBTMR_CONTROL_ENABLE; + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + return 0; +} + +/* + * APB timer clock is not in sync with pclk on Langwell, which translates to + * unreliable read value caused by sampling error. the error does not add up + * overtime and only happens when sampling a 0 as a 1 by mistake. so the time + * would go backwards. the following code is trying to prevent time traveling + * backwards. little bit paranoid. + */ +static cycle_t apbt_read_clocksource(struct clocksource *cs) +{ + unsigned long t0, t1, t2; + static unsigned long last_read; + +bad_count: + t1 = apbt_readl(phy_cs_timer_id, + APBTMR_N_CURRENT_VALUE); + t2 = apbt_readl(phy_cs_timer_id, + APBTMR_N_CURRENT_VALUE); + if (unlikely(t1 < t2)) { + pr_debug("APBT: read current count error %lx:%lx:%lx\n", + t1, t2, t2 - t1); + goto bad_count; + } + /* + * check against cached last read, makes sure time does not go back. + * it could be a normal rollover but we will do tripple check anyway + */ + if (unlikely(t2 > last_read)) { + /* check if we have a normal rollover */ + unsigned long raw_intr_status = + apbt_readl_reg(APBTMRS_RAW_INT_STATUS); + /* + * cs timer interrupt is masked but raw intr bit is set if + * rollover occurs. then we read EOI reg to clear it. + */ + if (raw_intr_status & (1 << phy_cs_timer_id)) { + apbt_readl(phy_cs_timer_id, APBTMR_N_EOI); + goto out; + } + pr_debug("APB CS going back %lx:%lx:%lx ", + t2, last_read, t2 - last_read); +bad_count_x3: + pr_debug(KERN_INFO "tripple check enforced\n"); + t0 = apbt_readl(phy_cs_timer_id, + APBTMR_N_CURRENT_VALUE); + udelay(1); + t1 = apbt_readl(phy_cs_timer_id, + APBTMR_N_CURRENT_VALUE); + udelay(1); + t2 = apbt_readl(phy_cs_timer_id, + APBTMR_N_CURRENT_VALUE); + if ((t2 > t1) || (t1 > t0)) { + printk(KERN_ERR "Error: APB CS tripple check failed\n"); + goto bad_count_x3; + } + } +out: + last_read = t2; + return (cycle_t)~t2; +} + +static int apbt_clocksource_register(void) +{ + u64 start, now; + cycle_t t1; + + /* Start the counter, use timer 2 as source, timer 0/1 for event */ + apbt_start_counter(phy_cs_timer_id); + + /* Verify whether apbt counter works */ + t1 = apbt_read_clocksource(&clocksource_apbt); + rdtscll(start); + + /* + * We don't know the TSC frequency yet, but waiting for + * 200000 TSC cycles is safe: + * 4 GHz == 50us + * 1 GHz == 200us + */ + do { + rep_nop(); + rdtscll(now); + } while ((now - start) < 200000UL); + + /* APBT is the only always on clocksource, it has to work! */ + if (t1 == apbt_read_clocksource(&clocksource_apbt)) + panic("APBT counter not counting. APBT disabled\n"); + + /* + * initialize and register APBT clocksource + * convert that to ns/clock cycle + * mult = (ns/c) * 2^APBT_SHIFT + */ + clocksource_apbt.mult = div_sc(MSEC_PER_SEC, + (unsigned long) apbt_freq, APBT_SHIFT); + clocksource_register(&clocksource_apbt); + + return 0; +} + +/* + * Early setup the APBT timer, only use timer 0 for booting then switch to + * per CPU timer if possible. + * returns 1 if per cpu apbt is setup + * returns 0 if no per cpu apbt is chosen + * panic if set up failed, this is the only platform timer on Moorestown. + */ +void __init apbt_time_init(void) +{ +#ifdef CONFIG_SMP + int i; + struct sfi_timer_table_entry *p_mtmr; + unsigned int percpu_timer; + struct apbt_dev *adev; +#endif + + if (apb_timer_block_enabled) + return; + apbt_set_mapping(); + if (apbt_virt_address) { + pr_debug("Found APBT version 0x%lx\n",\ + apbt_readl_reg(APBTMRS_COMP_VERSION)); + } else + goto out_noapbt; + /* + * Read the frequency and check for a sane value, for ESL model + * we extend the possible clock range to allow time scaling. + */ + + if (apbt_freq < APBT_MIN_FREQ || apbt_freq > APBT_MAX_FREQ) { + pr_debug("APBT has invalid freq 0x%llx\n", apbt_freq); + goto out_noapbt; + } + if (apbt_clocksource_register()) { + pr_debug("APBT has failed to register clocksource\n"); + goto out_noapbt; + } + if (!apbt_clockevent_register()) + apb_timer_block_enabled = 1; + else { + pr_debug("APBT has failed to register clockevent\n"); + goto out_noapbt; + } +#ifdef CONFIG_SMP + /* kernel cmdline disable apb timer, so we will use lapic timers */ + if (disable_apbt_percpu) { + printk(KERN_INFO "apbt: disabled per cpu timer\n"); + return; + } + pr_debug("%s: %d CPUs online\n", __func__, num_online_cpus()); + if (num_possible_cpus() <= sfi_mtimer_num) { + percpu_timer = 1; + apbt_num_timers_used = num_possible_cpus(); + } else { + percpu_timer = 0; + apbt_num_timers_used = 1; + adev = &per_cpu(cpu_apbt_dev, 0); + adev->flags &= ~APBT_DEV_USED; + } + pr_debug("%s: %d APB timers used\n", __func__, apbt_num_timers_used); + + /* here we set up per CPU timer data structure */ + apbt_devs = kzalloc(sizeof(struct apbt_dev) * apbt_num_timers_used, + GFP_KERNEL); + if (!apbt_devs) { + printk(KERN_ERR "Failed to allocate APB timer devices\n"); + return; + } + for (i = 0; i < apbt_num_timers_used; i++) { + adev = &per_cpu(cpu_apbt_dev, i); + adev->num = i; + adev->cpu = i; + p_mtmr = sfi_get_mtmr(i); + if (p_mtmr) { + adev->tick = p_mtmr->freq_hz; + adev->irq = p_mtmr->irq; + } else + printk(KERN_ERR "Failed to get timer for cpu %d\n", i); + adev->count = 0; + sprintf(adev->name, "apbt%d", i); + } +#endif + + return; + +out_noapbt: + apbt_clear_mapping(); + apb_timer_block_enabled = 0; + panic("failed to enable APB timer\n"); +} + +static inline void apbt_disable(int n) +{ + if (is_apbt_capable()) { + unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL); + ctrl &= ~APBTMR_CONTROL_ENABLE; + apbt_writel(n, ctrl, APBTMR_N_CONTROL); + } +} + +/* called before apb_timer_enable, use early map */ +unsigned long apbt_quick_calibrate() +{ + int i, scale; + u64 old, new; + cycle_t t1, t2; + unsigned long khz = 0; + u32 loop, shift; + + apbt_set_mapping(); + apbt_start_counter(phy_cs_timer_id); + + /* check if the timer can count down, otherwise return */ + old = apbt_read_clocksource(&clocksource_apbt); + i = 10000; + while (--i) { + if (old != apbt_read_clocksource(&clocksource_apbt)) + break; + } + if (!i) + goto failed; + + /* count 16 ms */ + loop = (apbt_freq * 1000) << 4; + + /* restart the timer to ensure it won't get to 0 in the calibration */ + apbt_start_counter(phy_cs_timer_id); + + old = apbt_read_clocksource(&clocksource_apbt); + old += loop; + + t1 = __native_read_tsc(); + + do { + new = apbt_read_clocksource(&clocksource_apbt); + } while (new < old); + + t2 = __native_read_tsc(); + + shift = 5; + if (unlikely(loop >> shift == 0)) { + printk(KERN_INFO + "APBT TSC calibration failed, not enough resolution\n"); + return 0; + } + scale = (int)div_u64((t2 - t1), loop >> shift); + khz = (scale * apbt_freq * 1000) >> shift; + printk(KERN_INFO "TSC freq calculated by APB timer is %lu khz\n", khz); + return khz; +failed: + return 0; +} diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 6e29b2a..00187f1 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1390,7 +1390,7 @@ void __init enable_IR_x2apic(void) } local_irq_save(flags); - mask_8259A(); + legacy_pic->mask_all(); mask_IO_APIC_setup(ioapic_entries); if (dmar_table_init_ret) @@ -1422,7 +1422,7 @@ void __init enable_IR_x2apic(void) nox2apic: if (!ret) /* IR enabling failed */ restore_IO_APIC_setup(ioapic_entries); - unmask_8259A(); + legacy_pic->restore_mask(); local_irq_restore(flags); out: @@ -2018,7 +2018,7 @@ static int lapic_resume(struct sys_device *dev) } mask_IO_APIC_setup(ioapic_entries); - mask_8259A(); + legacy_pic->mask_all(); } if (x2apic_mode) @@ -2062,7 +2062,7 @@ static int lapic_resume(struct sys_device *dev) if (intr_remapping_enabled) { reenable_intr_remapping(x2apic_mode); - unmask_8259A(); + legacy_pic->restore_mask(); restore_IO_APIC_setup(ioapic_entries); free_ioapic_entries(ioapic_entries); } diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 14862f1..e4e0ddc 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -143,12 +143,6 @@ static struct irq_cfg irq_cfgx[NR_IRQS_LEGACY]; static struct irq_cfg irq_cfgx[NR_IRQS]; #endif -void __init io_apic_disable_legacy(void) -{ - nr_legacy_irqs = 0; - nr_irqs_gsi = 0; -} - int __init arch_early_irq_init(void) { struct irq_cfg *cfg; @@ -157,6 +151,11 @@ int __init arch_early_irq_init(void) int node; int i; + if (!legacy_pic->nr_legacy_irqs) { + nr_irqs_gsi = 0; + io_apic_irqs = ~0UL; + } + cfg = irq_cfgx; count = ARRAY_SIZE(irq_cfgx); node= cpu_to_node(boot_cpu_id); @@ -170,7 +169,7 @@ int __init arch_early_irq_init(void) * For legacy IRQ's, start with assigning irq0 to irq15 to * IRQ0_VECTOR to IRQ15_VECTOR on cpu 0. */ - if (i < nr_legacy_irqs) { + if (i < legacy_pic->nr_legacy_irqs) { cfg[i].vector = IRQ0_VECTOR + i; cpumask_set_cpu(0, cfg[i].domain); } @@ -852,7 +851,7 @@ static int __init find_isa_irq_apic(int irq, int type) */ static int EISA_ELCR(unsigned int irq) { - if (irq < nr_legacy_irqs) { + if (irq < legacy_pic->nr_legacy_irqs) { unsigned int port = 0x4d0 + (irq >> 3); return (inb(port) >> (irq & 7)) & 1; } @@ -1439,7 +1438,7 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq * controllers like 8259. Now that IO-APIC can handle this irq, update * the cfg->domain. */ - if (irq < nr_legacy_irqs && cpumask_test_cpu(0, cfg->domain)) + if (irq < legacy_pic->nr_legacy_irqs && cpumask_test_cpu(0, cfg->domain)) apic->vector_allocation_domain(0, cfg->domain); if (assign_irq_vector(irq, cfg, apic->target_cpus())) @@ -1463,8 +1462,8 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq } ioapic_register_intr(irq, desc, trigger); - if (irq < nr_legacy_irqs) - disable_8259A_irq(irq); + if (irq < legacy_pic->nr_legacy_irqs) + legacy_pic->chip->mask(irq); ioapic_write_entry(apic_id, pin, entry); } @@ -1873,7 +1872,7 @@ __apicdebuginit(void) print_PIC(void) unsigned int v; unsigned long flags; - if (!nr_legacy_irqs) + if (!legacy_pic->nr_legacy_irqs) return; printk(KERN_DEBUG "\nprinting PIC contents\n"); @@ -1957,7 +1956,7 @@ void __init enable_IO_APIC(void) nr_ioapic_registers[apic] = reg_01.bits.entries+1; } - if (!nr_legacy_irqs) + if (!legacy_pic->nr_legacy_irqs) return; for(apic = 0; apic < nr_ioapics; apic++) { @@ -2014,7 +2013,7 @@ void disable_IO_APIC(void) */ clear_IO_APIC(); - if (!nr_legacy_irqs) + if (!legacy_pic->nr_legacy_irqs) return; /* @@ -2247,9 +2246,9 @@ static unsigned int startup_ioapic_irq(unsigned int irq) struct irq_cfg *cfg; raw_spin_lock_irqsave(&ioapic_lock, flags); - if (irq < nr_legacy_irqs) { - disable_8259A_irq(irq); - if (i8259A_irq_pending(irq)) + if (irq < legacy_pic->nr_legacy_irqs) { + legacy_pic->chip->mask(irq); + if (legacy_pic->irq_pending(irq)) was_pending = 1; } cfg = irq_cfg(irq); @@ -2782,8 +2781,8 @@ static inline void init_IO_APIC_traps(void) * so default to an old-fashioned 8259 * interrupt if we can.. */ - if (irq < nr_legacy_irqs) - make_8259A_irq(irq); + if (irq < legacy_pic->nr_legacy_irqs) + legacy_pic->make_irq(irq); else /* Strange. Oh, well.. */ desc->chip = &no_irq_chip; @@ -2940,7 +2939,7 @@ static inline void __init check_timer(void) /* * get/set the timer IRQ vector: */ - disable_8259A_irq(0); + legacy_pic->chip->mask(0); assign_irq_vector(0, cfg, apic->target_cpus()); /* @@ -2953,7 +2952,7 @@ static inline void __init check_timer(void) * automatically. */ apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); - init_8259A(1); + legacy_pic->init(1); #ifdef CONFIG_X86_32 { unsigned int ver; @@ -3012,7 +3011,7 @@ static inline void __init check_timer(void) if (timer_irq_works()) { if (nmi_watchdog == NMI_IO_APIC) { setup_nmi(); - enable_8259A_irq(0); + legacy_pic->chip->unmask(0); } if (disable_timer_pin_1 > 0) clear_IO_APIC_pin(0, pin1); @@ -3035,14 +3034,14 @@ static inline void __init check_timer(void) */ replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2); setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); - enable_8259A_irq(0); + legacy_pic->chip->unmask(0); if (timer_irq_works()) { apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); timer_through_8259 = 1; if (nmi_watchdog == NMI_IO_APIC) { - disable_8259A_irq(0); + legacy_pic->chip->mask(0); setup_nmi(); - enable_8259A_irq(0); + legacy_pic->chip->unmask(0); } goto out; } @@ -3050,7 +3049,7 @@ static inline void __init check_timer(void) * Cleanup, just in case ... */ local_irq_disable(); - disable_8259A_irq(0); + legacy_pic->chip->mask(0); clear_IO_APIC_pin(apic2, pin2); apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); } @@ -3069,22 +3068,22 @@ static inline void __init check_timer(void) lapic_register_intr(0, desc); apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ - enable_8259A_irq(0); + legacy_pic->chip->unmask(0); if (timer_irq_works()) { apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); goto out; } local_irq_disable(); - disable_8259A_irq(0); + legacy_pic->chip->mask(0); apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n"); apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer as ExtINT IRQ...\n"); - init_8259A(0); - make_8259A_irq(0); + legacy_pic->init(0); + legacy_pic->make_irq(0); apic_write(APIC_LVT0, APIC_DM_EXTINT); unlock_ExtINT_logic(); @@ -3126,7 +3125,7 @@ void __init setup_IO_APIC(void) /* * calling enable_IO_APIC() is moved to setup_local_APIC for BP */ - io_apic_irqs = nr_legacy_irqs ? ~PIC_IRQS : ~0UL; + io_apic_irqs = legacy_pic->nr_legacy_irqs ? ~PIC_IRQS : ~0UL; apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); /* @@ -3137,7 +3136,7 @@ void __init setup_IO_APIC(void) sync_Arb_IDs(); setup_IO_APIC_irqs(); init_IO_APIC_traps(); - if (nr_legacy_irqs) + if (legacy_pic->nr_legacy_irqs) check_timer(); } @@ -3928,7 +3927,7 @@ static int __io_apic_set_pci_routing(struct device *dev, int irq, /* * IRQs < 16 are already in the irq_2_pin[] map */ - if (irq >= nr_legacy_irqs) { + if (irq >= legacy_pic->nr_legacy_irqs) { cfg = desc->chip_data; if (add_pin_to_irq_node_nopanic(cfg, node, ioapic, pin)) { printk(KERN_INFO "can not add pin %d for irq %d\n", @@ -4302,3 +4301,24 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) nr_ioapics++; } + +/* Enable IOAPIC early just for system timer */ +void __init pre_init_apic_IRQ0(void) +{ + struct irq_cfg *cfg; + struct irq_desc *desc; + + printk(KERN_INFO "Early APIC setup for system timer0\n"); +#ifndef CONFIG_SMP + phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid); +#endif + desc = irq_to_desc_alloc_node(0, 0); + + setup_local_APIC(); + + cfg = irq_cfg(0); + add_pin_to_irq_node(cfg, 0, 0, 0); + set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge"); + + setup_IO_APIC_irq(0, 0, 0, desc, 0, 0); +} diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index bd7c96b..8aa65ad 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c @@ -177,7 +177,7 @@ int __init check_nmi_watchdog(void) error: if (nmi_watchdog == NMI_IO_APIC) { if (!timer_through_8259) - disable_8259A_irq(0); + legacy_pic->chip->mask(0); on_each_cpu(__acpi_nmi_disable, NULL, 1); } diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index 47dd856..3e28401 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -277,6 +277,7 @@ static __init void early_check_numaq(void) x86_init.mpparse.mpc_oem_pci_bus = mpc_oem_pci_bus; x86_init.mpparse.mpc_oem_bus_info = mpc_oem_bus_info; x86_init.timers.tsc_pre_init = numaq_tsc_init; + x86_init.pci.init = pci_numaq_init; } } diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig index f138c6c..870e6cc 100644 --- a/arch/x86/kernel/cpu/cpufreq/Kconfig +++ b/arch/x86/kernel/cpu/cpufreq/Kconfig @@ -10,6 +10,20 @@ if CPU_FREQ comment "CPUFreq processor drivers" +config X86_PCC_CPUFREQ + tristate "Processor Clocking Control interface driver" + depends on ACPI && ACPI_PROCESSOR + help + This driver adds support for the PCC interface. + + For details, take a look at: + <file:Documentation/cpu-freq/pcc-cpufreq.txt>. + + To compile this driver as a module, choose M here: the + module will be called pcc-cpufreq. + + If in doubt, say N. + config X86_ACPI_CPUFREQ tristate "ACPI Processor P-States driver" select CPU_FREQ_TABLE diff --git a/arch/x86/kernel/cpu/cpufreq/Makefile b/arch/x86/kernel/cpu/cpufreq/Makefile index 509296d..1840c0a 100644 --- a/arch/x86/kernel/cpu/cpufreq/Makefile +++ b/arch/x86/kernel/cpu/cpufreq/Makefile @@ -4,6 +4,7 @@ obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o +obj-$(CONFIG_X86_PCC_CPUFREQ) += pcc-cpufreq.o obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o obj-$(CONFIG_X86_POWERNOW_K7) += powernow-k7.o obj-$(CONFIG_X86_LONGHAUL) += longhaul.o diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c new file mode 100644 index 0000000..ff36d29 --- /dev/null +++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c @@ -0,0 +1,620 @@ +/* + * pcc-cpufreq.c - Processor Clocking Control firmware cpufreq interface + * + * Copyright (C) 2009 Red Hat, Matthew Garrett <mjg@redhat.com> + * Copyright (C) 2009 Hewlett-Packard Development Company, L.P. + * Nagananda Chumbalkar <nagananda.chumbalkar@hp.com> + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or NON + * INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 675 Mass Ave, Cambridge, MA 02139, USA. + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/smp.h> +#include <linux/sched.h> +#include <linux/cpufreq.h> +#include <linux/compiler.h> + +#include <linux/acpi.h> +#include <linux/io.h> +#include <linux/spinlock.h> +#include <linux/uaccess.h> + +#include <acpi/processor.h> + +#define PCC_VERSION "1.00.00" +#define POLL_LOOPS 300 + +#define CMD_COMPLETE 0x1 +#define CMD_GET_FREQ 0x0 +#define CMD_SET_FREQ 0x1 + +#define BUF_SZ 4 + +#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \ + "pcc-cpufreq", msg) + +struct pcc_register_resource { + u8 descriptor; + u16 length; + u8 space_id; + u8 bit_width; + u8 bit_offset; + u8 access_size; + u64 address; +} __attribute__ ((packed)); + +struct pcc_memory_resource { + u8 descriptor; + u16 length; + u8 space_id; + u8 resource_usage; + u8 type_specific; + u64 granularity; + u64 minimum; + u64 maximum; + u64 translation_offset; + u64 address_length; +} __attribute__ ((packed)); + +static struct cpufreq_driver pcc_cpufreq_driver; + +struct pcc_header { + u32 signature; + u16 length; + u8 major; + u8 minor; + u32 features; + u16 command; + u16 status; + u32 latency; + u32 minimum_time; + u32 maximum_time; + u32 nominal; + u32 throttled_frequency; + u32 minimum_frequency; +}; + +static void __iomem *pcch_virt_addr; +static struct pcc_header __iomem *pcch_hdr; + +static DEFINE_SPINLOCK(pcc_lock); + +static struct acpi_generic_address doorbell; + +static u64 doorbell_preserve; +static u64 doorbell_write; + +static u8 OSC_UUID[16] = {0x63, 0x9B, 0x2C, 0x9F, 0x70, 0x91, 0x49, 0x1f, + 0xBB, 0x4F, 0xA5, 0x98, 0x2F, 0xA1, 0xB5, 0x46}; + +struct pcc_cpu { + u32 input_offset; + u32 output_offset; +}; + +static struct pcc_cpu *pcc_cpu_info; + +static int pcc_cpufreq_verify(struct cpufreq_policy *policy) +{ + cpufreq_verify_within_limits(policy, policy->cpuinfo.min_freq, + policy->cpuinfo.max_freq); + return 0; +} + +static inline void pcc_cmd(void) +{ + u64 doorbell_value; + int i; + + acpi_read(&doorbell_value, &doorbell); + acpi_write((doorbell_value & doorbell_preserve) | doorbell_write, + &doorbell); + + for (i = 0; i < POLL_LOOPS; i++) { + if (ioread16(&pcch_hdr->status) & CMD_COMPLETE) + break; + } +} + +static inline void pcc_clear_mapping(void) +{ + if (pcch_virt_addr) + iounmap(pcch_virt_addr); + pcch_virt_addr = NULL; +} + +static unsigned int pcc_get_freq(unsigned int cpu) +{ + struct pcc_cpu *pcc_cpu_data; + unsigned int curr_freq; + unsigned int freq_limit; + u16 status; + u32 input_buffer; + u32 output_buffer; + + spin_lock(&pcc_lock); + + dprintk("get: get_freq for CPU %d\n", cpu); + pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu); + + input_buffer = 0x1; + iowrite32(input_buffer, + (pcch_virt_addr + pcc_cpu_data->input_offset)); + iowrite16(CMD_GET_FREQ, &pcch_hdr->command); + + pcc_cmd(); + + output_buffer = + ioread32(pcch_virt_addr + pcc_cpu_data->output_offset); + + /* Clear the input buffer - we are done with the current command */ + memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ); + + status = ioread16(&pcch_hdr->status); + if (status != CMD_COMPLETE) { + dprintk("get: FAILED: for CPU %d, status is %d\n", + cpu, status); + goto cmd_incomplete; + } + iowrite16(0, &pcch_hdr->status); + curr_freq = (((ioread32(&pcch_hdr->nominal) * (output_buffer & 0xff)) + / 100) * 1000); + + dprintk("get: SUCCESS: (virtual) output_offset for cpu %d is " + "0x%x, contains a value of: 0x%x. Speed is: %d MHz\n", + cpu, (pcch_virt_addr + pcc_cpu_data->output_offset), + output_buffer, curr_freq); + + freq_limit = (output_buffer >> 8) & 0xff; + if (freq_limit != 0xff) { + dprintk("get: frequency for cpu %d is being temporarily" + " capped at %d\n", cpu, curr_freq); + } + + spin_unlock(&pcc_lock); + return curr_freq; + +cmd_incomplete: + iowrite16(0, &pcch_hdr->status); + spin_unlock(&pcc_lock); + return -EINVAL; +} + +static int pcc_cpufreq_target(struct cpufreq_policy *policy, + unsigned int target_freq, + unsigned int relation) +{ + struct pcc_cpu *pcc_cpu_data; + struct cpufreq_freqs freqs; + u16 status; + u32 input_buffer; + int cpu; + + spin_lock(&pcc_lock); + cpu = policy->cpu; + pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu); + + dprintk("target: CPU %d should go to target freq: %d " + "(virtual) input_offset is 0x%x\n", + cpu, target_freq, + (pcch_virt_addr + pcc_cpu_data->input_offset)); + + freqs.new = target_freq; + freqs.cpu = cpu; + cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); + + input_buffer = 0x1 | (((target_freq * 100) + / (ioread32(&pcch_hdr->nominal) * 1000)) << 8); + iowrite32(input_buffer, + (pcch_virt_addr + pcc_cpu_data->input_offset)); + iowrite16(CMD_SET_FREQ, &pcch_hdr->command); + + pcc_cmd(); + + /* Clear the input buffer - we are done with the current command */ + memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ); + + status = ioread16(&pcch_hdr->status); + if (status != CMD_COMPLETE) { + dprintk("target: FAILED for cpu %d, with status: 0x%x\n", + cpu, status); + goto cmd_incomplete; + } + iowrite16(0, &pcch_hdr->status); + + cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); + dprintk("target: was SUCCESSFUL for cpu %d\n", cpu); + spin_unlock(&pcc_lock); + + return 0; + +cmd_incomplete: + iowrite16(0, &pcch_hdr->status); + spin_unlock(&pcc_lock); + return -EINVAL; +} + +static int pcc_get_offset(int cpu) +{ + acpi_status status; + struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL}; + union acpi_object *pccp, *offset; + struct pcc_cpu *pcc_cpu_data; + struct acpi_processor *pr; + int ret = 0; + + pr = per_cpu(processors, cpu); + pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu); + + status = acpi_evaluate_object(pr->handle, "PCCP", NULL, &buffer); + if (ACPI_FAILURE(status)) + return -ENODEV; + + pccp = buffer.pointer; + if (!pccp || pccp->type != ACPI_TYPE_PACKAGE) { + ret = -ENODEV; + goto out_free; + }; + + offset = &(pccp->package.elements[0]); + if (!offset || offset->type != ACPI_TYPE_INTEGER) { + ret = -ENODEV; + goto out_free; + } + + pcc_cpu_data->input_offset = offset->integer.value; + + offset = &(pccp->package.elements[1]); + if (!offset || offset->type != ACPI_TYPE_INTEGER) { + ret = -ENODEV; + goto out_free; + } + + pcc_cpu_data->output_offset = offset->integer.value; + + memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ); + memset_io((pcch_virt_addr + pcc_cpu_data->output_offset), 0, BUF_SZ); + + dprintk("pcc_get_offset: for CPU %d: pcc_cpu_data " + "input_offset: 0x%x, pcc_cpu_data output_offset: 0x%x\n", + cpu, pcc_cpu_data->input_offset, pcc_cpu_data->output_offset); +out_free: + kfree(buffer.pointer); + return ret; +} + +static int __init pcc_cpufreq_do_osc(acpi_handle *handle) +{ + acpi_status status; + struct acpi_object_list input; + struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL}; + union acpi_object in_params[4]; + union acpi_object *out_obj; + u32 capabilities[2]; + u32 errors; + u32 supported; + int ret = 0; + + input.count = 4; + input.pointer = in_params; + input.count = 4; + input.pointer = in_params; + in_params[0].type = ACPI_TYPE_BUFFER; + in_params[0].buffer.length = 16; + in_params[0].buffer.pointer = OSC_UUID; + in_params[1].type = ACPI_TYPE_INTEGER; + in_params[1].integer.value = 1; + in_params[2].type = ACPI_TYPE_INTEGER; + in_params[2].integer.value = 2; + in_params[3].type = ACPI_TYPE_BUFFER; + in_params[3].buffer.length = 8; + in_params[3].buffer.pointer = (u8 *)&capabilities; + + capabilities[0] = OSC_QUERY_ENABLE; + capabilities[1] = 0x1; + + status = acpi_evaluate_object(*handle, "_OSC", &input, &output); + if (ACPI_FAILURE(status)) + return -ENODEV; + + if (!output.length) + return -ENODEV; + + out_obj = output.pointer; + if (out_obj->type != ACPI_TYPE_BUFFER) { + ret = -ENODEV; + goto out_free; + } + + errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0); + if (errors) { + ret = -ENODEV; + goto out_free; + } + + supported = *((u32 *)(out_obj->buffer.pointer + 4)); + if (!(supported & 0x1)) { + ret = -ENODEV; + goto out_free; + } + + kfree(output.pointer); + capabilities[0] = 0x0; + capabilities[1] = 0x1; + + status = acpi_evaluate_object(*handle, "_OSC", &input, &output); + if (ACPI_FAILURE(status)) + return -ENODEV; + + if (!output.length) + return -ENODEV; + + out_obj = output.pointer; + if (out_obj->type != ACPI_TYPE_BUFFER) { + ret = -ENODEV; + goto out_free; + } + + errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0); + if (errors) { + ret = -ENODEV; + goto out_free; + } + + supported = *((u32 *)(out_obj->buffer.pointer + 4)); + if (!(supported & 0x1)) { + ret = -ENODEV; + goto out_free; + } + +out_free: + kfree(output.pointer); + return ret; +} + +static int __init pcc_cpufreq_probe(void) +{ + acpi_status status; + struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL}; + struct pcc_memory_resource *mem_resource; + struct pcc_register_resource *reg_resource; + union acpi_object *out_obj, *member; + acpi_handle handle, osc_handle; + int ret = 0; + + status = acpi_get_handle(NULL, "\\_SB", &handle); + if (ACPI_FAILURE(status)) + return -ENODEV; + + status = acpi_get_handle(handle, "_OSC", &osc_handle); + if (ACPI_SUCCESS(status)) { + ret = pcc_cpufreq_do_osc(&osc_handle); + if (ret) + dprintk("probe: _OSC evaluation did not succeed\n"); + /* Firmware's use of _OSC is optional */ + ret = 0; + } + + status = acpi_evaluate_object(handle, "PCCH", NULL, &output); + if (ACPI_FAILURE(status)) + return -ENODEV; + + out_obj = output.pointer; + if (out_obj->type != ACPI_TYPE_PACKAGE) { + ret = -ENODEV; + goto out_free; + } + + member = &out_obj->package.elements[0]; + if (member->type != ACPI_TYPE_BUFFER) { + ret = -ENODEV; + goto out_free; + } + + mem_resource = (struct pcc_memory_resource *)member->buffer.pointer; + + dprintk("probe: mem_resource descriptor: 0x%x," + " length: %d, space_id: %d, resource_usage: %d," + " type_specific: %d, granularity: 0x%llx," + " minimum: 0x%llx, maximum: 0x%llx," + " translation_offset: 0x%llx, address_length: 0x%llx\n", + mem_resource->descriptor, mem_resource->length, + mem_resource->space_id, mem_resource->resource_usage, + mem_resource->type_specific, mem_resource->granularity, + mem_resource->minimum, mem_resource->maximum, + mem_resource->translation_offset, + mem_resource->address_length); + + if (mem_resource->space_id != ACPI_ADR_SPACE_SYSTEM_MEMORY) { + ret = -ENODEV; + goto out_free; + } + + pcch_virt_addr = ioremap_nocache(mem_resource->minimum, + mem_resource->address_length); + if (pcch_virt_addr == NULL) { + dprintk("probe: could not map shared mem region\n"); + goto out_free; + } + pcch_hdr = pcch_virt_addr; + + dprintk("probe: PCCH header (virtual) addr: 0x%p\n", pcch_hdr); + dprintk("probe: PCCH header is at physical address: 0x%llx," + " signature: 0x%x, length: %d bytes, major: %d, minor: %d," + " supported features: 0x%x, command field: 0x%x," + " status field: 0x%x, nominal latency: %d us\n", + mem_resource->minimum, ioread32(&pcch_hdr->signature), + ioread16(&pcch_hdr->length), ioread8(&pcch_hdr->major), + ioread8(&pcch_hdr->minor), ioread32(&pcch_hdr->features), + ioread16(&pcch_hdr->command), ioread16(&pcch_hdr->status), + ioread32(&pcch_hdr->latency)); + + dprintk("probe: min time between commands: %d us," + " max time between commands: %d us," + " nominal CPU frequency: %d MHz," + " minimum CPU frequency: %d MHz," + " minimum CPU frequency without throttling: %d MHz\n", + ioread32(&pcch_hdr->minimum_time), + ioread32(&pcch_hdr->maximum_time), + ioread32(&pcch_hdr->nominal), + ioread32(&pcch_hdr->throttled_frequency), + ioread32(&pcch_hdr->minimum_frequency)); + + member = &out_obj->package.elements[1]; + if (member->type != ACPI_TYPE_BUFFER) { + ret = -ENODEV; + goto pcch_free; + } + + reg_resource = (struct pcc_register_resource *)member->buffer.pointer; + + doorbell.space_id = reg_resource->space_id; + doorbell.bit_width = reg_resource->bit_width; + doorbell.bit_offset = reg_resource->bit_offset; + doorbell.access_width = 64; + doorbell.address = reg_resource->address; + + dprintk("probe: doorbell: space_id is %d, bit_width is %d, " + "bit_offset is %d, access_width is %d, address is 0x%llx\n", + doorbell.space_id, doorbell.bit_width, doorbell.bit_offset, + doorbell.access_width, reg_resource->address); + + member = &out_obj->package.elements[2]; + if (member->type != ACPI_TYPE_INTEGER) { + ret = -ENODEV; + goto pcch_free; + } + + doorbell_preserve = member->integer.value; + + member = &out_obj->package.elements[3]; + if (member->type != ACPI_TYPE_INTEGER) { + ret = -ENODEV; + goto pcch_free; + } + + doorbell_write = member->integer.value; + + dprintk("probe: doorbell_preserve: 0x%llx," + " doorbell_write: 0x%llx\n", + doorbell_preserve, doorbell_write); + + pcc_cpu_info = alloc_percpu(struct pcc_cpu); + if (!pcc_cpu_info) { + ret = -ENOMEM; + goto pcch_free; + } + + printk(KERN_DEBUG "pcc-cpufreq: (v%s) driver loaded with frequency" + " limits: %d MHz, %d MHz\n", PCC_VERSION, + ioread32(&pcch_hdr->minimum_frequency), + ioread32(&pcch_hdr->nominal)); + kfree(output.pointer); + return ret; +pcch_free: + pcc_clear_mapping(); +out_free: + kfree(output.pointer); + return ret; +} + +static int pcc_cpufreq_cpu_init(struct cpufreq_policy *policy) +{ + unsigned int cpu = policy->cpu; + unsigned int result = 0; + + if (!pcch_virt_addr) { + result = -1; + goto pcch_null; + } + + result = pcc_get_offset(cpu); + if (result) { + dprintk("init: PCCP evaluation failed\n"); + goto free; + } + + policy->max = policy->cpuinfo.max_freq = + ioread32(&pcch_hdr->nominal) * 1000; + policy->min = policy->cpuinfo.min_freq = + ioread32(&pcch_hdr->minimum_frequency) * 1000; + policy->cur = pcc_get_freq(cpu); + + dprintk("init: policy->max is %d, policy->min is %d\n", + policy->max, policy->min); + + return 0; +free: + pcc_clear_mapping(); + free_percpu(pcc_cpu_info); +pcch_null: + return result; +} + +static int pcc_cpufreq_cpu_exit(struct cpufreq_policy *policy) +{ + return 0; +} + +static struct cpufreq_driver pcc_cpufreq_driver = { + .flags = CPUFREQ_CONST_LOOPS, + .get = pcc_get_freq, + .verify = pcc_cpufreq_verify, + .target = pcc_cpufreq_target, + .init = pcc_cpufreq_cpu_init, + .exit = pcc_cpufreq_cpu_exit, + .name = "pcc-cpufreq", + .owner = THIS_MODULE, +}; + +static int __init pcc_cpufreq_init(void) +{ + int ret; + + if (acpi_disabled) + return 0; + + ret = pcc_cpufreq_probe(); + if (ret) { + dprintk("pcc_cpufreq_init: PCCH evaluation failed\n"); + return ret; + } + + ret = cpufreq_register_driver(&pcc_cpufreq_driver); + + return ret; +} + +static void __exit pcc_cpufreq_exit(void) +{ + cpufreq_unregister_driver(&pcc_cpufreq_driver); + + pcc_clear_mapping(); + + free_percpu(pcc_cpu_info); +} + +MODULE_AUTHOR("Matthew Garrett, Naga Chumbalkar"); +MODULE_VERSION(PCC_VERSION); +MODULE_DESCRIPTION("Processor Clocking Control interface driver"); +MODULE_LICENSE("GPL"); + +late_initcall(pcc_cpufreq_init); +module_exit(pcc_cpufreq_exit); diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index fe4622e..79556bd 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -145,6 +145,7 @@ struct set_mtrr_data { /** * ipi_handler - Synchronisation handler. Executed by "other" CPUs. + * @info: pointer to mtrr configuration data * * Returns nothing. */ diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 641ccb9..b1fbdee 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -676,7 +676,7 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) if (c->weight != w) continue; - for_each_bit(j, c->idxmsk, X86_PMC_IDX_MAX) { + for_each_set_bit(j, c->idxmsk, X86_PMC_IDX_MAX) { if (!test_bit(j, used_mask)) break; } diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index cf6590c..977e754 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -757,7 +757,7 @@ again: inc_irq_stat(apic_perf_irqs); ack = status; - for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { + for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { struct perf_event *event = cpuc->events[bit]; clear_bit(bit, (unsigned long *) &status); diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 8c93a84..fb725ee 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -34,6 +34,12 @@ static int i8259A_auto_eoi; DEFINE_RAW_SPINLOCK(i8259A_lock); static void mask_and_ack_8259A(unsigned int); +static void mask_8259A(void); +static void unmask_8259A(void); +static void disable_8259A_irq(unsigned int irq); +static void enable_8259A_irq(unsigned int irq); +static void init_8259A(int auto_eoi); +static int i8259A_irq_pending(unsigned int irq); struct irq_chip i8259A_chip = { .name = "XT-PIC", @@ -63,7 +69,7 @@ unsigned int cached_irq_mask = 0xffff; */ unsigned long io_apic_irqs; -void disable_8259A_irq(unsigned int irq) +static void disable_8259A_irq(unsigned int irq) { unsigned int mask = 1 << irq; unsigned long flags; @@ -77,7 +83,7 @@ void disable_8259A_irq(unsigned int irq) raw_spin_unlock_irqrestore(&i8259A_lock, flags); } -void enable_8259A_irq(unsigned int irq) +static void enable_8259A_irq(unsigned int irq) { unsigned int mask = ~(1 << irq); unsigned long flags; @@ -91,7 +97,7 @@ void enable_8259A_irq(unsigned int irq) raw_spin_unlock_irqrestore(&i8259A_lock, flags); } -int i8259A_irq_pending(unsigned int irq) +static int i8259A_irq_pending(unsigned int irq) { unsigned int mask = 1<<irq; unsigned long flags; @@ -107,7 +113,7 @@ int i8259A_irq_pending(unsigned int irq) return ret; } -void make_8259A_irq(unsigned int irq) +static void make_8259A_irq(unsigned int irq) { disable_irq_nosync(irq); io_apic_irqs &= ~(1<<irq); @@ -281,7 +287,7 @@ static int __init i8259A_init_sysfs(void) device_initcall(i8259A_init_sysfs); -void mask_8259A(void) +static void mask_8259A(void) { unsigned long flags; @@ -293,7 +299,7 @@ void mask_8259A(void) raw_spin_unlock_irqrestore(&i8259A_lock, flags); } -void unmask_8259A(void) +static void unmask_8259A(void) { unsigned long flags; @@ -305,7 +311,7 @@ void unmask_8259A(void) raw_spin_unlock_irqrestore(&i8259A_lock, flags); } -void init_8259A(int auto_eoi) +static void init_8259A(int auto_eoi) { unsigned long flags; @@ -358,3 +364,47 @@ void init_8259A(int auto_eoi) raw_spin_unlock_irqrestore(&i8259A_lock, flags); } + +/* + * make i8259 a driver so that we can select pic functions at run time. the goal + * is to make x86 binary compatible among pc compatible and non-pc compatible + * platforms, such as x86 MID. + */ + +static void legacy_pic_noop(void) { }; +static void legacy_pic_uint_noop(unsigned int unused) { }; +static void legacy_pic_int_noop(int unused) { }; + +static struct irq_chip dummy_pic_chip = { + .name = "dummy pic", + .mask = legacy_pic_uint_noop, + .unmask = legacy_pic_uint_noop, + .disable = legacy_pic_uint_noop, + .mask_ack = legacy_pic_uint_noop, +}; +static int legacy_pic_irq_pending_noop(unsigned int irq) +{ + return 0; +} + +struct legacy_pic null_legacy_pic = { + .nr_legacy_irqs = 0, + .chip = &dummy_pic_chip, + .mask_all = legacy_pic_noop, + .restore_mask = legacy_pic_noop, + .init = legacy_pic_int_noop, + .irq_pending = legacy_pic_irq_pending_noop, + .make_irq = legacy_pic_uint_noop, +}; + +struct legacy_pic default_legacy_pic = { + .nr_legacy_irqs = NR_IRQS_LEGACY, + .chip = &i8259A_chip, + .mask_all = mask_8259A, + .restore_mask = unmask_8259A, + .init = init_8259A, + .irq_pending = i8259A_irq_pending, + .make_irq = make_8259A_irq, +}; + +struct legacy_pic *legacy_pic = &default_legacy_pic; diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index fce55d5..ef257fc 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -99,9 +99,6 @@ int vector_used_by_percpu_irq(unsigned int vector) return 0; } -/* Number of legacy interrupts */ -int nr_legacy_irqs __read_mostly = NR_IRQS_LEGACY; - void __init init_ISA_irqs(void) { int i; @@ -109,12 +106,12 @@ void __init init_ISA_irqs(void) #if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) init_bsp_APIC(); #endif - init_8259A(0); + legacy_pic->init(0); /* * 16 old-style INTA-cycle interrupts: */ - for (i = 0; i < NR_IRQS_LEGACY; i++) { + for (i = 0; i < legacy_pic->nr_legacy_irqs; i++) { struct irq_desc *desc = irq_to_desc(i); desc->status = IRQ_DISABLED; @@ -138,7 +135,7 @@ void __init init_IRQ(void) * then this vector space can be freed and re-used dynamically as the * irq's migrate etc. */ - for (i = 0; i < nr_legacy_irqs; i++) + for (i = 0; i < legacy_pic->nr_legacy_irqs; i++) per_cpu(vector_irq, 0)[IRQ0_VECTOR + i] = i; x86_init.irqs.intr_init(); diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 5de9f4a..b43bbae 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -49,6 +49,7 @@ #include <linux/module.h> #include <linux/kdebug.h> #include <linux/kallsyms.h> +#include <linux/ftrace.h> #include <asm/cacheflush.h> #include <asm/desc.h> @@ -106,16 +107,22 @@ struct kretprobe_blackpoint kretprobe_blacklist[] = { }; const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist); -/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/ -static void __kprobes set_jmp_op(void *from, void *to) +static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op) { - struct __arch_jmp_op { - char op; + struct __arch_relative_insn { + u8 op; s32 raddr; - } __attribute__((packed)) * jop; - jop = (struct __arch_jmp_op *)from; - jop->raddr = (s32)((long)(to) - ((long)(from) + 5)); - jop->op = RELATIVEJUMP_INSTRUCTION; + } __attribute__((packed)) *insn; + + insn = (struct __arch_relative_insn *)from; + insn->raddr = (s32)((long)(to) - ((long)(from) + 5)); + insn->op = op; +} + +/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/ +static void __kprobes synthesize_reljump(void *from, void *to) +{ + __synthesize_relative_insn(from, to, RELATIVEJUMP_OPCODE); } /* @@ -202,7 +209,7 @@ static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr) /* * Basically, kp->ainsn.insn has an original instruction. * However, RIP-relative instruction can not do single-stepping - * at different place, fix_riprel() tweaks the displacement of + * at different place, __copy_instruction() tweaks the displacement of * that instruction. In that case, we can't recover the instruction * from the kp->ainsn.insn. * @@ -284,21 +291,37 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn) } /* - * Adjust the displacement if the instruction uses the %rip-relative - * addressing mode. + * Copy an instruction and adjust the displacement if the instruction + * uses the %rip-relative addressing mode. * If it does, Return the address of the 32-bit displacement word. * If not, return null. * Only applicable to 64-bit x86. */ -static void __kprobes fix_riprel(struct kprobe *p) +static int __kprobes __copy_instruction(u8 *dest, u8 *src, int recover) { -#ifdef CONFIG_X86_64 struct insn insn; - kernel_insn_init(&insn, p->ainsn.insn); + int ret; + kprobe_opcode_t buf[MAX_INSN_SIZE]; + kernel_insn_init(&insn, src); + if (recover) { + insn_get_opcode(&insn); + if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) { + ret = recover_probed_instruction(buf, + (unsigned long)src); + if (ret) + return 0; + kernel_insn_init(&insn, buf); + } + } + insn_get_length(&insn); + memcpy(dest, insn.kaddr, insn.length); + +#ifdef CONFIG_X86_64 if (insn_rip_relative(&insn)) { s64 newdisp; u8 *disp; + kernel_insn_init(&insn, dest); insn_get_displacement(&insn); /* * The copied instruction uses the %rip-relative addressing @@ -312,20 +335,23 @@ static void __kprobes fix_riprel(struct kprobe *p) * extension of the original signed 32-bit displacement would * have given. */ - newdisp = (u8 *) p->addr + (s64) insn.displacement.value - - (u8 *) p->ainsn.insn; + newdisp = (u8 *) src + (s64) insn.displacement.value - + (u8 *) dest; BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check. */ - disp = (u8 *) p->ainsn.insn + insn_offset_displacement(&insn); + disp = (u8 *) dest + insn_offset_displacement(&insn); *(s32 *) disp = (s32) newdisp; } #endif + return insn.length; } static void __kprobes arch_copy_kprobe(struct kprobe *p) { - memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); - - fix_riprel(p); + /* + * Copy an instruction without recovering int3, because it will be + * put by another subsystem. + */ + __copy_instruction(p->ainsn.insn, p->addr, 0); if (can_boost(p->addr)) p->ainsn.boostable = 0; @@ -406,18 +432,6 @@ static void __kprobes restore_btf(void) update_debugctlmsr(current->thread.debugctlmsr); } -static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs) -{ - clear_btf(); - regs->flags |= X86_EFLAGS_TF; - regs->flags &= ~X86_EFLAGS_IF; - /* single step inline if the instruction is an int3 */ - if (p->opcode == BREAKPOINT_INSTRUCTION) - regs->ip = (unsigned long)p->addr; - else - regs->ip = (unsigned long)p->ainsn.insn; -} - void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) { @@ -429,20 +443,50 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, *sara = (unsigned long) &kretprobe_trampoline; } +#ifdef CONFIG_OPTPROBES +static int __kprobes setup_detour_execution(struct kprobe *p, + struct pt_regs *regs, + int reenter); +#else +#define setup_detour_execution(p, regs, reenter) (0) +#endif + static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs, - struct kprobe_ctlblk *kcb) + struct kprobe_ctlblk *kcb, int reenter) { + if (setup_detour_execution(p, regs, reenter)) + return; + #if !defined(CONFIG_PREEMPT) if (p->ainsn.boostable == 1 && !p->post_handler) { /* Boost up -- we can execute copied instructions directly */ - reset_current_kprobe(); + if (!reenter) + reset_current_kprobe(); + /* + * Reentering boosted probe doesn't reset current_kprobe, + * nor set current_kprobe, because it doesn't use single + * stepping. + */ regs->ip = (unsigned long)p->ainsn.insn; preempt_enable_no_resched(); return; } #endif - prepare_singlestep(p, regs); - kcb->kprobe_status = KPROBE_HIT_SS; + if (reenter) { + save_previous_kprobe(kcb); + set_current_kprobe(p, regs, kcb); + kcb->kprobe_status = KPROBE_REENTER; + } else + kcb->kprobe_status = KPROBE_HIT_SS; + /* Prepare real single stepping */ + clear_btf(); + regs->flags |= X86_EFLAGS_TF; + regs->flags &= ~X86_EFLAGS_IF; + /* single step inline if the instruction is an int3 */ + if (p->opcode == BREAKPOINT_INSTRUCTION) + regs->ip = (unsigned long)p->addr; + else + regs->ip = (unsigned long)p->ainsn.insn; } /* @@ -456,11 +500,8 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs, switch (kcb->kprobe_status) { case KPROBE_HIT_SSDONE: case KPROBE_HIT_ACTIVE: - save_previous_kprobe(kcb); - set_current_kprobe(p, regs, kcb); kprobes_inc_nmissed_count(p); - prepare_singlestep(p, regs); - kcb->kprobe_status = KPROBE_REENTER; + setup_singlestep(p, regs, kcb, 1); break; case KPROBE_HIT_SS: /* A probe has been hit in the codepath leading up to, or just @@ -535,13 +576,13 @@ static int __kprobes kprobe_handler(struct pt_regs *regs) * more here. */ if (!p->pre_handler || !p->pre_handler(p, regs)) - setup_singlestep(p, regs, kcb); + setup_singlestep(p, regs, kcb, 0); return 1; } } else if (kprobe_running()) { p = __get_cpu_var(current_kprobe); if (p->break_handler && p->break_handler(p, regs)) { - setup_singlestep(p, regs, kcb); + setup_singlestep(p, regs, kcb, 0); return 1; } } /* else: not a kprobe fault; let the kernel handle it */ @@ -550,6 +591,69 @@ static int __kprobes kprobe_handler(struct pt_regs *regs) return 0; } +#ifdef CONFIG_X86_64 +#define SAVE_REGS_STRING \ + /* Skip cs, ip, orig_ax. */ \ + " subq $24, %rsp\n" \ + " pushq %rdi\n" \ + " pushq %rsi\n" \ + " pushq %rdx\n" \ + " pushq %rcx\n" \ + " pushq %rax\n" \ + " pushq %r8\n" \ + " pushq %r9\n" \ + " pushq %r10\n" \ + " pushq %r11\n" \ + " pushq %rbx\n" \ + " pushq %rbp\n" \ + " pushq %r12\n" \ + " pushq %r13\n" \ + " pushq %r14\n" \ + " pushq %r15\n" +#define RESTORE_REGS_STRING \ + " popq %r15\n" \ + " popq %r14\n" \ + " popq %r13\n" \ + " popq %r12\n" \ + " popq %rbp\n" \ + " popq %rbx\n" \ + " popq %r11\n" \ + " popq %r10\n" \ + " popq %r9\n" \ + " popq %r8\n" \ + " popq %rax\n" \ + " popq %rcx\n" \ + " popq %rdx\n" \ + " popq %rsi\n" \ + " popq %rdi\n" \ + /* Skip orig_ax, ip, cs */ \ + " addq $24, %rsp\n" +#else +#define SAVE_REGS_STRING \ + /* Skip cs, ip, orig_ax and gs. */ \ + " subl $16, %esp\n" \ + " pushl %fs\n" \ + " pushl %ds\n" \ + " pushl %es\n" \ + " pushl %eax\n" \ + " pushl %ebp\n" \ + " pushl %edi\n" \ + " pushl %esi\n" \ + " pushl %edx\n" \ + " pushl %ecx\n" \ + " pushl %ebx\n" +#define RESTORE_REGS_STRING \ + " popl %ebx\n" \ + " popl %ecx\n" \ + " popl %edx\n" \ + " popl %esi\n" \ + " popl %edi\n" \ + " popl %ebp\n" \ + " popl %eax\n" \ + /* Skip ds, es, fs, gs, orig_ax, and ip. Note: don't pop cs here*/\ + " addl $24, %esp\n" +#endif + /* * When a retprobed function returns, this code saves registers and * calls trampoline_handler() runs, which calls the kretprobe's handler. @@ -563,65 +667,16 @@ static void __used __kprobes kretprobe_trampoline_holder(void) /* We don't bother saving the ss register */ " pushq %rsp\n" " pushfq\n" - /* - * Skip cs, ip, orig_ax. - * trampoline_handler() will plug in these values - */ - " subq $24, %rsp\n" - " pushq %rdi\n" - " pushq %rsi\n" - " pushq %rdx\n" - " pushq %rcx\n" - " pushq %rax\n" - " pushq %r8\n" - " pushq %r9\n" - " pushq %r10\n" - " pushq %r11\n" - " pushq %rbx\n" - " pushq %rbp\n" - " pushq %r12\n" - " pushq %r13\n" - " pushq %r14\n" - " pushq %r15\n" + SAVE_REGS_STRING " movq %rsp, %rdi\n" " call trampoline_handler\n" /* Replace saved sp with true return address. */ " movq %rax, 152(%rsp)\n" - " popq %r15\n" - " popq %r14\n" - " popq %r13\n" - " popq %r12\n" - " popq %rbp\n" - " popq %rbx\n" - " popq %r11\n" - " popq %r10\n" - " popq %r9\n" - " popq %r8\n" - " popq %rax\n" - " popq %rcx\n" - " popq %rdx\n" - " popq %rsi\n" - " popq %rdi\n" - /* Skip orig_ax, ip, cs */ - " addq $24, %rsp\n" + RESTORE_REGS_STRING " popfq\n" #else " pushf\n" - /* - * Skip cs, ip, orig_ax and gs. - * trampoline_handler() will plug in these values - */ - " subl $16, %esp\n" - " pushl %fs\n" - " pushl %es\n" - " pushl %ds\n" - " pushl %eax\n" - " pushl %ebp\n" - " pushl %edi\n" - " pushl %esi\n" - " pushl %edx\n" - " pushl %ecx\n" - " pushl %ebx\n" + SAVE_REGS_STRING " movl %esp, %eax\n" " call trampoline_handler\n" /* Move flags to cs */ @@ -629,15 +684,7 @@ static void __used __kprobes kretprobe_trampoline_holder(void) " movl %edx, 52(%esp)\n" /* Replace saved flags with true return address. */ " movl %eax, 56(%esp)\n" - " popl %ebx\n" - " popl %ecx\n" - " popl %edx\n" - " popl %esi\n" - " popl %edi\n" - " popl %ebp\n" - " popl %eax\n" - /* Skip ds, es, fs, gs, orig_ax and ip */ - " addl $24, %esp\n" + RESTORE_REGS_STRING " popf\n" #endif " ret\n"); @@ -805,8 +852,8 @@ static void __kprobes resume_execution(struct kprobe *p, * These instructions can be executed directly if it * jumps back to correct address. */ - set_jmp_op((void *)regs->ip, - (void *)orig_ip + (regs->ip - copy_ip)); + synthesize_reljump((void *)regs->ip, + (void *)orig_ip + (regs->ip - copy_ip)); p->ainsn.boostable = 1; } else { p->ainsn.boostable = -1; @@ -1033,6 +1080,358 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) return 0; } + +#ifdef CONFIG_OPTPROBES + +/* Insert a call instruction at address 'from', which calls address 'to'.*/ +static void __kprobes synthesize_relcall(void *from, void *to) +{ + __synthesize_relative_insn(from, to, RELATIVECALL_OPCODE); +} + +/* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */ +static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr, + unsigned long val) +{ +#ifdef CONFIG_X86_64 + *addr++ = 0x48; + *addr++ = 0xbf; +#else + *addr++ = 0xb8; +#endif + *(unsigned long *)addr = val; +} + +void __kprobes kprobes_optinsn_template_holder(void) +{ + asm volatile ( + ".global optprobe_template_entry\n" + "optprobe_template_entry: \n" +#ifdef CONFIG_X86_64 + /* We don't bother saving the ss register */ + " pushq %rsp\n" + " pushfq\n" + SAVE_REGS_STRING + " movq %rsp, %rsi\n" + ".global optprobe_template_val\n" + "optprobe_template_val: \n" + ASM_NOP5 + ASM_NOP5 + ".global optprobe_template_call\n" + "optprobe_template_call: \n" + ASM_NOP5 + /* Move flags to rsp */ + " movq 144(%rsp), %rdx\n" + " movq %rdx, 152(%rsp)\n" + RESTORE_REGS_STRING + /* Skip flags entry */ + " addq $8, %rsp\n" + " popfq\n" +#else /* CONFIG_X86_32 */ + " pushf\n" + SAVE_REGS_STRING + " movl %esp, %edx\n" + ".global optprobe_template_val\n" + "optprobe_template_val: \n" + ASM_NOP5 + ".global optprobe_template_call\n" + "optprobe_template_call: \n" + ASM_NOP5 + RESTORE_REGS_STRING + " addl $4, %esp\n" /* skip cs */ + " popf\n" +#endif + ".global optprobe_template_end\n" + "optprobe_template_end: \n"); +} + +#define TMPL_MOVE_IDX \ + ((long)&optprobe_template_val - (long)&optprobe_template_entry) +#define TMPL_CALL_IDX \ + ((long)&optprobe_template_call - (long)&optprobe_template_entry) +#define TMPL_END_IDX \ + ((long)&optprobe_template_end - (long)&optprobe_template_entry) + +#define INT3_SIZE sizeof(kprobe_opcode_t) + +/* Optimized kprobe call back function: called from optinsn */ +static void __kprobes optimized_callback(struct optimized_kprobe *op, + struct pt_regs *regs) +{ + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + + preempt_disable(); + if (kprobe_running()) { + kprobes_inc_nmissed_count(&op->kp); + } else { + /* Save skipped registers */ +#ifdef CONFIG_X86_64 + regs->cs = __KERNEL_CS; +#else + regs->cs = __KERNEL_CS | get_kernel_rpl(); + regs->gs = 0; +#endif + regs->ip = (unsigned long)op->kp.addr + INT3_SIZE; + regs->orig_ax = ~0UL; + + __get_cpu_var(current_kprobe) = &op->kp; + kcb->kprobe_status = KPROBE_HIT_ACTIVE; + opt_pre_handler(&op->kp, regs); + __get_cpu_var(current_kprobe) = NULL; + } + preempt_enable_no_resched(); +} + +static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src) +{ + int len = 0, ret; + + while (len < RELATIVEJUMP_SIZE) { + ret = __copy_instruction(dest + len, src + len, 1); + if (!ret || !can_boost(dest + len)) + return -EINVAL; + len += ret; + } + /* Check whether the address range is reserved */ + if (ftrace_text_reserved(src, src + len - 1) || + alternatives_text_reserved(src, src + len - 1)) + return -EBUSY; + + return len; +} + +/* Check whether insn is indirect jump */ +static int __kprobes insn_is_indirect_jump(struct insn *insn) +{ + return ((insn->opcode.bytes[0] == 0xff && + (X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */ + insn->opcode.bytes[0] == 0xea); /* Segment based jump */ +} + +/* Check whether insn jumps into specified address range */ +static int insn_jump_into_range(struct insn *insn, unsigned long start, int len) +{ + unsigned long target = 0; + + switch (insn->opcode.bytes[0]) { + case 0xe0: /* loopne */ + case 0xe1: /* loope */ + case 0xe2: /* loop */ + case 0xe3: /* jcxz */ + case 0xe9: /* near relative jump */ + case 0xeb: /* short relative jump */ + break; + case 0x0f: + if ((insn->opcode.bytes[1] & 0xf0) == 0x80) /* jcc near */ + break; + return 0; + default: + if ((insn->opcode.bytes[0] & 0xf0) == 0x70) /* jcc short */ + break; + return 0; + } + target = (unsigned long)insn->next_byte + insn->immediate.value; + + return (start <= target && target <= start + len); +} + +/* Decode whole function to ensure any instructions don't jump into target */ +static int __kprobes can_optimize(unsigned long paddr) +{ + int ret; + unsigned long addr, size = 0, offset = 0; + struct insn insn; + kprobe_opcode_t buf[MAX_INSN_SIZE]; + /* Dummy buffers for lookup_symbol_attrs */ + static char __dummy_buf[KSYM_NAME_LEN]; + + /* Lookup symbol including addr */ + if (!kallsyms_lookup(paddr, &size, &offset, NULL, __dummy_buf)) + return 0; + + /* Check there is enough space for a relative jump. */ + if (size - offset < RELATIVEJUMP_SIZE) + return 0; + + /* Decode instructions */ + addr = paddr - offset; + while (addr < paddr - offset + size) { /* Decode until function end */ + if (search_exception_tables(addr)) + /* + * Since some fixup code will jumps into this function, + * we can't optimize kprobe in this function. + */ + return 0; + kernel_insn_init(&insn, (void *)addr); + insn_get_opcode(&insn); + if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) { + ret = recover_probed_instruction(buf, addr); + if (ret) + return 0; + kernel_insn_init(&insn, buf); + } + insn_get_length(&insn); + /* Recover address */ + insn.kaddr = (void *)addr; + insn.next_byte = (void *)(addr + insn.length); + /* Check any instructions don't jump into target */ + if (insn_is_indirect_jump(&insn) || + insn_jump_into_range(&insn, paddr + INT3_SIZE, + RELATIVE_ADDR_SIZE)) + return 0; + addr += insn.length; + } + + return 1; +} + +/* Check optimized_kprobe can actually be optimized. */ +int __kprobes arch_check_optimized_kprobe(struct optimized_kprobe *op) +{ + int i; + struct kprobe *p; + + for (i = 1; i < op->optinsn.size; i++) { + p = get_kprobe(op->kp.addr + i); + if (p && !kprobe_disabled(p)) + return -EEXIST; + } + + return 0; +} + +/* Check the addr is within the optimized instructions. */ +int __kprobes arch_within_optimized_kprobe(struct optimized_kprobe *op, + unsigned long addr) +{ + return ((unsigned long)op->kp.addr <= addr && + (unsigned long)op->kp.addr + op->optinsn.size > addr); +} + +/* Free optimized instruction slot */ +static __kprobes +void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty) +{ + if (op->optinsn.insn) { + free_optinsn_slot(op->optinsn.insn, dirty); + op->optinsn.insn = NULL; + op->optinsn.size = 0; + } +} + +void __kprobes arch_remove_optimized_kprobe(struct optimized_kprobe *op) +{ + __arch_remove_optimized_kprobe(op, 1); +} + +/* + * Copy replacing target instructions + * Target instructions MUST be relocatable (checked inside) + */ +int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op) +{ + u8 *buf; + int ret; + long rel; + + if (!can_optimize((unsigned long)op->kp.addr)) + return -EILSEQ; + + op->optinsn.insn = get_optinsn_slot(); + if (!op->optinsn.insn) + return -ENOMEM; + + /* + * Verify if the address gap is in 2GB range, because this uses + * a relative jump. + */ + rel = (long)op->optinsn.insn - (long)op->kp.addr + RELATIVEJUMP_SIZE; + if (abs(rel) > 0x7fffffff) + return -ERANGE; + + buf = (u8 *)op->optinsn.insn; + + /* Copy instructions into the out-of-line buffer */ + ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr); + if (ret < 0) { + __arch_remove_optimized_kprobe(op, 0); + return ret; + } + op->optinsn.size = ret; + + /* Copy arch-dep-instance from template */ + memcpy(buf, &optprobe_template_entry, TMPL_END_IDX); + + /* Set probe information */ + synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op); + + /* Set probe function call */ + synthesize_relcall(buf + TMPL_CALL_IDX, optimized_callback); + + /* Set returning jmp instruction at the tail of out-of-line buffer */ + synthesize_reljump(buf + TMPL_END_IDX + op->optinsn.size, + (u8 *)op->kp.addr + op->optinsn.size); + + flush_icache_range((unsigned long) buf, + (unsigned long) buf + TMPL_END_IDX + + op->optinsn.size + RELATIVEJUMP_SIZE); + return 0; +} + +/* Replace a breakpoint (int3) with a relative jump. */ +int __kprobes arch_optimize_kprobe(struct optimized_kprobe *op) +{ + unsigned char jmp_code[RELATIVEJUMP_SIZE]; + s32 rel = (s32)((long)op->optinsn.insn - + ((long)op->kp.addr + RELATIVEJUMP_SIZE)); + + /* Backup instructions which will be replaced by jump address */ + memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE, + RELATIVE_ADDR_SIZE); + + jmp_code[0] = RELATIVEJUMP_OPCODE; + *(s32 *)(&jmp_code[1]) = rel; + + /* + * text_poke_smp doesn't support NMI/MCE code modifying. + * However, since kprobes itself also doesn't support NMI/MCE + * code probing, it's not a problem. + */ + text_poke_smp(op->kp.addr, jmp_code, RELATIVEJUMP_SIZE); + return 0; +} + +/* Replace a relative jump with a breakpoint (int3). */ +void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op) +{ + u8 buf[RELATIVEJUMP_SIZE]; + + /* Set int3 to first byte for kprobes */ + buf[0] = BREAKPOINT_INSTRUCTION; + memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); + text_poke_smp(op->kp.addr, buf, RELATIVEJUMP_SIZE); +} + +static int __kprobes setup_detour_execution(struct kprobe *p, + struct pt_regs *regs, + int reenter) +{ + struct optimized_kprobe *op; + + if (p->flags & KPROBE_FLAG_OPTIMIZED) { + /* This kprobe is really able to run optimized path. */ + op = container_of(p, struct optimized_kprobe, kp); + /* Detour through copied instructions */ + regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX; + if (!reenter) + reset_current_kprobe(); + preempt_enable_no_resched(); + return 1; + } + return 0; +} +#endif + int __init arch_init_kprobes(void) { return 0; diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c index 3b7078a..0aad867 100644 --- a/arch/x86/kernel/mrst.c +++ b/arch/x86/kernel/mrst.c @@ -10,8 +10,211 @@ * of the License. */ #include <linux/init.h> +#include <linux/kernel.h> +#include <linux/sfi.h> +#include <linux/irq.h> +#include <linux/module.h> #include <asm/setup.h> +#include <asm/mpspec_def.h> +#include <asm/hw_irq.h> +#include <asm/apic.h> +#include <asm/io_apic.h> +#include <asm/mrst.h> +#include <asm/io.h> +#include <asm/i8259.h> +#include <asm/apb_timer.h> + +static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM]; +static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM]; +int sfi_mtimer_num; + +struct sfi_rtc_table_entry sfi_mrtc_array[SFI_MRTC_MAX]; +EXPORT_SYMBOL_GPL(sfi_mrtc_array); +int sfi_mrtc_num; + +static inline void assign_to_mp_irq(struct mpc_intsrc *m, + struct mpc_intsrc *mp_irq) +{ + memcpy(mp_irq, m, sizeof(struct mpc_intsrc)); +} + +static inline int mp_irq_cmp(struct mpc_intsrc *mp_irq, + struct mpc_intsrc *m) +{ + return memcmp(mp_irq, m, sizeof(struct mpc_intsrc)); +} + +static void save_mp_irq(struct mpc_intsrc *m) +{ + int i; + + for (i = 0; i < mp_irq_entries; i++) { + if (!mp_irq_cmp(&mp_irqs[i], m)) + return; + } + + assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]); + if (++mp_irq_entries == MAX_IRQ_SOURCES) + panic("Max # of irq sources exceeded!!\n"); +} + +/* parse all the mtimer info to a static mtimer array */ +static int __init sfi_parse_mtmr(struct sfi_table_header *table) +{ + struct sfi_table_simple *sb; + struct sfi_timer_table_entry *pentry; + struct mpc_intsrc mp_irq; + int totallen; + + sb = (struct sfi_table_simple *)table; + if (!sfi_mtimer_num) { + sfi_mtimer_num = SFI_GET_NUM_ENTRIES(sb, + struct sfi_timer_table_entry); + pentry = (struct sfi_timer_table_entry *) sb->pentry; + totallen = sfi_mtimer_num * sizeof(*pentry); + memcpy(sfi_mtimer_array, pentry, totallen); + } + + printk(KERN_INFO "SFI: MTIMER info (num = %d):\n", sfi_mtimer_num); + pentry = sfi_mtimer_array; + for (totallen = 0; totallen < sfi_mtimer_num; totallen++, pentry++) { + printk(KERN_INFO "timer[%d]: paddr = 0x%08x, freq = %dHz," + " irq = %d\n", totallen, (u32)pentry->phys_addr, + pentry->freq_hz, pentry->irq); + if (!pentry->irq) + continue; + mp_irq.type = MP_IOAPIC; + mp_irq.irqtype = mp_INT; +/* triggering mode edge bit 2-3, active high polarity bit 0-1 */ + mp_irq.irqflag = 5; + mp_irq.srcbus = 0; + mp_irq.srcbusirq = pentry->irq; /* IRQ */ + mp_irq.dstapic = MP_APIC_ALL; + mp_irq.dstirq = pentry->irq; + save_mp_irq(&mp_irq); + } + + return 0; +} + +struct sfi_timer_table_entry *sfi_get_mtmr(int hint) +{ + int i; + if (hint < sfi_mtimer_num) { + if (!sfi_mtimer_usage[hint]) { + pr_debug("hint taken for timer %d irq %d\n",\ + hint, sfi_mtimer_array[hint].irq); + sfi_mtimer_usage[hint] = 1; + return &sfi_mtimer_array[hint]; + } + } + /* take the first timer available */ + for (i = 0; i < sfi_mtimer_num;) { + if (!sfi_mtimer_usage[i]) { + sfi_mtimer_usage[i] = 1; + return &sfi_mtimer_array[i]; + } + i++; + } + return NULL; +} + +void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr) +{ + int i; + for (i = 0; i < sfi_mtimer_num;) { + if (mtmr->irq == sfi_mtimer_array[i].irq) { + sfi_mtimer_usage[i] = 0; + return; + } + i++; + } +} + +/* parse all the mrtc info to a global mrtc array */ +int __init sfi_parse_mrtc(struct sfi_table_header *table) +{ + struct sfi_table_simple *sb; + struct sfi_rtc_table_entry *pentry; + struct mpc_intsrc mp_irq; + + int totallen; + + sb = (struct sfi_table_simple *)table; + if (!sfi_mrtc_num) { + sfi_mrtc_num = SFI_GET_NUM_ENTRIES(sb, + struct sfi_rtc_table_entry); + pentry = (struct sfi_rtc_table_entry *)sb->pentry; + totallen = sfi_mrtc_num * sizeof(*pentry); + memcpy(sfi_mrtc_array, pentry, totallen); + } + + printk(KERN_INFO "SFI: RTC info (num = %d):\n", sfi_mrtc_num); + pentry = sfi_mrtc_array; + for (totallen = 0; totallen < sfi_mrtc_num; totallen++, pentry++) { + printk(KERN_INFO "RTC[%d]: paddr = 0x%08x, irq = %d\n", + totallen, (u32)pentry->phys_addr, pentry->irq); + mp_irq.type = MP_IOAPIC; + mp_irq.irqtype = mp_INT; + mp_irq.irqflag = 0; + mp_irq.srcbus = 0; + mp_irq.srcbusirq = pentry->irq; /* IRQ */ + mp_irq.dstapic = MP_APIC_ALL; + mp_irq.dstirq = pentry->irq; + save_mp_irq(&mp_irq); + } + return 0; +} + +/* + * the secondary clock in Moorestown can be APBT or LAPIC clock, default to + * APBT but cmdline option can also override it. + */ +static void __cpuinit mrst_setup_secondary_clock(void) +{ + /* restore default lapic clock if disabled by cmdline */ + if (disable_apbt_percpu) + return setup_secondary_APIC_clock(); + apbt_setup_secondary_clock(); +} + +static unsigned long __init mrst_calibrate_tsc(void) +{ + unsigned long flags, fast_calibrate; + + local_irq_save(flags); + fast_calibrate = apbt_quick_calibrate(); + local_irq_restore(flags); + + if (fast_calibrate) + return fast_calibrate; + + return 0; +} + +void __init mrst_time_init(void) +{ + sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr); + pre_init_apic_IRQ0(); + apbt_time_init(); +} + +void __init mrst_rtc_init(void) +{ + sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc); +} + +/* + * if we use per cpu apb timer, the bootclock already setup. if we use lapic + * timer and one apbt timer for broadcast, we need to set up lapic boot clock. + */ +static void __init mrst_setup_boot_clock(void) +{ + pr_info("%s: per cpu apbt flag %d \n", __func__, disable_apbt_percpu); + if (disable_apbt_percpu) + setup_boot_APIC_clock(); +}; /* * Moorestown specific x86_init function overrides and early setup @@ -21,4 +224,17 @@ void __init x86_mrst_early_setup(void) { x86_init.resources.probe_roms = x86_init_noop; x86_init.resources.reserve_resources = x86_init_noop; + + x86_init.timers.timer_init = mrst_time_init; + x86_init.timers.setup_percpu_clockev = mrst_setup_boot_clock; + + x86_init.irqs.pre_vector_init = x86_init_noop; + + x86_cpuinit.setup_percpu_clockev = mrst_setup_secondary_clock; + + x86_platform.calibrate_tsc = mrst_calibrate_tsc; + x86_init.pci.init = pci_mrst_init; + x86_init.pci.fixup_irqs = x86_init_noop; + + legacy_pic = &null_legacy_pic; } diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c index 9d1d263..8297160 100644 --- a/arch/x86/kernel/olpc.c +++ b/arch/x86/kernel/olpc.c @@ -17,7 +17,9 @@ #include <linux/spinlock.h> #include <linux/io.h> #include <linux/string.h> + #include <asm/geode.h> +#include <asm/setup.h> #include <asm/olpc.h> #ifdef CONFIG_OPEN_FIRMWARE @@ -243,9 +245,11 @@ static int __init olpc_init(void) olpc_ec_cmd(EC_FIRMWARE_REV, NULL, 0, (unsigned char *) &olpc_platform_info.ecver, 1); - /* check to see if the VSA exists */ - if (cs5535_has_vsa2()) - olpc_platform_info.flags |= OLPC_F_VSA; +#ifdef CONFIG_PCI_OLPC + /* If the VSA exists let it emulate PCI, if not emulate in kernel */ + if (!cs5535_has_vsa2()) + x86_init.pci.arch_init = pci_olpc_init; +#endif printk(KERN_INFO "OLPC board revision %s%X (EC=%x)\n", ((olpc_platform_info.boardrev & 0xf) < 8) ? "pre" : "", diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index a435c76..a02e80c 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -48,6 +48,7 @@ #include <linux/err.h> #include <linux/nmi.h> #include <linux/tboot.h> +#include <linux/stackprotector.h> #include <asm/acpi.h> #include <asm/desc.h> @@ -67,6 +68,7 @@ #include <linux/mc146818rtc.h> #include <asm/smpboot_hooks.h> +#include <asm/i8259.h> #ifdef CONFIG_X86_32 u8 apicid_2_node[MAX_APICID]; @@ -291,9 +293,9 @@ notrace static void __cpuinit start_secondary(void *unused) check_tsc_sync_target(); if (nmi_watchdog == NMI_IO_APIC) { - disable_8259A_irq(0); + legacy_pic->chip->mask(0); enable_NMI_through_LVT0(); - enable_8259A_irq(0); + legacy_pic->chip->unmask(0); } #ifdef CONFIG_X86_32 @@ -329,6 +331,9 @@ notrace static void __cpuinit start_secondary(void *unused) /* enable local interrupts */ local_irq_enable(); + /* to prevent fake stack check failure in clock setup */ + boot_init_stack_canary(); + x86_cpuinit.setup_percpu_clockev(); wmb(); diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index ab38ce0..e680ea5 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c @@ -49,11 +49,6 @@ extern int no_broadcast; char visws_board_type = -1; char visws_board_rev = -1; -int is_visws_box(void) -{ - return visws_board_type >= 0; -} - static void __init visws_time_init(void) { printk(KERN_INFO "Starting Cobalt Timer system clock\n"); @@ -242,6 +237,8 @@ void __init visws_early_detect(void) x86_init.irqs.pre_vector_init = visws_pre_intr_init; x86_init.irqs.trap_init = visws_trap_init; x86_init.timers.timer_init = visws_time_init; + x86_init.pci.init = pci_visws_init; + x86_init.pci.init_irq = x86_init_noop; /* * Install reboot quirks: @@ -508,7 +505,7 @@ static struct irq_chip cobalt_irq_type = { */ static unsigned int startup_piix4_master_irq(unsigned int irq) { - init_8259A(0); + legacy_pic->init(0); return startup_cobalt_irq(irq); } @@ -532,9 +529,6 @@ static struct irq_chip piix4_master_irq_type = { static struct irq_chip piix4_virtual_irq_type = { .name = "PIIX4-virtual", - .shutdown = disable_8259A_irq, - .enable = enable_8259A_irq, - .disable = disable_8259A_irq, }; @@ -609,7 +603,7 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id) handle_IRQ_event(realirq, desc->action); if (!(desc->status & IRQ_DISABLED)) - enable_8259A_irq(realirq); + legacy_pic->chip->unmask(realirq); return IRQ_HANDLED; @@ -628,6 +622,12 @@ static struct irqaction cascade_action = { .name = "cascade", }; +static inline void set_piix4_virtual_irq_type(void) +{ + piix4_virtual_irq_type.shutdown = i8259A_chip.mask; + piix4_virtual_irq_type.enable = i8259A_chip.unmask; + piix4_virtual_irq_type.disable = i8259A_chip.mask; +} void init_VISWS_APIC_irqs(void) { @@ -653,6 +653,7 @@ void init_VISWS_APIC_irqs(void) desc->chip = &piix4_master_irq_type; } else if (i < CO_IRQ_APIC0) { + set_piix4_virtual_irq_type(); desc->chip = &piix4_virtual_irq_type; } else if (IS_CO_APIC(i)) { diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 9055e58..1c0c6ab 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -301,7 +301,8 @@ static int __init vsyscall_init(void) register_sysctl_table(kernel_root_table2); #endif on_each_cpu(cpu_vsyscall_init, NULL, 1); - hotcpu_notifier(cpu_vsyscall_notifier, 0); + /* notifier priority > KVM */ + hotcpu_notifier(cpu_vsyscall_notifier, 30); return 0; } diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index ee5746c..61a1e8c 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -4,9 +4,11 @@ * For licencing details see kernel-base/COPYING */ #include <linux/init.h> +#include <linux/ioport.h> #include <asm/bios_ebda.h> #include <asm/paravirt.h> +#include <asm/pci_x86.h> #include <asm/mpspec.h> #include <asm/setup.h> #include <asm/apic.h> @@ -70,6 +72,12 @@ struct x86_init_ops x86_init __initdata = { .iommu = { .iommu_init = iommu_init_noop, }, + + .pci = { + .init = x86_default_pci_init, + .init_irq = x86_default_pci_init_irq, + .fixup_irqs = x86_default_pci_fixup_irqs, + }, }; struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = { diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 3c4d0109..970bbd4 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -29,6 +29,7 @@ config KVM select HAVE_KVM_EVENTFD select KVM_APIC_ARCHITECTURE select USER_RETURN_NOTIFIER + select KVM_MMIO ---help--- Support hosting fully virtualized guest machines using hardware virtualization extensions. You will need a fairly recent diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 7e8faea..4dade6a 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -32,7 +32,7 @@ #include <linux/module.h> #include <asm/kvm_emulate.h> -#include "mmu.h" /* for is_long_mode() */ +#include "x86.h" /* * Opcode effective-address decode tables. @@ -76,6 +76,8 @@ #define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ #define GroupMask 0xff /* Group number stored in bits 0:7 */ /* Misc flags */ +#define Lock (1<<26) /* lock prefix is allowed for the instruction */ +#define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */ #define No64 (1<<28) /* Source 2 operand type */ #define Src2None (0<<29) @@ -88,39 +90,40 @@ enum { Group1_80, Group1_81, Group1_82, Group1_83, Group1A, Group3_Byte, Group3, Group4, Group5, Group7, + Group8, Group9, }; static u32 opcode_table[256] = { /* 0x00 - 0x07 */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, ImplicitOps | Stack | No64, ImplicitOps | Stack | No64, /* 0x08 - 0x0F */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, ImplicitOps | Stack | No64, 0, /* 0x10 - 0x17 */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, ImplicitOps | Stack | No64, ImplicitOps | Stack | No64, /* 0x18 - 0x1F */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, ImplicitOps | Stack | No64, ImplicitOps | Stack | No64, /* 0x20 - 0x27 */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0, /* 0x28 - 0x2F */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 0, 0, 0, 0, /* 0x30 - 0x37 */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 0, 0, 0, 0, /* 0x38 - 0x3F */ @@ -156,7 +159,7 @@ static u32 opcode_table[256] = { Group | Group1_80, Group | Group1_81, Group | Group1_82, Group | Group1_83, ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, /* 0x88 - 0x8F */ ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, @@ -210,7 +213,7 @@ static u32 opcode_table[256] = { SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* 0xF0 - 0xF7 */ 0, 0, 0, 0, - ImplicitOps, ImplicitOps, Group | Group3_Byte, Group | Group3, + ImplicitOps | Priv, ImplicitOps, Group | Group3_Byte, Group | Group3, /* 0xF8 - 0xFF */ ImplicitOps, 0, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, Group | Group4, Group | Group5, @@ -218,16 +221,20 @@ static u32 opcode_table[256] = { static u32 twobyte_table[256] = { /* 0x00 - 0x0F */ - 0, Group | GroupDual | Group7, 0, 0, 0, ImplicitOps, ImplicitOps, 0, - ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0, + 0, Group | GroupDual | Group7, 0, 0, + 0, ImplicitOps, ImplicitOps | Priv, 0, + ImplicitOps | Priv, ImplicitOps | Priv, 0, 0, + 0, ImplicitOps | ModRM, 0, 0, /* 0x10 - 0x1F */ 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, /* 0x20 - 0x2F */ - ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0, + ModRM | ImplicitOps | Priv, ModRM | Priv, + ModRM | ImplicitOps | Priv, ModRM | Priv, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x30 - 0x3F */ - ImplicitOps, 0, ImplicitOps, 0, - ImplicitOps, ImplicitOps, 0, 0, + ImplicitOps | Priv, 0, ImplicitOps | Priv, 0, + ImplicitOps, ImplicitOps | Priv, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x40 - 0x47 */ DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, @@ -257,21 +264,23 @@ static u32 twobyte_table[256] = { DstMem | SrcReg | Src2CL | ModRM, 0, 0, /* 0xA8 - 0xAF */ ImplicitOps | Stack, ImplicitOps | Stack, - 0, DstMem | SrcReg | ModRM | BitOp, + 0, DstMem | SrcReg | ModRM | BitOp | Lock, DstMem | SrcReg | Src2ImmByte | ModRM, DstMem | SrcReg | Src2CL | ModRM, ModRM, 0, /* 0xB0 - 0xB7 */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0, - DstMem | SrcReg | ModRM | BitOp, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, + 0, DstMem | SrcReg | ModRM | BitOp | Lock, 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem16 | ModRM | Mov, /* 0xB8 - 0xBF */ - 0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcReg | ModRM | BitOp, + 0, 0, + Group | Group8, DstMem | SrcReg | ModRM | BitOp | Lock, 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem16 | ModRM | Mov, /* 0xC0 - 0xCF */ - 0, 0, 0, DstMem | SrcReg | ModRM | Mov, 0, 0, 0, ImplicitOps | ModRM, + 0, 0, 0, DstMem | SrcReg | ModRM | Mov, + 0, 0, 0, Group | GroupDual | Group9, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xD0 - 0xDF */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -283,25 +292,41 @@ static u32 twobyte_table[256] = { static u32 group_table[] = { [Group1_80*8] = - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM, [Group1_81*8] = - DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, - DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, - DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, - DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, + DstMem | SrcImm | ModRM | Lock, + DstMem | SrcImm | ModRM | Lock, + DstMem | SrcImm | ModRM | Lock, + DstMem | SrcImm | ModRM | Lock, + DstMem | SrcImm | ModRM | Lock, + DstMem | SrcImm | ModRM | Lock, + DstMem | SrcImm | ModRM | Lock, + DstMem | SrcImm | ModRM, [Group1_82*8] = - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, + ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, + ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, + ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, + ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, + ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, + ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, + ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, + ByteOp | DstMem | SrcImm | ModRM | No64, [Group1_83*8] = - DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, - DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, - DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, - DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, + DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM, [Group1A*8] = DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0, [Group3_Byte*8] = @@ -320,24 +345,39 @@ static u32 group_table[] = { SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0, [Group7*8] = - 0, 0, ModRM | SrcMem, ModRM | SrcMem, + 0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv, SrcNone | ModRM | DstMem | Mov, 0, - SrcMem16 | ModRM | Mov, SrcMem | ModRM | ByteOp, + SrcMem16 | ModRM | Mov | Priv, SrcMem | ModRM | ByteOp | Priv, + [Group8*8] = + 0, 0, 0, 0, + DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM | Lock, DstMem | SrcImmByte | ModRM | Lock, + [Group9*8] = + 0, ImplicitOps | ModRM | Lock, 0, 0, 0, 0, 0, 0, }; static u32 group2_table[] = { [Group7*8] = - SrcNone | ModRM, 0, 0, SrcNone | ModRM, + SrcNone | ModRM | Priv, 0, 0, SrcNone | ModRM, SrcNone | ModRM | DstMem | Mov, 0, SrcMem16 | ModRM | Mov, 0, + [Group9*8] = + 0, 0, 0, 0, 0, 0, 0, 0, }; /* EFLAGS bit definitions. */ +#define EFLG_ID (1<<21) +#define EFLG_VIP (1<<20) +#define EFLG_VIF (1<<19) +#define EFLG_AC (1<<18) #define EFLG_VM (1<<17) #define EFLG_RF (1<<16) +#define EFLG_IOPL (3<<12) +#define EFLG_NT (1<<14) #define EFLG_OF (1<<11) #define EFLG_DF (1<<10) #define EFLG_IF (1<<9) +#define EFLG_TF (1<<8) #define EFLG_SF (1<<7) #define EFLG_ZF (1<<6) #define EFLG_AF (1<<4) @@ -606,7 +646,7 @@ static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, if (linear < fc->start || linear >= fc->end) { size = min(15UL, PAGE_SIZE - offset_in_page(linear)); - rc = ops->read_std(linear, fc->data, size, ctxt->vcpu); + rc = ops->fetch(linear, fc->data, size, ctxt->vcpu, NULL); if (rc) return rc; fc->start = linear; @@ -661,11 +701,11 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt, op_bytes = 3; *address = 0; rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2, - ctxt->vcpu); + ctxt->vcpu, NULL); if (rc) return rc; rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes, - ctxt->vcpu); + ctxt->vcpu, NULL); return rc; } @@ -889,6 +929,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) switch (mode) { case X86EMUL_MODE_REAL: + case X86EMUL_MODE_VM86: case X86EMUL_MODE_PROT16: def_op_bytes = def_ad_bytes = 2; break; @@ -975,7 +1016,7 @@ done_prefixes: } if (mode == X86EMUL_MODE_PROT64 && (c->d & No64)) { - kvm_report_emulation_failure(ctxt->vcpu, "invalid x86/64 instruction");; + kvm_report_emulation_failure(ctxt->vcpu, "invalid x86/64 instruction"); return -1; } @@ -1196,13 +1237,56 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt, rc = ops->read_emulated(register_address(c, ss_base(ctxt), c->regs[VCPU_REGS_RSP]), dest, len, ctxt->vcpu); - if (rc != 0) + if (rc != X86EMUL_CONTINUE) return rc; register_address_increment(c, &c->regs[VCPU_REGS_RSP], len); return rc; } +static int emulate_popf(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + void *dest, int len) +{ + int rc; + unsigned long val, change_mask; + int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; + int cpl = kvm_x86_ops->get_cpl(ctxt->vcpu); + + rc = emulate_pop(ctxt, ops, &val, len); + if (rc != X86EMUL_CONTINUE) + return rc; + + change_mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_OF + | EFLG_TF | EFLG_DF | EFLG_NT | EFLG_RF | EFLG_AC | EFLG_ID; + + switch(ctxt->mode) { + case X86EMUL_MODE_PROT64: + case X86EMUL_MODE_PROT32: + case X86EMUL_MODE_PROT16: + if (cpl == 0) + change_mask |= EFLG_IOPL; + if (cpl <= iopl) + change_mask |= EFLG_IF; + break; + case X86EMUL_MODE_VM86: + if (iopl < 3) { + kvm_inject_gp(ctxt->vcpu, 0); + return X86EMUL_PROPAGATE_FAULT; + } + change_mask |= EFLG_IF; + break; + default: /* real mode */ + change_mask |= (EFLG_IOPL | EFLG_IF); + break; + } + + *(unsigned long *)dest = + (ctxt->eflags & ~change_mask) | (val & change_mask); + + return rc; +} + static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg) { struct decode_cache *c = &ctxt->decode; @@ -1225,7 +1309,7 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, if (rc != 0) return rc; - rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)selector, 1, seg); + rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)selector, seg); return rc; } @@ -1370,7 +1454,7 @@ static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt, int rc; rc = ops->read_emulated(memop, &old, 8, ctxt->vcpu); - if (rc != 0) + if (rc != X86EMUL_CONTINUE) return rc; if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) || @@ -1385,7 +1469,7 @@ static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt, (u32) c->regs[VCPU_REGS_RBX]; rc = ops->cmpxchg_emulated(memop, &old, &new, 8, ctxt->vcpu); - if (rc != 0) + if (rc != X86EMUL_CONTINUE) return rc; ctxt->eflags |= EFLG_ZF; } @@ -1407,7 +1491,7 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt, rc = emulate_pop(ctxt, ops, &cs, c->op_bytes); if (rc) return rc; - rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)cs, 1, VCPU_SREG_CS); + rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)cs, VCPU_SREG_CS); return rc; } @@ -1451,7 +1535,7 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt, &c->dst.val, c->dst.bytes, ctxt->vcpu); - if (rc != 0) + if (rc != X86EMUL_CONTINUE) return rc; break; case OP_NONE: @@ -1514,9 +1598,8 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt) u64 msr_data; /* syscall is not available in real mode */ - if (c->lock_prefix || ctxt->mode == X86EMUL_MODE_REAL - || !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) - return -1; + if (ctxt->mode == X86EMUL_MODE_REAL || ctxt->mode == X86EMUL_MODE_VM86) + return X86EMUL_UNHANDLEABLE; setup_syscalls_segments(ctxt, &cs, &ss); @@ -1553,7 +1636,7 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt) ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); } - return 0; + return X86EMUL_CONTINUE; } static int @@ -1563,22 +1646,17 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt) struct kvm_segment cs, ss; u64 msr_data; - /* inject #UD if LOCK prefix is used */ - if (c->lock_prefix) - return -1; - - /* inject #GP if in real mode or paging is disabled */ - if (ctxt->mode == X86EMUL_MODE_REAL || - !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) { + /* inject #GP if in real mode */ + if (ctxt->mode == X86EMUL_MODE_REAL) { kvm_inject_gp(ctxt->vcpu, 0); - return -1; + return X86EMUL_UNHANDLEABLE; } /* XXX sysenter/sysexit have not been tested in 64bit mode. * Therefore, we inject an #UD. */ if (ctxt->mode == X86EMUL_MODE_PROT64) - return -1; + return X86EMUL_UNHANDLEABLE; setup_syscalls_segments(ctxt, &cs, &ss); @@ -1587,13 +1665,13 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt) case X86EMUL_MODE_PROT32: if ((msr_data & 0xfffc) == 0x0) { kvm_inject_gp(ctxt->vcpu, 0); - return -1; + return X86EMUL_PROPAGATE_FAULT; } break; case X86EMUL_MODE_PROT64: if (msr_data == 0x0) { kvm_inject_gp(ctxt->vcpu, 0); - return -1; + return X86EMUL_PROPAGATE_FAULT; } break; } @@ -1618,7 +1696,7 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt) kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data); c->regs[VCPU_REGS_RSP] = msr_data; - return 0; + return X86EMUL_CONTINUE; } static int @@ -1629,21 +1707,11 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt) u64 msr_data; int usermode; - /* inject #UD if LOCK prefix is used */ - if (c->lock_prefix) - return -1; - - /* inject #GP if in real mode or paging is disabled */ - if (ctxt->mode == X86EMUL_MODE_REAL - || !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) { - kvm_inject_gp(ctxt->vcpu, 0); - return -1; - } - - /* sysexit must be called from CPL 0 */ - if (kvm_x86_ops->get_cpl(ctxt->vcpu) != 0) { + /* inject #GP if in real mode or Virtual 8086 mode */ + if (ctxt->mode == X86EMUL_MODE_REAL || + ctxt->mode == X86EMUL_MODE_VM86) { kvm_inject_gp(ctxt->vcpu, 0); - return -1; + return X86EMUL_UNHANDLEABLE; } setup_syscalls_segments(ctxt, &cs, &ss); @@ -1661,7 +1729,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt) cs.selector = (u16)(msr_data + 16); if ((msr_data & 0xfffc) == 0x0) { kvm_inject_gp(ctxt->vcpu, 0); - return -1; + return X86EMUL_PROPAGATE_FAULT; } ss.selector = (u16)(msr_data + 24); break; @@ -1669,7 +1737,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt) cs.selector = (u16)(msr_data + 32); if (msr_data == 0x0) { kvm_inject_gp(ctxt->vcpu, 0); - return -1; + return X86EMUL_PROPAGATE_FAULT; } ss.selector = cs.selector + 8; cs.db = 0; @@ -1685,7 +1753,58 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt) c->eip = ctxt->vcpu->arch.regs[VCPU_REGS_RDX]; c->regs[VCPU_REGS_RSP] = ctxt->vcpu->arch.regs[VCPU_REGS_RCX]; - return 0; + return X86EMUL_CONTINUE; +} + +static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt) +{ + int iopl; + if (ctxt->mode == X86EMUL_MODE_REAL) + return false; + if (ctxt->mode == X86EMUL_MODE_VM86) + return true; + iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; + return kvm_x86_ops->get_cpl(ctxt->vcpu) > iopl; +} + +static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + u16 port, u16 len) +{ + struct kvm_segment tr_seg; + int r; + u16 io_bitmap_ptr; + u8 perm, bit_idx = port & 0x7; + unsigned mask = (1 << len) - 1; + + kvm_get_segment(ctxt->vcpu, &tr_seg, VCPU_SREG_TR); + if (tr_seg.unusable) + return false; + if (tr_seg.limit < 103) + return false; + r = ops->read_std(tr_seg.base + 102, &io_bitmap_ptr, 2, ctxt->vcpu, + NULL); + if (r != X86EMUL_CONTINUE) + return false; + if (io_bitmap_ptr + port/8 > tr_seg.limit) + return false; + r = ops->read_std(tr_seg.base + io_bitmap_ptr + port/8, &perm, 1, + ctxt->vcpu, NULL); + if (r != X86EMUL_CONTINUE) + return false; + if ((perm >> bit_idx) & mask) + return false; + return true; +} + +static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + u16 port, u16 len) +{ + if (emulator_bad_iopl(ctxt)) + if (!emulator_io_port_access_allowed(ctxt, ops, port, len)) + return false; + return true; } int @@ -1709,6 +1828,18 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); saved_eip = c->eip; + /* LOCK prefix is allowed only with some instructions */ + if (c->lock_prefix && !(c->d & Lock)) { + kvm_queue_exception(ctxt->vcpu, UD_VECTOR); + goto done; + } + + /* Privileged instruction can be executed only in CPL=0 */ + if ((c->d & Priv) && kvm_x86_ops->get_cpl(ctxt->vcpu)) { + kvm_inject_gp(ctxt->vcpu, 0); + goto done; + } + if (((c->d & ModRM) && (c->modrm_mod != 3)) || (c->d & MemAbs)) memop = c->modrm_ea; @@ -1749,7 +1880,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) &c->src.val, c->src.bytes, ctxt->vcpu); - if (rc != 0) + if (rc != X86EMUL_CONTINUE) goto done; c->src.orig_val = c->src.val; } @@ -1768,12 +1899,15 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) c->dst.ptr = (void *)c->dst.ptr + (c->src.val & mask) / 8; } - if (!(c->d & Mov) && - /* optimisation - avoid slow emulated read */ - ((rc = ops->read_emulated((unsigned long)c->dst.ptr, - &c->dst.val, - c->dst.bytes, ctxt->vcpu)) != 0)) - goto done; + if (!(c->d & Mov)) { + /* optimisation - avoid slow emulated read */ + rc = ops->read_emulated((unsigned long)c->dst.ptr, + &c->dst.val, + c->dst.bytes, + ctxt->vcpu); + if (rc != X86EMUL_CONTINUE) + goto done; + } } c->dst.orig_val = c->dst.val; @@ -1876,7 +2010,12 @@ special_insn: break; case 0x6c: /* insb */ case 0x6d: /* insw/insd */ - if (kvm_emulate_pio_string(ctxt->vcpu, + if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], + (c->d & ByteOp) ? 1 : c->op_bytes)) { + kvm_inject_gp(ctxt->vcpu, 0); + goto done; + } + if (kvm_emulate_pio_string(ctxt->vcpu, 1, (c->d & ByteOp) ? 1 : c->op_bytes, c->rep_prefix ? @@ -1892,6 +2031,11 @@ special_insn: return 0; case 0x6e: /* outsb */ case 0x6f: /* outsw/outsd */ + if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], + (c->d & ByteOp) ? 1 : c->op_bytes)) { + kvm_inject_gp(ctxt->vcpu, 0); + goto done; + } if (kvm_emulate_pio_string(ctxt->vcpu, 0, (c->d & ByteOp) ? 1 : c->op_bytes, @@ -1978,25 +2122,19 @@ special_insn: break; case 0x8e: { /* mov seg, r/m16 */ uint16_t sel; - int type_bits; - int err; sel = c->src.val; - if (c->modrm_reg == VCPU_SREG_SS) - toggle_interruptibility(ctxt, X86_SHADOW_INT_MOV_SS); - if (c->modrm_reg <= 5) { - type_bits = (c->modrm_reg == 1) ? 9 : 1; - err = kvm_load_segment_descriptor(ctxt->vcpu, sel, - type_bits, c->modrm_reg); - } else { - printk(KERN_INFO "Invalid segreg in modrm byte 0x%02x\n", - c->modrm); - goto cannot_emulate; + if (c->modrm_reg == VCPU_SREG_CS || + c->modrm_reg > VCPU_SREG_GS) { + kvm_queue_exception(ctxt->vcpu, UD_VECTOR); + goto done; } - if (err < 0) - goto cannot_emulate; + if (c->modrm_reg == VCPU_SREG_SS) + toggle_interruptibility(ctxt, X86_SHADOW_INT_MOV_SS); + + rc = kvm_load_segment_descriptor(ctxt->vcpu, sel, c->modrm_reg); c->dst.type = OP_NONE; /* Disable writeback. */ break; @@ -2025,7 +2163,10 @@ special_insn: c->dst.type = OP_REG; c->dst.ptr = (unsigned long *) &ctxt->eflags; c->dst.bytes = c->op_bytes; - goto pop_instruction; + rc = emulate_popf(ctxt, ops, &c->dst.val, c->op_bytes); + if (rc != X86EMUL_CONTINUE) + goto done; + break; case 0xa0 ... 0xa1: /* mov */ c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; c->dst.val = c->src.val; @@ -2039,11 +2180,12 @@ special_insn: c->dst.ptr = (unsigned long *)register_address(c, es_base(ctxt), c->regs[VCPU_REGS_RDI]); - if ((rc = ops->read_emulated(register_address(c, - seg_override_base(ctxt, c), - c->regs[VCPU_REGS_RSI]), + rc = ops->read_emulated(register_address(c, + seg_override_base(ctxt, c), + c->regs[VCPU_REGS_RSI]), &c->dst.val, - c->dst.bytes, ctxt->vcpu)) != 0) + c->dst.bytes, ctxt->vcpu); + if (rc != X86EMUL_CONTINUE) goto done; register_address_increment(c, &c->regs[VCPU_REGS_RSI], (ctxt->eflags & EFLG_DF) ? -c->dst.bytes @@ -2058,10 +2200,11 @@ special_insn: c->src.ptr = (unsigned long *)register_address(c, seg_override_base(ctxt, c), c->regs[VCPU_REGS_RSI]); - if ((rc = ops->read_emulated((unsigned long)c->src.ptr, - &c->src.val, - c->src.bytes, - ctxt->vcpu)) != 0) + rc = ops->read_emulated((unsigned long)c->src.ptr, + &c->src.val, + c->src.bytes, + ctxt->vcpu); + if (rc != X86EMUL_CONTINUE) goto done; c->dst.type = OP_NONE; /* Disable writeback. */ @@ -2069,10 +2212,11 @@ special_insn: c->dst.ptr = (unsigned long *)register_address(c, es_base(ctxt), c->regs[VCPU_REGS_RDI]); - if ((rc = ops->read_emulated((unsigned long)c->dst.ptr, - &c->dst.val, - c->dst.bytes, - ctxt->vcpu)) != 0) + rc = ops->read_emulated((unsigned long)c->dst.ptr, + &c->dst.val, + c->dst.bytes, + ctxt->vcpu); + if (rc != X86EMUL_CONTINUE) goto done; DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr); @@ -2102,12 +2246,13 @@ special_insn: c->dst.type = OP_REG; c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; - if ((rc = ops->read_emulated(register_address(c, - seg_override_base(ctxt, c), - c->regs[VCPU_REGS_RSI]), - &c->dst.val, - c->dst.bytes, - ctxt->vcpu)) != 0) + rc = ops->read_emulated(register_address(c, + seg_override_base(ctxt, c), + c->regs[VCPU_REGS_RSI]), + &c->dst.val, + c->dst.bytes, + ctxt->vcpu); + if (rc != X86EMUL_CONTINUE) goto done; register_address_increment(c, &c->regs[VCPU_REGS_RSI], (ctxt->eflags & EFLG_DF) ? -c->dst.bytes @@ -2163,11 +2308,9 @@ special_insn: case 0xe9: /* jmp rel */ goto jmp; case 0xea: /* jmp far */ - if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val, 9, - VCPU_SREG_CS) < 0) { - DPRINTF("jmp far: Failed to load CS descriptor\n"); - goto cannot_emulate; - } + if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val, + VCPU_SREG_CS)) + goto done; c->eip = c->src.val; break; @@ -2185,7 +2328,13 @@ special_insn: case 0xef: /* out (e/r)ax,dx */ port = c->regs[VCPU_REGS_RDX]; io_dir_in = 0; - do_io: if (kvm_emulate_pio(ctxt->vcpu, io_dir_in, + do_io: + if (!emulator_io_permited(ctxt, ops, port, + (c->d & ByteOp) ? 1 : c->op_bytes)) { + kvm_inject_gp(ctxt->vcpu, 0); + goto done; + } + if (kvm_emulate_pio(ctxt->vcpu, io_dir_in, (c->d & ByteOp) ? 1 : c->op_bytes, port) != 0) { c->eip = saved_eip; @@ -2210,13 +2359,21 @@ special_insn: c->dst.type = OP_NONE; /* Disable writeback. */ break; case 0xfa: /* cli */ - ctxt->eflags &= ~X86_EFLAGS_IF; - c->dst.type = OP_NONE; /* Disable writeback. */ + if (emulator_bad_iopl(ctxt)) + kvm_inject_gp(ctxt->vcpu, 0); + else { + ctxt->eflags &= ~X86_EFLAGS_IF; + c->dst.type = OP_NONE; /* Disable writeback. */ + } break; case 0xfb: /* sti */ - toggle_interruptibility(ctxt, X86_SHADOW_INT_STI); - ctxt->eflags |= X86_EFLAGS_IF; - c->dst.type = OP_NONE; /* Disable writeback. */ + if (emulator_bad_iopl(ctxt)) + kvm_inject_gp(ctxt->vcpu, 0); + else { + toggle_interruptibility(ctxt, X86_SHADOW_INT_STI); + ctxt->eflags |= X86_EFLAGS_IF; + c->dst.type = OP_NONE; /* Disable writeback. */ + } break; case 0xfc: /* cld */ ctxt->eflags &= ~EFLG_DF; @@ -2319,8 +2476,9 @@ twobyte_insn: } break; case 0x05: /* syscall */ - if (emulate_syscall(ctxt) == -1) - goto cannot_emulate; + rc = emulate_syscall(ctxt); + if (rc != X86EMUL_CONTINUE) + goto done; else goto writeback; break; @@ -2391,14 +2549,16 @@ twobyte_insn: c->dst.type = OP_NONE; break; case 0x34: /* sysenter */ - if (emulate_sysenter(ctxt) == -1) - goto cannot_emulate; + rc = emulate_sysenter(ctxt); + if (rc != X86EMUL_CONTINUE) + goto done; else goto writeback; break; case 0x35: /* sysexit */ - if (emulate_sysexit(ctxt) == -1) - goto cannot_emulate; + rc = emulate_sysexit(ctxt); + if (rc != X86EMUL_CONTINUE) + goto done; else goto writeback; break; diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 15578f1..294698b 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -242,11 +242,11 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian) { struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state, irq_ack_notifier); - spin_lock(&ps->inject_lock); + raw_spin_lock(&ps->inject_lock); if (atomic_dec_return(&ps->pit_timer.pending) < 0) atomic_inc(&ps->pit_timer.pending); ps->irq_ack = 1; - spin_unlock(&ps->inject_lock); + raw_spin_unlock(&ps->inject_lock); } void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) @@ -605,7 +605,7 @@ static const struct kvm_io_device_ops speaker_dev_ops = { .write = speaker_ioport_write, }; -/* Caller must have writers lock on slots_lock */ +/* Caller must hold slots_lock */ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) { struct kvm_pit *pit; @@ -624,7 +624,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) mutex_init(&pit->pit_state.lock); mutex_lock(&pit->pit_state.lock); - spin_lock_init(&pit->pit_state.inject_lock); + raw_spin_lock_init(&pit->pit_state.inject_lock); kvm->arch.vpit = pit; pit->kvm = kvm; @@ -645,13 +645,13 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier); kvm_iodevice_init(&pit->dev, &pit_dev_ops); - ret = __kvm_io_bus_register_dev(&kvm->pio_bus, &pit->dev); + ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &pit->dev); if (ret < 0) goto fail; if (flags & KVM_PIT_SPEAKER_DUMMY) { kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops); - ret = __kvm_io_bus_register_dev(&kvm->pio_bus, + ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &pit->speaker_dev); if (ret < 0) goto fail_unregister; @@ -660,11 +660,12 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) return pit; fail_unregister: - __kvm_io_bus_unregister_dev(&kvm->pio_bus, &pit->dev); + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev); fail: - if (pit->irq_source_id >= 0) - kvm_free_irq_source_id(kvm, pit->irq_source_id); + kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier); + kvm_unregister_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier); + kvm_free_irq_source_id(kvm, pit->irq_source_id); kfree(pit); return NULL; @@ -723,12 +724,12 @@ void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu) /* Try to inject pending interrupts when * last one has been acked. */ - spin_lock(&ps->inject_lock); + raw_spin_lock(&ps->inject_lock); if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) { ps->irq_ack = 0; inject = 1; } - spin_unlock(&ps->inject_lock); + raw_spin_unlock(&ps->inject_lock); if (inject) __inject_pit_timer_intr(kvm); } diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index d4c1c7f..900d6b0 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h @@ -27,7 +27,7 @@ struct kvm_kpit_state { u32 speaker_data_on; struct mutex lock; struct kvm_pit *pit; - spinlock_t inject_lock; + raw_spinlock_t inject_lock; unsigned long irq_ack; struct kvm_irq_ack_notifier irq_ack_notifier; }; diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index d057c0c..07771da 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -44,18 +44,19 @@ static void pic_clear_isr(struct kvm_kpic_state *s, int irq) * Other interrupt may be delivered to PIC while lock is dropped but * it should be safe since PIC state is already updated at this stage. */ - spin_unlock(&s->pics_state->lock); + raw_spin_unlock(&s->pics_state->lock); kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq); - spin_lock(&s->pics_state->lock); + raw_spin_lock(&s->pics_state->lock); } void kvm_pic_clear_isr_ack(struct kvm *kvm) { struct kvm_pic *s = pic_irqchip(kvm); - spin_lock(&s->lock); + + raw_spin_lock(&s->lock); s->pics[0].isr_ack = 0xff; s->pics[1].isr_ack = 0xff; - spin_unlock(&s->lock); + raw_spin_unlock(&s->lock); } /* @@ -156,9 +157,9 @@ static void pic_update_irq(struct kvm_pic *s) void kvm_pic_update_irq(struct kvm_pic *s) { - spin_lock(&s->lock); + raw_spin_lock(&s->lock); pic_update_irq(s); - spin_unlock(&s->lock); + raw_spin_unlock(&s->lock); } int kvm_pic_set_irq(void *opaque, int irq, int level) @@ -166,14 +167,14 @@ int kvm_pic_set_irq(void *opaque, int irq, int level) struct kvm_pic *s = opaque; int ret = -1; - spin_lock(&s->lock); + raw_spin_lock(&s->lock); if (irq >= 0 && irq < PIC_NUM_PINS) { ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); pic_update_irq(s); trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr, s->pics[irq >> 3].imr, ret == 0); } - spin_unlock(&s->lock); + raw_spin_unlock(&s->lock); return ret; } @@ -203,7 +204,7 @@ int kvm_pic_read_irq(struct kvm *kvm) int irq, irq2, intno; struct kvm_pic *s = pic_irqchip(kvm); - spin_lock(&s->lock); + raw_spin_lock(&s->lock); irq = pic_get_irq(&s->pics[0]); if (irq >= 0) { pic_intack(&s->pics[0], irq); @@ -228,7 +229,7 @@ int kvm_pic_read_irq(struct kvm *kvm) intno = s->pics[0].irq_base + irq; } pic_update_irq(s); - spin_unlock(&s->lock); + raw_spin_unlock(&s->lock); return intno; } @@ -442,7 +443,7 @@ static int picdev_write(struct kvm_io_device *this, printk(KERN_ERR "PIC: non byte write\n"); return 0; } - spin_lock(&s->lock); + raw_spin_lock(&s->lock); switch (addr) { case 0x20: case 0x21: @@ -455,7 +456,7 @@ static int picdev_write(struct kvm_io_device *this, elcr_ioport_write(&s->pics[addr & 1], addr, data); break; } - spin_unlock(&s->lock); + raw_spin_unlock(&s->lock); return 0; } @@ -472,7 +473,7 @@ static int picdev_read(struct kvm_io_device *this, printk(KERN_ERR "PIC: non byte read\n"); return 0; } - spin_lock(&s->lock); + raw_spin_lock(&s->lock); switch (addr) { case 0x20: case 0x21: @@ -486,7 +487,7 @@ static int picdev_read(struct kvm_io_device *this, break; } *(unsigned char *)val = data; - spin_unlock(&s->lock); + raw_spin_unlock(&s->lock); return 0; } @@ -520,7 +521,7 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL); if (!s) return NULL; - spin_lock_init(&s->lock); + raw_spin_lock_init(&s->lock); s->kvm = kvm; s->pics[0].elcr_mask = 0xf8; s->pics[1].elcr_mask = 0xde; @@ -533,7 +534,9 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) * Initialize PIO device */ kvm_iodevice_init(&s->dev, &picdev_ops); - ret = kvm_io_bus_register_dev(kvm, &kvm->pio_bus, &s->dev); + mutex_lock(&kvm->slots_lock); + ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &s->dev); + mutex_unlock(&kvm->slots_lock); if (ret < 0) { kfree(s); return NULL; @@ -541,3 +544,14 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) return s; } + +void kvm_destroy_pic(struct kvm *kvm) +{ + struct kvm_pic *vpic = kvm->arch.vpic; + + if (vpic) { + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev); + kvm->arch.vpic = NULL; + kfree(vpic); + } +} diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index be399e2..34b15915 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -62,7 +62,7 @@ struct kvm_kpic_state { }; struct kvm_pic { - spinlock_t lock; + raw_spinlock_t lock; unsigned pending_acks; struct kvm *kvm; struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ @@ -75,6 +75,7 @@ struct kvm_pic { }; struct kvm_pic *kvm_create_pic(struct kvm *kvm); +void kvm_destroy_pic(struct kvm *kvm); int kvm_pic_read_irq(struct kvm *kvm); void kvm_pic_update_irq(struct kvm_pic *s); void kvm_pic_clear_isr_ack(struct kvm *kvm); diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 7bcc5b6..cff851c 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -1,6 +1,11 @@ #ifndef ASM_KVM_CACHE_REGS_H #define ASM_KVM_CACHE_REGS_H +#define KVM_POSSIBLE_CR0_GUEST_BITS X86_CR0_TS +#define KVM_POSSIBLE_CR4_GUEST_BITS \ + (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ + | X86_CR4_OSXMMEXCPT | X86_CR4_PGE) + static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu, enum kvm_reg reg) { @@ -38,4 +43,30 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index) return vcpu->arch.pdptrs[index]; } +static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask) +{ + ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS; + if (tmask & vcpu->arch.cr0_guest_owned_bits) + kvm_x86_ops->decache_cr0_guest_bits(vcpu); + return vcpu->arch.cr0 & mask; +} + +static inline ulong kvm_read_cr0(struct kvm_vcpu *vcpu) +{ + return kvm_read_cr0_bits(vcpu, ~0UL); +} + +static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask) +{ + ulong tmask = mask & KVM_POSSIBLE_CR4_GUEST_BITS; + if (tmask & vcpu->arch.cr4_guest_owned_bits) + kvm_x86_ops->decache_cr4_guest_bits(vcpu); + return vcpu->arch.cr4 & mask; +} + +static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu) +{ + return kvm_read_cr4_bits(vcpu, ~0UL); +} + #endif diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index ba8c045..4b224f9 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1246,3 +1246,34 @@ int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data) return 0; } + +int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 reg, u64 data) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + + if (!irqchip_in_kernel(vcpu->kvm)) + return 1; + + /* if this is ICR write vector before command */ + if (reg == APIC_ICR) + apic_reg_write(apic, APIC_ICR2, (u32)(data >> 32)); + return apic_reg_write(apic, reg, (u32)data); +} + +int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + u32 low, high = 0; + + if (!irqchip_in_kernel(vcpu->kvm)) + return 1; + + if (apic_reg_read(apic, reg, 4, &low)) + return 1; + if (reg == APIC_ICR) + apic_reg_read(apic, APIC_ICR2, 4, &high); + + *data = (((u64)high) << 32) | low; + + return 0; +} diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 40010b0..f5fe32c 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -48,4 +48,12 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu); int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data); int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data); + +int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data); +int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data); + +static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE; +} #endif diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 89a49fb..741373e 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -18,6 +18,7 @@ */ #include "mmu.h" +#include "x86.h" #include "kvm_cache_regs.h" #include <linux/kvm_host.h> @@ -29,6 +30,7 @@ #include <linux/swap.h> #include <linux/hugetlb.h> #include <linux/compiler.h> +#include <linux/srcu.h> #include <asm/page.h> #include <asm/cmpxchg.h> @@ -136,16 +138,6 @@ module_param(oos_shadow, bool, 0644); #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \ | PT64_NX_MASK) -#define PFERR_PRESENT_MASK (1U << 0) -#define PFERR_WRITE_MASK (1U << 1) -#define PFERR_USER_MASK (1U << 2) -#define PFERR_RSVD_MASK (1U << 3) -#define PFERR_FETCH_MASK (1U << 4) - -#define PT_PDPE_LEVEL 3 -#define PT_DIRECTORY_LEVEL 2 -#define PT_PAGE_TABLE_LEVEL 1 - #define RMAP_EXT 4 #define ACC_EXEC_MASK 1 @@ -153,6 +145,9 @@ module_param(oos_shadow, bool, 0644); #define ACC_USER_MASK PT_USER_MASK #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) +#include <trace/events/kvm.h> + +#undef TRACE_INCLUDE_FILE #define CREATE_TRACE_POINTS #include "mmutrace.h" @@ -229,7 +224,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); static int is_write_protection(struct kvm_vcpu *vcpu) { - return vcpu->arch.cr0 & X86_CR0_WP; + return kvm_read_cr0_bits(vcpu, X86_CR0_WP); } static int is_cpuid_PSE36(void) @@ -239,7 +234,7 @@ static int is_cpuid_PSE36(void) static int is_nx(struct kvm_vcpu *vcpu) { - return vcpu->arch.shadow_efer & EFER_NX; + return vcpu->arch.efer & EFER_NX; } static int is_shadow_present_pte(u64 pte) @@ -253,7 +248,7 @@ static int is_large_pte(u64 pte) return pte & PT_PAGE_SIZE_MASK; } -static int is_writeble_pte(unsigned long pte) +static int is_writable_pte(unsigned long pte) { return pte & PT_WRITABLE_MASK; } @@ -470,24 +465,10 @@ static int has_wrprotected_page(struct kvm *kvm, static int host_mapping_level(struct kvm *kvm, gfn_t gfn) { - unsigned long page_size = PAGE_SIZE; - struct vm_area_struct *vma; - unsigned long addr; + unsigned long page_size; int i, ret = 0; - addr = gfn_to_hva(kvm, gfn); - if (kvm_is_error_hva(addr)) - return PT_PAGE_TABLE_LEVEL; - - down_read(¤t->mm->mmap_sem); - vma = find_vma(current->mm, addr); - if (!vma) - goto out; - - page_size = vma_kernel_pagesize(vma); - -out: - up_read(¤t->mm->mmap_sem); + page_size = kvm_host_page_size(kvm, gfn); for (i = PT_PAGE_TABLE_LEVEL; i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) { @@ -503,8 +484,7 @@ out: static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) { struct kvm_memory_slot *slot; - int host_level; - int level = PT_PAGE_TABLE_LEVEL; + int host_level, level, max_level; slot = gfn_to_memslot(vcpu->kvm, large_gfn); if (slot && slot->dirty_bitmap) @@ -515,7 +495,10 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) if (host_level == PT_PAGE_TABLE_LEVEL) return host_level; - for (level = PT_DIRECTORY_LEVEL; level <= host_level; ++level) + max_level = kvm_x86_ops->get_lpage_level() < host_level ? + kvm_x86_ops->get_lpage_level() : host_level; + + for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level) if (has_wrprotected_page(vcpu->kvm, large_gfn, level)) break; @@ -633,7 +616,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) pfn = spte_to_pfn(*spte); if (*spte & shadow_accessed_mask) kvm_set_pfn_accessed(pfn); - if (is_writeble_pte(*spte)) + if (is_writable_pte(*spte)) kvm_set_pfn_dirty(pfn); rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], sp->role.level); if (!*rmapp) { @@ -662,6 +645,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) prev_desc = desc; desc = desc->more; } + pr_err("rmap_remove: %p %llx many->many\n", spte, *spte); BUG(); } } @@ -708,7 +692,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) BUG_ON(!spte); BUG_ON(!(*spte & PT_PRESENT_MASK)); rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); - if (is_writeble_pte(*spte)) { + if (is_writable_pte(*spte)) { __set_spte(spte, *spte & ~PT_WRITABLE_MASK); write_protected = 1; } @@ -732,7 +716,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) BUG_ON(!(*spte & PT_PRESENT_MASK)); BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)); pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); - if (is_writeble_pte(*spte)) { + if (is_writable_pte(*spte)) { rmap_remove(kvm, spte); --kvm->stat.lpages; __set_spte(spte, shadow_trap_nonpresent_pte); @@ -787,7 +771,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, new_spte &= ~PT_WRITABLE_MASK; new_spte &= ~SPTE_HOST_WRITEABLE; - if (is_writeble_pte(*spte)) + if (is_writable_pte(*spte)) kvm_set_pfn_dirty(spte_to_pfn(*spte)); __set_spte(spte, new_spte); spte = rmap_next(kvm, rmapp, spte); @@ -805,35 +789,32 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, unsigned long data)) { int i, j; + int ret; int retval = 0; + struct kvm_memslots *slots; - /* - * If mmap_sem isn't taken, we can look the memslots with only - * the mmu_lock by skipping over the slots with userspace_addr == 0. - */ - for (i = 0; i < kvm->nmemslots; i++) { - struct kvm_memory_slot *memslot = &kvm->memslots[i]; + slots = rcu_dereference(kvm->memslots); + + for (i = 0; i < slots->nmemslots; i++) { + struct kvm_memory_slot *memslot = &slots->memslots[i]; unsigned long start = memslot->userspace_addr; unsigned long end; - /* mmu_lock protects userspace_addr */ - if (!start) - continue; - end = start + (memslot->npages << PAGE_SHIFT); if (hva >= start && hva < end) { gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; - retval |= handler(kvm, &memslot->rmap[gfn_offset], - data); + ret = handler(kvm, &memslot->rmap[gfn_offset], data); for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { int idx = gfn_offset; idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j); - retval |= handler(kvm, + ret |= handler(kvm, &memslot->lpage_info[j][idx].rmap_pde, data); } + trace_kvm_age_page(hva, memslot, ret); + retval |= ret; } } @@ -856,9 +837,15 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, u64 *spte; int young = 0; - /* always return old for EPT */ + /* + * Emulate the accessed bit for EPT, by checking if this page has + * an EPT mapping, and clearing it if it does. On the next access, + * a new EPT mapping will be established. + * This has some overhead, but not as much as the cost of swapping + * out actively used pages or breaking up actively used hugepages. + */ if (!shadow_accessed_mask) - return 0; + return kvm_unmap_rmapp(kvm, rmapp, data); spte = rmap_next(kvm, rmapp, NULL); while (spte) { @@ -1615,7 +1602,7 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn) static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) { - int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn)); + int slot = memslot_id(kvm, gfn); struct kvm_mmu_page *sp = page_header(__pa(pte)); __set_bit(slot, sp->slot_bitmap); @@ -1639,7 +1626,7 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) { struct page *page; - gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); + gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); if (gpa == UNMAPPED_GVA) return NULL; @@ -1852,7 +1839,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, * is responsibility of mmu_get_page / kvm_sync_page. * Same reasoning can be applied to dirty page accounting. */ - if (!can_unsync && is_writeble_pte(*sptep)) + if (!can_unsync && is_writable_pte(*sptep)) goto set_pte; if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { @@ -1860,7 +1847,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, __func__, gfn); ret = 1; pte_access &= ~ACC_WRITE_MASK; - if (is_writeble_pte(spte)) + if (is_writable_pte(spte)) spte &= ~PT_WRITABLE_MASK; } } @@ -1881,7 +1868,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, bool reset_host_protection) { int was_rmapped = 0; - int was_writeble = is_writeble_pte(*sptep); + int was_writable = is_writable_pte(*sptep); int rmap_count; pgprintk("%s: spte %llx access %x write_fault %d" @@ -1932,7 +1919,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (rmap_count > RMAP_RECYCLE_THRESHOLD) rmap_recycle(vcpu, sptep, gfn); } else { - if (was_writeble) + if (was_writable) kvm_release_pfn_dirty(pfn); else kvm_release_pfn_clean(pfn); @@ -2162,8 +2149,11 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) spin_unlock(&vcpu->kvm->mmu_lock); } -static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) +static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr, + u32 access, u32 *error) { + if (error) + *error = 0; return vaddr; } @@ -2747,7 +2737,7 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) if (tdp_enabled) return 0; - gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); + gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); spin_lock(&vcpu->kvm->mmu_lock); r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); @@ -2847,16 +2837,13 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu) */ page = alloc_page(GFP_KERNEL | __GFP_DMA32); if (!page) - goto error_1; + return -ENOMEM; + vcpu->arch.mmu.pae_root = page_address(page); for (i = 0; i < 4; ++i) vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; return 0; - -error_1: - free_mmu_pages(vcpu); - return -ENOMEM; } int kvm_mmu_create(struct kvm_vcpu *vcpu) @@ -2936,10 +2923,9 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask) spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { - int npages; + int npages, idx; - if (!down_read_trylock(&kvm->slots_lock)) - continue; + idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); npages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages; @@ -2952,7 +2938,7 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask) nr_to_scan--; spin_unlock(&kvm->mmu_lock); - up_read(&kvm->slots_lock); + srcu_read_unlock(&kvm->srcu, idx); } if (kvm_freed) list_move_tail(&kvm_freed->vm_list, &vm_list); @@ -3019,9 +3005,11 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm) int i; unsigned int nr_mmu_pages; unsigned int nr_pages = 0; + struct kvm_memslots *slots; - for (i = 0; i < kvm->nmemslots; i++) - nr_pages += kvm->memslots[i].npages; + slots = rcu_dereference(kvm->memslots); + for (i = 0; i < slots->nmemslots; i++) + nr_pages += slots->memslots[i].npages; nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000; nr_mmu_pages = max(nr_mmu_pages, @@ -3246,7 +3234,7 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, if (is_shadow_present_pte(ent) && !is_last_spte(ent, level)) audit_mappings_page(vcpu, ent, va, level - 1); else { - gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va); + gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, va, NULL); gfn_t gfn = gpa >> PAGE_SHIFT; pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn); hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT; @@ -3291,10 +3279,12 @@ static void audit_mappings(struct kvm_vcpu *vcpu) static int count_rmaps(struct kvm_vcpu *vcpu) { int nmaps = 0; - int i, j, k; + int i, j, k, idx; + idx = srcu_read_lock(&kvm->srcu); + slots = rcu_dereference(kvm->memslots); for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { - struct kvm_memory_slot *m = &vcpu->kvm->memslots[i]; + struct kvm_memory_slot *m = &slots->memslots[i]; struct kvm_rmap_desc *d; for (j = 0; j < m->npages; ++j) { @@ -3317,6 +3307,7 @@ static int count_rmaps(struct kvm_vcpu *vcpu) } } } + srcu_read_unlock(&kvm->srcu, idx); return nmaps; } diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 61a1b38..be66759 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -2,6 +2,7 @@ #define __KVM_X86_MMU_H #include <linux/kvm_host.h> +#include "kvm_cache_regs.h" #define PT64_PT_BITS 9 #define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS) @@ -37,6 +38,16 @@ #define PT32_ROOT_LEVEL 2 #define PT32E_ROOT_LEVEL 3 +#define PT_PDPE_LEVEL 3 +#define PT_DIRECTORY_LEVEL 2 +#define PT_PAGE_TABLE_LEVEL 1 + +#define PFERR_PRESENT_MASK (1U << 0) +#define PFERR_WRITE_MASK (1U << 1) +#define PFERR_USER_MASK (1U << 2) +#define PFERR_RSVD_MASK (1U << 3) +#define PFERR_FETCH_MASK (1U << 4) + int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) @@ -53,30 +64,6 @@ static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) return kvm_mmu_load(vcpu); } -static inline int is_long_mode(struct kvm_vcpu *vcpu) -{ -#ifdef CONFIG_X86_64 - return vcpu->arch.shadow_efer & EFER_LMA; -#else - return 0; -#endif -} - -static inline int is_pae(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.cr4 & X86_CR4_PAE; -} - -static inline int is_pse(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.cr4 & X86_CR4_PSE; -} - -static inline int is_paging(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.cr0 & X86_CR0_PG; -} - static inline int is_present_gpte(unsigned long pte) { return pte & PT_PRESENT_MASK; diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index ede2131..81eab9a 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -162,7 +162,7 @@ walk: if (rsvd_fault) goto access_error; - if (write_fault && !is_writeble_pte(pte)) + if (write_fault && !is_writable_pte(pte)) if (user_fault || is_write_protection(vcpu)) goto access_error; @@ -490,18 +490,23 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) spin_unlock(&vcpu->kvm->mmu_lock); } -static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) +static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, + u32 *error) { struct guest_walker walker; gpa_t gpa = UNMAPPED_GVA; int r; - r = FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0, 0); + r = FNAME(walk_addr)(&walker, vcpu, vaddr, + !!(access & PFERR_WRITE_MASK), + !!(access & PFERR_USER_MASK), + !!(access & PFERR_FETCH_MASK)); if (r) { gpa = gfn_to_gpa(walker.gfn); gpa |= vaddr & ~PAGE_MASK; - } + } else if (error) + *error = walker.error_code; return gpa; } diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 1d9b338..52f78dd0 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -231,7 +231,7 @@ static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) efer &= ~EFER_LME; to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME; - vcpu->arch.shadow_efer = efer; + vcpu->arch.efer = efer; } static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, @@ -540,6 +540,8 @@ static void init_vmcb(struct vcpu_svm *svm) struct vmcb_control_area *control = &svm->vmcb->control; struct vmcb_save_area *save = &svm->vmcb->save; + svm->vcpu.fpu_active = 1; + control->intercept_cr_read = INTERCEPT_CR0_MASK | INTERCEPT_CR3_MASK | INTERCEPT_CR4_MASK; @@ -552,13 +554,19 @@ static void init_vmcb(struct vcpu_svm *svm) control->intercept_dr_read = INTERCEPT_DR0_MASK | INTERCEPT_DR1_MASK | INTERCEPT_DR2_MASK | - INTERCEPT_DR3_MASK; + INTERCEPT_DR3_MASK | + INTERCEPT_DR4_MASK | + INTERCEPT_DR5_MASK | + INTERCEPT_DR6_MASK | + INTERCEPT_DR7_MASK; control->intercept_dr_write = INTERCEPT_DR0_MASK | INTERCEPT_DR1_MASK | INTERCEPT_DR2_MASK | INTERCEPT_DR3_MASK | + INTERCEPT_DR4_MASK | INTERCEPT_DR5_MASK | + INTERCEPT_DR6_MASK | INTERCEPT_DR7_MASK; control->intercept_exceptions = (1 << PF_VECTOR) | @@ -569,6 +577,7 @@ static void init_vmcb(struct vcpu_svm *svm) control->intercept = (1ULL << INTERCEPT_INTR) | (1ULL << INTERCEPT_NMI) | (1ULL << INTERCEPT_SMI) | + (1ULL << INTERCEPT_SELECTIVE_CR0) | (1ULL << INTERCEPT_CPUID) | (1ULL << INTERCEPT_INVD) | (1ULL << INTERCEPT_HLT) | @@ -641,10 +650,8 @@ static void init_vmcb(struct vcpu_svm *svm) control->intercept &= ~((1ULL << INTERCEPT_TASK_SWITCH) | (1ULL << INTERCEPT_INVLPG)); control->intercept_exceptions &= ~(1 << PF_VECTOR); - control->intercept_cr_read &= ~(INTERCEPT_CR0_MASK| - INTERCEPT_CR3_MASK); - control->intercept_cr_write &= ~(INTERCEPT_CR0_MASK| - INTERCEPT_CR3_MASK); + control->intercept_cr_read &= ~INTERCEPT_CR3_MASK; + control->intercept_cr_write &= ~INTERCEPT_CR3_MASK; save->g_pat = 0x0007040600070406ULL; save->cr3 = 0; save->cr4 = 0; @@ -730,7 +737,6 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) init_vmcb(svm); fx_init(&svm->vcpu); - svm->vcpu.fpu_active = 1; svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; if (kvm_vcpu_is_bsp(&svm->vcpu)) svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; @@ -765,14 +771,16 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (unlikely(cpu != vcpu->cpu)) { u64 delta; - /* - * Make sure that the guest sees a monotonically - * increasing TSC. - */ - delta = vcpu->arch.host_tsc - native_read_tsc(); - svm->vmcb->control.tsc_offset += delta; - if (is_nested(svm)) - svm->nested.hsave->control.tsc_offset += delta; + if (check_tsc_unstable()) { + /* + * Make sure that the guest sees a monotonically + * increasing TSC. + */ + delta = vcpu->arch.host_tsc - native_read_tsc(); + svm->vmcb->control.tsc_offset += delta; + if (is_nested(svm)) + svm->nested.hsave->control.tsc_offset += delta; + } vcpu->cpu = cpu; kvm_migrate_timers(vcpu); svm->asid_generation = 0; @@ -954,42 +962,59 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) svm->vmcb->save.gdtr.base = dt->base ; } +static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) +{ +} + static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) { } +static void update_cr0_intercept(struct vcpu_svm *svm) +{ + ulong gcr0 = svm->vcpu.arch.cr0; + u64 *hcr0 = &svm->vmcb->save.cr0; + + if (!svm->vcpu.fpu_active) + *hcr0 |= SVM_CR0_SELECTIVE_MASK; + else + *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK) + | (gcr0 & SVM_CR0_SELECTIVE_MASK); + + + if (gcr0 == *hcr0 && svm->vcpu.fpu_active) { + svm->vmcb->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK; + svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK; + } else { + svm->vmcb->control.intercept_cr_read |= INTERCEPT_CR0_MASK; + svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR0_MASK; + } +} + static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { struct vcpu_svm *svm = to_svm(vcpu); #ifdef CONFIG_X86_64 - if (vcpu->arch.shadow_efer & EFER_LME) { + if (vcpu->arch.efer & EFER_LME) { if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { - vcpu->arch.shadow_efer |= EFER_LMA; + vcpu->arch.efer |= EFER_LMA; svm->vmcb->save.efer |= EFER_LMA | EFER_LME; } if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) { - vcpu->arch.shadow_efer &= ~EFER_LMA; + vcpu->arch.efer &= ~EFER_LMA; svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME); } } #endif - if (npt_enabled) - goto set; + vcpu->arch.cr0 = cr0; - if ((vcpu->arch.cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) { - svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); - vcpu->fpu_active = 1; - } + if (!npt_enabled) + cr0 |= X86_CR0_PG | X86_CR0_WP; - vcpu->arch.cr0 = cr0; - cr0 |= X86_CR0_PG | X86_CR0_WP; - if (!vcpu->fpu_active) { - svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR); + if (!vcpu->fpu_active) cr0 |= X86_CR0_TS; - } -set: /* * re-enable caching here because the QEMU bios * does not do it - this results in some delay at @@ -997,6 +1022,7 @@ set: */ cr0 &= ~(X86_CR0_CD | X86_CR0_NW); svm->vmcb->save.cr0 = cr0; + update_cr0_intercept(svm); } static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) @@ -1102,76 +1128,70 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) svm->vmcb->control.asid = sd->next_asid++; } -static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr) +static int svm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *dest) { struct vcpu_svm *svm = to_svm(vcpu); - unsigned long val; switch (dr) { case 0 ... 3: - val = vcpu->arch.db[dr]; + *dest = vcpu->arch.db[dr]; break; + case 4: + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) + return EMULATE_FAIL; /* will re-inject UD */ + /* fall through */ case 6: if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) - val = vcpu->arch.dr6; + *dest = vcpu->arch.dr6; else - val = svm->vmcb->save.dr6; + *dest = svm->vmcb->save.dr6; break; + case 5: + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) + return EMULATE_FAIL; /* will re-inject UD */ + /* fall through */ case 7: if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) - val = vcpu->arch.dr7; + *dest = vcpu->arch.dr7; else - val = svm->vmcb->save.dr7; + *dest = svm->vmcb->save.dr7; break; - default: - val = 0; } - return val; + return EMULATE_DONE; } -static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value, - int *exception) +static int svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value) { struct vcpu_svm *svm = to_svm(vcpu); - *exception = 0; - switch (dr) { case 0 ... 3: vcpu->arch.db[dr] = value; if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) vcpu->arch.eff_db[dr] = value; - return; - case 4 ... 5: - if (vcpu->arch.cr4 & X86_CR4_DE) - *exception = UD_VECTOR; - return; + break; + case 4: + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) + return EMULATE_FAIL; /* will re-inject UD */ + /* fall through */ case 6: - if (value & 0xffffffff00000000ULL) { - *exception = GP_VECTOR; - return; - } vcpu->arch.dr6 = (value & DR6_VOLATILE) | DR6_FIXED_1; - return; + break; + case 5: + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) + return EMULATE_FAIL; /* will re-inject UD */ + /* fall through */ case 7: - if (value & 0xffffffff00000000ULL) { - *exception = GP_VECTOR; - return; - } vcpu->arch.dr7 = (value & DR7_VOLATILE) | DR7_FIXED_1; if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { svm->vmcb->save.dr7 = vcpu->arch.dr7; vcpu->arch.switch_db_regs = (value & DR7_BP_EN_MASK); } - return; - default: - /* FIXME: Possible case? */ - printk(KERN_DEBUG "%s: unexpected dr %u\n", - __func__, dr); - *exception = UD_VECTOR; - return; + break; } + + return EMULATE_DONE; } static int pf_interception(struct vcpu_svm *svm) @@ -1239,13 +1259,17 @@ static int ud_interception(struct vcpu_svm *svm) return 1; } -static int nm_interception(struct vcpu_svm *svm) +static void svm_fpu_activate(struct kvm_vcpu *vcpu) { + struct vcpu_svm *svm = to_svm(vcpu); svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); - if (!(svm->vcpu.arch.cr0 & X86_CR0_TS)) - svm->vmcb->save.cr0 &= ~X86_CR0_TS; svm->vcpu.fpu_active = 1; + update_cr0_intercept(svm); +} +static int nm_interception(struct vcpu_svm *svm) +{ + svm_fpu_activate(&svm->vcpu); return 1; } @@ -1337,7 +1361,7 @@ static int vmmcall_interception(struct vcpu_svm *svm) static int nested_svm_check_permissions(struct vcpu_svm *svm) { - if (!(svm->vcpu.arch.shadow_efer & EFER_SVME) + if (!(svm->vcpu.arch.efer & EFER_SVME) || !is_paging(&svm->vcpu)) { kvm_queue_exception(&svm->vcpu, UD_VECTOR); return 1; @@ -1740,8 +1764,8 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) hsave->save.ds = vmcb->save.ds; hsave->save.gdtr = vmcb->save.gdtr; hsave->save.idtr = vmcb->save.idtr; - hsave->save.efer = svm->vcpu.arch.shadow_efer; - hsave->save.cr0 = svm->vcpu.arch.cr0; + hsave->save.efer = svm->vcpu.arch.efer; + hsave->save.cr0 = kvm_read_cr0(&svm->vcpu); hsave->save.cr4 = svm->vcpu.arch.cr4; hsave->save.rflags = vmcb->save.rflags; hsave->save.rip = svm->next_rip; @@ -2153,9 +2177,10 @@ static int rdmsr_interception(struct vcpu_svm *svm) u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; u64 data; - if (svm_get_msr(&svm->vcpu, ecx, &data)) + if (svm_get_msr(&svm->vcpu, ecx, &data)) { + trace_kvm_msr_read_ex(ecx); kvm_inject_gp(&svm->vcpu, 0); - else { + } else { trace_kvm_msr_read(ecx, data); svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff; @@ -2247,13 +2272,15 @@ static int wrmsr_interception(struct vcpu_svm *svm) u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u) | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); - trace_kvm_msr_write(ecx, data); svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; - if (svm_set_msr(&svm->vcpu, ecx, data)) + if (svm_set_msr(&svm->vcpu, ecx, data)) { + trace_kvm_msr_write_ex(ecx, data); kvm_inject_gp(&svm->vcpu, 0); - else + } else { + trace_kvm_msr_write(ecx, data); skip_emulated_instruction(&svm->vcpu); + } return 1; } @@ -2297,7 +2324,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_READ_CR3] = emulate_on_interception, [SVM_EXIT_READ_CR4] = emulate_on_interception, [SVM_EXIT_READ_CR8] = emulate_on_interception, - /* for now: */ + [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, [SVM_EXIT_WRITE_CR0] = emulate_on_interception, [SVM_EXIT_WRITE_CR3] = emulate_on_interception, [SVM_EXIT_WRITE_CR4] = emulate_on_interception, @@ -2306,11 +2333,17 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_READ_DR1] = emulate_on_interception, [SVM_EXIT_READ_DR2] = emulate_on_interception, [SVM_EXIT_READ_DR3] = emulate_on_interception, + [SVM_EXIT_READ_DR4] = emulate_on_interception, + [SVM_EXIT_READ_DR5] = emulate_on_interception, + [SVM_EXIT_READ_DR6] = emulate_on_interception, + [SVM_EXIT_READ_DR7] = emulate_on_interception, [SVM_EXIT_WRITE_DR0] = emulate_on_interception, [SVM_EXIT_WRITE_DR1] = emulate_on_interception, [SVM_EXIT_WRITE_DR2] = emulate_on_interception, [SVM_EXIT_WRITE_DR3] = emulate_on_interception, + [SVM_EXIT_WRITE_DR4] = emulate_on_interception, [SVM_EXIT_WRITE_DR5] = emulate_on_interception, + [SVM_EXIT_WRITE_DR6] = emulate_on_interception, [SVM_EXIT_WRITE_DR7] = emulate_on_interception, [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception, [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception, @@ -2383,20 +2416,10 @@ static int handle_exit(struct kvm_vcpu *vcpu) svm_complete_interrupts(svm); - if (npt_enabled) { - int mmu_reload = 0; - if ((vcpu->arch.cr0 ^ svm->vmcb->save.cr0) & X86_CR0_PG) { - svm_set_cr0(vcpu, svm->vmcb->save.cr0); - mmu_reload = 1; - } + if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR0_MASK)) vcpu->arch.cr0 = svm->vmcb->save.cr0; + if (npt_enabled) vcpu->arch.cr3 = svm->vmcb->save.cr3; - if (mmu_reload) { - kvm_mmu_reset_context(vcpu); - kvm_mmu_load(vcpu); - } - } - if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) { kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; @@ -2798,12 +2821,6 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) svm->vmcb->save.cr3 = root; force_new_asid(vcpu); - - if (vcpu->fpu_active) { - svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR); - svm->vmcb->save.cr0 |= X86_CR0_TS; - vcpu->fpu_active = 0; - } } static int is_disabled(void) @@ -2852,6 +2869,10 @@ static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) return 0; } +static void svm_cpuid_update(struct kvm_vcpu *vcpu) +{ +} + static const struct trace_print_flags svm_exit_reasons_str[] = { { SVM_EXIT_READ_CR0, "read_cr0" }, { SVM_EXIT_READ_CR3, "read_cr3" }, @@ -2905,9 +2926,22 @@ static const struct trace_print_flags svm_exit_reasons_str[] = { { -1, NULL } }; -static bool svm_gb_page_enable(void) +static int svm_get_lpage_level(void) { - return true; + return PT_PDPE_LEVEL; +} + +static bool svm_rdtscp_supported(void) +{ + return false; +} + +static void svm_fpu_deactivate(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + update_cr0_intercept(svm); + svm->vmcb->control.intercept_exceptions |= 1 << NM_VECTOR; } static struct kvm_x86_ops svm_x86_ops = { @@ -2936,6 +2970,7 @@ static struct kvm_x86_ops svm_x86_ops = { .set_segment = svm_set_segment, .get_cpl = svm_get_cpl, .get_cs_db_l_bits = kvm_get_cs_db_l_bits, + .decache_cr0_guest_bits = svm_decache_cr0_guest_bits, .decache_cr4_guest_bits = svm_decache_cr4_guest_bits, .set_cr0 = svm_set_cr0, .set_cr3 = svm_set_cr3, @@ -2950,6 +2985,8 @@ static struct kvm_x86_ops svm_x86_ops = { .cache_reg = svm_cache_reg, .get_rflags = svm_get_rflags, .set_rflags = svm_set_rflags, + .fpu_activate = svm_fpu_activate, + .fpu_deactivate = svm_fpu_deactivate, .tlb_flush = svm_flush_tlb, @@ -2975,7 +3012,11 @@ static struct kvm_x86_ops svm_x86_ops = { .get_mt_mask = svm_get_mt_mask, .exit_reasons_str = svm_exit_reasons_str, - .gb_page_enable = svm_gb_page_enable, + .get_lpage_level = svm_get_lpage_level, + + .cpuid_update = svm_cpuid_update, + + .rdtscp_supported = svm_rdtscp_supported, }; static int __init svm_init(void) diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 816e044..6ad30a2 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -56,6 +56,38 @@ TRACE_EVENT(kvm_hypercall, ); /* + * Tracepoint for hypercall. + */ +TRACE_EVENT(kvm_hv_hypercall, + TP_PROTO(__u16 code, bool fast, __u16 rep_cnt, __u16 rep_idx, + __u64 ingpa, __u64 outgpa), + TP_ARGS(code, fast, rep_cnt, rep_idx, ingpa, outgpa), + + TP_STRUCT__entry( + __field( __u16, code ) + __field( bool, fast ) + __field( __u16, rep_cnt ) + __field( __u16, rep_idx ) + __field( __u64, ingpa ) + __field( __u64, outgpa ) + ), + + TP_fast_assign( + __entry->code = code; + __entry->fast = fast; + __entry->rep_cnt = rep_cnt; + __entry->rep_idx = rep_idx; + __entry->ingpa = ingpa; + __entry->outgpa = outgpa; + ), + + TP_printk("code 0x%x %s cnt 0x%x idx 0x%x in 0x%llx out 0x%llx", + __entry->code, __entry->fast ? "fast" : "slow", + __entry->rep_cnt, __entry->rep_idx, __entry->ingpa, + __entry->outgpa) +); + +/* * Tracepoint for PIO. */ TRACE_EVENT(kvm_pio, @@ -214,28 +246,33 @@ TRACE_EVENT(kvm_page_fault, * Tracepoint for guest MSR access. */ TRACE_EVENT(kvm_msr, - TP_PROTO(unsigned int rw, unsigned int ecx, unsigned long data), - TP_ARGS(rw, ecx, data), + TP_PROTO(unsigned write, u32 ecx, u64 data, bool exception), + TP_ARGS(write, ecx, data, exception), TP_STRUCT__entry( - __field( unsigned int, rw ) - __field( unsigned int, ecx ) - __field( unsigned long, data ) + __field( unsigned, write ) + __field( u32, ecx ) + __field( u64, data ) + __field( u8, exception ) ), TP_fast_assign( - __entry->rw = rw; + __entry->write = write; __entry->ecx = ecx; __entry->data = data; + __entry->exception = exception; ), - TP_printk("msr_%s %x = 0x%lx", - __entry->rw ? "write" : "read", - __entry->ecx, __entry->data) + TP_printk("msr_%s %x = 0x%llx%s", + __entry->write ? "write" : "read", + __entry->ecx, __entry->data, + __entry->exception ? " (#GP)" : "") ); -#define trace_kvm_msr_read(ecx, data) trace_kvm_msr(0, ecx, data) -#define trace_kvm_msr_write(ecx, data) trace_kvm_msr(1, ecx, data) +#define trace_kvm_msr_read(ecx, data) trace_kvm_msr(0, ecx, data, false) +#define trace_kvm_msr_write(ecx, data) trace_kvm_msr(1, ecx, data, false) +#define trace_kvm_msr_read_ex(ecx) trace_kvm_msr(0, ecx, 0, true) +#define trace_kvm_msr_write_ex(ecx, data) trace_kvm_msr(1, ecx, data, true) /* * Tracepoint for guest CR access. diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d4918d6..14873b9 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -61,6 +61,21 @@ module_param_named(unrestricted_guest, static int __read_mostly emulate_invalid_guest_state = 0; module_param(emulate_invalid_guest_state, bool, S_IRUGO); +#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ + (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD) +#define KVM_GUEST_CR0_MASK \ + (KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) +#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST \ + (X86_CR0_WP | X86_CR0_NE) +#define KVM_VM_CR0_ALWAYS_ON \ + (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) +#define KVM_CR4_GUEST_OWNED_BITS \ + (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ + | X86_CR4_OSXMMEXCPT) + +#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) +#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) + /* * These 2 parameters are used to config the controls for Pause-Loop Exiting: * ple_gap: upper bound on the amount of time between two successive @@ -136,6 +151,8 @@ struct vcpu_vmx { ktime_t entry_time; s64 vnmi_blocked_time; u32 exit_reason; + + bool rdtscp_enabled; }; static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) @@ -210,7 +227,7 @@ static const u32 vmx_msr_index[] = { #ifdef CONFIG_X86_64 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, #endif - MSR_EFER, MSR_K6_STAR, + MSR_EFER, MSR_TSC_AUX, MSR_K6_STAR, }; #define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index) @@ -301,6 +318,11 @@ static inline bool cpu_has_vmx_ept_2m_page(void) return !!(vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT); } +static inline bool cpu_has_vmx_ept_1g_page(void) +{ + return !!(vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT); +} + static inline int cpu_has_vmx_invept_individual_addr(void) { return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT); @@ -336,9 +358,7 @@ static inline int cpu_has_vmx_ple(void) static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) { - return flexpriority_enabled && - (cpu_has_vmx_virtualize_apic_accesses()) && - (irqchip_in_kernel(kvm)); + return flexpriority_enabled && irqchip_in_kernel(kvm); } static inline int cpu_has_vmx_vpid(void) @@ -347,6 +367,12 @@ static inline int cpu_has_vmx_vpid(void) SECONDARY_EXEC_ENABLE_VPID; } +static inline int cpu_has_vmx_rdtscp(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_RDTSCP; +} + static inline int cpu_has_virtual_nmis(void) { return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; @@ -551,22 +577,18 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) { u32 eb; - eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR); - if (!vcpu->fpu_active) - eb |= 1u << NM_VECTOR; - /* - * Unconditionally intercept #DB so we can maintain dr6 without - * reading it every exit. - */ - eb |= 1u << DB_VECTOR; - if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { - if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) - eb |= 1u << BP_VECTOR; - } + eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) | + (1u << NM_VECTOR) | (1u << DB_VECTOR); + if ((vcpu->guest_debug & + (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) == + (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) + eb |= 1u << BP_VECTOR; if (to_vmx(vcpu)->rmode.vm86_active) eb = ~0; if (enable_ept) eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ + if (vcpu->fpu_active) + eb &= ~(1u << NM_VECTOR); vmcs_write32(EXCEPTION_BITMAP, eb); } @@ -589,7 +611,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) u64 guest_efer; u64 ignore_bits; - guest_efer = vmx->vcpu.arch.shadow_efer; + guest_efer = vmx->vcpu.arch.efer; /* * NX is emulated; LMA and LME handled by hardware; SCE meaninless @@ -767,22 +789,30 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu) static void vmx_fpu_activate(struct kvm_vcpu *vcpu) { + ulong cr0; + if (vcpu->fpu_active) return; vcpu->fpu_active = 1; - vmcs_clear_bits(GUEST_CR0, X86_CR0_TS); - if (vcpu->arch.cr0 & X86_CR0_TS) - vmcs_set_bits(GUEST_CR0, X86_CR0_TS); + cr0 = vmcs_readl(GUEST_CR0); + cr0 &= ~(X86_CR0_TS | X86_CR0_MP); + cr0 |= kvm_read_cr0_bits(vcpu, X86_CR0_TS | X86_CR0_MP); + vmcs_writel(GUEST_CR0, cr0); update_exception_bitmap(vcpu); + vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; + vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); } +static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu); + static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) { - if (!vcpu->fpu_active) - return; - vcpu->fpu_active = 0; - vmcs_set_bits(GUEST_CR0, X86_CR0_TS); + vmx_decache_cr0_guest_bits(vcpu); + vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP); update_exception_bitmap(vcpu); + vcpu->arch.cr0_guest_owned_bits = 0; + vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); + vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); } static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) @@ -878,6 +908,11 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); } +static bool vmx_rdtscp_supported(void) +{ + return cpu_has_vmx_rdtscp(); +} + /* * Swap MSR entry in host/guest MSR entry array. */ @@ -913,12 +948,15 @@ static void setup_msrs(struct vcpu_vmx *vmx) index = __find_msr_index(vmx, MSR_CSTAR); if (index >= 0) move_msr_up(vmx, index, save_nmsrs++); + index = __find_msr_index(vmx, MSR_TSC_AUX); + if (index >= 0 && vmx->rdtscp_enabled) + move_msr_up(vmx, index, save_nmsrs++); /* * MSR_K6_STAR is only needed on long mode guests, and only * if efer.sce is enabled. */ index = __find_msr_index(vmx, MSR_K6_STAR); - if ((index >= 0) && (vmx->vcpu.arch.shadow_efer & EFER_SCE)) + if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE)) move_msr_up(vmx, index, save_nmsrs++); } #endif @@ -1002,6 +1040,10 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) case MSR_IA32_SYSENTER_ESP: data = vmcs_readl(GUEST_SYSENTER_ESP); break; + case MSR_TSC_AUX: + if (!to_vmx(vcpu)->rdtscp_enabled) + return 1; + /* Otherwise falls through */ default: vmx_load_host_state(to_vmx(vcpu)); msr = find_msr_entry(to_vmx(vcpu), msr_index); @@ -1065,7 +1107,15 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) vcpu->arch.pat = data; break; } - /* Otherwise falls through to kvm_set_msr_common */ + ret = kvm_set_msr_common(vcpu, msr_index, data); + break; + case MSR_TSC_AUX: + if (!vmx->rdtscp_enabled) + return 1; + /* Check reserved bit, higher 32 bits should be zero */ + if ((data >> 32) != 0) + return 1; + /* Otherwise falls through */ default: msr = find_msr_entry(vmx, msr_index); if (msr) { @@ -1224,6 +1274,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MOV_DR_EXITING | CPU_BASED_USE_TSC_OFFSETING | + CPU_BASED_MWAIT_EXITING | + CPU_BASED_MONITOR_EXITING | CPU_BASED_INVLPG_EXITING; opt = CPU_BASED_TPR_SHADOW | CPU_BASED_USE_MSR_BITMAPS | @@ -1243,7 +1295,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) SECONDARY_EXEC_ENABLE_VPID | SECONDARY_EXEC_ENABLE_EPT | SECONDARY_EXEC_UNRESTRICTED_GUEST | - SECONDARY_EXEC_PAUSE_LOOP_EXITING; + SECONDARY_EXEC_PAUSE_LOOP_EXITING | + SECONDARY_EXEC_RDTSCP; if (adjust_vmx_controls(min2, opt2, MSR_IA32_VMX_PROCBASED_CTLS2, &_cpu_based_2nd_exec_control) < 0) @@ -1457,8 +1510,12 @@ static void enter_pmode(struct kvm_vcpu *vcpu) static gva_t rmode_tss_base(struct kvm *kvm) { if (!kvm->arch.tss_addr) { - gfn_t base_gfn = kvm->memslots[0].base_gfn + - kvm->memslots[0].npages - 3; + struct kvm_memslots *slots; + gfn_t base_gfn; + + slots = rcu_dereference(kvm->memslots); + base_gfn = kvm->memslots->memslots[0].base_gfn + + kvm->memslots->memslots[0].npages - 3; return base_gfn << PAGE_SHIFT; } return kvm->arch.tss_addr; @@ -1544,9 +1601,7 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) * of this msr depends on is_long_mode(). */ vmx_load_host_state(to_vmx(vcpu)); - vcpu->arch.shadow_efer = efer; - if (!msr) - return; + vcpu->arch.efer = efer; if (efer & EFER_LMA) { vmcs_write32(VM_ENTRY_CONTROLS, vmcs_read32(VM_ENTRY_CONTROLS) | @@ -1576,13 +1631,13 @@ static void enter_lmode(struct kvm_vcpu *vcpu) (guest_tr_ar & ~AR_TYPE_MASK) | AR_TYPE_BUSY_64_TSS); } - vcpu->arch.shadow_efer |= EFER_LMA; - vmx_set_efer(vcpu, vcpu->arch.shadow_efer); + vcpu->arch.efer |= EFER_LMA; + vmx_set_efer(vcpu, vcpu->arch.efer); } static void exit_lmode(struct kvm_vcpu *vcpu) { - vcpu->arch.shadow_efer &= ~EFER_LMA; + vcpu->arch.efer &= ~EFER_LMA; vmcs_write32(VM_ENTRY_CONTROLS, vmcs_read32(VM_ENTRY_CONTROLS) @@ -1598,10 +1653,20 @@ static void vmx_flush_tlb(struct kvm_vcpu *vcpu) ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa)); } +static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) +{ + ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits; + + vcpu->arch.cr0 &= ~cr0_guest_owned_bits; + vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits; +} + static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) { - vcpu->arch.cr4 &= KVM_GUEST_CR4_MASK; - vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK; + ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits; + + vcpu->arch.cr4 &= ~cr4_guest_owned_bits; + vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits; } static void ept_load_pdptrs(struct kvm_vcpu *vcpu) @@ -1646,7 +1711,7 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, (CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING)); vcpu->arch.cr0 = cr0; - vmx_set_cr4(vcpu, vcpu->arch.cr4); + vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); } else if (!is_paging(vcpu)) { /* From nonpaging to paging */ vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, @@ -1654,23 +1719,13 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING)); vcpu->arch.cr0 = cr0; - vmx_set_cr4(vcpu, vcpu->arch.cr4); + vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); } if (!(cr0 & X86_CR0_WP)) *hw_cr0 &= ~X86_CR0_WP; } -static void ept_update_paging_mode_cr4(unsigned long *hw_cr4, - struct kvm_vcpu *vcpu) -{ - if (!is_paging(vcpu)) { - *hw_cr4 &= ~X86_CR4_PAE; - *hw_cr4 |= X86_CR4_PSE; - } else if (!(vcpu->arch.cr4 & X86_CR4_PAE)) - *hw_cr4 &= ~X86_CR4_PAE; -} - static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -1682,8 +1737,6 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) else hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON; - vmx_fpu_deactivate(vcpu); - if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) enter_pmode(vcpu); @@ -1691,7 +1744,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) enter_rmode(vcpu); #ifdef CONFIG_X86_64 - if (vcpu->arch.shadow_efer & EFER_LME) { + if (vcpu->arch.efer & EFER_LME) { if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) enter_lmode(vcpu); if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) @@ -1702,12 +1755,12 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) if (enable_ept) ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu); + if (!vcpu->fpu_active) + hw_cr0 |= X86_CR0_TS | X86_CR0_MP; + vmcs_writel(CR0_READ_SHADOW, cr0); vmcs_writel(GUEST_CR0, hw_cr0); vcpu->arch.cr0 = cr0; - - if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE)) - vmx_fpu_activate(vcpu); } static u64 construct_eptp(unsigned long root_hpa) @@ -1738,8 +1791,6 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) vmx_flush_tlb(vcpu); vmcs_writel(GUEST_CR3, guest_cr3); - if (vcpu->arch.cr0 & X86_CR0_PE) - vmx_fpu_deactivate(vcpu); } static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) @@ -1748,8 +1799,14 @@ static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); vcpu->arch.cr4 = cr4; - if (enable_ept) - ept_update_paging_mode_cr4(&hw_cr4, vcpu); + if (enable_ept) { + if (!is_paging(vcpu)) { + hw_cr4 &= ~X86_CR4_PAE; + hw_cr4 |= X86_CR4_PSE; + } else if (!(cr4 & X86_CR4_PAE)) { + hw_cr4 &= ~X86_CR4_PAE; + } + } vmcs_writel(CR4_READ_SHADOW, cr4); vmcs_writel(GUEST_CR4, hw_cr4); @@ -1787,7 +1844,7 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu, static int vmx_get_cpl(struct kvm_vcpu *vcpu) { - if (!(vcpu->arch.cr0 & X86_CR0_PE)) /* if real mode */ + if (!is_protmode(vcpu)) return 0; if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */ @@ -2042,7 +2099,7 @@ static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu) static bool guest_state_valid(struct kvm_vcpu *vcpu) { /* real mode guest state checks */ - if (!(vcpu->arch.cr0 & X86_CR0_PE)) { + if (!is_protmode(vcpu)) { if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) return false; if (!rmode_segment_valid(vcpu, VCPU_SREG_SS)) @@ -2175,7 +2232,7 @@ static int alloc_apic_access_page(struct kvm *kvm) struct kvm_userspace_memory_region kvm_userspace_mem; int r = 0; - down_write(&kvm->slots_lock); + mutex_lock(&kvm->slots_lock); if (kvm->arch.apic_access_page) goto out; kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT; @@ -2188,7 +2245,7 @@ static int alloc_apic_access_page(struct kvm *kvm) kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00); out: - up_write(&kvm->slots_lock); + mutex_unlock(&kvm->slots_lock); return r; } @@ -2197,7 +2254,7 @@ static int alloc_identity_pagetable(struct kvm *kvm) struct kvm_userspace_memory_region kvm_userspace_mem; int r = 0; - down_write(&kvm->slots_lock); + mutex_lock(&kvm->slots_lock); if (kvm->arch.ept_identity_pagetable) goto out; kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT; @@ -2212,7 +2269,7 @@ static int alloc_identity_pagetable(struct kvm *kvm) kvm->arch.ept_identity_pagetable = gfn_to_page(kvm, kvm->arch.ept_identity_map_addr >> PAGE_SHIFT); out: - up_write(&kvm->slots_lock); + mutex_unlock(&kvm->slots_lock); return r; } @@ -2384,14 +2441,12 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) for (i = 0; i < NR_VMX_MSR; ++i) { u32 index = vmx_msr_index[i]; u32 data_low, data_high; - u64 data; int j = vmx->nmsrs; if (rdmsr_safe(index, &data_low, &data_high) < 0) continue; if (wrmsr_safe(index, data_low, data_high) < 0) continue; - data = data_low | ((u64)data_high << 32); vmx->guest_msrs[j].index = i; vmx->guest_msrs[j].data = 0; vmx->guest_msrs[j].mask = -1ull; @@ -2404,7 +2459,10 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl); vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); - vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK); + vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS; + if (enable_ept) + vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE; + vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); tsc_base = vmx->vcpu.kvm->arch.vm_init_tsc; rdtscll(tsc_this); @@ -2429,10 +2487,10 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); u64 msr; - int ret; + int ret, idx; vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); - down_read(&vcpu->kvm->slots_lock); + idx = srcu_read_lock(&vcpu->kvm->srcu); if (!init_rmode(vmx->vcpu.kvm)) { ret = -ENOMEM; goto out; @@ -2526,7 +2584,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; - vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */ + vmx_set_cr0(&vmx->vcpu, kvm_read_cr0(vcpu)); /* enter rmode */ vmx_set_cr4(&vmx->vcpu, 0); vmx_set_efer(&vmx->vcpu, 0); vmx_fpu_activate(&vmx->vcpu); @@ -2540,7 +2598,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmx->emulation_required = 0; out: - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&vcpu->kvm->srcu, idx); return ret; } @@ -2717,6 +2775,12 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, kvm_queue_exception(vcpu, vec); return 1; case BP_VECTOR: + /* + * Update instruction length as we may reinject the exception + * from user space while in guest debugging mode. + */ + to_vmx(vcpu)->vcpu.arch.event_exit_inst_len = + vmcs_read32(VM_EXIT_INSTRUCTION_LEN); if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) return 0; /* fall through */ @@ -2839,6 +2903,13 @@ static int handle_exception(struct kvm_vcpu *vcpu) kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7); /* fall through */ case BP_VECTOR: + /* + * Update instruction length as we may reinject #BP from + * user space while in guest debugging mode. Reading it for + * #DB as well causes no harm, it is not used in that case. + */ + vmx->vcpu.arch.event_exit_inst_len = + vmcs_read32(VM_EXIT_INSTRUCTION_LEN); kvm_run->exit_reason = KVM_EXIT_DEBUG; kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip; kvm_run->debug.arch.exception = ex_no; @@ -2940,11 +3011,10 @@ static int handle_cr(struct kvm_vcpu *vcpu) }; break; case 2: /* clts */ - vmx_fpu_deactivate(vcpu); - vcpu->arch.cr0 &= ~X86_CR0_TS; - vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); - vmx_fpu_activate(vcpu); + vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); + trace_kvm_cr_write(0, kvm_read_cr0(vcpu)); skip_emulated_instruction(vcpu); + vmx_fpu_activate(vcpu); return 1; case 1: /*mov from cr*/ switch (cr) { @@ -2962,7 +3032,9 @@ static int handle_cr(struct kvm_vcpu *vcpu) } break; case 3: /* lmsw */ - kvm_lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f); + val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; + trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val); + kvm_lmsw(vcpu, val); skip_emulated_instruction(vcpu); return 1; @@ -2975,12 +3047,22 @@ static int handle_cr(struct kvm_vcpu *vcpu) return 0; } +static int check_dr_alias(struct kvm_vcpu *vcpu) +{ + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return -1; + } + return 0; +} + static int handle_dr(struct kvm_vcpu *vcpu) { unsigned long exit_qualification; unsigned long val; int dr, reg; + /* Do not handle if the CPL > 0, will trigger GP on re-entry */ if (!kvm_require_cpl(vcpu, 0)) return 1; dr = vmcs_readl(GUEST_DR7); @@ -3016,14 +3098,20 @@ static int handle_dr(struct kvm_vcpu *vcpu) case 0 ... 3: val = vcpu->arch.db[dr]; break; + case 4: + if (check_dr_alias(vcpu) < 0) + return 1; + /* fall through */ case 6: val = vcpu->arch.dr6; break; - case 7: + case 5: + if (check_dr_alias(vcpu) < 0) + return 1; + /* fall through */ + default: /* 7 */ val = vcpu->arch.dr7; break; - default: - val = 0; } kvm_register_write(vcpu, reg, val); } else { @@ -3034,21 +3122,25 @@ static int handle_dr(struct kvm_vcpu *vcpu) if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) vcpu->arch.eff_db[dr] = val; break; - case 4 ... 5: - if (vcpu->arch.cr4 & X86_CR4_DE) - kvm_queue_exception(vcpu, UD_VECTOR); - break; + case 4: + if (check_dr_alias(vcpu) < 0) + return 1; + /* fall through */ case 6: if (val & 0xffffffff00000000ULL) { - kvm_queue_exception(vcpu, GP_VECTOR); - break; + kvm_inject_gp(vcpu, 0); + return 1; } vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1; break; - case 7: + case 5: + if (check_dr_alias(vcpu) < 0) + return 1; + /* fall through */ + default: /* 7 */ if (val & 0xffffffff00000000ULL) { - kvm_queue_exception(vcpu, GP_VECTOR); - break; + kvm_inject_gp(vcpu, 0); + return 1; } vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1; if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { @@ -3075,6 +3167,7 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu) u64 data; if (vmx_get_msr(vcpu, ecx, &data)) { + trace_kvm_msr_read_ex(ecx); kvm_inject_gp(vcpu, 0); return 1; } @@ -3094,13 +3187,13 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu) u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); - trace_kvm_msr_write(ecx, data); - if (vmx_set_msr(vcpu, ecx, data) != 0) { + trace_kvm_msr_write_ex(ecx, data); kvm_inject_gp(vcpu, 0); return 1; } + trace_kvm_msr_write(ecx, data); skip_emulated_instruction(vcpu); return 1; } @@ -3385,7 +3478,6 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) } if (err != EMULATE_DONE) { - kvm_report_emulation_failure(vcpu, "emulation failure"); vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; vcpu->run->internal.ndata = 0; @@ -3416,6 +3508,12 @@ static int handle_pause(struct kvm_vcpu *vcpu) return 1; } +static int handle_invalid_op(struct kvm_vcpu *vcpu) +{ + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; +} + /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -3453,6 +3551,8 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, + [EXIT_REASON_MWAIT_INSTRUCTION] = handle_invalid_op, + [EXIT_REASON_MONITOR_INSTRUCTION] = handle_invalid_op, }; static const int kvm_vmx_max_exit_handlers = @@ -3686,9 +3786,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) */ vmcs_writel(HOST_CR0, read_cr0()); - if (vcpu->arch.switch_db_regs) - set_debugreg(vcpu->arch.dr6, 6); - asm( /* Store host registers */ "push %%"R"dx; push %%"R"bp;" @@ -3789,9 +3886,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) | (1 << VCPU_EXREG_PDPTR)); vcpu->arch.regs_dirty = 0; - if (vcpu->arch.switch_db_regs) - get_debugreg(vcpu->arch.dr6, 6); - vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); if (vmx->rmode.irq.pending) fixup_rmode_irq(vmx); @@ -3920,7 +4014,7 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) * b. VT-d with snooping control feature: snooping control feature of * VT-d engine can guarantee the cache correctness. Just set it * to WB to keep consistent with host. So the same as item 3. - * 3. EPT without VT-d: always map as WB and set IGMT=1 to keep + * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep * consistent with host MTRR */ if (is_mmio) @@ -3931,37 +4025,88 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) VMX_EPT_MT_EPTE_SHIFT; else ret = (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) - | VMX_EPT_IGMT_BIT; + | VMX_EPT_IPAT_BIT; return ret; } +#define _ER(x) { EXIT_REASON_##x, #x } + static const struct trace_print_flags vmx_exit_reasons_str[] = { - { EXIT_REASON_EXCEPTION_NMI, "exception" }, - { EXIT_REASON_EXTERNAL_INTERRUPT, "ext_irq" }, - { EXIT_REASON_TRIPLE_FAULT, "triple_fault" }, - { EXIT_REASON_NMI_WINDOW, "nmi_window" }, - { EXIT_REASON_IO_INSTRUCTION, "io_instruction" }, - { EXIT_REASON_CR_ACCESS, "cr_access" }, - { EXIT_REASON_DR_ACCESS, "dr_access" }, - { EXIT_REASON_CPUID, "cpuid" }, - { EXIT_REASON_MSR_READ, "rdmsr" }, - { EXIT_REASON_MSR_WRITE, "wrmsr" }, - { EXIT_REASON_PENDING_INTERRUPT, "interrupt_window" }, - { EXIT_REASON_HLT, "halt" }, - { EXIT_REASON_INVLPG, "invlpg" }, - { EXIT_REASON_VMCALL, "hypercall" }, - { EXIT_REASON_TPR_BELOW_THRESHOLD, "tpr_below_thres" }, - { EXIT_REASON_APIC_ACCESS, "apic_access" }, - { EXIT_REASON_WBINVD, "wbinvd" }, - { EXIT_REASON_TASK_SWITCH, "task_switch" }, - { EXIT_REASON_EPT_VIOLATION, "ept_violation" }, + _ER(EXCEPTION_NMI), + _ER(EXTERNAL_INTERRUPT), + _ER(TRIPLE_FAULT), + _ER(PENDING_INTERRUPT), + _ER(NMI_WINDOW), + _ER(TASK_SWITCH), + _ER(CPUID), + _ER(HLT), + _ER(INVLPG), + _ER(RDPMC), + _ER(RDTSC), + _ER(VMCALL), + _ER(VMCLEAR), + _ER(VMLAUNCH), + _ER(VMPTRLD), + _ER(VMPTRST), + _ER(VMREAD), + _ER(VMRESUME), + _ER(VMWRITE), + _ER(VMOFF), + _ER(VMON), + _ER(CR_ACCESS), + _ER(DR_ACCESS), + _ER(IO_INSTRUCTION), + _ER(MSR_READ), + _ER(MSR_WRITE), + _ER(MWAIT_INSTRUCTION), + _ER(MONITOR_INSTRUCTION), + _ER(PAUSE_INSTRUCTION), + _ER(MCE_DURING_VMENTRY), + _ER(TPR_BELOW_THRESHOLD), + _ER(APIC_ACCESS), + _ER(EPT_VIOLATION), + _ER(EPT_MISCONFIG), + _ER(WBINVD), { -1, NULL } }; -static bool vmx_gb_page_enable(void) +#undef _ER + +static int vmx_get_lpage_level(void) +{ + if (enable_ept && !cpu_has_vmx_ept_1g_page()) + return PT_DIRECTORY_LEVEL; + else + /* For shadow and EPT supported 1GB page */ + return PT_PDPE_LEVEL; +} + +static inline u32 bit(int bitno) +{ + return 1 << (bitno & 31); +} + +static void vmx_cpuid_update(struct kvm_vcpu *vcpu) { - return false; + struct kvm_cpuid_entry2 *best; + struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 exec_control; + + vmx->rdtscp_enabled = false; + if (vmx_rdtscp_supported()) { + exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + if (exec_control & SECONDARY_EXEC_RDTSCP) { + best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); + if (best && (best->edx & bit(X86_FEATURE_RDTSCP))) + vmx->rdtscp_enabled = true; + else { + exec_control &= ~SECONDARY_EXEC_RDTSCP; + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, + exec_control); + } + } + } } static struct kvm_x86_ops vmx_x86_ops = { @@ -3990,6 +4135,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .set_segment = vmx_set_segment, .get_cpl = vmx_get_cpl, .get_cs_db_l_bits = vmx_get_cs_db_l_bits, + .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits, .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits, .set_cr0 = vmx_set_cr0, .set_cr3 = vmx_set_cr3, @@ -4002,6 +4148,8 @@ static struct kvm_x86_ops vmx_x86_ops = { .cache_reg = vmx_cache_reg, .get_rflags = vmx_get_rflags, .set_rflags = vmx_set_rflags, + .fpu_activate = vmx_fpu_activate, + .fpu_deactivate = vmx_fpu_deactivate, .tlb_flush = vmx_flush_tlb, @@ -4027,7 +4175,11 @@ static struct kvm_x86_ops vmx_x86_ops = { .get_mt_mask = vmx_get_mt_mask, .exit_reasons_str = vmx_exit_reasons_str, - .gb_page_enable = vmx_gb_page_enable, + .get_lpage_level = vmx_get_lpage_level, + + .cpuid_update = vmx_cpuid_update, + + .rdtscp_supported = vmx_rdtscp_supported, }; static int __init vmx_init(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a1e1bc9..e46282a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -38,6 +38,7 @@ #include <linux/intel-iommu.h> #include <linux/cpufreq.h> #include <linux/user-return-notifier.h> +#include <linux/srcu.h> #include <trace/events/kvm.h> #undef TRACE_INCLUDE_FILE #define CREATE_TRACE_POINTS @@ -93,16 +94,16 @@ module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR); struct kvm_shared_msrs_global { int nr; - struct kvm_shared_msr { - u32 msr; - u64 value; - } msrs[KVM_NR_SHARED_MSRS]; + u32 msrs[KVM_NR_SHARED_MSRS]; }; struct kvm_shared_msrs { struct user_return_notifier urn; bool registered; - u64 current_value[KVM_NR_SHARED_MSRS]; + struct kvm_shared_msr_values { + u64 host; + u64 curr; + } values[KVM_NR_SHARED_MSRS]; }; static struct kvm_shared_msrs_global __read_mostly shared_msrs_global; @@ -147,53 +148,64 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { static void kvm_on_user_return(struct user_return_notifier *urn) { unsigned slot; - struct kvm_shared_msr *global; struct kvm_shared_msrs *locals = container_of(urn, struct kvm_shared_msrs, urn); + struct kvm_shared_msr_values *values; for (slot = 0; slot < shared_msrs_global.nr; ++slot) { - global = &shared_msrs_global.msrs[slot]; - if (global->value != locals->current_value[slot]) { - wrmsrl(global->msr, global->value); - locals->current_value[slot] = global->value; + values = &locals->values[slot]; + if (values->host != values->curr) { + wrmsrl(shared_msrs_global.msrs[slot], values->host); + values->curr = values->host; } } locals->registered = false; user_return_notifier_unregister(urn); } -void kvm_define_shared_msr(unsigned slot, u32 msr) +static void shared_msr_update(unsigned slot, u32 msr) { - int cpu; + struct kvm_shared_msrs *smsr; u64 value; + smsr = &__get_cpu_var(shared_msrs); + /* only read, and nobody should modify it at this time, + * so don't need lock */ + if (slot >= shared_msrs_global.nr) { + printk(KERN_ERR "kvm: invalid MSR slot!"); + return; + } + rdmsrl_safe(msr, &value); + smsr->values[slot].host = value; + smsr->values[slot].curr = value; +} + +void kvm_define_shared_msr(unsigned slot, u32 msr) +{ if (slot >= shared_msrs_global.nr) shared_msrs_global.nr = slot + 1; - shared_msrs_global.msrs[slot].msr = msr; - rdmsrl_safe(msr, &value); - shared_msrs_global.msrs[slot].value = value; - for_each_online_cpu(cpu) - per_cpu(shared_msrs, cpu).current_value[slot] = value; + shared_msrs_global.msrs[slot] = msr; + /* we need ensured the shared_msr_global have been updated */ + smp_wmb(); } EXPORT_SYMBOL_GPL(kvm_define_shared_msr); static void kvm_shared_msr_cpu_online(void) { unsigned i; - struct kvm_shared_msrs *locals = &__get_cpu_var(shared_msrs); for (i = 0; i < shared_msrs_global.nr; ++i) - locals->current_value[i] = shared_msrs_global.msrs[i].value; + shared_msr_update(i, shared_msrs_global.msrs[i]); } void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask) { struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs); - if (((value ^ smsr->current_value[slot]) & mask) == 0) + if (((value ^ smsr->values[slot].curr) & mask) == 0) return; - smsr->current_value[slot] = value; - wrmsrl(shared_msrs_global.msrs[slot].msr, value); + smsr->values[slot].curr = value; + wrmsrl(shared_msrs_global.msrs[slot], value); if (!smsr->registered) { smsr->urn.on_user_return = kvm_on_user_return; user_return_notifier_register(&smsr->urn); @@ -257,12 +269,68 @@ void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data) } EXPORT_SYMBOL_GPL(kvm_set_apic_base); +#define EXCPT_BENIGN 0 +#define EXCPT_CONTRIBUTORY 1 +#define EXCPT_PF 2 + +static int exception_class(int vector) +{ + switch (vector) { + case PF_VECTOR: + return EXCPT_PF; + case DE_VECTOR: + case TS_VECTOR: + case NP_VECTOR: + case SS_VECTOR: + case GP_VECTOR: + return EXCPT_CONTRIBUTORY; + default: + break; + } + return EXCPT_BENIGN; +} + +static void kvm_multiple_exception(struct kvm_vcpu *vcpu, + unsigned nr, bool has_error, u32 error_code) +{ + u32 prev_nr; + int class1, class2; + + if (!vcpu->arch.exception.pending) { + queue: + vcpu->arch.exception.pending = true; + vcpu->arch.exception.has_error_code = has_error; + vcpu->arch.exception.nr = nr; + vcpu->arch.exception.error_code = error_code; + return; + } + + /* to check exception */ + prev_nr = vcpu->arch.exception.nr; + if (prev_nr == DF_VECTOR) { + /* triple fault -> shutdown */ + set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); + return; + } + class1 = exception_class(prev_nr); + class2 = exception_class(nr); + if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY) + || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) { + /* generate double fault per SDM Table 5-5 */ + vcpu->arch.exception.pending = true; + vcpu->arch.exception.has_error_code = true; + vcpu->arch.exception.nr = DF_VECTOR; + vcpu->arch.exception.error_code = 0; + } else + /* replace previous exception with a new one in a hope + that instruction re-execution will regenerate lost + exception */ + goto queue; +} + void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) { - WARN_ON(vcpu->arch.exception.pending); - vcpu->arch.exception.pending = true; - vcpu->arch.exception.has_error_code = false; - vcpu->arch.exception.nr = nr; + kvm_multiple_exception(vcpu, nr, false, 0); } EXPORT_SYMBOL_GPL(kvm_queue_exception); @@ -270,25 +338,6 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, u32 error_code) { ++vcpu->stat.pf_guest; - - if (vcpu->arch.exception.pending) { - switch(vcpu->arch.exception.nr) { - case DF_VECTOR: - /* triple fault -> shutdown */ - set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); - return; - case PF_VECTOR: - vcpu->arch.exception.nr = DF_VECTOR; - vcpu->arch.exception.error_code = 0; - return; - default: - /* replace previous exception with a new one in a hope - that instruction re-execution will regenerate lost - exception */ - vcpu->arch.exception.pending = false; - break; - } - } vcpu->arch.cr2 = addr; kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); } @@ -301,11 +350,7 @@ EXPORT_SYMBOL_GPL(kvm_inject_nmi); void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) { - WARN_ON(vcpu->arch.exception.pending); - vcpu->arch.exception.pending = true; - vcpu->arch.exception.has_error_code = true; - vcpu->arch.exception.nr = nr; - vcpu->arch.exception.error_code = error_code; + kvm_multiple_exception(vcpu, nr, true, error_code); } EXPORT_SYMBOL_GPL(kvm_queue_exception_e); @@ -383,12 +428,18 @@ out: void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { - if (cr0 & CR0_RESERVED_BITS) { + cr0 |= X86_CR0_ET; + +#ifdef CONFIG_X86_64 + if (cr0 & 0xffffffff00000000UL) { printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", - cr0, vcpu->arch.cr0); + cr0, kvm_read_cr0(vcpu)); kvm_inject_gp(vcpu, 0); return; } +#endif + + cr0 &= ~CR0_RESERVED_BITS; if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n"); @@ -405,7 +456,7 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { #ifdef CONFIG_X86_64 - if ((vcpu->arch.shadow_efer & EFER_LME)) { + if ((vcpu->arch.efer & EFER_LME)) { int cs_db, cs_l; if (!is_pae(vcpu)) { @@ -443,13 +494,13 @@ EXPORT_SYMBOL_GPL(kvm_set_cr0); void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) { - kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)); + kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0ful) | (msw & 0x0f)); } EXPORT_SYMBOL_GPL(kvm_lmsw); void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { - unsigned long old_cr4 = vcpu->arch.cr4; + unsigned long old_cr4 = kvm_read_cr4(vcpu); unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE; if (cr4 & CR4_RESERVED_BITS) { @@ -575,9 +626,11 @@ static inline u32 bit(int bitno) * kvm-specific. Those are put in the beginning of the list. */ -#define KVM_SAVE_MSRS_BEGIN 2 +#define KVM_SAVE_MSRS_BEGIN 5 static u32 msrs_to_save[] = { MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, + HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, + HV_X64_MSR_APIC_ASSIST_PAGE, MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, MSR_K6_STAR, #ifdef CONFIG_X86_64 @@ -602,7 +655,7 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer) } if (is_paging(vcpu) - && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) { + && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) { printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n"); kvm_inject_gp(vcpu, 0); return; @@ -633,9 +686,9 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer) kvm_x86_ops->set_efer(vcpu, efer); efer &= ~EFER_LMA; - efer |= vcpu->arch.shadow_efer & EFER_LMA; + efer |= vcpu->arch.efer & EFER_LMA; - vcpu->arch.shadow_efer = efer; + vcpu->arch.efer = efer; vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; kvm_mmu_reset_context(vcpu); @@ -957,6 +1010,100 @@ out: return r; } +static bool kvm_hv_hypercall_enabled(struct kvm *kvm) +{ + return kvm->arch.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE; +} + +static bool kvm_hv_msr_partition_wide(u32 msr) +{ + bool r = false; + switch (msr) { + case HV_X64_MSR_GUEST_OS_ID: + case HV_X64_MSR_HYPERCALL: + r = true; + break; + } + + return r; +} + +static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data) +{ + struct kvm *kvm = vcpu->kvm; + + switch (msr) { + case HV_X64_MSR_GUEST_OS_ID: + kvm->arch.hv_guest_os_id = data; + /* setting guest os id to zero disables hypercall page */ + if (!kvm->arch.hv_guest_os_id) + kvm->arch.hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE; + break; + case HV_X64_MSR_HYPERCALL: { + u64 gfn; + unsigned long addr; + u8 instructions[4]; + + /* if guest os id is not set hypercall should remain disabled */ + if (!kvm->arch.hv_guest_os_id) + break; + if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) { + kvm->arch.hv_hypercall = data; + break; + } + gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT; + addr = gfn_to_hva(kvm, gfn); + if (kvm_is_error_hva(addr)) + return 1; + kvm_x86_ops->patch_hypercall(vcpu, instructions); + ((unsigned char *)instructions)[3] = 0xc3; /* ret */ + if (copy_to_user((void __user *)addr, instructions, 4)) + return 1; + kvm->arch.hv_hypercall = data; + break; + } + default: + pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " + "data 0x%llx\n", msr, data); + return 1; + } + return 0; +} + +static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data) +{ + switch (msr) { + case HV_X64_MSR_APIC_ASSIST_PAGE: { + unsigned long addr; + + if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) { + vcpu->arch.hv_vapic = data; + break; + } + addr = gfn_to_hva(vcpu->kvm, data >> + HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT); + if (kvm_is_error_hva(addr)) + return 1; + if (clear_user((void __user *)addr, PAGE_SIZE)) + return 1; + vcpu->arch.hv_vapic = data; + break; + } + case HV_X64_MSR_EOI: + return kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data); + case HV_X64_MSR_ICR: + return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data); + case HV_X64_MSR_TPR: + return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data); + default: + pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " + "data 0x%llx\n", msr, data); + return 1; + } + + return 0; +} + int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) { switch (msr) { @@ -1071,6 +1218,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " "0x%x data 0x%llx\n", msr, data); break; + case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: + if (kvm_hv_msr_partition_wide(msr)) { + int r; + mutex_lock(&vcpu->kvm->lock); + r = set_msr_hyperv_pw(vcpu, msr, data); + mutex_unlock(&vcpu->kvm->lock); + return r; + } else + return set_msr_hyperv(vcpu, msr, data); + break; default: if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) return xen_hvm_config(vcpu, data); @@ -1170,6 +1327,54 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) return 0; } +static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +{ + u64 data = 0; + struct kvm *kvm = vcpu->kvm; + + switch (msr) { + case HV_X64_MSR_GUEST_OS_ID: + data = kvm->arch.hv_guest_os_id; + break; + case HV_X64_MSR_HYPERCALL: + data = kvm->arch.hv_hypercall; + break; + default: + pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); + return 1; + } + + *pdata = data; + return 0; +} + +static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +{ + u64 data = 0; + + switch (msr) { + case HV_X64_MSR_VP_INDEX: { + int r; + struct kvm_vcpu *v; + kvm_for_each_vcpu(r, v, vcpu->kvm) + if (v == vcpu) + data = r; + break; + } + case HV_X64_MSR_EOI: + return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata); + case HV_X64_MSR_ICR: + return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata); + case HV_X64_MSR_TPR: + return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata); + default: + pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); + return 1; + } + *pdata = data; + return 0; +} + int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) { u64 data; @@ -1221,7 +1426,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) data |= (((uint64_t)4ULL) << 40); break; case MSR_EFER: - data = vcpu->arch.shadow_efer; + data = vcpu->arch.efer; break; case MSR_KVM_WALL_CLOCK: data = vcpu->kvm->arch.wall_clock; @@ -1236,6 +1441,16 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_IA32_MCG_STATUS: case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: return get_msr_mce(vcpu, msr, pdata); + case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: + if (kvm_hv_msr_partition_wide(msr)) { + int r; + mutex_lock(&vcpu->kvm->lock); + r = get_msr_hyperv_pw(vcpu, msr, pdata); + mutex_unlock(&vcpu->kvm->lock); + return r; + } else + return get_msr_hyperv(vcpu, msr, pdata); + break; default: if (!ignore_msrs) { pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); @@ -1261,15 +1476,15 @@ static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, int (*do_msr)(struct kvm_vcpu *vcpu, unsigned index, u64 *data)) { - int i; + int i, idx; vcpu_load(vcpu); - down_read(&vcpu->kvm->slots_lock); + idx = srcu_read_lock(&vcpu->kvm->srcu); for (i = 0; i < msrs->nmsrs; ++i) if (do_msr(vcpu, entries[i].index, &entries[i].data)) break; - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&vcpu->kvm->srcu, idx); vcpu_put(vcpu); @@ -1351,6 +1566,11 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_XEN_HVM: case KVM_CAP_ADJUST_CLOCK: case KVM_CAP_VCPU_EVENTS: + case KVM_CAP_HYPERV: + case KVM_CAP_HYPERV_VAPIC: + case KVM_CAP_HYPERV_SPIN: + case KVM_CAP_PCI_SEGMENT: + case KVM_CAP_X86_ROBUST_SINGLESTEP: r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -1464,8 +1684,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { - kvm_x86_ops->vcpu_put(vcpu); kvm_put_guest_fpu(vcpu); + kvm_x86_ops->vcpu_put(vcpu); } static int is_efer_nx(void) @@ -1530,6 +1750,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, cpuid_fix_nx_cap(vcpu); r = 0; kvm_apic_set_version(vcpu); + kvm_x86_ops->cpuid_update(vcpu); out_free: vfree(cpuid_entries); @@ -1552,6 +1773,7 @@ static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, goto out; vcpu->arch.cpuid_nent = cpuid->nent; kvm_apic_set_version(vcpu); + kvm_x86_ops->cpuid_update(vcpu); return 0; out: @@ -1594,12 +1816,15 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, u32 index, int *nent, int maxnent) { unsigned f_nx = is_efer_nx() ? F(NX) : 0; - unsigned f_gbpages = kvm_x86_ops->gb_page_enable() ? F(GBPAGES) : 0; #ifdef CONFIG_X86_64 + unsigned f_gbpages = (kvm_x86_ops->get_lpage_level() == PT_PDPE_LEVEL) + ? F(GBPAGES) : 0; unsigned f_lm = F(LM); #else + unsigned f_gbpages = 0; unsigned f_lm = 0; #endif + unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0; /* cpuid 1.edx */ const u32 kvm_supported_word0_x86_features = @@ -1619,7 +1844,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | F(PAT) | F(PSE36) | 0 /* Reserved */ | f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) | - F(FXSR) | F(FXSR_OPT) | f_gbpages | 0 /* RDTSCP */ | + F(FXSR) | F(FXSR_OPT) | f_gbpages | f_rdtscp | 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); /* cpuid 1.ecx */ const u32 kvm_supported_word4_x86_features = @@ -1866,7 +2091,7 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, return 0; if (mce->status & MCI_STATUS_UC) { if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) || - !(vcpu->arch.cr4 & X86_CR4_MCE)) { + !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) { printk(KERN_DEBUG "kvm: set_mce: " "injects mce exception while " "previous one is in progress!\n"); @@ -2160,14 +2385,14 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES) return -EINVAL; - down_write(&kvm->slots_lock); + mutex_lock(&kvm->slots_lock); spin_lock(&kvm->mmu_lock); kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages); kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages; spin_unlock(&kvm->mmu_lock); - up_write(&kvm->slots_lock); + mutex_unlock(&kvm->slots_lock); return 0; } @@ -2176,13 +2401,35 @@ static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) return kvm->arch.n_alloc_mmu_pages; } +gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn) +{ + int i; + struct kvm_mem_alias *alias; + struct kvm_mem_aliases *aliases; + + aliases = rcu_dereference(kvm->arch.aliases); + + for (i = 0; i < aliases->naliases; ++i) { + alias = &aliases->aliases[i]; + if (alias->flags & KVM_ALIAS_INVALID) + continue; + if (gfn >= alias->base_gfn + && gfn < alias->base_gfn + alias->npages) + return alias->target_gfn + gfn - alias->base_gfn; + } + return gfn; +} + gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) { int i; struct kvm_mem_alias *alias; + struct kvm_mem_aliases *aliases; - for (i = 0; i < kvm->arch.naliases; ++i) { - alias = &kvm->arch.aliases[i]; + aliases = rcu_dereference(kvm->arch.aliases); + + for (i = 0; i < aliases->naliases; ++i) { + alias = &aliases->aliases[i]; if (gfn >= alias->base_gfn && gfn < alias->base_gfn + alias->npages) return alias->target_gfn + gfn - alias->base_gfn; @@ -2200,6 +2447,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, { int r, n; struct kvm_mem_alias *p; + struct kvm_mem_aliases *aliases, *old_aliases; r = -EINVAL; /* General sanity checks */ @@ -2216,26 +2464,48 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, < alias->target_phys_addr) goto out; - down_write(&kvm->slots_lock); - spin_lock(&kvm->mmu_lock); + r = -ENOMEM; + aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL); + if (!aliases) + goto out; + + mutex_lock(&kvm->slots_lock); - p = &kvm->arch.aliases[alias->slot]; + /* invalidate any gfn reference in case of deletion/shrinking */ + memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases)); + aliases->aliases[alias->slot].flags |= KVM_ALIAS_INVALID; + old_aliases = kvm->arch.aliases; + rcu_assign_pointer(kvm->arch.aliases, aliases); + synchronize_srcu_expedited(&kvm->srcu); + kvm_mmu_zap_all(kvm); + kfree(old_aliases); + + r = -ENOMEM; + aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL); + if (!aliases) + goto out_unlock; + + memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases)); + + p = &aliases->aliases[alias->slot]; p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; p->npages = alias->memory_size >> PAGE_SHIFT; p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT; + p->flags &= ~(KVM_ALIAS_INVALID); for (n = KVM_ALIAS_SLOTS; n > 0; --n) - if (kvm->arch.aliases[n - 1].npages) + if (aliases->aliases[n - 1].npages) break; - kvm->arch.naliases = n; + aliases->naliases = n; - spin_unlock(&kvm->mmu_lock); - kvm_mmu_zap_all(kvm); - - up_write(&kvm->slots_lock); - - return 0; + old_aliases = kvm->arch.aliases; + rcu_assign_pointer(kvm->arch.aliases, aliases); + synchronize_srcu_expedited(&kvm->srcu); + kfree(old_aliases); + r = 0; +out_unlock: + mutex_unlock(&kvm->slots_lock); out: return r; } @@ -2273,18 +2543,18 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) r = 0; switch (chip->chip_id) { case KVM_IRQCHIP_PIC_MASTER: - spin_lock(&pic_irqchip(kvm)->lock); + raw_spin_lock(&pic_irqchip(kvm)->lock); memcpy(&pic_irqchip(kvm)->pics[0], &chip->chip.pic, sizeof(struct kvm_pic_state)); - spin_unlock(&pic_irqchip(kvm)->lock); + raw_spin_unlock(&pic_irqchip(kvm)->lock); break; case KVM_IRQCHIP_PIC_SLAVE: - spin_lock(&pic_irqchip(kvm)->lock); + raw_spin_lock(&pic_irqchip(kvm)->lock); memcpy(&pic_irqchip(kvm)->pics[1], &chip->chip.pic, sizeof(struct kvm_pic_state)); - spin_unlock(&pic_irqchip(kvm)->lock); + raw_spin_unlock(&pic_irqchip(kvm)->lock); break; case KVM_IRQCHIP_IOAPIC: r = kvm_set_ioapic(kvm, &chip->chip.ioapic); @@ -2364,29 +2634,62 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm, int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) { - int r; - int n; + int r, n, i; struct kvm_memory_slot *memslot; - int is_dirty = 0; + unsigned long is_dirty = 0; + unsigned long *dirty_bitmap = NULL; - down_write(&kvm->slots_lock); + mutex_lock(&kvm->slots_lock); - r = kvm_get_dirty_log(kvm, log, &is_dirty); - if (r) + r = -EINVAL; + if (log->slot >= KVM_MEMORY_SLOTS) + goto out; + + memslot = &kvm->memslots->memslots[log->slot]; + r = -ENOENT; + if (!memslot->dirty_bitmap) + goto out; + + n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; + + r = -ENOMEM; + dirty_bitmap = vmalloc(n); + if (!dirty_bitmap) goto out; + memset(dirty_bitmap, 0, n); + + for (i = 0; !is_dirty && i < n/sizeof(long); i++) + is_dirty = memslot->dirty_bitmap[i]; /* If nothing is dirty, don't bother messing with page tables. */ if (is_dirty) { + struct kvm_memslots *slots, *old_slots; + spin_lock(&kvm->mmu_lock); kvm_mmu_slot_remove_write_access(kvm, log->slot); spin_unlock(&kvm->mmu_lock); - memslot = &kvm->memslots[log->slot]; - n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; - memset(memslot->dirty_bitmap, 0, n); + + slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); + if (!slots) + goto out_free; + + memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); + slots->memslots[log->slot].dirty_bitmap = dirty_bitmap; + + old_slots = kvm->memslots; + rcu_assign_pointer(kvm->memslots, slots); + synchronize_srcu_expedited(&kvm->srcu); + dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap; + kfree(old_slots); } + r = 0; + if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) + r = -EFAULT; +out_free: + vfree(dirty_bitmap); out: - up_write(&kvm->slots_lock); + mutex_unlock(&kvm->slots_lock); return r; } @@ -2469,6 +2772,8 @@ long kvm_arch_vm_ioctl(struct file *filp, if (vpic) { r = kvm_ioapic_init(kvm); if (r) { + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, + &vpic->dev); kfree(vpic); goto create_irqchip_unlock; } @@ -2480,10 +2785,8 @@ long kvm_arch_vm_ioctl(struct file *filp, r = kvm_setup_default_irq_routing(kvm); if (r) { mutex_lock(&kvm->irq_lock); - kfree(kvm->arch.vpic); - kfree(kvm->arch.vioapic); - kvm->arch.vpic = NULL; - kvm->arch.vioapic = NULL; + kvm_ioapic_destroy(kvm); + kvm_destroy_pic(kvm); mutex_unlock(&kvm->irq_lock); } create_irqchip_unlock: @@ -2499,7 +2802,7 @@ long kvm_arch_vm_ioctl(struct file *filp, sizeof(struct kvm_pit_config))) goto out; create_pit: - down_write(&kvm->slots_lock); + mutex_lock(&kvm->slots_lock); r = -EEXIST; if (kvm->arch.vpit) goto create_pit_unlock; @@ -2508,7 +2811,7 @@ long kvm_arch_vm_ioctl(struct file *filp, if (kvm->arch.vpit) r = 0; create_pit_unlock: - up_write(&kvm->slots_lock); + mutex_unlock(&kvm->slots_lock); break; case KVM_IRQ_LINE_STATUS: case KVM_IRQ_LINE: { @@ -2725,7 +3028,7 @@ static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v)) return 0; - return kvm_io_bus_write(&vcpu->kvm->mmio_bus, addr, len, v); + return kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, len, v); } static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) @@ -2734,17 +3037,44 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v)) return 0; - return kvm_io_bus_read(&vcpu->kvm->mmio_bus, addr, len, v); + return kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v); } -static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, - struct kvm_vcpu *vcpu) +gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) +{ + u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); +} + + gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) +{ + u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + access |= PFERR_FETCH_MASK; + return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); +} + +gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) +{ + u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + access |= PFERR_WRITE_MASK; + return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); +} + +/* uses this to access any guest's mapped memory without checking CPL */ +gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) +{ + return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, 0, error); +} + +static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu, u32 access, + u32 *error) { void *data = val; int r = X86EMUL_CONTINUE; while (bytes) { - gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr, access, error); unsigned offset = addr & (PAGE_SIZE-1); unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); int ret; @@ -2767,14 +3097,37 @@ out: return r; } +/* used for instruction fetching */ +static int kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu, u32 *error) +{ + u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, + access | PFERR_FETCH_MASK, error); +} + +static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu, u32 *error) +{ + u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, + error); +} + +static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu, u32 *error) +{ + return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error); +} + static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes, - struct kvm_vcpu *vcpu) + struct kvm_vcpu *vcpu, u32 *error) { void *data = val; int r = X86EMUL_CONTINUE; while (bytes) { - gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + gpa_t gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error); unsigned offset = addr & (PAGE_SIZE-1); unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); int ret; @@ -2804,6 +3157,7 @@ static int emulator_read_emulated(unsigned long addr, struct kvm_vcpu *vcpu) { gpa_t gpa; + u32 error_code; if (vcpu->mmio_read_completed) { memcpy(val, vcpu->mmio_data, bytes); @@ -2813,17 +3167,20 @@ static int emulator_read_emulated(unsigned long addr, return X86EMUL_CONTINUE; } - gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, &error_code); + + if (gpa == UNMAPPED_GVA) { + kvm_inject_page_fault(vcpu, addr, error_code); + return X86EMUL_PROPAGATE_FAULT; + } /* For APIC access vmexit */ if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) goto mmio; - if (kvm_read_guest_virt(addr, val, bytes, vcpu) + if (kvm_read_guest_virt(addr, val, bytes, vcpu, NULL) == X86EMUL_CONTINUE) return X86EMUL_CONTINUE; - if (gpa == UNMAPPED_GVA) - return X86EMUL_PROPAGATE_FAULT; mmio: /* @@ -2862,11 +3219,12 @@ static int emulator_write_emulated_onepage(unsigned long addr, struct kvm_vcpu *vcpu) { gpa_t gpa; + u32 error_code; - gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, &error_code); if (gpa == UNMAPPED_GVA) { - kvm_inject_page_fault(vcpu, addr, 2); + kvm_inject_page_fault(vcpu, addr, error_code); return X86EMUL_PROPAGATE_FAULT; } @@ -2930,7 +3288,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr, char *kaddr; u64 val; - gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL); if (gpa == UNMAPPED_GVA || (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) @@ -2967,35 +3325,21 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) int emulate_clts(struct kvm_vcpu *vcpu) { - kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS); + kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); + kvm_x86_ops->fpu_activate(vcpu); return X86EMUL_CONTINUE; } int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) { - struct kvm_vcpu *vcpu = ctxt->vcpu; - - switch (dr) { - case 0 ... 3: - *dest = kvm_x86_ops->get_dr(vcpu, dr); - return X86EMUL_CONTINUE; - default: - pr_unimpl(vcpu, "%s: unexpected dr %u\n", __func__, dr); - return X86EMUL_UNHANDLEABLE; - } + return kvm_x86_ops->get_dr(ctxt->vcpu, dr, dest); } int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) { unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U; - int exception; - kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception); - if (exception) { - /* FIXME: better handling */ - return X86EMUL_UNHANDLEABLE; - } - return X86EMUL_CONTINUE; + return kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask); } void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) @@ -3009,7 +3353,7 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); - kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu); + kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu, NULL); printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n", context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); @@ -3017,7 +3361,8 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); static struct x86_emulate_ops emulate_ops = { - .read_std = kvm_read_guest_virt, + .read_std = kvm_read_guest_virt_system, + .fetch = kvm_fetch_guest_virt, .read_emulated = emulator_read_emulated, .write_emulated = emulator_write_emulated, .cmpxchg_emulated = emulator_cmpxchg_emulated, @@ -3060,8 +3405,9 @@ int emulate_instruction(struct kvm_vcpu *vcpu, vcpu->arch.emulate_ctxt.vcpu = vcpu; vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu); vcpu->arch.emulate_ctxt.mode = + (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) - ? X86EMUL_MODE_REAL : cs_l + ? X86EMUL_MODE_VM86 : cs_l ? X86EMUL_MODE_PROT64 : cs_db ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; @@ -3153,12 +3499,17 @@ static int pio_copy_data(struct kvm_vcpu *vcpu) gva_t q = vcpu->arch.pio.guest_gva; unsigned bytes; int ret; + u32 error_code; bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count; if (vcpu->arch.pio.in) - ret = kvm_write_guest_virt(q, p, bytes, vcpu); + ret = kvm_write_guest_virt(q, p, bytes, vcpu, &error_code); else - ret = kvm_read_guest_virt(q, p, bytes, vcpu); + ret = kvm_read_guest_virt(q, p, bytes, vcpu, &error_code); + + if (ret == X86EMUL_PROPAGATE_FAULT) + kvm_inject_page_fault(vcpu, q, error_code); + return ret; } @@ -3179,7 +3530,7 @@ int complete_pio(struct kvm_vcpu *vcpu) if (io->in) { r = pio_copy_data(vcpu); if (r) - return r; + goto out; } delta = 1; @@ -3206,7 +3557,7 @@ int complete_pio(struct kvm_vcpu *vcpu) kvm_register_write(vcpu, VCPU_REGS_RSI, val); } } - +out: io->count -= io->cur_count; io->cur_count = 0; @@ -3219,11 +3570,12 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) int r; if (vcpu->arch.pio.in) - r = kvm_io_bus_read(&vcpu->kvm->pio_bus, vcpu->arch.pio.port, + r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port, vcpu->arch.pio.size, pd); else - r = kvm_io_bus_write(&vcpu->kvm->pio_bus, vcpu->arch.pio.port, - vcpu->arch.pio.size, pd); + r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS, + vcpu->arch.pio.port, vcpu->arch.pio.size, + pd); return r; } @@ -3234,7 +3586,7 @@ static int pio_string_write(struct kvm_vcpu *vcpu) int i, r = 0; for (i = 0; i < io->cur_count; i++) { - if (kvm_io_bus_write(&vcpu->kvm->pio_bus, + if (kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS, io->port, io->size, pd)) { r = -EOPNOTSUPP; break; @@ -3248,6 +3600,8 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port) { unsigned long val; + trace_kvm_pio(!in, port, size, 1); + vcpu->run->exit_reason = KVM_EXIT_IO; vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; vcpu->run->io.size = vcpu->arch.pio.size = size; @@ -3259,11 +3613,10 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port) vcpu->arch.pio.down = 0; vcpu->arch.pio.rep = 0; - trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port, - size, 1); - - val = kvm_register_read(vcpu, VCPU_REGS_RAX); - memcpy(vcpu->arch.pio_data, &val, 4); + if (!vcpu->arch.pio.in) { + val = kvm_register_read(vcpu, VCPU_REGS_RAX); + memcpy(vcpu->arch.pio_data, &val, 4); + } if (!kernel_pio(vcpu, vcpu->arch.pio_data)) { complete_pio(vcpu); @@ -3280,6 +3633,8 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in, unsigned now, in_page; int ret = 0; + trace_kvm_pio(!in, port, size, count); + vcpu->run->exit_reason = KVM_EXIT_IO; vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; vcpu->run->io.size = vcpu->arch.pio.size = size; @@ -3291,9 +3646,6 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in, vcpu->arch.pio.down = down; vcpu->arch.pio.rep = rep; - trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port, - size, count); - if (!count) { kvm_x86_ops->skip_emulated_instruction(vcpu); return 1; @@ -3325,10 +3677,8 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in, if (!vcpu->arch.pio.in) { /* string PIO write */ ret = pio_copy_data(vcpu); - if (ret == X86EMUL_PROPAGATE_FAULT) { - kvm_inject_gp(vcpu, 0); + if (ret == X86EMUL_PROPAGATE_FAULT) return 1; - } if (ret == 0 && !pio_string_write(vcpu)) { complete_pio(vcpu); if (vcpu->arch.pio.count == 0) @@ -3487,11 +3837,76 @@ static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0, return a0 | ((gpa_t)a1 << 32); } +int kvm_hv_hypercall(struct kvm_vcpu *vcpu) +{ + u64 param, ingpa, outgpa, ret; + uint16_t code, rep_idx, rep_cnt, res = HV_STATUS_SUCCESS, rep_done = 0; + bool fast, longmode; + int cs_db, cs_l; + + /* + * hypercall generates UD from non zero cpl and real mode + * per HYPER-V spec + */ + if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 0; + } + + kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); + longmode = is_long_mode(vcpu) && cs_l == 1; + + if (!longmode) { + param = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) | + (kvm_register_read(vcpu, VCPU_REGS_RAX) & 0xffffffff); + ingpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RBX) << 32) | + (kvm_register_read(vcpu, VCPU_REGS_RCX) & 0xffffffff); + outgpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDI) << 32) | + (kvm_register_read(vcpu, VCPU_REGS_RSI) & 0xffffffff); + } +#ifdef CONFIG_X86_64 + else { + param = kvm_register_read(vcpu, VCPU_REGS_RCX); + ingpa = kvm_register_read(vcpu, VCPU_REGS_RDX); + outgpa = kvm_register_read(vcpu, VCPU_REGS_R8); + } +#endif + + code = param & 0xffff; + fast = (param >> 16) & 0x1; + rep_cnt = (param >> 32) & 0xfff; + rep_idx = (param >> 48) & 0xfff; + + trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa); + + switch (code) { + case HV_X64_HV_NOTIFY_LONG_SPIN_WAIT: + kvm_vcpu_on_spin(vcpu); + break; + default: + res = HV_STATUS_INVALID_HYPERCALL_CODE; + break; + } + + ret = res | (((u64)rep_done & 0xfff) << 32); + if (longmode) { + kvm_register_write(vcpu, VCPU_REGS_RAX, ret); + } else { + kvm_register_write(vcpu, VCPU_REGS_RDX, ret >> 32); + kvm_register_write(vcpu, VCPU_REGS_RAX, ret & 0xffffffff); + } + + return 1; +} + int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) { unsigned long nr, a0, a1, a2, a3, ret; int r = 1; + if (kvm_hv_hypercall_enabled(vcpu->kvm)) + return kvm_hv_hypercall(vcpu); + nr = kvm_register_read(vcpu, VCPU_REGS_RAX); a0 = kvm_register_read(vcpu, VCPU_REGS_RBX); a1 = kvm_register_read(vcpu, VCPU_REGS_RCX); @@ -3534,10 +3949,8 @@ EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); int kvm_fix_hypercall(struct kvm_vcpu *vcpu) { char instruction[3]; - int ret = 0; unsigned long rip = kvm_rip_read(vcpu); - /* * Blow out the MMU to ensure that no other VCPU has an active mapping * to ensure that the updated hypercall appears atomically across all @@ -3546,11 +3959,8 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu) kvm_mmu_zap_all(vcpu->kvm); kvm_x86_ops->patch_hypercall(vcpu, instruction); - if (emulator_write_emulated(rip, instruction, 3, vcpu) - != X86EMUL_CONTINUE) - ret = -EFAULT; - return ret; + return emulator_write_emulated(rip, instruction, 3, vcpu); } static u64 mk_cr_64(u64 curr_cr, u32 new_val) @@ -3583,10 +3993,9 @@ unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) { unsigned long value; - kvm_x86_ops->decache_cr4_guest_bits(vcpu); switch (cr) { case 0: - value = vcpu->arch.cr0; + value = kvm_read_cr0(vcpu); break; case 2: value = vcpu->arch.cr2; @@ -3595,7 +4004,7 @@ unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) value = vcpu->arch.cr3; break; case 4: - value = vcpu->arch.cr4; + value = kvm_read_cr4(vcpu); break; case 8: value = kvm_get_cr8(vcpu); @@ -3613,7 +4022,7 @@ void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, { switch (cr) { case 0: - kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val)); + kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val)); *rflags = kvm_get_rflags(vcpu); break; case 2: @@ -3623,7 +4032,7 @@ void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, kvm_set_cr3(vcpu, val); break; case 4: - kvm_set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val)); + kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val)); break; case 8: kvm_set_cr8(vcpu, val & 0xfUL); @@ -3690,6 +4099,7 @@ struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, } return best; } +EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry); int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) { @@ -3773,14 +4183,15 @@ static void vapic_enter(struct kvm_vcpu *vcpu) static void vapic_exit(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; + int idx; if (!apic || !apic->vapic_addr) return; - down_read(&vcpu->kvm->slots_lock); + idx = srcu_read_lock(&vcpu->kvm->srcu); kvm_release_page_dirty(apic->vapic_page); mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&vcpu->kvm->srcu, idx); } static void update_cr8_intercept(struct kvm_vcpu *vcpu) @@ -3876,12 +4287,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) r = 0; goto out; } + if (test_and_clear_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests)) { + vcpu->fpu_active = 0; + kvm_x86_ops->fpu_deactivate(vcpu); + } } preempt_disable(); kvm_x86_ops->prepare_guest_switch(vcpu); - kvm_load_guest_fpu(vcpu); + if (vcpu->fpu_active) + kvm_load_guest_fpu(vcpu); local_irq_disable(); @@ -3909,7 +4325,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_lapic_sync_to_vapic(vcpu); } - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); kvm_guest_enter(); @@ -3951,7 +4367,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) preempt_enable(); - down_read(&vcpu->kvm->slots_lock); + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); /* * Profile KVM exit RIPs: @@ -3973,6 +4389,7 @@ out: static int __vcpu_run(struct kvm_vcpu *vcpu) { int r; + struct kvm *kvm = vcpu->kvm; if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) { pr_debug("vcpu %d received sipi with vector # %x\n", @@ -3984,7 +4401,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; } - down_read(&vcpu->kvm->slots_lock); + vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); vapic_enter(vcpu); r = 1; @@ -3992,9 +4409,9 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) r = vcpu_enter_guest(vcpu); else { - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); kvm_vcpu_block(vcpu); - down_read(&vcpu->kvm->slots_lock); + vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests)) { switch(vcpu->arch.mp_state) { @@ -4029,13 +4446,13 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) ++vcpu->stat.signal_exits; } if (need_resched()) { - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); kvm_resched(vcpu); - down_read(&vcpu->kvm->slots_lock); + vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); } } - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); post_kvm_run_save(vcpu); vapic_exit(vcpu); @@ -4074,10 +4491,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) vcpu->mmio_read_completed = 1; vcpu->mmio_needed = 0; - down_read(&vcpu->kvm->slots_lock); + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); r = emulate_instruction(vcpu, vcpu->arch.mmio_fault_cr2, 0, EMULTYPE_NO_DECODE); - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); if (r == EMULATE_DO_MMIO) { /* * Read-modify-write. Back to userspace. @@ -4204,13 +4621,12 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, sregs->gdt.limit = dt.limit; sregs->gdt.base = dt.base; - kvm_x86_ops->decache_cr4_guest_bits(vcpu); - sregs->cr0 = vcpu->arch.cr0; + sregs->cr0 = kvm_read_cr0(vcpu); sregs->cr2 = vcpu->arch.cr2; sregs->cr3 = vcpu->arch.cr3; - sregs->cr4 = vcpu->arch.cr4; + sregs->cr4 = kvm_read_cr4(vcpu); sregs->cr8 = kvm_get_cr8(vcpu); - sregs->efer = vcpu->arch.shadow_efer; + sregs->efer = vcpu->arch.efer; sregs->apic_base = kvm_get_apic_base(vcpu); memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap); @@ -4298,14 +4714,23 @@ static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, { struct descriptor_table dtable; u16 index = selector >> 3; + int ret; + u32 err; + gva_t addr; get_segment_descriptor_dtable(vcpu, selector, &dtable); if (dtable.limit < index * 8 + 7) { kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc); - return 1; + return X86EMUL_PROPAGATE_FAULT; } - return kvm_read_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu); + addr = dtable.base + index * 8; + ret = kvm_read_guest_virt_system(addr, seg_desc, sizeof(*seg_desc), + vcpu, &err); + if (ret == X86EMUL_PROPAGATE_FAULT) + kvm_inject_page_fault(vcpu, addr, err); + + return ret; } /* allowed just for 8 bytes segments */ @@ -4319,15 +4744,23 @@ static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, if (dtable.limit < index * 8 + 7) return 1; - return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu); + return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu, NULL); +} + +static gpa_t get_tss_base_addr_write(struct kvm_vcpu *vcpu, + struct desc_struct *seg_desc) +{ + u32 base_addr = get_desc_base(seg_desc); + + return kvm_mmu_gva_to_gpa_write(vcpu, base_addr, NULL); } -static gpa_t get_tss_base_addr(struct kvm_vcpu *vcpu, +static gpa_t get_tss_base_addr_read(struct kvm_vcpu *vcpu, struct desc_struct *seg_desc) { u32 base_addr = get_desc_base(seg_desc); - return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr); + return kvm_mmu_gva_to_gpa_read(vcpu, base_addr, NULL); } static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg) @@ -4338,18 +4771,6 @@ static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg) return kvm_seg.selector; } -static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu, - u16 selector, - struct kvm_segment *kvm_seg) -{ - struct desc_struct seg_desc; - - if (load_guest_segment_descriptor(vcpu, selector, &seg_desc)) - return 1; - seg_desct_to_kvm_desct(&seg_desc, selector, kvm_seg); - return 0; -} - static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg) { struct kvm_segment segvar = { @@ -4367,7 +4788,7 @@ static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int se .unusable = 0, }; kvm_x86_ops->set_segment(vcpu, &segvar, seg); - return 0; + return X86EMUL_CONTINUE; } static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg) @@ -4377,24 +4798,112 @@ static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg) (kvm_get_rflags(vcpu) & X86_EFLAGS_VM); } -int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, - int type_bits, int seg) +int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg) { struct kvm_segment kvm_seg; + struct desc_struct seg_desc; + u8 dpl, rpl, cpl; + unsigned err_vec = GP_VECTOR; + u32 err_code = 0; + bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */ + int ret; - if (is_vm86_segment(vcpu, seg) || !(vcpu->arch.cr0 & X86_CR0_PE)) + if (is_vm86_segment(vcpu, seg) || !is_protmode(vcpu)) return kvm_load_realmode_segment(vcpu, selector, seg); - if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg)) - return 1; - kvm_seg.type |= type_bits; - if (seg != VCPU_SREG_SS && seg != VCPU_SREG_CS && - seg != VCPU_SREG_LDTR) - if (!kvm_seg.s) - kvm_seg.unusable = 1; + /* NULL selector is not valid for TR, CS and SS */ + if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR) + && null_selector) + goto exception; + + /* TR should be in GDT only */ + if (seg == VCPU_SREG_TR && (selector & (1 << 2))) + goto exception; + + ret = load_guest_segment_descriptor(vcpu, selector, &seg_desc); + if (ret) + return ret; + + seg_desct_to_kvm_desct(&seg_desc, selector, &kvm_seg); + + if (null_selector) { /* for NULL selector skip all following checks */ + kvm_seg.unusable = 1; + goto load; + } + + err_code = selector & 0xfffc; + err_vec = GP_VECTOR; + /* can't load system descriptor into segment selecor */ + if (seg <= VCPU_SREG_GS && !kvm_seg.s) + goto exception; + + if (!kvm_seg.present) { + err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR; + goto exception; + } + + rpl = selector & 3; + dpl = kvm_seg.dpl; + cpl = kvm_x86_ops->get_cpl(vcpu); + + switch (seg) { + case VCPU_SREG_SS: + /* + * segment is not a writable data segment or segment + * selector's RPL != CPL or segment selector's RPL != CPL + */ + if (rpl != cpl || (kvm_seg.type & 0xa) != 0x2 || dpl != cpl) + goto exception; + break; + case VCPU_SREG_CS: + if (!(kvm_seg.type & 8)) + goto exception; + + if (kvm_seg.type & 4) { + /* conforming */ + if (dpl > cpl) + goto exception; + } else { + /* nonconforming */ + if (rpl > cpl || dpl != cpl) + goto exception; + } + /* CS(RPL) <- CPL */ + selector = (selector & 0xfffc) | cpl; + break; + case VCPU_SREG_TR: + if (kvm_seg.s || (kvm_seg.type != 1 && kvm_seg.type != 9)) + goto exception; + break; + case VCPU_SREG_LDTR: + if (kvm_seg.s || kvm_seg.type != 2) + goto exception; + break; + default: /* DS, ES, FS, or GS */ + /* + * segment is not a data or readable code segment or + * ((segment is a data or nonconforming code segment) + * and (both RPL and CPL > DPL)) + */ + if ((kvm_seg.type & 0xa) == 0x8 || + (((kvm_seg.type & 0xc) != 0xc) && (rpl > dpl && cpl > dpl))) + goto exception; + break; + } + + if (!kvm_seg.unusable && kvm_seg.s) { + /* mark segment as accessed */ + kvm_seg.type |= 1; + seg_desc.type |= 1; + save_guest_segment_descriptor(vcpu, selector, &seg_desc); + } +load: kvm_set_segment(vcpu, &kvm_seg, seg); - return 0; + return X86EMUL_CONTINUE; +exception: + kvm_queue_exception_e(vcpu, err_vec, err_code); + return X86EMUL_PROPAGATE_FAULT; } static void save_state_to_tss32(struct kvm_vcpu *vcpu, @@ -4420,6 +4929,14 @@ static void save_state_to_tss32(struct kvm_vcpu *vcpu, tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR); } +static void kvm_load_segment_selector(struct kvm_vcpu *vcpu, u16 sel, int seg) +{ + struct kvm_segment kvm_seg; + kvm_get_segment(vcpu, &kvm_seg, seg); + kvm_seg.selector = sel; + kvm_set_segment(vcpu, &kvm_seg, seg); +} + static int load_state_from_tss32(struct kvm_vcpu *vcpu, struct tss_segment_32 *tss) { @@ -4437,25 +4954,41 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu, kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi); kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi); - if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR)) + /* + * SDM says that segment selectors are loaded before segment + * descriptors + */ + kvm_load_segment_selector(vcpu, tss->ldt_selector, VCPU_SREG_LDTR); + kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES); + kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS); + kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS); + kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS); + kvm_load_segment_selector(vcpu, tss->fs, VCPU_SREG_FS); + kvm_load_segment_selector(vcpu, tss->gs, VCPU_SREG_GS); + + /* + * Now load segment descriptors. If fault happenes at this stage + * it is handled in a context of new task + */ + if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, VCPU_SREG_LDTR)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) + if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) + if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) + if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) + if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS)) + if (kvm_load_segment_descriptor(vcpu, tss->fs, VCPU_SREG_FS)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS)) + if (kvm_load_segment_descriptor(vcpu, tss->gs, VCPU_SREG_GS)) return 1; return 0; } @@ -4495,19 +5028,33 @@ static int load_state_from_tss16(struct kvm_vcpu *vcpu, kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si); kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di); - if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR)) + /* + * SDM says that segment selectors are loaded before segment + * descriptors + */ + kvm_load_segment_selector(vcpu, tss->ldt, VCPU_SREG_LDTR); + kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES); + kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS); + kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS); + kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS); + + /* + * Now load segment descriptors. If fault happenes at this stage + * it is handled in a context of new task + */ + if (kvm_load_segment_descriptor(vcpu, tss->ldt, VCPU_SREG_LDTR)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) + if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) + if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) + if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) + if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS)) return 1; return 0; } @@ -4529,7 +5076,7 @@ static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector, sizeof tss_segment_16)) goto out; - if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc), + if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc), &tss_segment_16, sizeof tss_segment_16)) goto out; @@ -4537,7 +5084,7 @@ static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector, tss_segment_16.prev_task_link = old_tss_sel; if (kvm_write_guest(vcpu->kvm, - get_tss_base_addr(vcpu, nseg_desc), + get_tss_base_addr_write(vcpu, nseg_desc), &tss_segment_16.prev_task_link, sizeof tss_segment_16.prev_task_link)) goto out; @@ -4568,7 +5115,7 @@ static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector, sizeof tss_segment_32)) goto out; - if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc), + if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc), &tss_segment_32, sizeof tss_segment_32)) goto out; @@ -4576,7 +5123,7 @@ static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector, tss_segment_32.prev_task_link = old_tss_sel; if (kvm_write_guest(vcpu->kvm, - get_tss_base_addr(vcpu, nseg_desc), + get_tss_base_addr_write(vcpu, nseg_desc), &tss_segment_32.prev_task_link, sizeof tss_segment_32.prev_task_link)) goto out; @@ -4599,7 +5146,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR); u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR); - old_tss_base = vcpu->arch.mmu.gva_to_gpa(vcpu, old_tss_base); + old_tss_base = kvm_mmu_gva_to_gpa_write(vcpu, old_tss_base, NULL); /* FIXME: Handle errors. Failure to read either TSS or their * descriptors should generate a pagefault. @@ -4658,7 +5205,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) &nseg_desc); } - kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 | X86_CR0_TS); + kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0(vcpu) | X86_CR0_TS); seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg); tr_seg.type = 11; kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR); @@ -4689,17 +5236,15 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, kvm_set_cr8(vcpu, sregs->cr8); - mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer; + mmu_reset_needed |= vcpu->arch.efer != sregs->efer; kvm_x86_ops->set_efer(vcpu, sregs->efer); kvm_set_apic_base(vcpu, sregs->apic_base); - kvm_x86_ops->decache_cr4_guest_bits(vcpu); - - mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0; + mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0; kvm_x86_ops->set_cr0(vcpu, sregs->cr0); vcpu->arch.cr0 = sregs->cr0; - mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4; + mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; kvm_x86_ops->set_cr4(vcpu, sregs->cr4); if (!is_long_mode(vcpu) && is_pae(vcpu)) { load_pdptrs(vcpu, vcpu->arch.cr3); @@ -4734,7 +5279,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, /* Older userspace won't unhalt the vcpu on reset. */ if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 && sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 && - !(vcpu->arch.cr0 & X86_CR0_PE)) + !is_protmode(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; vcpu_put(vcpu); @@ -4832,11 +5377,12 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, { unsigned long vaddr = tr->linear_address; gpa_t gpa; + int idx; vcpu_load(vcpu); - down_read(&vcpu->kvm->slots_lock); - gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr); - up_read(&vcpu->kvm->slots_lock); + idx = srcu_read_lock(&vcpu->kvm->srcu); + gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL); + srcu_read_unlock(&vcpu->kvm->srcu, idx); tr->physical_address = gpa; tr->valid = gpa != UNMAPPED_GVA; tr->writeable = 1; @@ -4917,14 +5463,14 @@ EXPORT_SYMBOL_GPL(fx_init); void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) { - if (!vcpu->fpu_active || vcpu->guest_fpu_loaded) + if (vcpu->guest_fpu_loaded) return; vcpu->guest_fpu_loaded = 1; kvm_fx_save(&vcpu->arch.host_fx_image); kvm_fx_restore(&vcpu->arch.guest_fx_image); + trace_kvm_fpu(1); } -EXPORT_SYMBOL_GPL(kvm_load_guest_fpu); void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) { @@ -4935,8 +5481,9 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) kvm_fx_save(&vcpu->arch.guest_fx_image); kvm_fx_restore(&vcpu->arch.host_fx_image); ++vcpu->stat.fpu_reload; + set_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests); + trace_kvm_fpu(0); } -EXPORT_SYMBOL_GPL(kvm_put_guest_fpu); void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) { @@ -5088,11 +5635,13 @@ fail: void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) { + int idx; + kfree(vcpu->arch.mce_banks); kvm_free_lapic(vcpu); - down_read(&vcpu->kvm->slots_lock); + idx = srcu_read_lock(&vcpu->kvm->srcu); kvm_mmu_destroy(vcpu); - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&vcpu->kvm->srcu, idx); free_page((unsigned long)vcpu->arch.pio_data); } @@ -5103,6 +5652,12 @@ struct kvm *kvm_arch_create_vm(void) if (!kvm) return ERR_PTR(-ENOMEM); + kvm->arch.aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL); + if (!kvm->arch.aliases) { + kfree(kvm); + return ERR_PTR(-ENOMEM); + } + INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); @@ -5159,16 +5714,18 @@ void kvm_arch_destroy_vm(struct kvm *kvm) put_page(kvm->arch.apic_access_page); if (kvm->arch.ept_identity_pagetable) put_page(kvm->arch.ept_identity_pagetable); + cleanup_srcu_struct(&kvm->srcu); + kfree(kvm->arch.aliases); kfree(kvm); } -int kvm_arch_set_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, +int kvm_arch_prepare_memory_region(struct kvm *kvm, + struct kvm_memory_slot *memslot, struct kvm_memory_slot old, + struct kvm_userspace_memory_region *mem, int user_alloc) { - int npages = mem->memory_size >> PAGE_SHIFT; - struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot]; + int npages = memslot->npages; /*To keep backward compatibility with older userspace, *x86 needs to hanlde !user_alloc case. @@ -5188,26 +5745,35 @@ int kvm_arch_set_memory_region(struct kvm *kvm, if (IS_ERR((void *)userspace_addr)) return PTR_ERR((void *)userspace_addr); - /* set userspace_addr atomically for kvm_hva_to_rmapp */ - spin_lock(&kvm->mmu_lock); memslot->userspace_addr = userspace_addr; - spin_unlock(&kvm->mmu_lock); - } else { - if (!old.user_alloc && old.rmap) { - int ret; - - down_write(¤t->mm->mmap_sem); - ret = do_munmap(current->mm, old.userspace_addr, - old.npages * PAGE_SIZE); - up_write(¤t->mm->mmap_sem); - if (ret < 0) - printk(KERN_WARNING - "kvm_vm_ioctl_set_memory_region: " - "failed to munmap memory\n"); - } } } + + return 0; +} + +void kvm_arch_commit_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, + struct kvm_memory_slot old, + int user_alloc) +{ + + int npages = mem->memory_size >> PAGE_SHIFT; + + if (!user_alloc && !old.user_alloc && old.rmap && !npages) { + int ret; + + down_write(¤t->mm->mmap_sem); + ret = do_munmap(current->mm, old.userspace_addr, + old.npages * PAGE_SIZE); + up_write(¤t->mm->mmap_sem); + if (ret < 0) + printk(KERN_WARNING + "kvm_vm_ioctl_set_memory_region: " + "failed to munmap memory\n"); + } + spin_lock(&kvm->mmu_lock); if (!kvm->arch.n_requested_mmu_pages) { unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); @@ -5216,8 +5782,6 @@ int kvm_arch_set_memory_region(struct kvm *kvm, kvm_mmu_slot_remove_write_access(kvm, mem->slot); spin_unlock(&kvm->mmu_lock); - - return 0; } void kvm_arch_flush_shadow(struct kvm *kvm) diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 5eadea5..2d101639 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -2,6 +2,7 @@ #define ARCH_X86_KVM_X86_H #include <linux/kvm_host.h> +#include "kvm_cache_regs.h" static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu) { @@ -35,4 +36,33 @@ static inline bool kvm_exception_is_soft(unsigned int nr) struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, u32 function, u32 index); +static inline bool is_protmode(struct kvm_vcpu *vcpu) +{ + return kvm_read_cr0_bits(vcpu, X86_CR0_PE); +} + +static inline int is_long_mode(struct kvm_vcpu *vcpu) +{ +#ifdef CONFIG_X86_64 + return vcpu->arch.efer & EFER_LMA; +#else + return 0; +#endif +} + +static inline int is_pae(struct kvm_vcpu *vcpu) +{ + return kvm_read_cr4_bits(vcpu, X86_CR4_PAE); +} + +static inline int is_pse(struct kvm_vcpu *vcpu) +{ + return kvm_read_cr4_bits(vcpu, X86_CR4_PSE); +} + +static inline int is_paging(struct kvm_vcpu *vcpu) +{ + return kvm_read_cr0_bits(vcpu, X86_CR0_PG); +} + #endif diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile index 0b7d3e9..b110d97 100644 --- a/arch/x86/pci/Makefile +++ b/arch/x86/pci/Makefile @@ -13,6 +13,8 @@ obj-$(CONFIG_X86_VISWS) += visws.o obj-$(CONFIG_X86_NUMAQ) += numaq_32.o +obj-$(CONFIG_X86_MRST) += mrst.o + obj-y += common.o early.o obj-y += amd_bus.o bus_numa.o diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 5f11ff6..6e22454 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -298,17 +298,14 @@ int __init pci_acpi_init(void) { struct pci_dev *dev = NULL; - if (pcibios_scanned) - return 0; - if (acpi_noirq) - return 0; + return -ENODEV; printk(KERN_INFO "PCI: Using ACPI for IRQ routing\n"); acpi_irq_penalty_init(); - pcibios_scanned++; pcibios_enable_irq = acpi_pci_irq_enable; pcibios_disable_irq = acpi_pci_irq_disable; + x86_init.pci.init_irq = x86_init_noop; if (pci_routeirq) { /* diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 3736176..294e10c 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -72,12 +72,6 @@ struct pci_ops pci_root_ops = { }; /* - * legacy, numa, and acpi all want to call pcibios_scan_root - * from their initcalls. This flag prevents that. - */ -int pcibios_scanned; - -/* * This interrupt-safe spinlock protects all accesses to PCI * configuration space. */ diff --git a/arch/x86/pci/init.c b/arch/x86/pci/init.c index 25a1f8e..adb62aa 100644 --- a/arch/x86/pci/init.c +++ b/arch/x86/pci/init.c @@ -1,6 +1,7 @@ #include <linux/pci.h> #include <linux/init.h> #include <asm/pci_x86.h> +#include <asm/x86_init.h> /* arch_initcall has too random ordering, so call the initializers in the right sequence from here. */ @@ -15,10 +16,9 @@ static __init int pci_arch_init(void) if (!(pci_probe & PCI_PROBE_NOEARLY)) pci_mmcfg_early_init(); -#ifdef CONFIG_PCI_OLPC - if (!pci_olpc_init()) - return 0; /* skip additional checks if it's an XO */ -#endif + if (x86_init.pci.arch_init && !x86_init.pci.arch_init()) + return 0; + #ifdef CONFIG_PCI_BIOS pci_pcbios_init(); #endif diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index b02f6d8..8b10752 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -53,7 +53,7 @@ struct irq_router_handler { int (*probe)(struct irq_router *r, struct pci_dev *router, u16 device); }; -int (*pcibios_enable_irq)(struct pci_dev *dev) = NULL; +int (*pcibios_enable_irq)(struct pci_dev *dev) = pirq_enable_irq; void (*pcibios_disable_irq)(struct pci_dev *dev) = NULL; /* @@ -1018,7 +1018,7 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign) return 1; } -static void __init pcibios_fixup_irqs(void) +void __init pcibios_fixup_irqs(void) { struct pci_dev *dev = NULL; u8 pin; @@ -1112,12 +1112,12 @@ static struct dmi_system_id __initdata pciirq_dmi_table[] = { { } }; -int __init pcibios_irq_init(void) +void __init pcibios_irq_init(void) { DBG(KERN_DEBUG "PCI: IRQ init\n"); - if (pcibios_enable_irq || raw_pci_ops == NULL) - return 0; + if (raw_pci_ops == NULL) + return; dmi_check_system(pciirq_dmi_table); @@ -1144,9 +1144,7 @@ int __init pcibios_irq_init(void) pirq_table = NULL; } - pcibios_enable_irq = pirq_enable_irq; - - pcibios_fixup_irqs(); + x86_init.pci.fixup_irqs(); if (io_apic_assign_pci_irqs && pci_routeirq) { struct pci_dev *dev = NULL; @@ -1159,8 +1157,6 @@ int __init pcibios_irq_init(void) for_each_pci_dev(dev) pirq_enable_irq(dev); } - - return 0; } static void pirq_penalize_isa_irq(int irq, int active) diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c index 4061bb0..0db5eaf 100644 --- a/arch/x86/pci/legacy.c +++ b/arch/x86/pci/legacy.c @@ -35,16 +35,13 @@ static void __devinit pcibios_fixup_peer_bridges(void) } } -static int __init pci_legacy_init(void) +int __init pci_legacy_init(void) { if (!raw_pci_ops) { printk("PCI: System does not support PCI\n"); return 0; } - if (pcibios_scanned++) - return 0; - printk("PCI: Probing PCI hardware\n"); pci_root_bus = pcibios_scan_root(0); if (pci_root_bus) @@ -55,18 +52,15 @@ static int __init pci_legacy_init(void) int __init pci_subsys_init(void) { -#ifdef CONFIG_X86_NUMAQ - pci_numaq_init(); -#endif -#ifdef CONFIG_ACPI - pci_acpi_init(); -#endif -#ifdef CONFIG_X86_VISWS - pci_visws_init(); -#endif - pci_legacy_init(); + /* + * The init function returns an non zero value when + * pci_legacy_init should be invoked. + */ + if (x86_init.pci.init()) + pci_legacy_init(); + pcibios_fixup_peer_bridges(); - pcibios_irq_init(); + x86_init.pci.init_irq(); pcibios_init(); return 0; diff --git a/arch/x86/pci/mrst.c b/arch/x86/pci/mrst.c new file mode 100644 index 0000000..8bf2fcb --- /dev/null +++ b/arch/x86/pci/mrst.c @@ -0,0 +1,262 @@ +/* + * Moorestown PCI support + * Copyright (c) 2008 Intel Corporation + * Jesse Barnes <jesse.barnes@intel.com> + * + * Moorestown has an interesting PCI implementation: + * - configuration space is memory mapped (as defined by MCFG) + * - Lincroft devices also have a real, type 1 configuration space + * - Early Lincroft silicon has a type 1 access bug that will cause + * a hang if non-existent devices are accessed + * - some devices have the "fixed BAR" capability, which means + * they can't be relocated or modified; check for that during + * BAR sizing + * + * So, we use the MCFG space for all reads and writes, but also send + * Lincroft writes to type 1 space. But only read/write if the device + * actually exists, otherwise return all 1s for reads and bit bucket + * the writes. + */ + +#include <linux/sched.h> +#include <linux/pci.h> +#include <linux/ioport.h> +#include <linux/init.h> +#include <linux/dmi.h> + +#include <asm/acpi.h> +#include <asm/segment.h> +#include <asm/io.h> +#include <asm/smp.h> +#include <asm/pci_x86.h> +#include <asm/hw_irq.h> +#include <asm/io_apic.h> + +#define PCIE_CAP_OFFSET 0x100 + +/* Fixed BAR fields */ +#define PCIE_VNDR_CAP_ID_FIXED_BAR 0x00 /* Fixed BAR (TBD) */ +#define PCI_FIXED_BAR_0_SIZE 0x04 +#define PCI_FIXED_BAR_1_SIZE 0x08 +#define PCI_FIXED_BAR_2_SIZE 0x0c +#define PCI_FIXED_BAR_3_SIZE 0x10 +#define PCI_FIXED_BAR_4_SIZE 0x14 +#define PCI_FIXED_BAR_5_SIZE 0x1c + +/** + * fixed_bar_cap - return the offset of the fixed BAR cap if found + * @bus: PCI bus + * @devfn: device in question + * + * Look for the fixed BAR cap on @bus and @devfn, returning its offset + * if found or 0 otherwise. + */ +static int fixed_bar_cap(struct pci_bus *bus, unsigned int devfn) +{ + int pos; + u32 pcie_cap = 0, cap_data; + + pos = PCIE_CAP_OFFSET; + + if (!raw_pci_ext_ops) + return 0; + + while (pos) { + if (raw_pci_ext_ops->read(pci_domain_nr(bus), bus->number, + devfn, pos, 4, &pcie_cap)) + return 0; + + if (pcie_cap == 0xffffffff) + return 0; + + if (PCI_EXT_CAP_ID(pcie_cap) == PCI_EXT_CAP_ID_VNDR) { + raw_pci_ext_ops->read(pci_domain_nr(bus), bus->number, + devfn, pos + 4, 4, &cap_data); + if ((cap_data & 0xffff) == PCIE_VNDR_CAP_ID_FIXED_BAR) + return pos; + } + + pos = pcie_cap >> 20; + } + + return 0; +} + +static int pci_device_update_fixed(struct pci_bus *bus, unsigned int devfn, + int reg, int len, u32 val, int offset) +{ + u32 size; + unsigned int domain, busnum; + int bar = (reg - PCI_BASE_ADDRESS_0) >> 2; + + domain = pci_domain_nr(bus); + busnum = bus->number; + + if (val == ~0 && len == 4) { + unsigned long decode; + + raw_pci_ext_ops->read(domain, busnum, devfn, + offset + 8 + (bar * 4), 4, &size); + + /* Turn the size into a decode pattern for the sizing code */ + if (size) { + decode = size - 1; + decode |= decode >> 1; + decode |= decode >> 2; + decode |= decode >> 4; + decode |= decode >> 8; + decode |= decode >> 16; + decode++; + decode = ~(decode - 1); + } else { + decode = ~0; + } + + /* + * If val is all ones, the core code is trying to size the reg, + * so update the mmconfig space with the real size. + * + * Note: this assumes the fixed size we got is a power of two. + */ + return raw_pci_ext_ops->write(domain, busnum, devfn, reg, 4, + decode); + } + + /* This is some other kind of BAR write, so just do it. */ + return raw_pci_ext_ops->write(domain, busnum, devfn, reg, len, val); +} + +/** + * type1_access_ok - check whether to use type 1 + * @bus: bus number + * @devfn: device & function in question + * + * If the bus is on a Lincroft chip and it exists, or is not on a Lincroft at + * all, the we can go ahead with any reads & writes. If it's on a Lincroft, + * but doesn't exist, avoid the access altogether to keep the chip from + * hanging. + */ +static bool type1_access_ok(unsigned int bus, unsigned int devfn, int reg) +{ + /* This is a workaround for A0 LNC bug where PCI status register does + * not have new CAP bit set. can not be written by SW either. + * + * PCI header type in real LNC indicates a single function device, this + * will prevent probing other devices under the same function in PCI + * shim. Therefore, use the header type in shim instead. + */ + if (reg >= 0x100 || reg == PCI_STATUS || reg == PCI_HEADER_TYPE) + return 0; + if (bus == 0 && (devfn == PCI_DEVFN(2, 0) || devfn == PCI_DEVFN(0, 0))) + return 1; + return 0; /* langwell on others */ +} + +static int pci_read(struct pci_bus *bus, unsigned int devfn, int where, + int size, u32 *value) +{ + if (type1_access_ok(bus->number, devfn, where)) + return pci_direct_conf1.read(pci_domain_nr(bus), bus->number, + devfn, where, size, value); + return raw_pci_ext_ops->read(pci_domain_nr(bus), bus->number, + devfn, where, size, value); +} + +static int pci_write(struct pci_bus *bus, unsigned int devfn, int where, + int size, u32 value) +{ + int offset; + + /* On MRST, there is no PCI ROM BAR, this will cause a subsequent read + * to ROM BAR return 0 then being ignored. + */ + if (where == PCI_ROM_ADDRESS) + return 0; + + /* + * Devices with fixed BARs need special handling: + * - BAR sizing code will save, write ~0, read size, restore + * - so writes to fixed BARs need special handling + * - other writes to fixed BAR devices should go through mmconfig + */ + offset = fixed_bar_cap(bus, devfn); + if (offset && + (where >= PCI_BASE_ADDRESS_0 && where <= PCI_BASE_ADDRESS_5)) { + return pci_device_update_fixed(bus, devfn, where, size, value, + offset); + } + + /* + * On Moorestown update both real & mmconfig space + * Note: early Lincroft silicon can't handle type 1 accesses to + * non-existent devices, so just eat the write in that case. + */ + if (type1_access_ok(bus->number, devfn, where)) + return pci_direct_conf1.write(pci_domain_nr(bus), bus->number, + devfn, where, size, value); + return raw_pci_ext_ops->write(pci_domain_nr(bus), bus->number, devfn, + where, size, value); +} + +static int mrst_pci_irq_enable(struct pci_dev *dev) +{ + u8 pin; + struct io_apic_irq_attr irq_attr; + + pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); + + /* MRST only have IOAPIC, the PCI irq lines are 1:1 mapped to + * IOAPIC RTE entries, so we just enable RTE for the device. + */ + irq_attr.ioapic = mp_find_ioapic(dev->irq); + irq_attr.ioapic_pin = dev->irq; + irq_attr.trigger = 1; /* level */ + irq_attr.polarity = 1; /* active low */ + io_apic_set_pci_routing(&dev->dev, dev->irq, &irq_attr); + + return 0; +} + +struct pci_ops pci_mrst_ops = { + .read = pci_read, + .write = pci_write, +}; + +/** + * pci_mrst_init - installs pci_mrst_ops + * + * Moorestown has an interesting PCI implementation (see above). + * Called when the early platform detection installs it. + */ +int __init pci_mrst_init(void) +{ + printk(KERN_INFO "Moorestown platform detected, using MRST PCI ops\n"); + pci_mmcfg_late_init(); + pcibios_enable_irq = mrst_pci_irq_enable; + pci_root_ops = pci_mrst_ops; + /* Continue with standard init */ + return 1; +} + +/* + * Langwell devices reside at fixed offsets, don't try to move them. + */ +static void __devinit pci_fixed_bar_fixup(struct pci_dev *dev) +{ + unsigned long offset; + u32 size; + int i; + + /* Fixup the BAR sizes for fixed BAR devices and make them unmoveable */ + offset = fixed_bar_cap(dev->bus, dev->devfn); + if (!offset || PCI_DEVFN(2, 0) == dev->devfn || + PCI_DEVFN(2, 2) == dev->devfn) + return; + + for (i = 0; i < PCI_ROM_RESOURCE; i++) { + pci_read_config_dword(dev, offset + 8 + (i * 4), &size); + dev->resource[i].end = dev->resource[i].start + size - 1; + dev->resource[i].flags |= IORESOURCE_PCI_FIXED; + } +} +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, pci_fixed_bar_fixup); diff --git a/arch/x86/pci/numaq_32.c b/arch/x86/pci/numaq_32.c index 8884a1c..8223738 100644 --- a/arch/x86/pci/numaq_32.c +++ b/arch/x86/pci/numaq_32.c @@ -148,14 +148,8 @@ int __init pci_numaq_init(void) { int quad; - if (!found_numaq) - return 0; - raw_pci_ops = &pci_direct_conf1_mq; - if (pcibios_scanned++) - return 0; - pci_root_bus = pcibios_scan_root(0); if (pci_root_bus) pci_bus_add_devices(pci_root_bus); diff --git a/arch/x86/pci/olpc.c b/arch/x86/pci/olpc.c index b889d82..b348154 100644 --- a/arch/x86/pci/olpc.c +++ b/arch/x86/pci/olpc.c @@ -304,9 +304,6 @@ static struct pci_raw_ops pci_olpc_conf = { int __init pci_olpc_init(void) { - if (!machine_is_olpc() || olpc_has_vsa()) - return -ENODEV; - printk(KERN_INFO "PCI: Using configuration type OLPC\n"); raw_pci_ops = &pci_olpc_conf; is_lx = is_geode_lx(); diff --git a/arch/x86/pci/visws.c b/arch/x86/pci/visws.c index bcead7a..03008f7 100644 --- a/arch/x86/pci/visws.c +++ b/arch/x86/pci/visws.c @@ -69,9 +69,6 @@ void __init pcibios_update_irq(struct pci_dev *dev, int irq) int __init pci_visws_init(void) { - if (!is_visws_box()) - return -1; - pcibios_enable_irq = &pci_visws_enable_irq; pcibios_disable_irq = &pci_visws_disable_irq; @@ -90,5 +87,6 @@ int __init pci_visws_init(void) pci_scan_bus_with_sysdata(pci_bus1); pci_fixup_irqs(pci_common_swizzle, visws_map_irq); pcibios_resource_survey(); - return 0; + /* Request bus scan */ + return 1; } |