diff options
Diffstat (limited to 'arch/x86')
159 files changed, 9168 insertions, 4250 deletions
diff --git a/arch/x86/.gitignore b/arch/x86/.gitignore index 0280790..7cab8c0 100644 --- a/arch/x86/.gitignore +++ b/arch/x86/.gitignore @@ -1,3 +1,4 @@ boot/compressed/vmlinux tools/test_get_len +tools/insn_sanity diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5731eb7..864cc6e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -60,8 +60,12 @@ config X86 select PERF_EVENTS select HAVE_PERF_EVENTS_NMI select ANON_INODES + select HAVE_ALIGNED_STRUCT_PAGE if SLUB && !M386 + select HAVE_CMPXCHG_LOCAL if !M386 + select HAVE_CMPXCHG_DOUBLE select HAVE_ARCH_KMEMCHECK select HAVE_USER_RETURN_NOTIFIER + select ARCH_BINFMT_ELF_RANDOMIZE_PIE select HAVE_ARCH_JUMP_LABEL select HAVE_TEXT_POKE_SMP select HAVE_GENERIC_HARDIRQS @@ -77,6 +81,7 @@ config X86 select HAVE_BPF_JIT if (X86_64 && NET) select CLKEVT_I8253 select ARCH_HAVE_NMI_SAFE_CMPXCHG + select GENERIC_IOMAP config INSTRUCTION_DECODER def_bool (KPROBES || PERF_EVENTS) @@ -120,16 +125,6 @@ config HAVE_LATENCYTOP_SUPPORT config MMU def_bool y -config ZONE_DMA - bool "DMA memory allocation support" if EXPERT - default y - help - DMA memory allocation support allows devices with less than 32-bit - addressing to allocate within the first 16MB of address space. - Disable if no such devices will be used. - - If unsure, say Y. - config SBUS bool @@ -142,9 +137,6 @@ config NEED_SG_DMA_LENGTH config GENERIC_ISA_DMA def_bool ISA_DMA_API -config GENERIC_IOMAP - def_bool y - config GENERIC_BUG def_bool y depends on BUG @@ -253,6 +245,16 @@ source "kernel/Kconfig.freezer" menu "Processor type and features" +config ZONE_DMA + bool "DMA memory allocation support" if EXPERT + default y + help + DMA memory allocation support allows devices with less than 32-bit + addressing to allocate within the first 16MB of address space. + Disable if no such devices will be used. + + If unsure, say Y. + source "kernel/time/Kconfig" config SMP @@ -421,12 +423,14 @@ config X86_MRST depends on PCI depends on PCI_GOANY depends on X86_IO_APIC + select X86_INTEL_MID + select SFI + select DW_APB_TIMER select APB_TIMER select I2C select SPI select INTEL_SCU_IPC select X86_PLATFORM_DEVICES - select X86_INTEL_MID ---help--- Moorestown is Intel's Low Power Intel Architecture (LPIA) based Moblin Internet Device(MID) platform. Moorestown consists of two chips: @@ -435,6 +439,26 @@ config X86_MRST nor standard legacy replacement devices/features. e.g. Moorestown does not contain i8259, i8254, HPET, legacy BIOS, most of the io ports. +config X86_MDFLD + bool "Medfield MID platform" + depends on PCI + depends on PCI_GOANY + depends on X86_IO_APIC + select X86_INTEL_MID + select SFI + select DW_APB_TIMER + select APB_TIMER + select I2C + select SPI + select INTEL_SCU_IPC + select X86_PLATFORM_DEVICES + ---help--- + Medfield is Intel's Low Power Intel Architecture (LPIA) based Moblin + Internet Device(MID) platform. + Unlike standard x86 PCs, Medfield does not have many legacy devices + nor standard legacy replacement devices/features. e.g. Medfield does + not contain i8259, i8254, HPET, legacy BIOS, most of the io ports. + endif config X86_RDC321X @@ -632,7 +656,7 @@ config X86_SUMMIT_NUMA config X86_CYCLONE_TIMER def_bool y - depends on X86_32_NON_STANDARD + depends on X86_SUMMIT source "arch/x86/Kconfig.cpu" @@ -660,9 +684,10 @@ config HPET_EMULATE_RTC depends on HPET_TIMER && (RTC=y || RTC=m || RTC_DRV_CMOS=m || RTC_DRV_CMOS=y) config APB_TIMER - def_bool y if MRST - prompt "Langwell APB Timer Support" if X86_MRST + def_bool y if X86_INTEL_MID + prompt "Intel MID APB Timer Support" if X86_INTEL_MID select DW_APB_TIMER + depends on X86_INTEL_MID && SFI help APB timer is the replacement for 8254, HPET on X86 MID platforms. The APBT provides a stable time base on SMP @@ -1490,6 +1515,13 @@ config EFI resultant kernel should continue to boot on existing non-EFI platforms. +config EFI_STUB + bool "EFI stub support" + depends on EFI + ---help--- + This kernel feature allows a bzImage to be loaded directly + by EFI firmware without the use of a bootloader. + config SECCOMP def_bool y prompt "Enable seccomp to safely compute untrusted bytecode" @@ -1742,7 +1774,7 @@ source "drivers/sfi/Kconfig" config X86_APM_BOOT def_bool y - depends on APM || APM_MODULE + depends on APM menuconfig APM tristate "APM (Advanced Power Management) BIOS support" diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index e3ca7e0..3c57033 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -309,12 +309,6 @@ config X86_INTERNODE_CACHE_SHIFT config X86_CMPXCHG def_bool X86_64 || (X86_32 && !M386) -config CMPXCHG_LOCAL - def_bool X86_64 || (X86_32 && !M386) - -config CMPXCHG_DOUBLE - def_bool y - config X86_L1_CACHE_SHIFT int default "7" if MPENTIUM4 || MPSC diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index bf56e17..e46c21473 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -43,9 +43,9 @@ config EARLY_PRINTK with klogd/syslogd or the X server. You should normally N here, unless you want to debug such a crash. -config EARLY_PRINTK_MRST - bool "Early printk for MRST platform support" - depends on EARLY_PRINTK && X86_MRST +config EARLY_PRINTK_INTEL_MID + bool "Early printk for Intel MID platform support" + depends on EARLY_PRINTK && X86_INTEL_MID config EARLY_PRINTK_DBGP bool "Early printk via EHCI debug port" @@ -63,8 +63,11 @@ config DEBUG_STACKOVERFLOW bool "Check for stack overflows" depends on DEBUG_KERNEL ---help--- - This option will cause messages to be printed if free stack space - drops below a certain limit. + Say Y here if you want to check the overflows of kernel, IRQ + and exception stacks. This option will cause messages of the + stacks in detail when free stack space drops below a certain + limit. + If in doubt, say "N". config X86_PTDUMP bool "Export kernel pagetable layout to userspace via debugfs" @@ -284,4 +287,16 @@ config DEBUG_STRICT_USER_COPY_CHECKS If unsure, or if you run an older (pre 4.4) gcc, say N. +config DEBUG_NMI_SELFTEST + bool "NMI Selftest" + depends on DEBUG_KERNEL && X86_LOCAL_APIC + ---help--- + Enabling this option turns on a quick NMI selftest to verify + that the NMI behaves correctly. + + This might help diagnose strange hangs that rely on NMI to + function properly. + + If unsure, say N. + endmenu diff --git a/arch/x86/Makefile b/arch/x86/Makefile index b02e509..209ba12 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -118,6 +118,12 @@ KBUILD_CFLAGS += $(mflags-y) KBUILD_AFLAGS += $(mflags-y) ### +# Syscall table generation + +archheaders: + $(Q)$(MAKE) $(build)=arch/x86/syscalls all + +### # Kernel objects head-y := arch/x86/kernel/head_$(BITS).o diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 09664ef..b123b9a 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -23,7 +23,15 @@ LDFLAGS_vmlinux := -T hostprogs-y := mkpiggy -$(obj)/vmlinux: $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/string.o $(obj)/cmdline.o $(obj)/early_serial_console.o $(obj)/piggy.o FORCE +VMLINUX_OBJS = $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o \ + $(obj)/string.o $(obj)/cmdline.o $(obj)/early_serial_console.o \ + $(obj)/piggy.o + +ifeq ($(CONFIG_EFI_STUB), y) + VMLINUX_OBJS += $(obj)/eboot.o $(obj)/efi_stub_$(BITS).o +endif + +$(obj)/vmlinux: $(VMLINUX_OBJS) FORCE $(call if_changed,ld) @: diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c new file mode 100644 index 0000000..fec216f --- /dev/null +++ b/arch/x86/boot/compressed/eboot.c @@ -0,0 +1,1022 @@ +/* ----------------------------------------------------------------------- + * + * Copyright 2011 Intel Corporation; author Matt Fleming + * + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2. + * + * ----------------------------------------------------------------------- */ + +#include <linux/efi.h> +#include <asm/efi.h> +#include <asm/setup.h> +#include <asm/desc.h> + +#include "eboot.h" + +static efi_system_table_t *sys_table; + +static efi_status_t __get_map(efi_memory_desc_t **map, unsigned long *map_size, + unsigned long *desc_size) +{ + efi_memory_desc_t *m = NULL; + efi_status_t status; + unsigned long key; + u32 desc_version; + + *map_size = sizeof(*m) * 32; +again: + /* + * Add an additional efi_memory_desc_t because we're doing an + * allocation which may be in a new descriptor region. + */ + *map_size += sizeof(*m); + status = efi_call_phys3(sys_table->boottime->allocate_pool, + EFI_LOADER_DATA, *map_size, (void **)&m); + if (status != EFI_SUCCESS) + goto fail; + + status = efi_call_phys5(sys_table->boottime->get_memory_map, map_size, + m, &key, desc_size, &desc_version); + if (status == EFI_BUFFER_TOO_SMALL) { + efi_call_phys1(sys_table->boottime->free_pool, m); + goto again; + } + + if (status != EFI_SUCCESS) + efi_call_phys1(sys_table->boottime->free_pool, m); + +fail: + *map = m; + return status; +} + +/* + * Allocate at the highest possible address that is not above 'max'. + */ +static efi_status_t high_alloc(unsigned long size, unsigned long align, + unsigned long *addr, unsigned long max) +{ + unsigned long map_size, desc_size; + efi_memory_desc_t *map; + efi_status_t status; + unsigned long nr_pages; + u64 max_addr = 0; + int i; + + status = __get_map(&map, &map_size, &desc_size); + if (status != EFI_SUCCESS) + goto fail; + + nr_pages = round_up(size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE; +again: + for (i = 0; i < map_size / desc_size; i++) { + efi_memory_desc_t *desc; + unsigned long m = (unsigned long)map; + u64 start, end; + + desc = (efi_memory_desc_t *)(m + (i * desc_size)); + if (desc->type != EFI_CONVENTIONAL_MEMORY) + continue; + + if (desc->num_pages < nr_pages) + continue; + + start = desc->phys_addr; + end = start + desc->num_pages * (1UL << EFI_PAGE_SHIFT); + + if ((start + size) > end || (start + size) > max) + continue; + + if (end - size > max) + end = max; + + if (round_down(end - size, align) < start) + continue; + + start = round_down(end - size, align); + + /* + * Don't allocate at 0x0. It will confuse code that + * checks pointers against NULL. + */ + if (start == 0x0) + continue; + + if (start > max_addr) + max_addr = start; + } + + if (!max_addr) + status = EFI_NOT_FOUND; + else { + status = efi_call_phys4(sys_table->boottime->allocate_pages, + EFI_ALLOCATE_ADDRESS, EFI_LOADER_DATA, + nr_pages, &max_addr); + if (status != EFI_SUCCESS) { + max = max_addr; + max_addr = 0; + goto again; + } + + *addr = max_addr; + } + +free_pool: + efi_call_phys1(sys_table->boottime->free_pool, map); + +fail: + return status; +} + +/* + * Allocate at the lowest possible address. + */ +static efi_status_t low_alloc(unsigned long size, unsigned long align, + unsigned long *addr) +{ + unsigned long map_size, desc_size; + efi_memory_desc_t *map; + efi_status_t status; + unsigned long nr_pages; + int i; + + status = __get_map(&map, &map_size, &desc_size); + if (status != EFI_SUCCESS) + goto fail; + + nr_pages = round_up(size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE; + for (i = 0; i < map_size / desc_size; i++) { + efi_memory_desc_t *desc; + unsigned long m = (unsigned long)map; + u64 start, end; + + desc = (efi_memory_desc_t *)(m + (i * desc_size)); + + if (desc->type != EFI_CONVENTIONAL_MEMORY) + continue; + + if (desc->num_pages < nr_pages) + continue; + + start = desc->phys_addr; + end = start + desc->num_pages * (1UL << EFI_PAGE_SHIFT); + + /* + * Don't allocate at 0x0. It will confuse code that + * checks pointers against NULL. Skip the first 8 + * bytes so we start at a nice even number. + */ + if (start == 0x0) + start += 8; + + start = round_up(start, align); + if ((start + size) > end) + continue; + + status = efi_call_phys4(sys_table->boottime->allocate_pages, + EFI_ALLOCATE_ADDRESS, EFI_LOADER_DATA, + nr_pages, &start); + if (status == EFI_SUCCESS) { + *addr = start; + break; + } + } + + if (i == map_size / desc_size) + status = EFI_NOT_FOUND; + +free_pool: + efi_call_phys1(sys_table->boottime->free_pool, map); +fail: + return status; +} + +static void low_free(unsigned long size, unsigned long addr) +{ + unsigned long nr_pages; + + nr_pages = round_up(size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE; + efi_call_phys2(sys_table->boottime->free_pages, addr, size); +} + +static void find_bits(unsigned long mask, u8 *pos, u8 *size) +{ + u8 first, len; + + first = 0; + len = 0; + + if (mask) { + while (!(mask & 0x1)) { + mask = mask >> 1; + first++; + } + + while (mask & 0x1) { + mask = mask >> 1; + len++; + } + } + + *pos = first; + *size = len; +} + +/* + * See if we have Graphics Output Protocol + */ +static efi_status_t setup_gop(struct screen_info *si, efi_guid_t *proto, + unsigned long size) +{ + struct efi_graphics_output_protocol *gop, *first_gop; + struct efi_pixel_bitmask pixel_info; + unsigned long nr_gops; + efi_status_t status; + void **gop_handle; + u16 width, height; + u32 fb_base, fb_size; + u32 pixels_per_scan_line; + int pixel_format; + int i; + + status = efi_call_phys3(sys_table->boottime->allocate_pool, + EFI_LOADER_DATA, size, &gop_handle); + if (status != EFI_SUCCESS) + return status; + + status = efi_call_phys5(sys_table->boottime->locate_handle, + EFI_LOCATE_BY_PROTOCOL, proto, + NULL, &size, gop_handle); + if (status != EFI_SUCCESS) + goto free_handle; + + first_gop = NULL; + + nr_gops = size / sizeof(void *); + for (i = 0; i < nr_gops; i++) { + struct efi_graphics_output_mode_info *info; + efi_guid_t pciio_proto = EFI_PCI_IO_PROTOCOL_GUID; + void *pciio; + void *h = gop_handle[i]; + + status = efi_call_phys3(sys_table->boottime->handle_protocol, + h, proto, &gop); + if (status != EFI_SUCCESS) + continue; + + efi_call_phys3(sys_table->boottime->handle_protocol, + h, &pciio_proto, &pciio); + + status = efi_call_phys4(gop->query_mode, gop, + gop->mode->mode, &size, &info); + if (status == EFI_SUCCESS && (!first_gop || pciio)) { + /* + * Apple provide GOPs that are not backed by + * real hardware (they're used to handle + * multiple displays). The workaround is to + * search for a GOP implementing the PCIIO + * protocol, and if one isn't found, to just + * fallback to the first GOP. + */ + width = info->horizontal_resolution; + height = info->vertical_resolution; + fb_base = gop->mode->frame_buffer_base; + fb_size = gop->mode->frame_buffer_size; + pixel_format = info->pixel_format; + pixel_info = info->pixel_information; + pixels_per_scan_line = info->pixels_per_scan_line; + + /* + * Once we've found a GOP supporting PCIIO, + * don't bother looking any further. + */ + if (pciio) + break; + + first_gop = gop; + } + } + + /* Did we find any GOPs? */ + if (!first_gop) + goto free_handle; + + /* EFI framebuffer */ + si->orig_video_isVGA = VIDEO_TYPE_EFI; + + si->lfb_width = width; + si->lfb_height = height; + si->lfb_base = fb_base; + si->lfb_size = fb_size; + si->pages = 1; + + if (pixel_format == PIXEL_RGB_RESERVED_8BIT_PER_COLOR) { + si->lfb_depth = 32; + si->lfb_linelength = pixels_per_scan_line * 4; + si->red_size = 8; + si->red_pos = 0; + si->green_size = 8; + si->green_pos = 8; + si->blue_size = 8; + si->blue_pos = 16; + si->rsvd_size = 8; + si->rsvd_pos = 24; + } else if (pixel_format == PIXEL_BGR_RESERVED_8BIT_PER_COLOR) { + si->lfb_depth = 32; + si->lfb_linelength = pixels_per_scan_line * 4; + si->red_size = 8; + si->red_pos = 16; + si->green_size = 8; + si->green_pos = 8; + si->blue_size = 8; + si->blue_pos = 0; + si->rsvd_size = 8; + si->rsvd_pos = 24; + } else if (pixel_format == PIXEL_BIT_MASK) { + find_bits(pixel_info.red_mask, &si->red_pos, &si->red_size); + find_bits(pixel_info.green_mask, &si->green_pos, + &si->green_size); + find_bits(pixel_info.blue_mask, &si->blue_pos, &si->blue_size); + find_bits(pixel_info.reserved_mask, &si->rsvd_pos, + &si->rsvd_size); + si->lfb_depth = si->red_size + si->green_size + + si->blue_size + si->rsvd_size; + si->lfb_linelength = (pixels_per_scan_line * si->lfb_depth) / 8; + } else { + si->lfb_depth = 4; + si->lfb_linelength = si->lfb_width / 2; + si->red_size = 0; + si->red_pos = 0; + si->green_size = 0; + si->green_pos = 0; + si->blue_size = 0; + si->blue_pos = 0; + si->rsvd_size = 0; + si->rsvd_pos = 0; + } + +free_handle: + efi_call_phys1(sys_table->boottime->free_pool, gop_handle); + return status; +} + +/* + * See if we have Universal Graphics Adapter (UGA) protocol + */ +static efi_status_t setup_uga(struct screen_info *si, efi_guid_t *uga_proto, + unsigned long size) +{ + struct efi_uga_draw_protocol *uga, *first_uga; + unsigned long nr_ugas; + efi_status_t status; + u32 width, height; + void **uga_handle = NULL; + int i; + + status = efi_call_phys3(sys_table->boottime->allocate_pool, + EFI_LOADER_DATA, size, &uga_handle); + if (status != EFI_SUCCESS) + return status; + + status = efi_call_phys5(sys_table->boottime->locate_handle, + EFI_LOCATE_BY_PROTOCOL, uga_proto, + NULL, &size, uga_handle); + if (status != EFI_SUCCESS) + goto free_handle; + + first_uga = NULL; + + nr_ugas = size / sizeof(void *); + for (i = 0; i < nr_ugas; i++) { + efi_guid_t pciio_proto = EFI_PCI_IO_PROTOCOL_GUID; + void *handle = uga_handle[i]; + u32 w, h, depth, refresh; + void *pciio; + + status = efi_call_phys3(sys_table->boottime->handle_protocol, + handle, uga_proto, &uga); + if (status != EFI_SUCCESS) + continue; + + efi_call_phys3(sys_table->boottime->handle_protocol, + handle, &pciio_proto, &pciio); + + status = efi_call_phys5(uga->get_mode, uga, &w, &h, + &depth, &refresh); + if (status == EFI_SUCCESS && (!first_uga || pciio)) { + width = w; + height = h; + + /* + * Once we've found a UGA supporting PCIIO, + * don't bother looking any further. + */ + if (pciio) + break; + + first_uga = uga; + } + } + + if (!first_uga) + goto free_handle; + + /* EFI framebuffer */ + si->orig_video_isVGA = VIDEO_TYPE_EFI; + + si->lfb_depth = 32; + si->lfb_width = width; + si->lfb_height = height; + + si->red_size = 8; + si->red_pos = 16; + si->green_size = 8; + si->green_pos = 8; + si->blue_size = 8; + si->blue_pos = 0; + si->rsvd_size = 8; + si->rsvd_pos = 24; + + +free_handle: + efi_call_phys1(sys_table->boottime->free_pool, uga_handle); + return status; +} + +void setup_graphics(struct boot_params *boot_params) +{ + efi_guid_t graphics_proto = EFI_GRAPHICS_OUTPUT_PROTOCOL_GUID; + struct screen_info *si; + efi_guid_t uga_proto = EFI_UGA_PROTOCOL_GUID; + efi_status_t status; + unsigned long size; + void **gop_handle = NULL; + void **uga_handle = NULL; + + si = &boot_params->screen_info; + memset(si, 0, sizeof(*si)); + + size = 0; + status = efi_call_phys5(sys_table->boottime->locate_handle, + EFI_LOCATE_BY_PROTOCOL, &graphics_proto, + NULL, &size, gop_handle); + if (status == EFI_BUFFER_TOO_SMALL) + status = setup_gop(si, &graphics_proto, size); + + if (status != EFI_SUCCESS) { + size = 0; + status = efi_call_phys5(sys_table->boottime->locate_handle, + EFI_LOCATE_BY_PROTOCOL, &uga_proto, + NULL, &size, uga_handle); + if (status == EFI_BUFFER_TOO_SMALL) + setup_uga(si, &uga_proto, size); + } +} + +struct initrd { + efi_file_handle_t *handle; + u64 size; +}; + +/* + * Check the cmdline for a LILO-style initrd= arguments. + * + * We only support loading an initrd from the same filesystem as the + * kernel image. + */ +static efi_status_t handle_ramdisks(efi_loaded_image_t *image, + struct setup_header *hdr) +{ + struct initrd *initrds; + unsigned long initrd_addr; + efi_guid_t fs_proto = EFI_FILE_SYSTEM_GUID; + u64 initrd_total; + efi_file_io_interface_t *io; + efi_file_handle_t *fh; + efi_status_t status; + int nr_initrds; + char *str; + int i, j, k; + + initrd_addr = 0; + initrd_total = 0; + + str = (char *)(unsigned long)hdr->cmd_line_ptr; + + j = 0; /* See close_handles */ + + if (!str || !*str) + return EFI_SUCCESS; + + for (nr_initrds = 0; *str; nr_initrds++) { + str = strstr(str, "initrd="); + if (!str) + break; + + str += 7; + + /* Skip any leading slashes */ + while (*str == '/' || *str == '\\') + str++; + + while (*str && *str != ' ' && *str != '\n') + str++; + } + + if (!nr_initrds) + return EFI_SUCCESS; + + status = efi_call_phys3(sys_table->boottime->allocate_pool, + EFI_LOADER_DATA, + nr_initrds * sizeof(*initrds), + &initrds); + if (status != EFI_SUCCESS) + goto fail; + + str = (char *)(unsigned long)hdr->cmd_line_ptr; + for (i = 0; i < nr_initrds; i++) { + struct initrd *initrd; + efi_file_handle_t *h; + efi_file_info_t *info; + efi_char16_t filename[256]; + unsigned long info_sz; + efi_guid_t info_guid = EFI_FILE_INFO_ID; + efi_char16_t *p; + u64 file_sz; + + str = strstr(str, "initrd="); + if (!str) + break; + + str += 7; + + initrd = &initrds[i]; + p = filename; + + /* Skip any leading slashes */ + while (*str == '/' || *str == '\\') + str++; + + while (*str && *str != ' ' && *str != '\n') { + if (p >= filename + sizeof(filename)) + break; + + *p++ = *str++; + } + + *p = '\0'; + + /* Only open the volume once. */ + if (!i) { + efi_boot_services_t *boottime; + + boottime = sys_table->boottime; + + status = efi_call_phys3(boottime->handle_protocol, + image->device_handle, &fs_proto, &io); + if (status != EFI_SUCCESS) + goto free_initrds; + + status = efi_call_phys2(io->open_volume, io, &fh); + if (status != EFI_SUCCESS) + goto free_initrds; + } + + status = efi_call_phys5(fh->open, fh, &h, filename, + EFI_FILE_MODE_READ, (u64)0); + if (status != EFI_SUCCESS) + goto close_handles; + + initrd->handle = h; + + info_sz = 0; + status = efi_call_phys4(h->get_info, h, &info_guid, + &info_sz, NULL); + if (status != EFI_BUFFER_TOO_SMALL) + goto close_handles; + +grow: + status = efi_call_phys3(sys_table->boottime->allocate_pool, + EFI_LOADER_DATA, info_sz, &info); + if (status != EFI_SUCCESS) + goto close_handles; + + status = efi_call_phys4(h->get_info, h, &info_guid, + &info_sz, info); + if (status == EFI_BUFFER_TOO_SMALL) { + efi_call_phys1(sys_table->boottime->free_pool, info); + goto grow; + } + + file_sz = info->file_size; + efi_call_phys1(sys_table->boottime->free_pool, info); + + if (status != EFI_SUCCESS) + goto close_handles; + + initrd->size = file_sz; + initrd_total += file_sz; + } + + if (initrd_total) { + unsigned long addr; + + /* + * Multiple initrd's need to be at consecutive + * addresses in memory, so allocate enough memory for + * all the initrd's. + */ + status = high_alloc(initrd_total, 0x1000, + &initrd_addr, hdr->initrd_addr_max); + if (status != EFI_SUCCESS) + goto close_handles; + + /* We've run out of free low memory. */ + if (initrd_addr > hdr->initrd_addr_max) { + status = EFI_INVALID_PARAMETER; + goto free_initrd_total; + } + + addr = initrd_addr; + for (j = 0; j < nr_initrds; j++) { + u64 size; + + size = initrds[j].size; + while (size) { + u64 chunksize; + if (size > EFI_READ_CHUNK_SIZE) + chunksize = EFI_READ_CHUNK_SIZE; + else + chunksize = size; + status = efi_call_phys3(fh->read, + initrds[j].handle, + &chunksize, addr); + if (status != EFI_SUCCESS) + goto free_initrd_total; + addr += chunksize; + size -= chunksize; + } + + efi_call_phys1(fh->close, initrds[j].handle); + } + + } + + efi_call_phys1(sys_table->boottime->free_pool, initrds); + + hdr->ramdisk_image = initrd_addr; + hdr->ramdisk_size = initrd_total; + + return status; + +free_initrd_total: + low_free(initrd_total, initrd_addr); + +close_handles: + for (k = j; k < nr_initrds; k++) + efi_call_phys1(fh->close, initrds[k].handle); +free_initrds: + efi_call_phys1(sys_table->boottime->free_pool, initrds); +fail: + hdr->ramdisk_image = 0; + hdr->ramdisk_size = 0; + + return status; +} + +/* + * Because the x86 boot code expects to be passed a boot_params we + * need to create one ourselves (usually the bootloader would create + * one for us). + */ +static efi_status_t make_boot_params(struct boot_params *boot_params, + efi_loaded_image_t *image, + void *handle) +{ + struct efi_info *efi = &boot_params->efi_info; + struct apm_bios_info *bi = &boot_params->apm_bios_info; + struct sys_desc_table *sdt = &boot_params->sys_desc_table; + struct e820entry *e820_map = &boot_params->e820_map[0]; + struct e820entry *prev = NULL; + struct setup_header *hdr = &boot_params->hdr; + unsigned long size, key, desc_size, _size; + efi_memory_desc_t *mem_map; + void *options = image->load_options; + u32 load_options_size = image->load_options_size / 2; /* ASCII */ + int options_size = 0; + efi_status_t status; + __u32 desc_version; + unsigned long cmdline; + u8 nr_entries; + u16 *s2; + u8 *s1; + int i; + + hdr->type_of_loader = 0x21; + + /* Convert unicode cmdline to ascii */ + cmdline = 0; + s2 = (u16 *)options; + + if (s2) { + while (*s2 && *s2 != '\n' && options_size < load_options_size) { + s2++; + options_size++; + } + + if (options_size) { + if (options_size > hdr->cmdline_size) + options_size = hdr->cmdline_size; + + options_size++; /* NUL termination */ + + status = low_alloc(options_size, 1, &cmdline); + if (status != EFI_SUCCESS) + goto fail; + + s1 = (u8 *)(unsigned long)cmdline; + s2 = (u16 *)options; + + for (i = 0; i < options_size - 1; i++) + *s1++ = *s2++; + + *s1 = '\0'; + } + } + + hdr->cmd_line_ptr = cmdline; + + hdr->ramdisk_image = 0; + hdr->ramdisk_size = 0; + + status = handle_ramdisks(image, hdr); + if (status != EFI_SUCCESS) + goto free_cmdline; + + setup_graphics(boot_params); + + /* Clear APM BIOS info */ + memset(bi, 0, sizeof(*bi)); + + memset(sdt, 0, sizeof(*sdt)); + + memcpy(&efi->efi_loader_signature, EFI_LOADER_SIGNATURE, sizeof(__u32)); + + size = sizeof(*mem_map) * 32; + +again: + size += sizeof(*mem_map); + _size = size; + status = low_alloc(size, 1, (unsigned long *)&mem_map); + if (status != EFI_SUCCESS) + goto free_cmdline; + + status = efi_call_phys5(sys_table->boottime->get_memory_map, &size, + mem_map, &key, &desc_size, &desc_version); + if (status == EFI_BUFFER_TOO_SMALL) { + low_free(_size, (unsigned long)mem_map); + goto again; + } + + if (status != EFI_SUCCESS) + goto free_mem_map; + + efi->efi_systab = (unsigned long)sys_table; + efi->efi_memdesc_size = desc_size; + efi->efi_memdesc_version = desc_version; + efi->efi_memmap = (unsigned long)mem_map; + efi->efi_memmap_size = size; + +#ifdef CONFIG_X86_64 + efi->efi_systab_hi = (unsigned long)sys_table >> 32; + efi->efi_memmap_hi = (unsigned long)mem_map >> 32; +#endif + + /* Might as well exit boot services now */ + status = efi_call_phys2(sys_table->boottime->exit_boot_services, + handle, key); + if (status != EFI_SUCCESS) + goto free_mem_map; + + /* Historic? */ + boot_params->alt_mem_k = 32 * 1024; + + /* + * Convert the EFI memory map to E820. + */ + nr_entries = 0; + for (i = 0; i < size / desc_size; i++) { + efi_memory_desc_t *d; + unsigned int e820_type = 0; + unsigned long m = (unsigned long)mem_map; + + d = (efi_memory_desc_t *)(m + (i * desc_size)); + switch (d->type) { + case EFI_RESERVED_TYPE: + case EFI_RUNTIME_SERVICES_CODE: + case EFI_RUNTIME_SERVICES_DATA: + case EFI_MEMORY_MAPPED_IO: + case EFI_MEMORY_MAPPED_IO_PORT_SPACE: + case EFI_PAL_CODE: + e820_type = E820_RESERVED; + break; + + case EFI_UNUSABLE_MEMORY: + e820_type = E820_UNUSABLE; + break; + + case EFI_ACPI_RECLAIM_MEMORY: + e820_type = E820_ACPI; + break; + + case EFI_LOADER_CODE: + case EFI_LOADER_DATA: + case EFI_BOOT_SERVICES_CODE: + case EFI_BOOT_SERVICES_DATA: + case EFI_CONVENTIONAL_MEMORY: + e820_type = E820_RAM; + break; + + case EFI_ACPI_MEMORY_NVS: + e820_type = E820_NVS; + break; + + default: + continue; + } + + /* Merge adjacent mappings */ + if (prev && prev->type == e820_type && + (prev->addr + prev->size) == d->phys_addr) + prev->size += d->num_pages << 12; + else { + e820_map->addr = d->phys_addr; + e820_map->size = d->num_pages << 12; + e820_map->type = e820_type; + prev = e820_map++; + nr_entries++; + } + } + + boot_params->e820_entries = nr_entries; + + return EFI_SUCCESS; + +free_mem_map: + low_free(_size, (unsigned long)mem_map); +free_cmdline: + if (options_size) + low_free(options_size, hdr->cmd_line_ptr); +fail: + return status; +} + +/* + * On success we return a pointer to a boot_params structure, and NULL + * on failure. + */ +struct boot_params *efi_main(void *handle, efi_system_table_t *_table) +{ + struct boot_params *boot_params; + unsigned long start, nr_pages; + struct desc_ptr *gdt, *idt; + efi_loaded_image_t *image; + struct setup_header *hdr; + efi_status_t status; + efi_guid_t proto = LOADED_IMAGE_PROTOCOL_GUID; + struct desc_struct *desc; + + sys_table = _table; + + /* Check if we were booted by the EFI firmware */ + if (sys_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) + goto fail; + + status = efi_call_phys3(sys_table->boottime->handle_protocol, + handle, &proto, (void *)&image); + if (status != EFI_SUCCESS) + goto fail; + + status = low_alloc(0x4000, 1, (unsigned long *)&boot_params); + if (status != EFI_SUCCESS) + goto fail; + + memset(boot_params, 0x0, 0x4000); + + /* Copy first two sectors to boot_params */ + memcpy(boot_params, image->image_base, 1024); + + hdr = &boot_params->hdr; + + /* + * The EFI firmware loader could have placed the kernel image + * anywhere in memory, but the kernel has various restrictions + * on the max physical address it can run at. Attempt to move + * the kernel to boot_params.pref_address, or as low as + * possible. + */ + start = hdr->pref_address; + nr_pages = round_up(hdr->init_size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE; + + status = efi_call_phys4(sys_table->boottime->allocate_pages, + EFI_ALLOCATE_ADDRESS, EFI_LOADER_DATA, + nr_pages, &start); + if (status != EFI_SUCCESS) { + status = low_alloc(hdr->init_size, hdr->kernel_alignment, + &start); + if (status != EFI_SUCCESS) + goto fail; + } + + hdr->code32_start = (__u32)start; + hdr->pref_address = (__u64)(unsigned long)image->image_base; + + memcpy((void *)start, image->image_base, image->image_size); + + status = efi_call_phys3(sys_table->boottime->allocate_pool, + EFI_LOADER_DATA, sizeof(*gdt), + (void **)&gdt); + if (status != EFI_SUCCESS) + goto fail; + + gdt->size = 0x800; + status = low_alloc(gdt->size, 8, (unsigned long *)&gdt->address); + if (status != EFI_SUCCESS) + goto fail; + + status = efi_call_phys3(sys_table->boottime->allocate_pool, + EFI_LOADER_DATA, sizeof(*idt), + (void **)&idt); + if (status != EFI_SUCCESS) + goto fail; + + idt->size = 0; + idt->address = 0; + + status = make_boot_params(boot_params, image, handle); + if (status != EFI_SUCCESS) + goto fail; + + memset((char *)gdt->address, 0x0, gdt->size); + desc = (struct desc_struct *)gdt->address; + + /* The first GDT is a dummy and the second is unused. */ + desc += 2; + + desc->limit0 = 0xffff; + desc->base0 = 0x0000; + desc->base1 = 0x0000; + desc->type = SEG_TYPE_CODE | SEG_TYPE_EXEC_READ; + desc->s = DESC_TYPE_CODE_DATA; + desc->dpl = 0; + desc->p = 1; + desc->limit = 0xf; + desc->avl = 0; + desc->l = 0; + desc->d = SEG_OP_SIZE_32BIT; + desc->g = SEG_GRANULARITY_4KB; + desc->base2 = 0x00; + + desc++; + desc->limit0 = 0xffff; + desc->base0 = 0x0000; + desc->base1 = 0x0000; + desc->type = SEG_TYPE_DATA | SEG_TYPE_READ_WRITE; + desc->s = DESC_TYPE_CODE_DATA; + desc->dpl = 0; + desc->p = 1; + desc->limit = 0xf; + desc->avl = 0; + desc->l = 0; + desc->d = SEG_OP_SIZE_32BIT; + desc->g = SEG_GRANULARITY_4KB; + desc->base2 = 0x00; + +#ifdef CONFIG_X86_64 + /* Task segment value */ + desc++; + desc->limit0 = 0x0000; + desc->base0 = 0x0000; + desc->base1 = 0x0000; + desc->type = SEG_TYPE_TSS; + desc->s = 0; + desc->dpl = 0; + desc->p = 1; + desc->limit = 0x0; + desc->avl = 0; + desc->l = 0; + desc->d = 0; + desc->g = SEG_GRANULARITY_4KB; + desc->base2 = 0x00; +#endif /* CONFIG_X86_64 */ + + asm volatile ("lidt %0" : : "m" (*idt)); + asm volatile ("lgdt %0" : : "m" (*gdt)); + + asm volatile("cli"); + + return boot_params; +fail: + return NULL; +} diff --git a/arch/x86/boot/compressed/eboot.h b/arch/x86/boot/compressed/eboot.h new file mode 100644 index 0000000..3925166 --- /dev/null +++ b/arch/x86/boot/compressed/eboot.h @@ -0,0 +1,61 @@ +#ifndef BOOT_COMPRESSED_EBOOT_H +#define BOOT_COMPRESSED_EBOOT_H + +#define SEG_TYPE_DATA (0 << 3) +#define SEG_TYPE_READ_WRITE (1 << 1) +#define SEG_TYPE_CODE (1 << 3) +#define SEG_TYPE_EXEC_READ (1 << 1) +#define SEG_TYPE_TSS ((1 << 3) | (1 << 0)) +#define SEG_OP_SIZE_32BIT (1 << 0) +#define SEG_GRANULARITY_4KB (1 << 0) + +#define DESC_TYPE_CODE_DATA (1 << 0) + +#define EFI_PAGE_SIZE (1UL << EFI_PAGE_SHIFT) +#define EFI_READ_CHUNK_SIZE (1024 * 1024) + +#define PIXEL_RGB_RESERVED_8BIT_PER_COLOR 0 +#define PIXEL_BGR_RESERVED_8BIT_PER_COLOR 1 +#define PIXEL_BIT_MASK 2 +#define PIXEL_BLT_ONLY 3 +#define PIXEL_FORMAT_MAX 4 + +struct efi_pixel_bitmask { + u32 red_mask; + u32 green_mask; + u32 blue_mask; + u32 reserved_mask; +}; + +struct efi_graphics_output_mode_info { + u32 version; + u32 horizontal_resolution; + u32 vertical_resolution; + int pixel_format; + struct efi_pixel_bitmask pixel_information; + u32 pixels_per_scan_line; +} __packed; + +struct efi_graphics_output_protocol_mode { + u32 max_mode; + u32 mode; + unsigned long info; + unsigned long size_of_info; + u64 frame_buffer_base; + unsigned long frame_buffer_size; +} __packed; + +struct efi_graphics_output_protocol { + void *query_mode; + unsigned long set_mode; + unsigned long blt; + struct efi_graphics_output_protocol_mode *mode; +}; + +struct efi_uga_draw_protocol { + void *get_mode; + void *set_mode; + void *blt; +}; + +#endif /* BOOT_COMPRESSED_EBOOT_H */ diff --git a/arch/x86/boot/compressed/efi_stub_32.S b/arch/x86/boot/compressed/efi_stub_32.S new file mode 100644 index 0000000..a53440e --- /dev/null +++ b/arch/x86/boot/compressed/efi_stub_32.S @@ -0,0 +1,86 @@ +/* + * EFI call stub for IA32. + * + * This stub allows us to make EFI calls in physical mode with interrupts + * turned off. Note that this implementation is different from the one in + * arch/x86/platform/efi/efi_stub_32.S because we're _already_ in physical + * mode at this point. + */ + +#include <linux/linkage.h> +#include <asm/page_types.h> + +/* + * efi_call_phys(void *, ...) is a function with variable parameters. + * All the callers of this function assure that all the parameters are 4-bytes. + */ + +/* + * In gcc calling convention, EBX, ESP, EBP, ESI and EDI are all callee save. + * So we'd better save all of them at the beginning of this function and restore + * at the end no matter how many we use, because we can not assure EFI runtime + * service functions will comply with gcc calling convention, too. + */ + +.text +ENTRY(efi_call_phys) + /* + * 0. The function can only be called in Linux kernel. So CS has been + * set to 0x0010, DS and SS have been set to 0x0018. In EFI, I found + * the values of these registers are the same. And, the corresponding + * GDT entries are identical. So I will do nothing about segment reg + * and GDT, but change GDT base register in prelog and epilog. + */ + + /* + * 1. Because we haven't been relocated by this point we need to + * use relative addressing. + */ + call 1f +1: popl %edx + subl $1b, %edx + + /* + * 2. Now on the top of stack is the return + * address in the caller of efi_call_phys(), then parameter 1, + * parameter 2, ..., param n. To make things easy, we save the return + * address of efi_call_phys in a global variable. + */ + popl %ecx + movl %ecx, saved_return_addr(%edx) + /* get the function pointer into ECX*/ + popl %ecx + movl %ecx, efi_rt_function_ptr(%edx) + + /* + * 3. Call the physical function. + */ + call *%ecx + + /* + * 4. Balance the stack. And because EAX contain the return value, + * we'd better not clobber it. We need to calculate our address + * again because %ecx and %edx are not preserved across EFI function + * calls. + */ + call 1f +1: popl %edx + subl $1b, %edx + + movl efi_rt_function_ptr(%edx), %ecx + pushl %ecx + + /* + * 10. Push the saved return address onto the stack and return. + */ + movl saved_return_addr(%edx), %ecx + pushl %ecx + ret +ENDPROC(efi_call_phys) +.previous + +.data +saved_return_addr: + .long 0 +efi_rt_function_ptr: + .long 0 diff --git a/arch/x86/boot/compressed/efi_stub_64.S b/arch/x86/boot/compressed/efi_stub_64.S new file mode 100644 index 0000000..cedc60d --- /dev/null +++ b/arch/x86/boot/compressed/efi_stub_64.S @@ -0,0 +1 @@ +#include "../../platform/efi/efi_stub_64.S" diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index 67a655a..a055993 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -32,6 +32,28 @@ __HEAD ENTRY(startup_32) +#ifdef CONFIG_EFI_STUB + /* + * We don't need the return address, so set up the stack so + * efi_main() can find its arugments. + */ + add $0x4, %esp + + call efi_main + cmpl $0, %eax + je preferred_addr + movl %eax, %esi + call 1f +1: + popl %eax + subl $1b, %eax + subl BP_pref_address(%esi), %eax + add BP_code32_start(%esi), %eax + leal preferred_addr(%eax), %eax + jmp *%eax + +preferred_addr: +#endif cld /* * Test KEEP_SEGMENTS flag to see if the bootloader is asking diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 35af09d..558d76c 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -199,6 +199,26 @@ ENTRY(startup_64) * an identity mapped page table being provied that maps our * entire text+data+bss and hopefully all of memory. */ +#ifdef CONFIG_EFI_STUB + pushq %rsi + mov %rcx, %rdi + mov %rdx, %rsi + call efi_main + popq %rsi + cmpq $0,%rax + je preferred_addr + movq %rax,%rsi + call 1f +1: + popq %rax + subq $1b, %rax + subq BP_pref_address(%rsi), %rax + add BP_code32_start(%esi), %eax + leaq preferred_addr(%rax), %rax + jmp *%rax + +preferred_addr: +#endif /* Setup data segments. */ xorl %eax, %eax diff --git a/arch/x86/boot/compressed/string.c b/arch/x86/boot/compressed/string.c index 19b3e69..ffb9c5c 100644 --- a/arch/x86/boot/compressed/string.c +++ b/arch/x86/boot/compressed/string.c @@ -1,2 +1,11 @@ #include "misc.h" + +int memcmp(const void *s1, const void *s2, size_t len) +{ + u8 diff; + asm("repe; cmpsb; setnz %0" + : "=qm" (diff), "+D" (s1), "+S" (s2), "+c" (len)); + return diff; +} + #include "../string.c" diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index bdb4d45..f1bbeeb 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -45,6 +45,11 @@ SYSSEG = 0x1000 /* historical load address >> 4 */ .global bootsect_start bootsect_start: +#ifdef CONFIG_EFI_STUB + # "MZ", MS-DOS header + .byte 0x4d + .byte 0x5a +#endif # Normalize the start address ljmp $BOOTSEG, $start2 @@ -79,6 +84,14 @@ bs_die: # invoke the BIOS reset code... ljmp $0xf000,$0xfff0 +#ifdef CONFIG_EFI_STUB + .org 0x3c + # + # Offset to the PE header. + # + .long pe_header +#endif /* CONFIG_EFI_STUB */ + .section ".bsdata", "a" bugger_off_msg: .ascii "Direct booting from floppy is no longer supported.\r\n" @@ -87,6 +100,141 @@ bugger_off_msg: .ascii "Remove disk and press any key to reboot . . .\r\n" .byte 0 +#ifdef CONFIG_EFI_STUB +pe_header: + .ascii "PE" + .word 0 + +coff_header: +#ifdef CONFIG_X86_32 + .word 0x14c # i386 +#else + .word 0x8664 # x86-64 +#endif + .word 2 # nr_sections + .long 0 # TimeDateStamp + .long 0 # PointerToSymbolTable + .long 1 # NumberOfSymbols + .word section_table - optional_header # SizeOfOptionalHeader +#ifdef CONFIG_X86_32 + .word 0x306 # Characteristics. + # IMAGE_FILE_32BIT_MACHINE | + # IMAGE_FILE_DEBUG_STRIPPED | + # IMAGE_FILE_EXECUTABLE_IMAGE | + # IMAGE_FILE_LINE_NUMS_STRIPPED +#else + .word 0x206 # Characteristics + # IMAGE_FILE_DEBUG_STRIPPED | + # IMAGE_FILE_EXECUTABLE_IMAGE | + # IMAGE_FILE_LINE_NUMS_STRIPPED +#endif + +optional_header: +#ifdef CONFIG_X86_32 + .word 0x10b # PE32 format +#else + .word 0x20b # PE32+ format +#endif + .byte 0x02 # MajorLinkerVersion + .byte 0x14 # MinorLinkerVersion + + # Filled in by build.c + .long 0 # SizeOfCode + + .long 0 # SizeOfInitializedData + .long 0 # SizeOfUninitializedData + + # Filled in by build.c + .long 0x0000 # AddressOfEntryPoint + + .long 0x0000 # BaseOfCode +#ifdef CONFIG_X86_32 + .long 0 # data +#endif + +extra_header_fields: +#ifdef CONFIG_X86_32 + .long 0 # ImageBase +#else + .quad 0 # ImageBase +#endif + .long 0x1000 # SectionAlignment + .long 0x200 # FileAlignment + .word 0 # MajorOperatingSystemVersion + .word 0 # MinorOperatingSystemVersion + .word 0 # MajorImageVersion + .word 0 # MinorImageVersion + .word 0 # MajorSubsystemVersion + .word 0 # MinorSubsystemVersion + .long 0 # Win32VersionValue + + # + # The size of the bzImage is written in tools/build.c + # + .long 0 # SizeOfImage + + .long 0x200 # SizeOfHeaders + .long 0 # CheckSum + .word 0xa # Subsystem (EFI application) + .word 0 # DllCharacteristics +#ifdef CONFIG_X86_32 + .long 0 # SizeOfStackReserve + .long 0 # SizeOfStackCommit + .long 0 # SizeOfHeapReserve + .long 0 # SizeOfHeapCommit +#else + .quad 0 # SizeOfStackReserve + .quad 0 # SizeOfStackCommit + .quad 0 # SizeOfHeapReserve + .quad 0 # SizeOfHeapCommit +#endif + .long 0 # LoaderFlags + .long 0x1 # NumberOfRvaAndSizes + + .quad 0 # ExportTable + .quad 0 # ImportTable + .quad 0 # ResourceTable + .quad 0 # ExceptionTable + .quad 0 # CertificationTable + .quad 0 # BaseRelocationTable + + # Section table +section_table: + .ascii ".text" + .byte 0 + .byte 0 + .byte 0 + .long 0 + .long 0x0 # startup_{32,64} + .long 0 # Size of initialized data + # on disk + .long 0x0 # startup_{32,64} + .long 0 # PointerToRelocations + .long 0 # PointerToLineNumbers + .word 0 # NumberOfRelocations + .word 0 # NumberOfLineNumbers + .long 0x60500020 # Characteristics (section flags) + + # + # The EFI application loader requires a relocation section + # because EFI applications are relocatable and not having + # this section seems to confuse it. But since we don't need + # the loader to fixup any relocs for us just fill it with a + # single dummy reloc. + # + .ascii ".reloc" + .byte 0 + .byte 0 + .long reloc_end - reloc_start + .long reloc_start + .long reloc_end - reloc_start # SizeOfRawData + .long reloc_start # PointerToRawData + .long 0 # PointerToRelocations + .long 0 # PointerToLineNumbers + .word 0 # NumberOfRelocations + .word 0 # NumberOfLineNumbers + .long 0x42100040 # Characteristics (section flags) +#endif /* CONFIG_EFI_STUB */ # Kernel attributes; used by setup. This is part 1 of the # header, from the old boot sector. @@ -318,3 +466,13 @@ die: setup_corrupt: .byte 7 .string "No setup signature found...\n" + + .data +dummy: .long 0 + + .section .reloc +reloc_start: + .long dummy - reloc_start + .long 10 + .word 0 +reloc_end: diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c index 3cbc405..574dedf 100644 --- a/arch/x86/boot/string.c +++ b/arch/x86/boot/string.c @@ -111,3 +111,38 @@ unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int bas return result; } + +/** + * strlen - Find the length of a string + * @s: The string to be sized + */ +size_t strlen(const char *s) +{ + const char *sc; + + for (sc = s; *sc != '\0'; ++sc) + /* nothing */; + return sc - s; +} + +/** + * strstr - Find the first substring in a %NUL terminated string + * @s1: The string to be searched + * @s2: The string to search for + */ +char *strstr(const char *s1, const char *s2) +{ + size_t l1, l2; + + l2 = strlen(s2); + if (!l2) + return (char *)s1; + l1 = strlen(s1); + while (l1 >= l2) { + l1--; + if (!memcmp(s1, s2, l2)) + return (char *)s1; + s1++; + } + return NULL; +} diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c index fdc60a0..4e9bd6b 100644 --- a/arch/x86/boot/tools/build.c +++ b/arch/x86/boot/tools/build.c @@ -135,6 +135,9 @@ static void usage(void) int main(int argc, char ** argv) { +#ifdef CONFIG_EFI_STUB + unsigned int file_sz, pe_header; +#endif unsigned int i, sz, setup_sectors; int c; u32 sys_size; @@ -194,6 +197,42 @@ int main(int argc, char ** argv) buf[0x1f6] = sys_size >> 16; buf[0x1f7] = sys_size >> 24; +#ifdef CONFIG_EFI_STUB + file_sz = sz + i + ((sys_size * 16) - sz); + + pe_header = *(unsigned int *)&buf[0x3c]; + + /* Size of code */ + *(unsigned int *)&buf[pe_header + 0x1c] = file_sz; + + /* Size of image */ + *(unsigned int *)&buf[pe_header + 0x50] = file_sz; + +#ifdef CONFIG_X86_32 + /* Address of entry point */ + *(unsigned int *)&buf[pe_header + 0x28] = i; + + /* .text size */ + *(unsigned int *)&buf[pe_header + 0xb0] = file_sz; + + /* .text size of initialised data */ + *(unsigned int *)&buf[pe_header + 0xb8] = file_sz; +#else + /* + * Address of entry point. startup_32 is at the beginning and + * the 64-bit entry point (startup_64) is always 512 bytes + * after. + */ + *(unsigned int *)&buf[pe_header + 0x28] = i + 512; + + /* .text size */ + *(unsigned int *)&buf[pe_header + 0xc0] = file_sz; + + /* .text size of initialised data */ + *(unsigned int *)&buf[pe_header + 0xc8] = file_sz; +#endif /* CONFIG_X86_32 */ +#endif /* CONFIG_EFI_STUB */ + crc = partial_crc32(buf, i, crc); if (fwrite(buf, 1, i, stdout) != i) die("Writing setup failed"); diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 3537d4b..2b0b963 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -5,12 +5,14 @@ obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o +obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o +obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o @@ -20,12 +22,14 @@ obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o aes-i586-y := aes-i586-asm_32.o aes_glue.o twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o +serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o +serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o diff --git a/arch/x86/crypto/serpent-sse2-i586-asm_32.S b/arch/x86/crypto/serpent-sse2-i586-asm_32.S new file mode 100644 index 0000000..4e37677 --- /dev/null +++ b/arch/x86/crypto/serpent-sse2-i586-asm_32.S @@ -0,0 +1,638 @@ +/* + * Serpent Cipher 4-way parallel algorithm (i586/SSE2) + * + * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> + * + * Based on crypto/serpent.c by + * Copyright (C) 2002 Dag Arne Osvik <osvik@ii.uib.no> + * 2003 Herbert Valerio Riedel <hvr@gnu.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + * USA + * + */ + +.file "serpent-sse2-i586-asm_32.S" +.text + +#define arg_ctx 4 +#define arg_dst 8 +#define arg_src 12 +#define arg_xor 16 + +/********************************************************************** + 4-way SSE2 serpent + **********************************************************************/ +#define CTX %edx + +#define RA %xmm0 +#define RB %xmm1 +#define RC %xmm2 +#define RD %xmm3 +#define RE %xmm4 + +#define RT0 %xmm5 +#define RT1 %xmm6 + +#define RNOT %xmm7 + +#define get_key(i, j, t) \ + movd (4*(i)+(j))*4(CTX), t; \ + pshufd $0, t, t; + +#define K(x0, x1, x2, x3, x4, i) \ + get_key(i, 0, x4); \ + get_key(i, 1, RT0); \ + get_key(i, 2, RT1); \ + pxor x4, x0; \ + pxor RT0, x1; \ + pxor RT1, x2; \ + get_key(i, 3, x4); \ + pxor x4, x3; + +#define LK(x0, x1, x2, x3, x4, i) \ + movdqa x0, x4; \ + pslld $13, x0; \ + psrld $(32 - 13), x4; \ + por x4, x0; \ + pxor x0, x1; \ + movdqa x2, x4; \ + pslld $3, x2; \ + psrld $(32 - 3), x4; \ + por x4, x2; \ + pxor x2, x1; \ + movdqa x1, x4; \ + pslld $1, x1; \ + psrld $(32 - 1), x4; \ + por x4, x1; \ + movdqa x0, x4; \ + pslld $3, x4; \ + pxor x2, x3; \ + pxor x4, x3; \ + movdqa x3, x4; \ + pslld $7, x3; \ + psrld $(32 - 7), x4; \ + por x4, x3; \ + movdqa x1, x4; \ + pslld $7, x4; \ + pxor x1, x0; \ + pxor x3, x0; \ + pxor x3, x2; \ + pxor x4, x2; \ + movdqa x0, x4; \ + get_key(i, 1, RT0); \ + pxor RT0, x1; \ + get_key(i, 3, RT0); \ + pxor RT0, x3; \ + pslld $5, x0; \ + psrld $(32 - 5), x4; \ + por x4, x0; \ + movdqa x2, x4; \ + pslld $22, x2; \ + psrld $(32 - 22), x4; \ + por x4, x2; \ + get_key(i, 0, RT0); \ + pxor RT0, x0; \ + get_key(i, 2, RT0); \ + pxor RT0, x2; + +#define KL(x0, x1, x2, x3, x4, i) \ + K(x0, x1, x2, x3, x4, i); \ + movdqa x0, x4; \ + psrld $5, x0; \ + pslld $(32 - 5), x4; \ + por x4, x0; \ + movdqa x2, x4; \ + psrld $22, x2; \ + pslld $(32 - 22), x4; \ + por x4, x2; \ + pxor x3, x2; \ + pxor x3, x0; \ + movdqa x1, x4; \ + pslld $7, x4; \ + pxor x1, x0; \ + pxor x4, x2; \ + movdqa x1, x4; \ + psrld $1, x1; \ + pslld $(32 - 1), x4; \ + por x4, x1; \ + movdqa x3, x4; \ + psrld $7, x3; \ + pslld $(32 - 7), x4; \ + por x4, x3; \ + pxor x0, x1; \ + movdqa x0, x4; \ + pslld $3, x4; \ + pxor x4, x3; \ + movdqa x0, x4; \ + psrld $13, x0; \ + pslld $(32 - 13), x4; \ + por x4, x0; \ + pxor x2, x1; \ + pxor x2, x3; \ + movdqa x2, x4; \ + psrld $3, x2; \ + pslld $(32 - 3), x4; \ + por x4, x2; + +#define S0(x0, x1, x2, x3, x4) \ + movdqa x3, x4; \ + por x0, x3; \ + pxor x4, x0; \ + pxor x2, x4; \ + pxor RNOT, x4; \ + pxor x1, x3; \ + pand x0, x1; \ + pxor x4, x1; \ + pxor x0, x2; \ + pxor x3, x0; \ + por x0, x4; \ + pxor x2, x0; \ + pand x1, x2; \ + pxor x2, x3; \ + pxor RNOT, x1; \ + pxor x4, x2; \ + pxor x2, x1; + +#define S1(x0, x1, x2, x3, x4) \ + movdqa x1, x4; \ + pxor x0, x1; \ + pxor x3, x0; \ + pxor RNOT, x3; \ + pand x1, x4; \ + por x1, x0; \ + pxor x2, x3; \ + pxor x3, x0; \ + pxor x3, x1; \ + pxor x4, x3; \ + por x4, x1; \ + pxor x2, x4; \ + pand x0, x2; \ + pxor x1, x2; \ + por x0, x1; \ + pxor RNOT, x0; \ + pxor x2, x0; \ + pxor x1, x4; + +#define S2(x0, x1, x2, x3, x4) \ + pxor RNOT, x3; \ + pxor x0, x1; \ + movdqa x0, x4; \ + pand x2, x0; \ + pxor x3, x0; \ + por x4, x3; \ + pxor x1, x2; \ + pxor x1, x3; \ + pand x0, x1; \ + pxor x2, x0; \ + pand x3, x2; \ + por x1, x3; \ + pxor RNOT, x0; \ + pxor x0, x3; \ + pxor x0, x4; \ + pxor x2, x0; \ + por x2, x1; + +#define S3(x0, x1, x2, x3, x4) \ + movdqa x1, x4; \ + pxor x3, x1; \ + por x0, x3; \ + pand x0, x4; \ + pxor x2, x0; \ + pxor x1, x2; \ + pand x3, x1; \ + pxor x3, x2; \ + por x4, x0; \ + pxor x3, x4; \ + pxor x0, x1; \ + pand x3, x0; \ + pand x4, x3; \ + pxor x2, x3; \ + por x1, x4; \ + pand x1, x2; \ + pxor x3, x4; \ + pxor x3, x0; \ + pxor x2, x3; + +#define S4(x0, x1, x2, x3, x4) \ + movdqa x3, x4; \ + pand x0, x3; \ + pxor x4, x0; \ + pxor x2, x3; \ + por x4, x2; \ + pxor x1, x0; \ + pxor x3, x4; \ + por x0, x2; \ + pxor x1, x2; \ + pand x0, x1; \ + pxor x4, x1; \ + pand x2, x4; \ + pxor x3, x2; \ + pxor x0, x4; \ + por x1, x3; \ + pxor RNOT, x1; \ + pxor x0, x3; + +#define S5(x0, x1, x2, x3, x4) \ + movdqa x1, x4; \ + por x0, x1; \ + pxor x1, x2; \ + pxor RNOT, x3; \ + pxor x0, x4; \ + pxor x2, x0; \ + pand x4, x1; \ + por x3, x4; \ + pxor x0, x4; \ + pand x3, x0; \ + pxor x3, x1; \ + pxor x2, x3; \ + pxor x1, x0; \ + pand x4, x2; \ + pxor x2, x1; \ + pand x0, x2; \ + pxor x2, x3; + +#define S6(x0, x1, x2, x3, x4) \ + movdqa x1, x4; \ + pxor x0, x3; \ + pxor x2, x1; \ + pxor x0, x2; \ + pand x3, x0; \ + por x3, x1; \ + pxor RNOT, x4; \ + pxor x1, x0; \ + pxor x2, x1; \ + pxor x4, x3; \ + pxor x0, x4; \ + pand x0, x2; \ + pxor x1, x4; \ + pxor x3, x2; \ + pand x1, x3; \ + pxor x0, x3; \ + pxor x2, x1; + +#define S7(x0, x1, x2, x3, x4) \ + pxor RNOT, x1; \ + movdqa x1, x4; \ + pxor RNOT, x0; \ + pand x2, x1; \ + pxor x3, x1; \ + por x4, x3; \ + pxor x2, x4; \ + pxor x3, x2; \ + pxor x0, x3; \ + por x1, x0; \ + pand x0, x2; \ + pxor x4, x0; \ + pxor x3, x4; \ + pand x0, x3; \ + pxor x1, x4; \ + pxor x4, x2; \ + pxor x1, x3; \ + por x0, x4; \ + pxor x1, x4; + +#define SI0(x0, x1, x2, x3, x4) \ + movdqa x3, x4; \ + pxor x0, x1; \ + por x1, x3; \ + pxor x1, x4; \ + pxor RNOT, x0; \ + pxor x3, x2; \ + pxor x0, x3; \ + pand x1, x0; \ + pxor x2, x0; \ + pand x3, x2; \ + pxor x4, x3; \ + pxor x3, x2; \ + pxor x3, x1; \ + pand x0, x3; \ + pxor x0, x1; \ + pxor x2, x0; \ + pxor x3, x4; + +#define SI1(x0, x1, x2, x3, x4) \ + pxor x3, x1; \ + movdqa x0, x4; \ + pxor x2, x0; \ + pxor RNOT, x2; \ + por x1, x4; \ + pxor x3, x4; \ + pand x1, x3; \ + pxor x2, x1; \ + pand x4, x2; \ + pxor x1, x4; \ + por x3, x1; \ + pxor x0, x3; \ + pxor x0, x2; \ + por x4, x0; \ + pxor x4, x2; \ + pxor x0, x1; \ + pxor x1, x4; + +#define SI2(x0, x1, x2, x3, x4) \ + pxor x1, x2; \ + movdqa x3, x4; \ + pxor RNOT, x3; \ + por x2, x3; \ + pxor x4, x2; \ + pxor x0, x4; \ + pxor x1, x3; \ + por x2, x1; \ + pxor x0, x2; \ + pxor x4, x1; \ + por x3, x4; \ + pxor x3, x2; \ + pxor x2, x4; \ + pand x1, x2; \ + pxor x3, x2; \ + pxor x4, x3; \ + pxor x0, x4; + +#define SI3(x0, x1, x2, x3, x4) \ + pxor x1, x2; \ + movdqa x1, x4; \ + pand x2, x1; \ + pxor x0, x1; \ + por x4, x0; \ + pxor x3, x4; \ + pxor x3, x0; \ + por x1, x3; \ + pxor x2, x1; \ + pxor x3, x1; \ + pxor x2, x0; \ + pxor x3, x2; \ + pand x1, x3; \ + pxor x0, x1; \ + pand x2, x0; \ + pxor x3, x4; \ + pxor x0, x3; \ + pxor x1, x0; + +#define SI4(x0, x1, x2, x3, x4) \ + pxor x3, x2; \ + movdqa x0, x4; \ + pand x1, x0; \ + pxor x2, x0; \ + por x3, x2; \ + pxor RNOT, x4; \ + pxor x0, x1; \ + pxor x2, x0; \ + pand x4, x2; \ + pxor x0, x2; \ + por x4, x0; \ + pxor x3, x0; \ + pand x2, x3; \ + pxor x3, x4; \ + pxor x1, x3; \ + pand x0, x1; \ + pxor x1, x4; \ + pxor x3, x0; + +#define SI5(x0, x1, x2, x3, x4) \ + movdqa x1, x4; \ + por x2, x1; \ + pxor x4, x2; \ + pxor x3, x1; \ + pand x4, x3; \ + pxor x3, x2; \ + por x0, x3; \ + pxor RNOT, x0; \ + pxor x2, x3; \ + por x0, x2; \ + pxor x1, x4; \ + pxor x4, x2; \ + pand x0, x4; \ + pxor x1, x0; \ + pxor x3, x1; \ + pand x2, x0; \ + pxor x3, x2; \ + pxor x2, x0; \ + pxor x4, x2; \ + pxor x3, x4; + +#define SI6(x0, x1, x2, x3, x4) \ + pxor x2, x0; \ + movdqa x0, x4; \ + pand x3, x0; \ + pxor x3, x2; \ + pxor x2, x0; \ + pxor x1, x3; \ + por x4, x2; \ + pxor x3, x2; \ + pand x0, x3; \ + pxor RNOT, x0; \ + pxor x1, x3; \ + pand x2, x1; \ + pxor x0, x4; \ + pxor x4, x3; \ + pxor x2, x4; \ + pxor x1, x0; \ + pxor x0, x2; + +#define SI7(x0, x1, x2, x3, x4) \ + movdqa x3, x4; \ + pand x0, x3; \ + pxor x2, x0; \ + por x4, x2; \ + pxor x1, x4; \ + pxor RNOT, x0; \ + por x3, x1; \ + pxor x0, x4; \ + pand x2, x0; \ + pxor x1, x0; \ + pand x2, x1; \ + pxor x2, x3; \ + pxor x3, x4; \ + pand x3, x2; \ + por x0, x3; \ + pxor x4, x1; \ + pxor x4, x3; \ + pand x0, x4; \ + pxor x2, x4; + +#define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \ + movdqa x2, t3; \ + movdqa x0, t1; \ + unpcklps x3, t3; \ + movdqa x0, t2; \ + unpcklps x1, t1; \ + unpckhps x1, t2; \ + movdqa t3, x1; \ + unpckhps x3, x2; \ + movdqa t1, x0; \ + movhlps t1, x1; \ + movdqa t2, t1; \ + movlhps t3, x0; \ + movlhps x2, t1; \ + movhlps t2, x2; \ + movdqa x2, x3; \ + movdqa t1, x2; + +#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \ + movdqu (0*4*4)(in), x0; \ + movdqu (1*4*4)(in), x1; \ + movdqu (2*4*4)(in), x2; \ + movdqu (3*4*4)(in), x3; \ + \ + transpose_4x4(x0, x1, x2, x3, t0, t1, t2) + +#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \ + transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ + \ + movdqu x0, (0*4*4)(out); \ + movdqu x1, (1*4*4)(out); \ + movdqu x2, (2*4*4)(out); \ + movdqu x3, (3*4*4)(out); + +#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \ + transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ + \ + movdqu (0*4*4)(out), t0; \ + pxor t0, x0; \ + movdqu x0, (0*4*4)(out); \ + movdqu (1*4*4)(out), t0; \ + pxor t0, x1; \ + movdqu x1, (1*4*4)(out); \ + movdqu (2*4*4)(out), t0; \ + pxor t0, x2; \ + movdqu x2, (2*4*4)(out); \ + movdqu (3*4*4)(out), t0; \ + pxor t0, x3; \ + movdqu x3, (3*4*4)(out); + +.align 8 +.global __serpent_enc_blk_4way +.type __serpent_enc_blk_4way,@function; + +__serpent_enc_blk_4way: + /* input: + * arg_ctx(%esp): ctx, CTX + * arg_dst(%esp): dst + * arg_src(%esp): src + * arg_xor(%esp): bool, if true: xor output + */ + + pcmpeqd RNOT, RNOT; + + movl arg_ctx(%esp), CTX; + + movl arg_src(%esp), %eax; + read_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE); + + K(RA, RB, RC, RD, RE, 0); + S0(RA, RB, RC, RD, RE); LK(RC, RB, RD, RA, RE, 1); + S1(RC, RB, RD, RA, RE); LK(RE, RD, RA, RC, RB, 2); + S2(RE, RD, RA, RC, RB); LK(RB, RD, RE, RC, RA, 3); + S3(RB, RD, RE, RC, RA); LK(RC, RA, RD, RB, RE, 4); + S4(RC, RA, RD, RB, RE); LK(RA, RD, RB, RE, RC, 5); + S5(RA, RD, RB, RE, RC); LK(RC, RA, RD, RE, RB, 6); + S6(RC, RA, RD, RE, RB); LK(RD, RB, RA, RE, RC, 7); + S7(RD, RB, RA, RE, RC); LK(RC, RA, RE, RD, RB, 8); + S0(RC, RA, RE, RD, RB); LK(RE, RA, RD, RC, RB, 9); + S1(RE, RA, RD, RC, RB); LK(RB, RD, RC, RE, RA, 10); + S2(RB, RD, RC, RE, RA); LK(RA, RD, RB, RE, RC, 11); + S3(RA, RD, RB, RE, RC); LK(RE, RC, RD, RA, RB, 12); + S4(RE, RC, RD, RA, RB); LK(RC, RD, RA, RB, RE, 13); + S5(RC, RD, RA, RB, RE); LK(RE, RC, RD, RB, RA, 14); + S6(RE, RC, RD, RB, RA); LK(RD, RA, RC, RB, RE, 15); + S7(RD, RA, RC, RB, RE); LK(RE, RC, RB, RD, RA, 16); + S0(RE, RC, RB, RD, RA); LK(RB, RC, RD, RE, RA, 17); + S1(RB, RC, RD, RE, RA); LK(RA, RD, RE, RB, RC, 18); + S2(RA, RD, RE, RB, RC); LK(RC, RD, RA, RB, RE, 19); + S3(RC, RD, RA, RB, RE); LK(RB, RE, RD, RC, RA, 20); + S4(RB, RE, RD, RC, RA); LK(RE, RD, RC, RA, RB, 21); + S5(RE, RD, RC, RA, RB); LK(RB, RE, RD, RA, RC, 22); + S6(RB, RE, RD, RA, RC); LK(RD, RC, RE, RA, RB, 23); + S7(RD, RC, RE, RA, RB); LK(RB, RE, RA, RD, RC, 24); + S0(RB, RE, RA, RD, RC); LK(RA, RE, RD, RB, RC, 25); + S1(RA, RE, RD, RB, RC); LK(RC, RD, RB, RA, RE, 26); + S2(RC, RD, RB, RA, RE); LK(RE, RD, RC, RA, RB, 27); + S3(RE, RD, RC, RA, RB); LK(RA, RB, RD, RE, RC, 28); + S4(RA, RB, RD, RE, RC); LK(RB, RD, RE, RC, RA, 29); + S5(RB, RD, RE, RC, RA); LK(RA, RB, RD, RC, RE, 30); + S6(RA, RB, RD, RC, RE); LK(RD, RE, RB, RC, RA, 31); + S7(RD, RE, RB, RC, RA); K(RA, RB, RC, RD, RE, 32); + + movl arg_dst(%esp), %eax; + + cmpb $0, arg_xor(%esp); + jnz __enc_xor4; + + write_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE); + + ret; + +__enc_xor4: + xor_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE); + + ret; + +.align 8 +.global serpent_dec_blk_4way +.type serpent_dec_blk_4way,@function; + +serpent_dec_blk_4way: + /* input: + * arg_ctx(%esp): ctx, CTX + * arg_dst(%esp): dst + * arg_src(%esp): src + */ + + pcmpeqd RNOT, RNOT; + + movl arg_ctx(%esp), CTX; + + movl arg_src(%esp), %eax; + read_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE); + + K(RA, RB, RC, RD, RE, 32); + SI7(RA, RB, RC, RD, RE); KL(RB, RD, RA, RE, RC, 31); + SI6(RB, RD, RA, RE, RC); KL(RA, RC, RE, RB, RD, 30); + SI5(RA, RC, RE, RB, RD); KL(RC, RD, RA, RE, RB, 29); + SI4(RC, RD, RA, RE, RB); KL(RC, RA, RB, RE, RD, 28); + SI3(RC, RA, RB, RE, RD); KL(RB, RC, RD, RE, RA, 27); + SI2(RB, RC, RD, RE, RA); KL(RC, RA, RE, RD, RB, 26); + SI1(RC, RA, RE, RD, RB); KL(RB, RA, RE, RD, RC, 25); + SI0(RB, RA, RE, RD, RC); KL(RE, RC, RA, RB, RD, 24); + SI7(RE, RC, RA, RB, RD); KL(RC, RB, RE, RD, RA, 23); + SI6(RC, RB, RE, RD, RA); KL(RE, RA, RD, RC, RB, 22); + SI5(RE, RA, RD, RC, RB); KL(RA, RB, RE, RD, RC, 21); + SI4(RA, RB, RE, RD, RC); KL(RA, RE, RC, RD, RB, 20); + SI3(RA, RE, RC, RD, RB); KL(RC, RA, RB, RD, RE, 19); + SI2(RC, RA, RB, RD, RE); KL(RA, RE, RD, RB, RC, 18); + SI1(RA, RE, RD, RB, RC); KL(RC, RE, RD, RB, RA, 17); + SI0(RC, RE, RD, RB, RA); KL(RD, RA, RE, RC, RB, 16); + SI7(RD, RA, RE, RC, RB); KL(RA, RC, RD, RB, RE, 15); + SI6(RA, RC, RD, RB, RE); KL(RD, RE, RB, RA, RC, 14); + SI5(RD, RE, RB, RA, RC); KL(RE, RC, RD, RB, RA, 13); + SI4(RE, RC, RD, RB, RA); KL(RE, RD, RA, RB, RC, 12); + SI3(RE, RD, RA, RB, RC); KL(RA, RE, RC, RB, RD, 11); + SI2(RA, RE, RC, RB, RD); KL(RE, RD, RB, RC, RA, 10); + SI1(RE, RD, RB, RC, RA); KL(RA, RD, RB, RC, RE, 9); + SI0(RA, RD, RB, RC, RE); KL(RB, RE, RD, RA, RC, 8); + SI7(RB, RE, RD, RA, RC); KL(RE, RA, RB, RC, RD, 7); + SI6(RE, RA, RB, RC, RD); KL(RB, RD, RC, RE, RA, 6); + SI5(RB, RD, RC, RE, RA); KL(RD, RA, RB, RC, RE, 5); + SI4(RD, RA, RB, RC, RE); KL(RD, RB, RE, RC, RA, 4); + SI3(RD, RB, RE, RC, RA); KL(RE, RD, RA, RC, RB, 3); + SI2(RE, RD, RA, RC, RB); KL(RD, RB, RC, RA, RE, 2); + SI1(RD, RB, RC, RA, RE); KL(RE, RB, RC, RA, RD, 1); + SI0(RE, RB, RC, RA, RD); K(RC, RD, RB, RE, RA, 0); + + movl arg_dst(%esp), %eax; + write_blocks(%eax, RC, RD, RB, RE, RT0, RT1, RA); + + ret; diff --git a/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S b/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S new file mode 100644 index 0000000..7f24a15 --- /dev/null +++ b/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S @@ -0,0 +1,761 @@ +/* + * Serpent Cipher 8-way parallel algorithm (x86_64/SSE2) + * + * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> + * + * Based on crypto/serpent.c by + * Copyright (C) 2002 Dag Arne Osvik <osvik@ii.uib.no> + * 2003 Herbert Valerio Riedel <hvr@gnu.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + * USA + * + */ + +.file "serpent-sse2-x86_64-asm_64.S" +.text + +#define CTX %rdi + +/********************************************************************** + 8-way SSE2 serpent + **********************************************************************/ +#define RA1 %xmm0 +#define RB1 %xmm1 +#define RC1 %xmm2 +#define RD1 %xmm3 +#define RE1 %xmm4 + +#define RA2 %xmm5 +#define RB2 %xmm6 +#define RC2 %xmm7 +#define RD2 %xmm8 +#define RE2 %xmm9 + +#define RNOT %xmm10 + +#define RK0 %xmm11 +#define RK1 %xmm12 +#define RK2 %xmm13 +#define RK3 %xmm14 + +#define S0_1(x0, x1, x2, x3, x4) \ + movdqa x3, x4; \ + por x0, x3; \ + pxor x4, x0; \ + pxor x2, x4; \ + pxor RNOT, x4; \ + pxor x1, x3; \ + pand x0, x1; \ + pxor x4, x1; \ + pxor x0, x2; +#define S0_2(x0, x1, x2, x3, x4) \ + pxor x3, x0; \ + por x0, x4; \ + pxor x2, x0; \ + pand x1, x2; \ + pxor x2, x3; \ + pxor RNOT, x1; \ + pxor x4, x2; \ + pxor x2, x1; + +#define S1_1(x0, x1, x2, x3, x4) \ + movdqa x1, x4; \ + pxor x0, x1; \ + pxor x3, x0; \ + pxor RNOT, x3; \ + pand x1, x4; \ + por x1, x0; \ + pxor x2, x3; \ + pxor x3, x0; \ + pxor x3, x1; +#define S1_2(x0, x1, x2, x3, x4) \ + pxor x4, x3; \ + por x4, x1; \ + pxor x2, x4; \ + pand x0, x2; \ + pxor x1, x2; \ + por x0, x1; \ + pxor RNOT, x0; \ + pxor x2, x0; \ + pxor x1, x4; + +#define S2_1(x0, x1, x2, x3, x4) \ + pxor RNOT, x3; \ + pxor x0, x1; \ + movdqa x0, x4; \ + pand x2, x0; \ + pxor x3, x0; \ + por x4, x3; \ + pxor x1, x2; \ + pxor x1, x3; \ + pand x0, x1; +#define S2_2(x0, x1, x2, x3, x4) \ + pxor x2, x0; \ + pand x3, x2; \ + por x1, x3; \ + pxor RNOT, x0; \ + pxor x0, x3; \ + pxor x0, x4; \ + pxor x2, x0; \ + por x2, x1; + +#define S3_1(x0, x1, x2, x3, x4) \ + movdqa x1, x4; \ + pxor x3, x1; \ + por x0, x3; \ + pand x0, x4; \ + pxor x2, x0; \ + pxor x1, x2; \ + pand x3, x1; \ + pxor x3, x2; \ + por x4, x0; \ + pxor x3, x4; +#define S3_2(x0, x1, x2, x3, x4) \ + pxor x0, x1; \ + pand x3, x0; \ + pand x4, x3; \ + pxor x2, x3; \ + por x1, x4; \ + pand x1, x2; \ + pxor x3, x4; \ + pxor x3, x0; \ + pxor x2, x3; + +#define S4_1(x0, x1, x2, x3, x4) \ + movdqa x3, x4; \ + pand x0, x3; \ + pxor x4, x0; \ + pxor x2, x3; \ + por x4, x2; \ + pxor x1, x0; \ + pxor x3, x4; \ + por x0, x2; \ + pxor x1, x2; +#define S4_2(x0, x1, x2, x3, x4) \ + pand x0, x1; \ + pxor x4, x1; \ + pand x2, x4; \ + pxor x3, x2; \ + pxor x0, x4; \ + por x1, x3; \ + pxor RNOT, x1; \ + pxor x0, x3; + +#define S5_1(x0, x1, x2, x3, x4) \ + movdqa x1, x4; \ + por x0, x1; \ + pxor x1, x2; \ + pxor RNOT, x3; \ + pxor x0, x4; \ + pxor x2, x0; \ + pand x4, x1; \ + por x3, x4; \ + pxor x0, x4; +#define S5_2(x0, x1, x2, x3, x4) \ + pand x3, x0; \ + pxor x3, x1; \ + pxor x2, x3; \ + pxor x1, x0; \ + pand x4, x2; \ + pxor x2, x1; \ + pand x0, x2; \ + pxor x2, x3; + +#define S6_1(x0, x1, x2, x3, x4) \ + movdqa x1, x4; \ + pxor x0, x3; \ + pxor x2, x1; \ + pxor x0, x2; \ + pand x3, x0; \ + por x3, x1; \ + pxor RNOT, x4; \ + pxor x1, x0; \ + pxor x2, x1; +#define S6_2(x0, x1, x2, x3, x4) \ + pxor x4, x3; \ + pxor x0, x4; \ + pand x0, x2; \ + pxor x1, x4; \ + pxor x3, x2; \ + pand x1, x3; \ + pxor x0, x3; \ + pxor x2, x1; + +#define S7_1(x0, x1, x2, x3, x4) \ + pxor RNOT, x1; \ + movdqa x1, x4; \ + pxor RNOT, x0; \ + pand x2, x1; \ + pxor x3, x1; \ + por x4, x3; \ + pxor x2, x4; \ + pxor x3, x2; \ + pxor x0, x3; \ + por x1, x0; +#define S7_2(x0, x1, x2, x3, x4) \ + pand x0, x2; \ + pxor x4, x0; \ + pxor x3, x4; \ + pand x0, x3; \ + pxor x1, x4; \ + pxor x4, x2; \ + pxor x1, x3; \ + por x0, x4; \ + pxor x1, x4; + +#define SI0_1(x0, x1, x2, x3, x4) \ + movdqa x3, x4; \ + pxor x0, x1; \ + por x1, x3; \ + pxor x1, x4; \ + pxor RNOT, x0; \ + pxor x3, x2; \ + pxor x0, x3; \ + pand x1, x0; \ + pxor x2, x0; +#define SI0_2(x0, x1, x2, x3, x4) \ + pand x3, x2; \ + pxor x4, x3; \ + pxor x3, x2; \ + pxor x3, x1; \ + pand x0, x3; \ + pxor x0, x1; \ + pxor x2, x0; \ + pxor x3, x4; + +#define SI1_1(x0, x1, x2, x3, x4) \ + pxor x3, x1; \ + movdqa x0, x4; \ + pxor x2, x0; \ + pxor RNOT, x2; \ + por x1, x4; \ + pxor x3, x4; \ + pand x1, x3; \ + pxor x2, x1; \ + pand x4, x2; +#define SI1_2(x0, x1, x2, x3, x4) \ + pxor x1, x4; \ + por x3, x1; \ + pxor x0, x3; \ + pxor x0, x2; \ + por x4, x0; \ + pxor x4, x2; \ + pxor x0, x1; \ + pxor x1, x4; + +#define SI2_1(x0, x1, x2, x3, x4) \ + pxor x1, x2; \ + movdqa x3, x4; \ + pxor RNOT, x3; \ + por x2, x3; \ + pxor x4, x2; \ + pxor x0, x4; \ + pxor x1, x3; \ + por x2, x1; \ + pxor x0, x2; +#define SI2_2(x0, x1, x2, x3, x4) \ + pxor x4, x1; \ + por x3, x4; \ + pxor x3, x2; \ + pxor x2, x4; \ + pand x1, x2; \ + pxor x3, x2; \ + pxor x4, x3; \ + pxor x0, x4; + +#define SI3_1(x0, x1, x2, x3, x4) \ + pxor x1, x2; \ + movdqa x1, x4; \ + pand x2, x1; \ + pxor x0, x1; \ + por x4, x0; \ + pxor x3, x4; \ + pxor x3, x0; \ + por x1, x3; \ + pxor x2, x1; +#define SI3_2(x0, x1, x2, x3, x4) \ + pxor x3, x1; \ + pxor x2, x0; \ + pxor x3, x2; \ + pand x1, x3; \ + pxor x0, x1; \ + pand x2, x0; \ + pxor x3, x4; \ + pxor x0, x3; \ + pxor x1, x0; + +#define SI4_1(x0, x1, x2, x3, x4) \ + pxor x3, x2; \ + movdqa x0, x4; \ + pand x1, x0; \ + pxor x2, x0; \ + por x3, x2; \ + pxor RNOT, x4; \ + pxor x0, x1; \ + pxor x2, x0; \ + pand x4, x2; +#define SI4_2(x0, x1, x2, x3, x4) \ + pxor x0, x2; \ + por x4, x0; \ + pxor x3, x0; \ + pand x2, x3; \ + pxor x3, x4; \ + pxor x1, x3; \ + pand x0, x1; \ + pxor x1, x4; \ + pxor x3, x0; + +#define SI5_1(x0, x1, x2, x3, x4) \ + movdqa x1, x4; \ + por x2, x1; \ + pxor x4, x2; \ + pxor x3, x1; \ + pand x4, x3; \ + pxor x3, x2; \ + por x0, x3; \ + pxor RNOT, x0; \ + pxor x2, x3; \ + por x0, x2; +#define SI5_2(x0, x1, x2, x3, x4) \ + pxor x1, x4; \ + pxor x4, x2; \ + pand x0, x4; \ + pxor x1, x0; \ + pxor x3, x1; \ + pand x2, x0; \ + pxor x3, x2; \ + pxor x2, x0; \ + pxor x4, x2; \ + pxor x3, x4; + +#define SI6_1(x0, x1, x2, x3, x4) \ + pxor x2, x0; \ + movdqa x0, x4; \ + pand x3, x0; \ + pxor x3, x2; \ + pxor x2, x0; \ + pxor x1, x3; \ + por x4, x2; \ + pxor x3, x2; \ + pand x0, x3; +#define SI6_2(x0, x1, x2, x3, x4) \ + pxor RNOT, x0; \ + pxor x1, x3; \ + pand x2, x1; \ + pxor x0, x4; \ + pxor x4, x3; \ + pxor x2, x4; \ + pxor x1, x0; \ + pxor x0, x2; + +#define SI7_1(x0, x1, x2, x3, x4) \ + movdqa x3, x4; \ + pand x0, x3; \ + pxor x2, x0; \ + por x4, x2; \ + pxor x1, x4; \ + pxor RNOT, x0; \ + por x3, x1; \ + pxor x0, x4; \ + pand x2, x0; \ + pxor x1, x0; +#define SI7_2(x0, x1, x2, x3, x4) \ + pand x2, x1; \ + pxor x2, x3; \ + pxor x3, x4; \ + pand x3, x2; \ + por x0, x3; \ + pxor x4, x1; \ + pxor x4, x3; \ + pand x0, x4; \ + pxor x2, x4; + +#define get_key(i, j, t) \ + movd (4*(i)+(j))*4(CTX), t; \ + pshufd $0, t, t; + +#define K2(x0, x1, x2, x3, x4, i) \ + get_key(i, 0, RK0); \ + get_key(i, 1, RK1); \ + get_key(i, 2, RK2); \ + get_key(i, 3, RK3); \ + pxor RK0, x0 ## 1; \ + pxor RK1, x1 ## 1; \ + pxor RK2, x2 ## 1; \ + pxor RK3, x3 ## 1; \ + pxor RK0, x0 ## 2; \ + pxor RK1, x1 ## 2; \ + pxor RK2, x2 ## 2; \ + pxor RK3, x3 ## 2; + +#define LK2(x0, x1, x2, x3, x4, i) \ + movdqa x0 ## 1, x4 ## 1; \ + pslld $13, x0 ## 1; \ + psrld $(32 - 13), x4 ## 1; \ + por x4 ## 1, x0 ## 1; \ + pxor x0 ## 1, x1 ## 1; \ + movdqa x2 ## 1, x4 ## 1; \ + pslld $3, x2 ## 1; \ + psrld $(32 - 3), x4 ## 1; \ + por x4 ## 1, x2 ## 1; \ + pxor x2 ## 1, x1 ## 1; \ + movdqa x0 ## 2, x4 ## 2; \ + pslld $13, x0 ## 2; \ + psrld $(32 - 13), x4 ## 2; \ + por x4 ## 2, x0 ## 2; \ + pxor x0 ## 2, x1 ## 2; \ + movdqa x2 ## 2, x4 ## 2; \ + pslld $3, x2 ## 2; \ + psrld $(32 - 3), x4 ## 2; \ + por x4 ## 2, x2 ## 2; \ + pxor x2 ## 2, x1 ## 2; \ + movdqa x1 ## 1, x4 ## 1; \ + pslld $1, x1 ## 1; \ + psrld $(32 - 1), x4 ## 1; \ + por x4 ## 1, x1 ## 1; \ + movdqa x0 ## 1, x4 ## 1; \ + pslld $3, x4 ## 1; \ + pxor x2 ## 1, x3 ## 1; \ + pxor x4 ## 1, x3 ## 1; \ + movdqa x3 ## 1, x4 ## 1; \ + get_key(i, 1, RK1); \ + movdqa x1 ## 2, x4 ## 2; \ + pslld $1, x1 ## 2; \ + psrld $(32 - 1), x4 ## 2; \ + por x4 ## 2, x1 ## 2; \ + movdqa x0 ## 2, x4 ## 2; \ + pslld $3, x4 ## 2; \ + pxor x2 ## 2, x3 ## 2; \ + pxor x4 ## 2, x3 ## 2; \ + movdqa x3 ## 2, x4 ## 2; \ + get_key(i, 3, RK3); \ + pslld $7, x3 ## 1; \ + psrld $(32 - 7), x4 ## 1; \ + por x4 ## 1, x3 ## 1; \ + movdqa x1 ## 1, x4 ## 1; \ + pslld $7, x4 ## 1; \ + pxor x1 ## 1, x0 ## 1; \ + pxor x3 ## 1, x0 ## 1; \ + pxor x3 ## 1, x2 ## 1; \ + pxor x4 ## 1, x2 ## 1; \ + get_key(i, 0, RK0); \ + pslld $7, x3 ## 2; \ + psrld $(32 - 7), x4 ## 2; \ + por x4 ## 2, x3 ## 2; \ + movdqa x1 ## 2, x4 ## 2; \ + pslld $7, x4 ## 2; \ + pxor x1 ## 2, x0 ## 2; \ + pxor x3 ## 2, x0 ## 2; \ + pxor x3 ## 2, x2 ## 2; \ + pxor x4 ## 2, x2 ## 2; \ + get_key(i, 2, RK2); \ + pxor RK1, x1 ## 1; \ + pxor RK3, x3 ## 1; \ + movdqa x0 ## 1, x4 ## 1; \ + pslld $5, x0 ## 1; \ + psrld $(32 - 5), x4 ## 1; \ + por x4 ## 1, x0 ## 1; \ + movdqa x2 ## 1, x4 ## 1; \ + pslld $22, x2 ## 1; \ + psrld $(32 - 22), x4 ## 1; \ + por x4 ## 1, x2 ## 1; \ + pxor RK0, x0 ## 1; \ + pxor RK2, x2 ## 1; \ + pxor RK1, x1 ## 2; \ + pxor RK3, x3 ## 2; \ + movdqa x0 ## 2, x4 ## 2; \ + pslld $5, x0 ## 2; \ + psrld $(32 - 5), x4 ## 2; \ + por x4 ## 2, x0 ## 2; \ + movdqa x2 ## 2, x4 ## 2; \ + pslld $22, x2 ## 2; \ + psrld $(32 - 22), x4 ## 2; \ + por x4 ## 2, x2 ## 2; \ + pxor RK0, x0 ## 2; \ + pxor RK2, x2 ## 2; + +#define KL2(x0, x1, x2, x3, x4, i) \ + pxor RK0, x0 ## 1; \ + pxor RK2, x2 ## 1; \ + movdqa x0 ## 1, x4 ## 1; \ + psrld $5, x0 ## 1; \ + pslld $(32 - 5), x4 ## 1; \ + por x4 ## 1, x0 ## 1; \ + pxor RK3, x3 ## 1; \ + pxor RK1, x1 ## 1; \ + movdqa x2 ## 1, x4 ## 1; \ + psrld $22, x2 ## 1; \ + pslld $(32 - 22), x4 ## 1; \ + por x4 ## 1, x2 ## 1; \ + pxor x3 ## 1, x2 ## 1; \ + pxor RK0, x0 ## 2; \ + pxor RK2, x2 ## 2; \ + movdqa x0 ## 2, x4 ## 2; \ + psrld $5, x0 ## 2; \ + pslld $(32 - 5), x4 ## 2; \ + por x4 ## 2, x0 ## 2; \ + pxor RK3, x3 ## 2; \ + pxor RK1, x1 ## 2; \ + movdqa x2 ## 2, x4 ## 2; \ + psrld $22, x2 ## 2; \ + pslld $(32 - 22), x4 ## 2; \ + por x4 ## 2, x2 ## 2; \ + pxor x3 ## 2, x2 ## 2; \ + pxor x3 ## 1, x0 ## 1; \ + movdqa x1 ## 1, x4 ## 1; \ + pslld $7, x4 ## 1; \ + pxor x1 ## 1, x0 ## 1; \ + pxor x4 ## 1, x2 ## 1; \ + movdqa x1 ## 1, x4 ## 1; \ + psrld $1, x1 ## 1; \ + pslld $(32 - 1), x4 ## 1; \ + por x4 ## 1, x1 ## 1; \ + pxor x3 ## 2, x0 ## 2; \ + movdqa x1 ## 2, x4 ## 2; \ + pslld $7, x4 ## 2; \ + pxor x1 ## 2, x0 ## 2; \ + pxor x4 ## 2, x2 ## 2; \ + movdqa x1 ## 2, x4 ## 2; \ + psrld $1, x1 ## 2; \ + pslld $(32 - 1), x4 ## 2; \ + por x4 ## 2, x1 ## 2; \ + movdqa x3 ## 1, x4 ## 1; \ + psrld $7, x3 ## 1; \ + pslld $(32 - 7), x4 ## 1; \ + por x4 ## 1, x3 ## 1; \ + pxor x0 ## 1, x1 ## 1; \ + movdqa x0 ## 1, x4 ## 1; \ + pslld $3, x4 ## 1; \ + pxor x4 ## 1, x3 ## 1; \ + movdqa x0 ## 1, x4 ## 1; \ + movdqa x3 ## 2, x4 ## 2; \ + psrld $7, x3 ## 2; \ + pslld $(32 - 7), x4 ## 2; \ + por x4 ## 2, x3 ## 2; \ + pxor x0 ## 2, x1 ## 2; \ + movdqa x0 ## 2, x4 ## 2; \ + pslld $3, x4 ## 2; \ + pxor x4 ## 2, x3 ## 2; \ + movdqa x0 ## 2, x4 ## 2; \ + psrld $13, x0 ## 1; \ + pslld $(32 - 13), x4 ## 1; \ + por x4 ## 1, x0 ## 1; \ + pxor x2 ## 1, x1 ## 1; \ + pxor x2 ## 1, x3 ## 1; \ + movdqa x2 ## 1, x4 ## 1; \ + psrld $3, x2 ## 1; \ + pslld $(32 - 3), x4 ## 1; \ + por x4 ## 1, x2 ## 1; \ + psrld $13, x0 ## 2; \ + pslld $(32 - 13), x4 ## 2; \ + por x4 ## 2, x0 ## 2; \ + pxor x2 ## 2, x1 ## 2; \ + pxor x2 ## 2, x3 ## 2; \ + movdqa x2 ## 2, x4 ## 2; \ + psrld $3, x2 ## 2; \ + pslld $(32 - 3), x4 ## 2; \ + por x4 ## 2, x2 ## 2; + +#define S(SBOX, x0, x1, x2, x3, x4) \ + SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ + SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ + SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ + SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); + +#define SP(SBOX, x0, x1, x2, x3, x4, i) \ + get_key(i, 0, RK0); \ + SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ + get_key(i, 2, RK2); \ + SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ + get_key(i, 3, RK3); \ + SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ + get_key(i, 1, RK1); \ + SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ + +#define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \ + movdqa x2, t3; \ + movdqa x0, t1; \ + unpcklps x3, t3; \ + movdqa x0, t2; \ + unpcklps x1, t1; \ + unpckhps x1, t2; \ + movdqa t3, x1; \ + unpckhps x3, x2; \ + movdqa t1, x0; \ + movhlps t1, x1; \ + movdqa t2, t1; \ + movlhps t3, x0; \ + movlhps x2, t1; \ + movhlps t2, x2; \ + movdqa x2, x3; \ + movdqa t1, x2; + +#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \ + movdqu (0*4*4)(in), x0; \ + movdqu (1*4*4)(in), x1; \ + movdqu (2*4*4)(in), x2; \ + movdqu (3*4*4)(in), x3; \ + \ + transpose_4x4(x0, x1, x2, x3, t0, t1, t2) + +#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \ + transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ + \ + movdqu x0, (0*4*4)(out); \ + movdqu x1, (1*4*4)(out); \ + movdqu x2, (2*4*4)(out); \ + movdqu x3, (3*4*4)(out); + +#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \ + transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ + \ + movdqu (0*4*4)(out), t0; \ + pxor t0, x0; \ + movdqu x0, (0*4*4)(out); \ + movdqu (1*4*4)(out), t0; \ + pxor t0, x1; \ + movdqu x1, (1*4*4)(out); \ + movdqu (2*4*4)(out), t0; \ + pxor t0, x2; \ + movdqu x2, (2*4*4)(out); \ + movdqu (3*4*4)(out), t0; \ + pxor t0, x3; \ + movdqu x3, (3*4*4)(out); + +.align 8 +.global __serpent_enc_blk_8way +.type __serpent_enc_blk_8way,@function; + +__serpent_enc_blk_8way: + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + * %rcx: bool, if true: xor output + */ + + pcmpeqd RNOT, RNOT; + + leaq (4*4*4)(%rdx), %rax; + read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2); + read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); + + K2(RA, RB, RC, RD, RE, 0); + S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1); + S(S1, RC, RB, RD, RA, RE); LK2(RE, RD, RA, RC, RB, 2); + S(S2, RE, RD, RA, RC, RB); LK2(RB, RD, RE, RC, RA, 3); + S(S3, RB, RD, RE, RC, RA); LK2(RC, RA, RD, RB, RE, 4); + S(S4, RC, RA, RD, RB, RE); LK2(RA, RD, RB, RE, RC, 5); + S(S5, RA, RD, RB, RE, RC); LK2(RC, RA, RD, RE, RB, 6); + S(S6, RC, RA, RD, RE, RB); LK2(RD, RB, RA, RE, RC, 7); + S(S7, RD, RB, RA, RE, RC); LK2(RC, RA, RE, RD, RB, 8); + S(S0, RC, RA, RE, RD, RB); LK2(RE, RA, RD, RC, RB, 9); + S(S1, RE, RA, RD, RC, RB); LK2(RB, RD, RC, RE, RA, 10); + S(S2, RB, RD, RC, RE, RA); LK2(RA, RD, RB, RE, RC, 11); + S(S3, RA, RD, RB, RE, RC); LK2(RE, RC, RD, RA, RB, 12); + S(S4, RE, RC, RD, RA, RB); LK2(RC, RD, RA, RB, RE, 13); + S(S5, RC, RD, RA, RB, RE); LK2(RE, RC, RD, RB, RA, 14); + S(S6, RE, RC, RD, RB, RA); LK2(RD, RA, RC, RB, RE, 15); + S(S7, RD, RA, RC, RB, RE); LK2(RE, RC, RB, RD, RA, 16); + S(S0, RE, RC, RB, RD, RA); LK2(RB, RC, RD, RE, RA, 17); + S(S1, RB, RC, RD, RE, RA); LK2(RA, RD, RE, RB, RC, 18); + S(S2, RA, RD, RE, RB, RC); LK2(RC, RD, RA, RB, RE, 19); + S(S3, RC, RD, RA, RB, RE); LK2(RB, RE, RD, RC, RA, 20); + S(S4, RB, RE, RD, RC, RA); LK2(RE, RD, RC, RA, RB, 21); + S(S5, RE, RD, RC, RA, RB); LK2(RB, RE, RD, RA, RC, 22); + S(S6, RB, RE, RD, RA, RC); LK2(RD, RC, RE, RA, RB, 23); + S(S7, RD, RC, RE, RA, RB); LK2(RB, RE, RA, RD, RC, 24); + S(S0, RB, RE, RA, RD, RC); LK2(RA, RE, RD, RB, RC, 25); + S(S1, RA, RE, RD, RB, RC); LK2(RC, RD, RB, RA, RE, 26); + S(S2, RC, RD, RB, RA, RE); LK2(RE, RD, RC, RA, RB, 27); + S(S3, RE, RD, RC, RA, RB); LK2(RA, RB, RD, RE, RC, 28); + S(S4, RA, RB, RD, RE, RC); LK2(RB, RD, RE, RC, RA, 29); + S(S5, RB, RD, RE, RC, RA); LK2(RA, RB, RD, RC, RE, 30); + S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31); + S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32); + + leaq (4*4*4)(%rsi), %rax; + + testb %cl, %cl; + jnz __enc_xor8; + + write_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2); + write_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); + + ret; + +__enc_xor8: + xor_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2); + xor_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); + + ret; + +.align 8 +.global serpent_dec_blk_8way +.type serpent_dec_blk_8way,@function; + +serpent_dec_blk_8way: + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + */ + + pcmpeqd RNOT, RNOT; + + leaq (4*4*4)(%rdx), %rax; + read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2); + read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); + + K2(RA, RB, RC, RD, RE, 32); + SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31); + SP(SI6, RB, RD, RA, RE, RC, 30); KL2(RA, RC, RE, RB, RD, 30); + SP(SI5, RA, RC, RE, RB, RD, 29); KL2(RC, RD, RA, RE, RB, 29); + SP(SI4, RC, RD, RA, RE, RB, 28); KL2(RC, RA, RB, RE, RD, 28); + SP(SI3, RC, RA, RB, RE, RD, 27); KL2(RB, RC, RD, RE, RA, 27); + SP(SI2, RB, RC, RD, RE, RA, 26); KL2(RC, RA, RE, RD, RB, 26); + SP(SI1, RC, RA, RE, RD, RB, 25); KL2(RB, RA, RE, RD, RC, 25); + SP(SI0, RB, RA, RE, RD, RC, 24); KL2(RE, RC, RA, RB, RD, 24); + SP(SI7, RE, RC, RA, RB, RD, 23); KL2(RC, RB, RE, RD, RA, 23); + SP(SI6, RC, RB, RE, RD, RA, 22); KL2(RE, RA, RD, RC, RB, 22); + SP(SI5, RE, RA, RD, RC, RB, 21); KL2(RA, RB, RE, RD, RC, 21); + SP(SI4, RA, RB, RE, RD, RC, 20); KL2(RA, RE, RC, RD, RB, 20); + SP(SI3, RA, RE, RC, RD, RB, 19); KL2(RC, RA, RB, RD, RE, 19); + SP(SI2, RC, RA, RB, RD, RE, 18); KL2(RA, RE, RD, RB, RC, 18); + SP(SI1, RA, RE, RD, RB, RC, 17); KL2(RC, RE, RD, RB, RA, 17); + SP(SI0, RC, RE, RD, RB, RA, 16); KL2(RD, RA, RE, RC, RB, 16); + SP(SI7, RD, RA, RE, RC, RB, 15); KL2(RA, RC, RD, RB, RE, 15); + SP(SI6, RA, RC, RD, RB, RE, 14); KL2(RD, RE, RB, RA, RC, 14); + SP(SI5, RD, RE, RB, RA, RC, 13); KL2(RE, RC, RD, RB, RA, 13); + SP(SI4, RE, RC, RD, RB, RA, 12); KL2(RE, RD, RA, RB, RC, 12); + SP(SI3, RE, RD, RA, RB, RC, 11); KL2(RA, RE, RC, RB, RD, 11); + SP(SI2, RA, RE, RC, RB, RD, 10); KL2(RE, RD, RB, RC, RA, 10); + SP(SI1, RE, RD, RB, RC, RA, 9); KL2(RA, RD, RB, RC, RE, 9); + SP(SI0, RA, RD, RB, RC, RE, 8); KL2(RB, RE, RD, RA, RC, 8); + SP(SI7, RB, RE, RD, RA, RC, 7); KL2(RE, RA, RB, RC, RD, 7); + SP(SI6, RE, RA, RB, RC, RD, 6); KL2(RB, RD, RC, RE, RA, 6); + SP(SI5, RB, RD, RC, RE, RA, 5); KL2(RD, RA, RB, RC, RE, 5); + SP(SI4, RD, RA, RB, RC, RE, 4); KL2(RD, RB, RE, RC, RA, 4); + SP(SI3, RD, RB, RE, RC, RA, 3); KL2(RE, RD, RA, RC, RB, 3); + SP(SI2, RE, RD, RA, RC, RB, 2); KL2(RD, RB, RC, RA, RE, 2); + SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1); + S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0); + + leaq (4*4*4)(%rsi), %rax; + write_blocks(%rsi, RC1, RD1, RB1, RE1, RK0, RK1, RK2); + write_blocks(%rax, RC2, RD2, RB2, RE2, RK0, RK1, RK2); + + ret; diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c new file mode 100644 index 0000000..7955a9b --- /dev/null +++ b/arch/x86/crypto/serpent_sse2_glue.c @@ -0,0 +1,1070 @@ +/* + * Glue Code for SSE2 assembler versions of Serpent Cipher + * + * Copyright (c) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> + * + * Glue code based on aesni-intel_glue.c by: + * Copyright (C) 2008, Intel Corp. + * Author: Huang Ying <ying.huang@intel.com> + * + * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by: + * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> + * CTR part based on code (crypto/ctr.c) by: + * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + * USA + * + */ + +#include <linux/module.h> +#include <linux/hardirq.h> +#include <linux/types.h> +#include <linux/crypto.h> +#include <linux/err.h> +#include <crypto/algapi.h> +#include <crypto/serpent.h> +#include <crypto/cryptd.h> +#include <crypto/b128ops.h> +#include <crypto/ctr.h> +#include <crypto/lrw.h> +#include <crypto/xts.h> +#include <asm/i387.h> +#include <asm/serpent.h> +#include <crypto/scatterwalk.h> +#include <linux/workqueue.h> +#include <linux/spinlock.h> + +struct async_serpent_ctx { + struct cryptd_ablkcipher *cryptd_tfm; +}; + +static inline bool serpent_fpu_begin(bool fpu_enabled, unsigned int nbytes) +{ + if (fpu_enabled) + return true; + + /* SSE2 is only used when chunk to be processed is large enough, so + * do not enable FPU until it is necessary. + */ + if (nbytes < SERPENT_BLOCK_SIZE * SERPENT_PARALLEL_BLOCKS) + return false; + + kernel_fpu_begin(); + return true; +} + +static inline void serpent_fpu_end(bool fpu_enabled) +{ + if (fpu_enabled) + kernel_fpu_end(); +} + +static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk, + bool enc) +{ + bool fpu_enabled = false; + struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + const unsigned int bsize = SERPENT_BLOCK_SIZE; + unsigned int nbytes; + int err; + + err = blkcipher_walk_virt(desc, walk); + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + + while ((nbytes = walk->nbytes)) { + u8 *wsrc = walk->src.virt.addr; + u8 *wdst = walk->dst.virt.addr; + + fpu_enabled = serpent_fpu_begin(fpu_enabled, nbytes); + + /* Process multi-block batch */ + if (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS) { + do { + if (enc) + serpent_enc_blk_xway(ctx, wdst, wsrc); + else + serpent_dec_blk_xway(ctx, wdst, wsrc); + + wsrc += bsize * SERPENT_PARALLEL_BLOCKS; + wdst += bsize * SERPENT_PARALLEL_BLOCKS; + nbytes -= bsize * SERPENT_PARALLEL_BLOCKS; + } while (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS); + + if (nbytes < bsize) + goto done; + } + + /* Handle leftovers */ + do { + if (enc) + __serpent_encrypt(ctx, wdst, wsrc); + else + __serpent_decrypt(ctx, wdst, wsrc); + + wsrc += bsize; + wdst += bsize; + nbytes -= bsize; + } while (nbytes >= bsize); + +done: + err = blkcipher_walk_done(desc, walk, nbytes); + } + + serpent_fpu_end(fpu_enabled); + return err; +} + +static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct blkcipher_walk walk; + + blkcipher_walk_init(&walk, dst, src, nbytes); + return ecb_crypt(desc, &walk, true); +} + +static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct blkcipher_walk walk; + + blkcipher_walk_init(&walk, dst, src, nbytes); + return ecb_crypt(desc, &walk, false); +} + +static struct crypto_alg blk_ecb_alg = { + .cra_name = "__ecb-serpent-sse2", + .cra_driver_name = "__driver-ecb-serpent-sse2", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = SERPENT_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct serpent_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(blk_ecb_alg.cra_list), + .cra_u = { + .blkcipher = { + .min_keysize = SERPENT_MIN_KEY_SIZE, + .max_keysize = SERPENT_MAX_KEY_SIZE, + .setkey = serpent_setkey, + .encrypt = ecb_encrypt, + .decrypt = ecb_decrypt, + }, + }, +}; + +static unsigned int __cbc_encrypt(struct blkcipher_desc *desc, + struct blkcipher_walk *walk) +{ + struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + const unsigned int bsize = SERPENT_BLOCK_SIZE; + unsigned int nbytes = walk->nbytes; + u128 *src = (u128 *)walk->src.virt.addr; + u128 *dst = (u128 *)walk->dst.virt.addr; + u128 *iv = (u128 *)walk->iv; + + do { + u128_xor(dst, src, iv); + __serpent_encrypt(ctx, (u8 *)dst, (u8 *)dst); + iv = dst; + + src += 1; + dst += 1; + nbytes -= bsize; + } while (nbytes >= bsize); + + u128_xor((u128 *)walk->iv, (u128 *)walk->iv, iv); + return nbytes; +} + +static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct blkcipher_walk walk; + int err; + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt(desc, &walk); + + while ((nbytes = walk.nbytes)) { + nbytes = __cbc_encrypt(desc, &walk); + err = blkcipher_walk_done(desc, &walk, nbytes); + } + + return err; +} + +static unsigned int __cbc_decrypt(struct blkcipher_desc *desc, + struct blkcipher_walk *walk) +{ + struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + const unsigned int bsize = SERPENT_BLOCK_SIZE; + unsigned int nbytes = walk->nbytes; + u128 *src = (u128 *)walk->src.virt.addr; + u128 *dst = (u128 *)walk->dst.virt.addr; + u128 ivs[SERPENT_PARALLEL_BLOCKS - 1]; + u128 last_iv; + int i; + + /* Start of the last block. */ + src += nbytes / bsize - 1; + dst += nbytes / bsize - 1; + + last_iv = *src; + + /* Process multi-block batch */ + if (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS) { + do { + nbytes -= bsize * (SERPENT_PARALLEL_BLOCKS - 1); + src -= SERPENT_PARALLEL_BLOCKS - 1; + dst -= SERPENT_PARALLEL_BLOCKS - 1; + + for (i = 0; i < SERPENT_PARALLEL_BLOCKS - 1; i++) + ivs[i] = src[i]; + + serpent_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src); + + for (i = 0; i < SERPENT_PARALLEL_BLOCKS - 1; i++) + u128_xor(dst + (i + 1), dst + (i + 1), ivs + i); + + nbytes -= bsize; + if (nbytes < bsize) + goto done; + + u128_xor(dst, dst, src - 1); + src -= 1; + dst -= 1; + } while (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS); + + if (nbytes < bsize) + goto done; + } + + /* Handle leftovers */ + for (;;) { + __serpent_decrypt(ctx, (u8 *)dst, (u8 *)src); + + nbytes -= bsize; + if (nbytes < bsize) + break; + + u128_xor(dst, dst, src - 1); + src -= 1; + dst -= 1; + } + +done: + u128_xor(dst, dst, (u128 *)walk->iv); + *(u128 *)walk->iv = last_iv; + + return nbytes; +} + +static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + bool fpu_enabled = false; + struct blkcipher_walk walk; + int err; + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt(desc, &walk); + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + + while ((nbytes = walk.nbytes)) { + fpu_enabled = serpent_fpu_begin(fpu_enabled, nbytes); + nbytes = __cbc_decrypt(desc, &walk); + err = blkcipher_walk_done(desc, &walk, nbytes); + } + + serpent_fpu_end(fpu_enabled); + return err; +} + +static struct crypto_alg blk_cbc_alg = { + .cra_name = "__cbc-serpent-sse2", + .cra_driver_name = "__driver-cbc-serpent-sse2", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = SERPENT_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct serpent_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(blk_cbc_alg.cra_list), + .cra_u = { + .blkcipher = { + .min_keysize = SERPENT_MIN_KEY_SIZE, + .max_keysize = SERPENT_MAX_KEY_SIZE, + .setkey = serpent_setkey, + .encrypt = cbc_encrypt, + .decrypt = cbc_decrypt, + }, + }, +}; + +static inline void u128_to_be128(be128 *dst, const u128 *src) +{ + dst->a = cpu_to_be64(src->a); + dst->b = cpu_to_be64(src->b); +} + +static inline void be128_to_u128(u128 *dst, const be128 *src) +{ + dst->a = be64_to_cpu(src->a); + dst->b = be64_to_cpu(src->b); +} + +static inline void u128_inc(u128 *i) +{ + i->b++; + if (!i->b) + i->a++; +} + +static void ctr_crypt_final(struct blkcipher_desc *desc, + struct blkcipher_walk *walk) +{ + struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + u8 *ctrblk = walk->iv; + u8 keystream[SERPENT_BLOCK_SIZE]; + u8 *src = walk->src.virt.addr; + u8 *dst = walk->dst.virt.addr; + unsigned int nbytes = walk->nbytes; + + __serpent_encrypt(ctx, keystream, ctrblk); + crypto_xor(keystream, src, nbytes); + memcpy(dst, keystream, nbytes); + + crypto_inc(ctrblk, SERPENT_BLOCK_SIZE); +} + +static unsigned int __ctr_crypt(struct blkcipher_desc *desc, + struct blkcipher_walk *walk) +{ + struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + const unsigned int bsize = SERPENT_BLOCK_SIZE; + unsigned int nbytes = walk->nbytes; + u128 *src = (u128 *)walk->src.virt.addr; + u128 *dst = (u128 *)walk->dst.virt.addr; + u128 ctrblk; + be128 ctrblocks[SERPENT_PARALLEL_BLOCKS]; + int i; + + be128_to_u128(&ctrblk, (be128 *)walk->iv); + + /* Process multi-block batch */ + if (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS) { + do { + /* create ctrblks for parallel encrypt */ + for (i = 0; i < SERPENT_PARALLEL_BLOCKS; i++) { + if (dst != src) + dst[i] = src[i]; + + u128_to_be128(&ctrblocks[i], &ctrblk); + u128_inc(&ctrblk); + } + + serpent_enc_blk_xway_xor(ctx, (u8 *)dst, + (u8 *)ctrblocks); + + src += SERPENT_PARALLEL_BLOCKS; + dst += SERPENT_PARALLEL_BLOCKS; + nbytes -= bsize * SERPENT_PARALLEL_BLOCKS; + } while (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS); + + if (nbytes < bsize) + goto done; + } + + /* Handle leftovers */ + do { + if (dst != src) + *dst = *src; + + u128_to_be128(&ctrblocks[0], &ctrblk); + u128_inc(&ctrblk); + + __serpent_encrypt(ctx, (u8 *)ctrblocks, (u8 *)ctrblocks); + u128_xor(dst, dst, (u128 *)ctrblocks); + + src += 1; + dst += 1; + nbytes -= bsize; + } while (nbytes >= bsize); + +done: + u128_to_be128((be128 *)walk->iv, &ctrblk); + return nbytes; +} + +static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + bool fpu_enabled = false; + struct blkcipher_walk walk; + int err; + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt_block(desc, &walk, SERPENT_BLOCK_SIZE); + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + + while ((nbytes = walk.nbytes) >= SERPENT_BLOCK_SIZE) { + fpu_enabled = serpent_fpu_begin(fpu_enabled, nbytes); + nbytes = __ctr_crypt(desc, &walk); + err = blkcipher_walk_done(desc, &walk, nbytes); + } + + serpent_fpu_end(fpu_enabled); + + if (walk.nbytes) { + ctr_crypt_final(desc, &walk); + err = blkcipher_walk_done(desc, &walk, 0); + } + + return err; +} + +static struct crypto_alg blk_ctr_alg = { + .cra_name = "__ctr-serpent-sse2", + .cra_driver_name = "__driver-ctr-serpent-sse2", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct serpent_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(blk_ctr_alg.cra_list), + .cra_u = { + .blkcipher = { + .min_keysize = SERPENT_MIN_KEY_SIZE, + .max_keysize = SERPENT_MAX_KEY_SIZE, + .ivsize = SERPENT_BLOCK_SIZE, + .setkey = serpent_setkey, + .encrypt = ctr_crypt, + .decrypt = ctr_crypt, + }, + }, +}; + +struct crypt_priv { + struct serpent_ctx *ctx; + bool fpu_enabled; +}; + +static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) +{ + const unsigned int bsize = SERPENT_BLOCK_SIZE; + struct crypt_priv *ctx = priv; + int i; + + ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes); + + if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) { + serpent_enc_blk_xway(ctx->ctx, srcdst, srcdst); + return; + } + + for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) + __serpent_encrypt(ctx->ctx, srcdst, srcdst); +} + +static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) +{ + const unsigned int bsize = SERPENT_BLOCK_SIZE; + struct crypt_priv *ctx = priv; + int i; + + ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes); + + if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) { + serpent_dec_blk_xway(ctx->ctx, srcdst, srcdst); + return; + } + + for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) + __serpent_decrypt(ctx->ctx, srcdst, srcdst); +} + +struct serpent_lrw_ctx { + struct lrw_table_ctx lrw_table; + struct serpent_ctx serpent_ctx; +}; + +static int lrw_serpent_setkey(struct crypto_tfm *tfm, const u8 *key, + unsigned int keylen) +{ + struct serpent_lrw_ctx *ctx = crypto_tfm_ctx(tfm); + int err; + + err = __serpent_setkey(&ctx->serpent_ctx, key, keylen - + SERPENT_BLOCK_SIZE); + if (err) + return err; + + return lrw_init_table(&ctx->lrw_table, key + keylen - + SERPENT_BLOCK_SIZE); +} + +static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct serpent_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + be128 buf[SERPENT_PARALLEL_BLOCKS]; + struct crypt_priv crypt_ctx = { + .ctx = &ctx->serpent_ctx, + .fpu_enabled = false, + }; + struct lrw_crypt_req req = { + .tbuf = buf, + .tbuflen = sizeof(buf), + + .table_ctx = &ctx->lrw_table, + .crypt_ctx = &crypt_ctx, + .crypt_fn = encrypt_callback, + }; + int ret; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + ret = lrw_crypt(desc, dst, src, nbytes, &req); + serpent_fpu_end(crypt_ctx.fpu_enabled); + + return ret; +} + +static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct serpent_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + be128 buf[SERPENT_PARALLEL_BLOCKS]; + struct crypt_priv crypt_ctx = { + .ctx = &ctx->serpent_ctx, + .fpu_enabled = false, + }; + struct lrw_crypt_req req = { + .tbuf = buf, + .tbuflen = sizeof(buf), + + .table_ctx = &ctx->lrw_table, + .crypt_ctx = &crypt_ctx, + .crypt_fn = decrypt_callback, + }; + int ret; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + ret = lrw_crypt(desc, dst, src, nbytes, &req); + serpent_fpu_end(crypt_ctx.fpu_enabled); + + return ret; +} + +static void lrw_exit_tfm(struct crypto_tfm *tfm) +{ + struct serpent_lrw_ctx *ctx = crypto_tfm_ctx(tfm); + + lrw_free_table(&ctx->lrw_table); +} + +static struct crypto_alg blk_lrw_alg = { + .cra_name = "__lrw-serpent-sse2", + .cra_driver_name = "__driver-lrw-serpent-sse2", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = SERPENT_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct serpent_lrw_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(blk_lrw_alg.cra_list), + .cra_exit = lrw_exit_tfm, + .cra_u = { + .blkcipher = { + .min_keysize = SERPENT_MIN_KEY_SIZE + + SERPENT_BLOCK_SIZE, + .max_keysize = SERPENT_MAX_KEY_SIZE + + SERPENT_BLOCK_SIZE, + .ivsize = SERPENT_BLOCK_SIZE, + .setkey = lrw_serpent_setkey, + .encrypt = lrw_encrypt, + .decrypt = lrw_decrypt, + }, + }, +}; + +struct serpent_xts_ctx { + struct serpent_ctx tweak_ctx; + struct serpent_ctx crypt_ctx; +}; + +static int xts_serpent_setkey(struct crypto_tfm *tfm, const u8 *key, + unsigned int keylen) +{ + struct serpent_xts_ctx *ctx = crypto_tfm_ctx(tfm); + u32 *flags = &tfm->crt_flags; + int err; + + /* key consists of keys of equal size concatenated, therefore + * the length must be even + */ + if (keylen % 2) { + *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; + return -EINVAL; + } + + /* first half of xts-key is for crypt */ + err = __serpent_setkey(&ctx->crypt_ctx, key, keylen / 2); + if (err) + return err; + + /* second half of xts-key is for tweak */ + return __serpent_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2); +} + +static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + be128 buf[SERPENT_PARALLEL_BLOCKS]; + struct crypt_priv crypt_ctx = { + .ctx = &ctx->crypt_ctx, + .fpu_enabled = false, + }; + struct xts_crypt_req req = { + .tbuf = buf, + .tbuflen = sizeof(buf), + + .tweak_ctx = &ctx->tweak_ctx, + .tweak_fn = XTS_TWEAK_CAST(__serpent_encrypt), + .crypt_ctx = &crypt_ctx, + .crypt_fn = encrypt_callback, + }; + int ret; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + ret = xts_crypt(desc, dst, src, nbytes, &req); + serpent_fpu_end(crypt_ctx.fpu_enabled); + + return ret; +} + +static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + be128 buf[SERPENT_PARALLEL_BLOCKS]; + struct crypt_priv crypt_ctx = { + .ctx = &ctx->crypt_ctx, + .fpu_enabled = false, + }; + struct xts_crypt_req req = { + .tbuf = buf, + .tbuflen = sizeof(buf), + + .tweak_ctx = &ctx->tweak_ctx, + .tweak_fn = XTS_TWEAK_CAST(__serpent_encrypt), + .crypt_ctx = &crypt_ctx, + .crypt_fn = decrypt_callback, + }; + int ret; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + ret = xts_crypt(desc, dst, src, nbytes, &req); + serpent_fpu_end(crypt_ctx.fpu_enabled); + + return ret; +} + +static struct crypto_alg blk_xts_alg = { + .cra_name = "__xts-serpent-sse2", + .cra_driver_name = "__driver-xts-serpent-sse2", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = SERPENT_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct serpent_xts_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(blk_xts_alg.cra_list), + .cra_u = { + .blkcipher = { + .min_keysize = SERPENT_MIN_KEY_SIZE * 2, + .max_keysize = SERPENT_MAX_KEY_SIZE * 2, + .ivsize = SERPENT_BLOCK_SIZE, + .setkey = xts_serpent_setkey, + .encrypt = xts_encrypt, + .decrypt = xts_decrypt, + }, + }, +}; + +static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key, + unsigned int key_len) +{ + struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm); + struct crypto_ablkcipher *child = &ctx->cryptd_tfm->base; + int err; + + crypto_ablkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK); + crypto_ablkcipher_set_flags(child, crypto_ablkcipher_get_flags(tfm) + & CRYPTO_TFM_REQ_MASK); + err = crypto_ablkcipher_setkey(child, key, key_len); + crypto_ablkcipher_set_flags(tfm, crypto_ablkcipher_get_flags(child) + & CRYPTO_TFM_RES_MASK); + return err; +} + +static int __ablk_encrypt(struct ablkcipher_request *req) +{ + struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); + struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm); + struct blkcipher_desc desc; + + desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm); + desc.info = req->info; + desc.flags = 0; + + return crypto_blkcipher_crt(desc.tfm)->encrypt( + &desc, req->dst, req->src, req->nbytes); +} + +static int ablk_encrypt(struct ablkcipher_request *req) +{ + struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); + struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm); + + if (!irq_fpu_usable()) { + struct ablkcipher_request *cryptd_req = + ablkcipher_request_ctx(req); + + memcpy(cryptd_req, req, sizeof(*req)); + ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); + + return crypto_ablkcipher_encrypt(cryptd_req); + } else { + return __ablk_encrypt(req); + } +} + +static int ablk_decrypt(struct ablkcipher_request *req) +{ + struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); + struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm); + + if (!irq_fpu_usable()) { + struct ablkcipher_request *cryptd_req = + ablkcipher_request_ctx(req); + + memcpy(cryptd_req, req, sizeof(*req)); + ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); + + return crypto_ablkcipher_decrypt(cryptd_req); + } else { + struct blkcipher_desc desc; + + desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm); + desc.info = req->info; + desc.flags = 0; + + return crypto_blkcipher_crt(desc.tfm)->decrypt( + &desc, req->dst, req->src, req->nbytes); + } +} + +static void ablk_exit(struct crypto_tfm *tfm) +{ + struct async_serpent_ctx *ctx = crypto_tfm_ctx(tfm); + + cryptd_free_ablkcipher(ctx->cryptd_tfm); +} + +static void ablk_init_common(struct crypto_tfm *tfm, + struct cryptd_ablkcipher *cryptd_tfm) +{ + struct async_serpent_ctx *ctx = crypto_tfm_ctx(tfm); + + ctx->cryptd_tfm = cryptd_tfm; + tfm->crt_ablkcipher.reqsize = sizeof(struct ablkcipher_request) + + crypto_ablkcipher_reqsize(&cryptd_tfm->base); +} + +static int ablk_ecb_init(struct crypto_tfm *tfm) +{ + struct cryptd_ablkcipher *cryptd_tfm; + + cryptd_tfm = cryptd_alloc_ablkcipher("__driver-ecb-serpent-sse2", 0, 0); + if (IS_ERR(cryptd_tfm)) + return PTR_ERR(cryptd_tfm); + ablk_init_common(tfm, cryptd_tfm); + return 0; +} + +static struct crypto_alg ablk_ecb_alg = { + .cra_name = "ecb(serpent)", + .cra_driver_name = "ecb-serpent-sse2", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = SERPENT_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_serpent_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(ablk_ecb_alg.cra_list), + .cra_init = ablk_ecb_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = SERPENT_MIN_KEY_SIZE, + .max_keysize = SERPENT_MAX_KEY_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_decrypt, + }, + }, +}; + +static int ablk_cbc_init(struct crypto_tfm *tfm) +{ + struct cryptd_ablkcipher *cryptd_tfm; + + cryptd_tfm = cryptd_alloc_ablkcipher("__driver-cbc-serpent-sse2", 0, 0); + if (IS_ERR(cryptd_tfm)) + return PTR_ERR(cryptd_tfm); + ablk_init_common(tfm, cryptd_tfm); + return 0; +} + +static struct crypto_alg ablk_cbc_alg = { + .cra_name = "cbc(serpent)", + .cra_driver_name = "cbc-serpent-sse2", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = SERPENT_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_serpent_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(ablk_cbc_alg.cra_list), + .cra_init = ablk_cbc_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = SERPENT_MIN_KEY_SIZE, + .max_keysize = SERPENT_MAX_KEY_SIZE, + .ivsize = SERPENT_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = __ablk_encrypt, + .decrypt = ablk_decrypt, + }, + }, +}; + +static int ablk_ctr_init(struct crypto_tfm *tfm) +{ + struct cryptd_ablkcipher *cryptd_tfm; + + cryptd_tfm = cryptd_alloc_ablkcipher("__driver-ctr-serpent-sse2", 0, 0); + if (IS_ERR(cryptd_tfm)) + return PTR_ERR(cryptd_tfm); + ablk_init_common(tfm, cryptd_tfm); + return 0; +} + +static struct crypto_alg ablk_ctr_alg = { + .cra_name = "ctr(serpent)", + .cra_driver_name = "ctr-serpent-sse2", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct async_serpent_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(ablk_ctr_alg.cra_list), + .cra_init = ablk_ctr_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = SERPENT_MIN_KEY_SIZE, + .max_keysize = SERPENT_MAX_KEY_SIZE, + .ivsize = SERPENT_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_encrypt, + .geniv = "chainiv", + }, + }, +}; + +static int ablk_lrw_init(struct crypto_tfm *tfm) +{ + struct cryptd_ablkcipher *cryptd_tfm; + + cryptd_tfm = cryptd_alloc_ablkcipher("__driver-lrw-serpent-sse2", 0, 0); + if (IS_ERR(cryptd_tfm)) + return PTR_ERR(cryptd_tfm); + ablk_init_common(tfm, cryptd_tfm); + return 0; +} + +static struct crypto_alg ablk_lrw_alg = { + .cra_name = "lrw(serpent)", + .cra_driver_name = "lrw-serpent-sse2", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = SERPENT_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_serpent_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(ablk_lrw_alg.cra_list), + .cra_init = ablk_lrw_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = SERPENT_MIN_KEY_SIZE + + SERPENT_BLOCK_SIZE, + .max_keysize = SERPENT_MAX_KEY_SIZE + + SERPENT_BLOCK_SIZE, + .ivsize = SERPENT_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_decrypt, + }, + }, +}; + +static int ablk_xts_init(struct crypto_tfm *tfm) +{ + struct cryptd_ablkcipher *cryptd_tfm; + + cryptd_tfm = cryptd_alloc_ablkcipher("__driver-xts-serpent-sse2", 0, 0); + if (IS_ERR(cryptd_tfm)) + return PTR_ERR(cryptd_tfm); + ablk_init_common(tfm, cryptd_tfm); + return 0; +} + +static struct crypto_alg ablk_xts_alg = { + .cra_name = "xts(serpent)", + .cra_driver_name = "xts-serpent-sse2", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = SERPENT_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_serpent_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(ablk_xts_alg.cra_list), + .cra_init = ablk_xts_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = SERPENT_MIN_KEY_SIZE * 2, + .max_keysize = SERPENT_MAX_KEY_SIZE * 2, + .ivsize = SERPENT_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_decrypt, + }, + }, +}; + +static int __init serpent_sse2_init(void) +{ + int err; + + if (!cpu_has_xmm2) { + printk(KERN_INFO "SSE2 instructions are not detected.\n"); + return -ENODEV; + } + + err = crypto_register_alg(&blk_ecb_alg); + if (err) + goto blk_ecb_err; + err = crypto_register_alg(&blk_cbc_alg); + if (err) + goto blk_cbc_err; + err = crypto_register_alg(&blk_ctr_alg); + if (err) + goto blk_ctr_err; + err = crypto_register_alg(&ablk_ecb_alg); + if (err) + goto ablk_ecb_err; + err = crypto_register_alg(&ablk_cbc_alg); + if (err) + goto ablk_cbc_err; + err = crypto_register_alg(&ablk_ctr_alg); + if (err) + goto ablk_ctr_err; + err = crypto_register_alg(&blk_lrw_alg); + if (err) + goto blk_lrw_err; + err = crypto_register_alg(&ablk_lrw_alg); + if (err) + goto ablk_lrw_err; + err = crypto_register_alg(&blk_xts_alg); + if (err) + goto blk_xts_err; + err = crypto_register_alg(&ablk_xts_alg); + if (err) + goto ablk_xts_err; + return err; + + crypto_unregister_alg(&ablk_xts_alg); +ablk_xts_err: + crypto_unregister_alg(&blk_xts_alg); +blk_xts_err: + crypto_unregister_alg(&ablk_lrw_alg); +ablk_lrw_err: + crypto_unregister_alg(&blk_lrw_alg); +blk_lrw_err: + crypto_unregister_alg(&ablk_ctr_alg); +ablk_ctr_err: + crypto_unregister_alg(&ablk_cbc_alg); +ablk_cbc_err: + crypto_unregister_alg(&ablk_ecb_alg); +ablk_ecb_err: + crypto_unregister_alg(&blk_ctr_alg); +blk_ctr_err: + crypto_unregister_alg(&blk_cbc_alg); +blk_cbc_err: + crypto_unregister_alg(&blk_ecb_alg); +blk_ecb_err: + return err; +} + +static void __exit serpent_sse2_exit(void) +{ + crypto_unregister_alg(&ablk_xts_alg); + crypto_unregister_alg(&blk_xts_alg); + crypto_unregister_alg(&ablk_lrw_alg); + crypto_unregister_alg(&blk_lrw_alg); + crypto_unregister_alg(&ablk_ctr_alg); + crypto_unregister_alg(&ablk_cbc_alg); + crypto_unregister_alg(&ablk_ecb_alg); + crypto_unregister_alg(&blk_ctr_alg); + crypto_unregister_alg(&blk_cbc_alg); + crypto_unregister_alg(&blk_ecb_alg); +} + +module_init(serpent_sse2_init); +module_exit(serpent_sse2_exit); + +MODULE_DESCRIPTION("Serpent Cipher Algorithm, SSE2 optimized"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("serpent"); diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c index 5ede9c4..7fee8c1 100644 --- a/arch/x86/crypto/twofish_glue_3way.c +++ b/arch/x86/crypto/twofish_glue_3way.c @@ -32,6 +32,8 @@ #include <crypto/algapi.h> #include <crypto/twofish.h> #include <crypto/b128ops.h> +#include <crypto/lrw.h> +#include <crypto/xts.h> /* regular block cipher functions from twofish_x86_64 module */ asmlinkage void twofish_enc_blk(struct twofish_ctx *ctx, u8 *dst, @@ -432,6 +434,209 @@ static struct crypto_alg blk_ctr_alg = { }, }; +static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) +{ + const unsigned int bsize = TF_BLOCK_SIZE; + struct twofish_ctx *ctx = priv; + int i; + + if (nbytes == 3 * bsize) { + twofish_enc_blk_3way(ctx, srcdst, srcdst); + return; + } + + for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) + twofish_enc_blk(ctx, srcdst, srcdst); +} + +static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) +{ + const unsigned int bsize = TF_BLOCK_SIZE; + struct twofish_ctx *ctx = priv; + int i; + + if (nbytes == 3 * bsize) { + twofish_dec_blk_3way(ctx, srcdst, srcdst); + return; + } + + for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) + twofish_dec_blk(ctx, srcdst, srcdst); +} + +struct twofish_lrw_ctx { + struct lrw_table_ctx lrw_table; + struct twofish_ctx twofish_ctx; +}; + +static int lrw_twofish_setkey(struct crypto_tfm *tfm, const u8 *key, + unsigned int keylen) +{ + struct twofish_lrw_ctx *ctx = crypto_tfm_ctx(tfm); + int err; + + err = __twofish_setkey(&ctx->twofish_ctx, key, keylen - TF_BLOCK_SIZE, + &tfm->crt_flags); + if (err) + return err; + + return lrw_init_table(&ctx->lrw_table, key + keylen - TF_BLOCK_SIZE); +} + +static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + be128 buf[3]; + struct lrw_crypt_req req = { + .tbuf = buf, + .tbuflen = sizeof(buf), + + .table_ctx = &ctx->lrw_table, + .crypt_ctx = &ctx->twofish_ctx, + .crypt_fn = encrypt_callback, + }; + + return lrw_crypt(desc, dst, src, nbytes, &req); +} + +static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + be128 buf[3]; + struct lrw_crypt_req req = { + .tbuf = buf, + .tbuflen = sizeof(buf), + + .table_ctx = &ctx->lrw_table, + .crypt_ctx = &ctx->twofish_ctx, + .crypt_fn = decrypt_callback, + }; + + return lrw_crypt(desc, dst, src, nbytes, &req); +} + +static void lrw_exit_tfm(struct crypto_tfm *tfm) +{ + struct twofish_lrw_ctx *ctx = crypto_tfm_ctx(tfm); + + lrw_free_table(&ctx->lrw_table); +} + +static struct crypto_alg blk_lrw_alg = { + .cra_name = "lrw(twofish)", + .cra_driver_name = "lrw-twofish-3way", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = TF_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct twofish_lrw_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(blk_lrw_alg.cra_list), + .cra_exit = lrw_exit_tfm, + .cra_u = { + .blkcipher = { + .min_keysize = TF_MIN_KEY_SIZE + TF_BLOCK_SIZE, + .max_keysize = TF_MAX_KEY_SIZE + TF_BLOCK_SIZE, + .ivsize = TF_BLOCK_SIZE, + .setkey = lrw_twofish_setkey, + .encrypt = lrw_encrypt, + .decrypt = lrw_decrypt, + }, + }, +}; + +struct twofish_xts_ctx { + struct twofish_ctx tweak_ctx; + struct twofish_ctx crypt_ctx; +}; + +static int xts_twofish_setkey(struct crypto_tfm *tfm, const u8 *key, + unsigned int keylen) +{ + struct twofish_xts_ctx *ctx = crypto_tfm_ctx(tfm); + u32 *flags = &tfm->crt_flags; + int err; + + /* key consists of keys of equal size concatenated, therefore + * the length must be even + */ + if (keylen % 2) { + *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; + return -EINVAL; + } + + /* first half of xts-key is for crypt */ + err = __twofish_setkey(&ctx->crypt_ctx, key, keylen / 2, flags); + if (err) + return err; + + /* second half of xts-key is for tweak */ + return __twofish_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2, + flags); +} + +static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + be128 buf[3]; + struct xts_crypt_req req = { + .tbuf = buf, + .tbuflen = sizeof(buf), + + .tweak_ctx = &ctx->tweak_ctx, + .tweak_fn = XTS_TWEAK_CAST(twofish_enc_blk), + .crypt_ctx = &ctx->crypt_ctx, + .crypt_fn = encrypt_callback, + }; + + return xts_crypt(desc, dst, src, nbytes, &req); +} + +static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + be128 buf[3]; + struct xts_crypt_req req = { + .tbuf = buf, + .tbuflen = sizeof(buf), + + .tweak_ctx = &ctx->tweak_ctx, + .tweak_fn = XTS_TWEAK_CAST(twofish_enc_blk), + .crypt_ctx = &ctx->crypt_ctx, + .crypt_fn = decrypt_callback, + }; + + return xts_crypt(desc, dst, src, nbytes, &req); +} + +static struct crypto_alg blk_xts_alg = { + .cra_name = "xts(twofish)", + .cra_driver_name = "xts-twofish-3way", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = TF_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct twofish_xts_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(blk_xts_alg.cra_list), + .cra_u = { + .blkcipher = { + .min_keysize = TF_MIN_KEY_SIZE * 2, + .max_keysize = TF_MAX_KEY_SIZE * 2, + .ivsize = TF_BLOCK_SIZE, + .setkey = xts_twofish_setkey, + .encrypt = xts_encrypt, + .decrypt = xts_decrypt, + }, + }, +}; + int __init init(void) { int err; @@ -445,9 +650,20 @@ int __init init(void) err = crypto_register_alg(&blk_ctr_alg); if (err) goto ctr_err; + err = crypto_register_alg(&blk_lrw_alg); + if (err) + goto blk_lrw_err; + err = crypto_register_alg(&blk_xts_alg); + if (err) + goto blk_xts_err; return 0; + crypto_unregister_alg(&blk_xts_alg); +blk_xts_err: + crypto_unregister_alg(&blk_lrw_alg); +blk_lrw_err: + crypto_unregister_alg(&blk_ctr_alg); ctr_err: crypto_unregister_alg(&blk_cbc_alg); cbc_err: @@ -458,6 +674,8 @@ ecb_err: void __exit fini(void) { + crypto_unregister_alg(&blk_xts_alg); + crypto_unregister_alg(&blk_lrw_alg); crypto_unregister_alg(&blk_ctr_alg); crypto_unregister_alg(&blk_cbc_alg); crypto_unregister_alg(&blk_ecb_alg); diff --git a/arch/x86/ia32/Makefile b/arch/x86/ia32/Makefile index 52d0ccf..455646e 100644 --- a/arch/x86/ia32/Makefile +++ b/arch/x86/ia32/Makefile @@ -3,6 +3,7 @@ # obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_signal.o +obj-$(CONFIG_IA32_EMULATION) += nosyscall.o syscall_ia32.o sysv-$(CONFIG_SYSVIPC) := ipc32.o obj-$(CONFIG_IA32_EMULATION) += $(sysv-y) diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 3e27456..e3e7340 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -14,6 +14,7 @@ #include <asm/segment.h> #include <asm/irqflags.h> #include <linux/linkage.h> +#include <linux/err.h> /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ #include <linux/elf-em.h> @@ -27,8 +28,6 @@ .section .entry.text, "ax" -#define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8) - .macro IA32_ARG_FIXUP noebp=0 movl %edi,%r8d .if \noebp @@ -191,7 +190,7 @@ sysexit_from_sys_call: movl %ebx,%edx /* 3rd arg: 1st syscall arg */ movl %eax,%esi /* 2nd arg: syscall number */ movl $AUDIT_ARCH_I386,%edi /* 1st arg: audit arch */ - call audit_syscall_entry + call __audit_syscall_entry movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall number */ cmpq $(IA32_NR_syscalls-1),%rax ja ia32_badsys @@ -208,12 +207,13 @@ sysexit_from_sys_call: TRACE_IRQS_ON sti movl %eax,%esi /* second arg, syscall return value */ - cmpl $0,%eax /* is it < 0? */ - setl %al /* 1 if so, 0 if not */ + cmpl $-MAX_ERRNO,%eax /* is it an error ? */ + jbe 1f + movslq %eax, %rsi /* if error sign extend to 64 bits */ +1: setbe %al /* 1 if error, 0 if not */ movzbl %al,%edi /* zero-extend that into %edi */ - inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ - call audit_syscall_exit - movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall return value */ + call __audit_syscall_exit + movq RAX-ARGOFFSET(%rsp),%rax /* reload syscall return value */ movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi cli TRACE_IRQS_OFF @@ -447,9 +447,6 @@ ia32_badsys: movq $-ENOSYS,%rax jmp ia32_sysret -quiet_ni_syscall: - movq $-ENOSYS,%rax - ret CFI_ENDPROC .macro PTREGSCALL label, func, arg @@ -494,357 +491,3 @@ ia32_ptregs_common: jmp ia32_sysret /* misbalances the return cache */ CFI_ENDPROC END(ia32_ptregs_common) - - .section .rodata,"a" - .align 8 -ia32_sys_call_table: - .quad sys_restart_syscall - .quad sys_exit - .quad stub32_fork - .quad sys_read - .quad sys_write - .quad compat_sys_open /* 5 */ - .quad sys_close - .quad sys32_waitpid - .quad sys_creat - .quad sys_link - .quad sys_unlink /* 10 */ - .quad stub32_execve - .quad sys_chdir - .quad compat_sys_time - .quad sys_mknod - .quad sys_chmod /* 15 */ - .quad sys_lchown16 - .quad quiet_ni_syscall /* old break syscall holder */ - .quad sys_stat - .quad sys32_lseek - .quad sys_getpid /* 20 */ - .quad compat_sys_mount /* mount */ - .quad sys_oldumount /* old_umount */ - .quad sys_setuid16 - .quad sys_getuid16 - .quad compat_sys_stime /* stime */ /* 25 */ - .quad compat_sys_ptrace /* ptrace */ - .quad sys_alarm - .quad sys_fstat /* (old)fstat */ - .quad sys_pause - .quad compat_sys_utime /* 30 */ - .quad quiet_ni_syscall /* old stty syscall holder */ - .quad quiet_ni_syscall /* old gtty syscall holder */ - .quad sys_access - .quad sys_nice - .quad quiet_ni_syscall /* 35 */ /* old ftime syscall holder */ - .quad sys_sync - .quad sys32_kill - .quad sys_rename - .quad sys_mkdir - .quad sys_rmdir /* 40 */ - .quad sys_dup - .quad sys_pipe - .quad compat_sys_times - .quad quiet_ni_syscall /* old prof syscall holder */ - .quad sys_brk /* 45 */ - .quad sys_setgid16 - .quad sys_getgid16 - .quad sys_signal - .quad sys_geteuid16 - .quad sys_getegid16 /* 50 */ - .quad sys_acct - .quad sys_umount /* new_umount */ - .quad quiet_ni_syscall /* old lock syscall holder */ - .quad compat_sys_ioctl - .quad compat_sys_fcntl64 /* 55 */ - .quad quiet_ni_syscall /* old mpx syscall holder */ - .quad sys_setpgid - .quad quiet_ni_syscall /* old ulimit syscall holder */ - .quad sys_olduname - .quad sys_umask /* 60 */ - .quad sys_chroot - .quad compat_sys_ustat - .quad sys_dup2 - .quad sys_getppid - .quad sys_getpgrp /* 65 */ - .quad sys_setsid - .quad sys32_sigaction - .quad sys_sgetmask - .quad sys_ssetmask - .quad sys_setreuid16 /* 70 */ - .quad sys_setregid16 - .quad sys32_sigsuspend - .quad compat_sys_sigpending - .quad sys_sethostname - .quad compat_sys_setrlimit /* 75 */ - .quad compat_sys_old_getrlimit /* old_getrlimit */ - .quad compat_sys_getrusage - .quad compat_sys_gettimeofday - .quad compat_sys_settimeofday - .quad sys_getgroups16 /* 80 */ - .quad sys_setgroups16 - .quad compat_sys_old_select - .quad sys_symlink - .quad sys_lstat - .quad sys_readlink /* 85 */ - .quad sys_uselib - .quad sys_swapon - .quad sys_reboot - .quad compat_sys_old_readdir - .quad sys32_mmap /* 90 */ - .quad sys_munmap - .quad sys_truncate - .quad sys_ftruncate - .quad sys_fchmod - .quad sys_fchown16 /* 95 */ - .quad sys_getpriority - .quad sys_setpriority - .quad quiet_ni_syscall /* old profil syscall holder */ - .quad compat_sys_statfs - .quad compat_sys_fstatfs /* 100 */ - .quad sys_ioperm - .quad compat_sys_socketcall - .quad sys_syslog - .quad compat_sys_setitimer - .quad compat_sys_getitimer /* 105 */ - .quad compat_sys_newstat - .quad compat_sys_newlstat - .quad compat_sys_newfstat - .quad sys_uname - .quad stub32_iopl /* 110 */ - .quad sys_vhangup - .quad quiet_ni_syscall /* old "idle" system call */ - .quad sys32_vm86_warning /* vm86old */ - .quad compat_sys_wait4 - .quad sys_swapoff /* 115 */ - .quad compat_sys_sysinfo - .quad sys32_ipc - .quad sys_fsync - .quad stub32_sigreturn - .quad stub32_clone /* 120 */ - .quad sys_setdomainname - .quad sys_newuname - .quad sys_modify_ldt - .quad compat_sys_adjtimex - .quad sys32_mprotect /* 125 */ - .quad compat_sys_sigprocmask - .quad quiet_ni_syscall /* create_module */ - .quad sys_init_module - .quad sys_delete_module - .quad quiet_ni_syscall /* 130 get_kernel_syms */ - .quad sys32_quotactl - .quad sys_getpgid - .quad sys_fchdir - .quad quiet_ni_syscall /* bdflush */ - .quad sys_sysfs /* 135 */ - .quad sys_personality - .quad quiet_ni_syscall /* for afs_syscall */ - .quad sys_setfsuid16 - .quad sys_setfsgid16 - .quad sys_llseek /* 140 */ - .quad compat_sys_getdents - .quad compat_sys_select - .quad sys_flock - .quad sys_msync - .quad compat_sys_readv /* 145 */ - .quad compat_sys_writev - .quad sys_getsid - .quad sys_fdatasync - .quad compat_sys_sysctl /* sysctl */ - .quad sys_mlock /* 150 */ - .quad sys_munlock - .quad sys_mlockall - .quad sys_munlockall - .quad sys_sched_setparam - .quad sys_sched_getparam /* 155 */ - .quad sys_sched_setscheduler - .quad sys_sched_getscheduler - .quad sys_sched_yield - .quad sys_sched_get_priority_max - .quad sys_sched_get_priority_min /* 160 */ - .quad sys32_sched_rr_get_interval - .quad compat_sys_nanosleep - .quad sys_mremap - .quad sys_setresuid16 - .quad sys_getresuid16 /* 165 */ - .quad sys32_vm86_warning /* vm86 */ - .quad quiet_ni_syscall /* query_module */ - .quad sys_poll - .quad quiet_ni_syscall /* old nfsservctl */ - .quad sys_setresgid16 /* 170 */ - .quad sys_getresgid16 - .quad sys_prctl - .quad stub32_rt_sigreturn - .quad sys32_rt_sigaction - .quad sys32_rt_sigprocmask /* 175 */ - .quad sys32_rt_sigpending - .quad compat_sys_rt_sigtimedwait - .quad sys32_rt_sigqueueinfo - .quad sys_rt_sigsuspend - .quad sys32_pread /* 180 */ - .quad sys32_pwrite - .quad sys_chown16 - .quad sys_getcwd - .quad sys_capget - .quad sys_capset - .quad stub32_sigaltstack - .quad sys32_sendfile - .quad quiet_ni_syscall /* streams1 */ - .quad quiet_ni_syscall /* streams2 */ - .quad stub32_vfork /* 190 */ - .quad compat_sys_getrlimit - .quad sys_mmap_pgoff - .quad sys32_truncate64 - .quad sys32_ftruncate64 - .quad sys32_stat64 /* 195 */ - .quad sys32_lstat64 - .quad sys32_fstat64 - .quad sys_lchown - .quad sys_getuid - .quad sys_getgid /* 200 */ - .quad sys_geteuid - .quad sys_getegid - .quad sys_setreuid - .quad sys_setregid - .quad sys_getgroups /* 205 */ - .quad sys_setgroups - .quad sys_fchown - .quad sys_setresuid - .quad sys_getresuid - .quad sys_setresgid /* 210 */ - .quad sys_getresgid - .quad sys_chown - .quad sys_setuid - .quad sys_setgid - .quad sys_setfsuid /* 215 */ - .quad sys_setfsgid - .quad sys_pivot_root - .quad sys_mincore - .quad sys_madvise - .quad compat_sys_getdents64 /* 220 getdents64 */ - .quad compat_sys_fcntl64 - .quad quiet_ni_syscall /* tux */ - .quad quiet_ni_syscall /* security */ - .quad sys_gettid - .quad sys32_readahead /* 225 */ - .quad sys_setxattr - .quad sys_lsetxattr - .quad sys_fsetxattr - .quad sys_getxattr - .quad sys_lgetxattr /* 230 */ - .quad sys_fgetxattr - .quad sys_listxattr - .quad sys_llistxattr - .quad sys_flistxattr - .quad sys_removexattr /* 235 */ - .quad sys_lremovexattr - .quad sys_fremovexattr - .quad sys_tkill - .quad sys_sendfile64 - .quad compat_sys_futex /* 240 */ - .quad compat_sys_sched_setaffinity - .quad compat_sys_sched_getaffinity - .quad sys_set_thread_area - .quad sys_get_thread_area - .quad compat_sys_io_setup /* 245 */ - .quad sys_io_destroy - .quad compat_sys_io_getevents - .quad compat_sys_io_submit - .quad sys_io_cancel - .quad sys32_fadvise64 /* 250 */ - .quad quiet_ni_syscall /* free_huge_pages */ - .quad sys_exit_group - .quad sys32_lookup_dcookie - .quad sys_epoll_create - .quad sys_epoll_ctl /* 255 */ - .quad sys_epoll_wait - .quad sys_remap_file_pages - .quad sys_set_tid_address - .quad compat_sys_timer_create - .quad compat_sys_timer_settime /* 260 */ - .quad compat_sys_timer_gettime - .quad sys_timer_getoverrun - .quad sys_timer_delete - .quad compat_sys_clock_settime - .quad compat_sys_clock_gettime /* 265 */ - .quad compat_sys_clock_getres - .quad compat_sys_clock_nanosleep - .quad compat_sys_statfs64 - .quad compat_sys_fstatfs64 - .quad sys_tgkill /* 270 */ - .quad compat_sys_utimes - .quad sys32_fadvise64_64 - .quad quiet_ni_syscall /* sys_vserver */ - .quad sys_mbind - .quad compat_sys_get_mempolicy /* 275 */ - .quad sys_set_mempolicy - .quad compat_sys_mq_open - .quad sys_mq_unlink - .quad compat_sys_mq_timedsend - .quad compat_sys_mq_timedreceive /* 280 */ - .quad compat_sys_mq_notify - .quad compat_sys_mq_getsetattr - .quad compat_sys_kexec_load /* reserved for kexec */ - .quad compat_sys_waitid - .quad quiet_ni_syscall /* 285: sys_altroot */ - .quad sys_add_key - .quad sys_request_key - .quad sys_keyctl - .quad sys_ioprio_set - .quad sys_ioprio_get /* 290 */ - .quad sys_inotify_init - .quad sys_inotify_add_watch - .quad sys_inotify_rm_watch - .quad sys_migrate_pages - .quad compat_sys_openat /* 295 */ - .quad sys_mkdirat - .quad sys_mknodat - .quad sys_fchownat - .quad compat_sys_futimesat - .quad sys32_fstatat /* 300 */ - .quad sys_unlinkat - .quad sys_renameat - .quad sys_linkat - .quad sys_symlinkat - .quad sys_readlinkat /* 305 */ - .quad sys_fchmodat - .quad sys_faccessat - .quad compat_sys_pselect6 - .quad compat_sys_ppoll - .quad sys_unshare /* 310 */ - .quad compat_sys_set_robust_list - .quad compat_sys_get_robust_list - .quad sys_splice - .quad sys32_sync_file_range - .quad sys_tee /* 315 */ - .quad compat_sys_vmsplice - .quad compat_sys_move_pages - .quad sys_getcpu - .quad sys_epoll_pwait - .quad compat_sys_utimensat /* 320 */ - .quad compat_sys_signalfd - .quad sys_timerfd_create - .quad sys_eventfd - .quad sys32_fallocate - .quad compat_sys_timerfd_settime /* 325 */ - .quad compat_sys_timerfd_gettime - .quad compat_sys_signalfd4 - .quad sys_eventfd2 - .quad sys_epoll_create1 - .quad sys_dup3 /* 330 */ - .quad sys_pipe2 - .quad sys_inotify_init1 - .quad compat_sys_preadv - .quad compat_sys_pwritev - .quad compat_sys_rt_tgsigqueueinfo /* 335 */ - .quad sys_perf_event_open - .quad compat_sys_recvmmsg - .quad sys_fanotify_init - .quad sys32_fanotify_mark - .quad sys_prlimit64 /* 340 */ - .quad sys_name_to_handle_at - .quad compat_sys_open_by_handle_at - .quad compat_sys_clock_adjtime - .quad sys_syncfs - .quad compat_sys_sendmmsg /* 345 */ - .quad sys_setns - .quad compat_sys_process_vm_readv - .quad compat_sys_process_vm_writev -ia32_syscall_end: diff --git a/arch/x86/ia32/nosyscall.c b/arch/x86/ia32/nosyscall.c new file mode 100644 index 0000000..51ecd5b --- /dev/null +++ b/arch/x86/ia32/nosyscall.c @@ -0,0 +1,7 @@ +#include <linux/kernel.h> +#include <linux/errno.h> + +long compat_ni_syscall(void) +{ + return -ENOSYS; +} diff --git a/arch/x86/ia32/syscall_ia32.c b/arch/x86/ia32/syscall_ia32.c new file mode 100644 index 0000000..4754ba0 --- /dev/null +++ b/arch/x86/ia32/syscall_ia32.c @@ -0,0 +1,25 @@ +/* System call table for ia32 emulation. */ + +#include <linux/linkage.h> +#include <linux/sys.h> +#include <linux/cache.h> +#include <asm/asm-offsets.h> + +#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void compat(void) ; +#include <asm/syscalls_32.h> +#undef __SYSCALL_I386 + +#define __SYSCALL_I386(nr, sym, compat) [nr] = compat, + +typedef void (*sys_call_ptr_t)(void); + +extern void compat_ni_syscall(void); + +const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = { + /* + * Smells like a compiler bug -- it doesn't work + * when the & below is removed. + */ + [0 ... __NR_ia32_syscall_max] = &compat_ni_syscall, +#include <asm/syscalls_32.h> +}; diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index 6fa90a8..b57e6a4 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -19,7 +19,8 @@ header-y += processor-flags.h header-y += ptrace-abi.h header-y += sigcontext32.h header-y += ucontext.h -header-y += unistd_32.h -header-y += unistd_64.h header-y += vm86.h header-y += vsyscall.h + +genhdr-y += unistd_32.h +genhdr-y += unistd_64.h diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h index 8e41071..49ad773 100644 --- a/arch/x86/include/asm/amd_nb.h +++ b/arch/x86/include/asm/amd_nb.h @@ -1,6 +1,7 @@ #ifndef _ASM_X86_AMD_NB_H #define _ASM_X86_AMD_NB_H +#include <linux/ioport.h> #include <linux/pci.h> struct amd_nb_bus_dev_range { @@ -13,6 +14,7 @@ extern const struct pci_device_id amd_nb_misc_ids[]; extern const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[]; extern bool early_is_amd_nb(u32 value); +extern struct resource *amd_get_mmconfig_range(struct resource *res); extern int amd_cache_northbridges(void); extern void amd_flush_garts(void); extern int amd_numa_init(void); diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h index 24098aa..fa13f0e 100644 --- a/arch/x86/include/asm/atomic64_32.h +++ b/arch/x86/include/asm/atomic64_32.h @@ -82,7 +82,7 @@ static inline void atomic64_set(atomic64_t *v, long long i) * * Atomically reads the value of @v and returns it. */ -static inline long long atomic64_read(atomic64_t *v) +static inline long long atomic64_read(const atomic64_t *v) { long long r; asm volatile(ATOMIC64_ALTERNATIVE(read) diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h index e020d88..2f90c51 100644 --- a/arch/x86/include/asm/bootparam.h +++ b/arch/x86/include/asm/bootparam.h @@ -64,6 +64,8 @@ struct setup_header { __u32 payload_offset; __u32 payload_length; __u64 setup_data; + __u64 pref_address; + __u32 init_size; } __attribute__((packed)); struct sys_desc_table { diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index f3444f7..17c5d4b 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -197,7 +197,10 @@ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ #define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/ +#define X86_FEATURE_BMI1 (9*32+ 3) /* 1st group bit manipulation extensions */ +#define X86_FEATURE_AVX2 (9*32+ 5) /* AVX2 instructions */ #define X86_FEATURE_SMEP (9*32+ 7) /* Supervisor Mode Execution Protection */ +#define X86_FEATURE_BMI2 (9*32+ 8) /* 2nd group bit manipulation extensions */ #define X86_FEATURE_ERMS (9*32+ 9) /* Enhanced REP MOVSB/STOSB */ #if defined(__KERNEL__) && !defined(__ASSEMBLY__) diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h index 078ad0c..b903d5e 100644 --- a/arch/x86/include/asm/debugreg.h +++ b/arch/x86/include/asm/debugreg.h @@ -101,6 +101,28 @@ extern void aout_dump_debugregs(struct user *dump); extern void hw_breakpoint_restore(void); +#ifdef CONFIG_X86_64 +DECLARE_PER_CPU(int, debug_stack_usage); +static inline void debug_stack_usage_inc(void) +{ + __get_cpu_var(debug_stack_usage)++; +} +static inline void debug_stack_usage_dec(void) +{ + __get_cpu_var(debug_stack_usage)--; +} +int is_debug_stack(unsigned long addr); +void debug_stack_set_zero(void); +void debug_stack_reset(void); +#else /* !X86_64 */ +static inline int is_debug_stack(unsigned long addr) { return 0; } +static inline void debug_stack_set_zero(void) { } +static inline void debug_stack_reset(void) { } +static inline void debug_stack_usage_inc(void) { } +static inline void debug_stack_usage_dec(void) { } +#endif /* X86_64 */ + + #endif /* __KERNEL__ */ #endif /* _ASM_X86_DEBUGREG_H */ diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 41935fa..e95822d 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -35,6 +35,8 @@ static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *in extern struct desc_ptr idt_descr; extern gate_desc idt_table[]; +extern struct desc_ptr nmi_idt_descr; +extern gate_desc nmi_idt_table[]; struct gdt_page { struct desc_struct gdt[GDT_ENTRIES]; @@ -307,6 +309,16 @@ static inline void set_desc_limit(struct desc_struct *desc, unsigned long limit) desc->limit = (limit >> 16) & 0xf; } +#ifdef CONFIG_X86_64 +static inline void set_nmi_gate(int gate, void *addr) +{ + gate_desc s; + + pack_gate(&s, GATE_INTERRUPT, (unsigned long)addr, 0, 0, __KERNEL_CS); + write_idt_entry(nmi_idt_table, gate, &s); +} +#endif + static inline void _set_gate(int gate, unsigned type, void *addr, unsigned dpl, unsigned ist, unsigned seg) { diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 7093e4a..844f735 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -3,6 +3,8 @@ #ifdef CONFIG_X86_32 +#define EFI_LOADER_SIGNATURE "EL32" + extern unsigned long asmlinkage efi_call_phys(void *, ...); #define efi_call_phys0(f) efi_call_phys(f) @@ -37,6 +39,8 @@ extern unsigned long asmlinkage efi_call_phys(void *, ...); #else /* !CONFIG_X86_32 */ +#define EFI_LOADER_SIGNATURE "EL64" + extern u64 efi_call0(void *fp); extern u64 efi_call1(void *fp, u64 arg1); extern u64 efi_call2(void *fp, u64 arg1, u64 arg2); diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 460c74e..4da3c0c 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -117,7 +117,7 @@ enum fixed_addresses { #endif FIX_TEXT_POKE1, /* reserve 2 pages for text_poke() */ FIX_TEXT_POKE0, /* first page is last, because allocation is backward */ -#ifdef CONFIG_X86_MRST +#ifdef CONFIG_X86_INTEL_MID FIX_LNW_VRTC, #endif __end_of_permanent_fixed_addresses, diff --git a/arch/x86/include/asm/ia32_unistd.h b/arch/x86/include/asm/ia32_unistd.h index 976f6ec..b0d5716 100644 --- a/arch/x86/include/asm/ia32_unistd.h +++ b/arch/x86/include/asm/ia32_unistd.h @@ -2,17 +2,10 @@ #define _ASM_X86_IA32_UNISTD_H /* - * This file contains the system call numbers of the ia32 port, + * This file contains the system call numbers of the ia32 compat ABI, * this is for the kernel only. - * Only add syscalls here where some part of the kernel needs to know - * the number. This should be otherwise in sync with asm-x86/unistd_32.h. -AK */ - -#define __NR_ia32_restart_syscall 0 -#define __NR_ia32_exit 1 -#define __NR_ia32_read 3 -#define __NR_ia32_write 4 -#define __NR_ia32_sigreturn 119 -#define __NR_ia32_rt_sigreturn 173 +#define __SYSCALL_ia32_NR(x) (x) +#include <asm/unistd_32_ia32.h> #endif /* _ASM_X86_IA32_UNISTD_H */ diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h index 8dbe353..adcc0ae 100644 --- a/arch/x86/include/asm/init.h +++ b/arch/x86/include/asm/init.h @@ -5,6 +5,8 @@ extern void __init early_ioremap_page_table_range_init(void); #endif +extern void __init zone_sizes_init(void); + extern unsigned long __init kernel_physical_mapping_init(unsigned long start, unsigned long end, diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h index 345c99c..dffc38e 100644 --- a/arch/x86/include/asm/iommu.h +++ b/arch/x86/include/asm/iommu.h @@ -5,6 +5,7 @@ extern struct dma_map_ops nommu_dma_ops; extern int force_iommu, no_iommu; extern int iommu_detected; extern int iommu_pass_through; +extern int iommu_group_mf; /* 10 seconds */ #define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000) diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index a026507..ab4092e 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -181,6 +181,7 @@ struct x86_emulate_ops { int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value); int (*set_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data); int (*get_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata); + int (*read_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc, u64 *pdata); void (*halt)(struct x86_emulate_ctxt *ctxt); void (*wbinvd)(struct x86_emulate_ctxt *ctxt); int (*fix_hypercall)(struct x86_emulate_ctxt *ctxt); @@ -364,6 +365,7 @@ enum x86_intercept { #endif int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len); +bool x86_page_table_writing_insn(struct x86_emulate_ctxt *ctxt); #define EMULATION_FAILED -1 #define EMULATION_OK 0 #define EMULATION_RESTART 1 diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b4973f4..52d6640 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -16,10 +16,12 @@ #include <linux/mmu_notifier.h> #include <linux/tracepoint.h> #include <linux/cpumask.h> +#include <linux/irq_work.h> #include <linux/kvm.h> #include <linux/kvm_para.h> #include <linux/kvm_types.h> +#include <linux/perf_event.h> #include <asm/pvclock-abi.h> #include <asm/desc.h> @@ -31,6 +33,8 @@ #define KVM_MEMORY_SLOTS 32 /* memory slots that does not exposed to userspace */ #define KVM_PRIVATE_MEM_SLOTS 4 +#define KVM_MEM_SLOTS_NUM (KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS) + #define KVM_MMIO_SIZE 16 #define KVM_PIO_PAGE_OFFSET 1 @@ -228,7 +232,7 @@ struct kvm_mmu_page { * One bit set per slot which has memory * in this shadow page. */ - DECLARE_BITMAP(slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); + DECLARE_BITMAP(slot_bitmap, KVM_MEM_SLOTS_NUM); bool unsync; int root_count; /* Currently serving as active root */ unsigned int unsync_children; @@ -239,14 +243,9 @@ struct kvm_mmu_page { int clear_spte_count; #endif - struct rcu_head rcu; -}; + int write_flooding_count; -struct kvm_pv_mmu_op_buffer { - void *ptr; - unsigned len; - unsigned processed; - char buf[512] __aligned(sizeof(long)); + struct rcu_head rcu; }; struct kvm_pio_request { @@ -294,6 +293,37 @@ struct kvm_mmu { u64 pdptrs[4]; /* pae */ }; +enum pmc_type { + KVM_PMC_GP = 0, + KVM_PMC_FIXED, +}; + +struct kvm_pmc { + enum pmc_type type; + u8 idx; + u64 counter; + u64 eventsel; + struct perf_event *perf_event; + struct kvm_vcpu *vcpu; +}; + +struct kvm_pmu { + unsigned nr_arch_gp_counters; + unsigned nr_arch_fixed_counters; + unsigned available_event_types; + u64 fixed_ctr_ctrl; + u64 global_ctrl; + u64 global_status; + u64 global_ovf_ctrl; + u64 counter_bitmask[2]; + u64 global_ctrl_mask; + u8 version; + struct kvm_pmc gp_counters[X86_PMC_MAX_GENERIC]; + struct kvm_pmc fixed_counters[X86_PMC_MAX_FIXED]; + struct irq_work irq_work; + u64 reprogram_pmi; +}; + struct kvm_vcpu_arch { /* * rip and regs accesses must go through @@ -345,19 +375,10 @@ struct kvm_vcpu_arch { */ struct kvm_mmu *walk_mmu; - /* only needed in kvm_pv_mmu_op() path, but it's hot so - * put it here to avoid allocation */ - struct kvm_pv_mmu_op_buffer mmu_op_buffer; - struct kvm_mmu_memory_cache mmu_pte_list_desc_cache; struct kvm_mmu_memory_cache mmu_page_cache; struct kvm_mmu_memory_cache mmu_page_header_cache; - gfn_t last_pt_write_gfn; - int last_pt_write_count; - u64 *last_pte_updated; - gfn_t last_pte_gfn; - struct fpu guest_fpu; u64 xcr0; @@ -436,6 +457,8 @@ struct kvm_vcpu_arch { unsigned access; gfn_t mmio_gfn; + struct kvm_pmu pmu; + /* used for guest single stepping over the given code position */ unsigned long singlestep_rip; @@ -444,6 +467,9 @@ struct kvm_vcpu_arch { cpumask_var_t wbinvd_dirty_mask; + unsigned long last_retry_eip; + unsigned long last_retry_addr; + struct { bool halted; gfn_t gfns[roundup_pow_of_two(ASYNC_PF_PER_VCPU)]; @@ -459,7 +485,6 @@ struct kvm_arch { unsigned int n_requested_mmu_pages; unsigned int n_max_mmu_pages; unsigned int indirect_shadow_pages; - atomic_t invlpg_counter; struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; /* * Hash table of struct kvm_mmu_page. @@ -660,6 +685,8 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, int kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); +int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn, + struct kvm_memory_slot *slot); void kvm_mmu_zap_all(struct kvm *kvm); unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm); void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages); @@ -668,8 +695,6 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3); int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, const void *val, int bytes); -int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes, - gpa_t addr, unsigned long *ret); u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn); extern bool tdp_enabled; @@ -692,6 +717,7 @@ enum emulation_result { #define EMULTYPE_NO_DECODE (1 << 0) #define EMULTYPE_TRAP_UD (1 << 1) #define EMULTYPE_SKIP (1 << 2) +#define EMULTYPE_RETRY (1 << 3) int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2, int emulation_type, void *insn, int insn_len); @@ -734,6 +760,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data); unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu); void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); +bool kvm_rdpmc(struct kvm_vcpu *vcpu); void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr); void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); @@ -754,13 +781,14 @@ int fx_init(struct kvm_vcpu *vcpu); void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu); void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, - const u8 *new, int bytes, - bool guest_initiated); + const u8 *new, int bytes); +int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn); int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva); void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); int kvm_mmu_load(struct kvm_vcpu *vcpu); void kvm_mmu_unload(struct kvm_vcpu *vcpu); void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); +gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access); gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception); gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, @@ -782,6 +810,11 @@ void kvm_disable_tdp(void); int complete_pio(struct kvm_vcpu *vcpu); bool kvm_check_iopl(struct kvm_vcpu *vcpu); +static inline gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access) +{ + return gpa; +} + static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) { struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT); @@ -894,4 +927,17 @@ extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn); void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err); +int kvm_is_in_guest(void); + +void kvm_pmu_init(struct kvm_vcpu *vcpu); +void kvm_pmu_destroy(struct kvm_vcpu *vcpu); +void kvm_pmu_reset(struct kvm_vcpu *vcpu); +void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu); +bool kvm_pmu_msr(struct kvm_vcpu *vcpu, u32 msr); +int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data); +int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data); +int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data); +void kvm_handle_pmu_event(struct kvm_vcpu *vcpu); +void kvm_deliver_pmi(struct kvm_vcpu *vcpu); + #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 6add827..6aefb14 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -151,7 +151,7 @@ static inline void enable_p5_mce(void) {} void mce_setup(struct mce *m); void mce_log(struct mce *m); -DECLARE_PER_CPU(struct sys_device, mce_sysdev); +extern struct device *mce_device[CONFIG_NR_CPUS]; /* * Maximum banks number. diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h index 93f7909..0a0a954 100644 --- a/arch/x86/include/asm/mrst.h +++ b/arch/x86/include/asm/mrst.h @@ -67,7 +67,7 @@ extern struct console early_mrst_console; extern void mrst_early_console_init(void); extern struct console early_hsu_console; -extern void hsu_early_console_init(void); +extern void hsu_early_console_init(const char *); extern void intel_scu_devices_create(void); extern void intel_scu_devices_destroy(void); diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index d498943..df75d07 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -112,19 +112,28 @@ static inline void x86_teardown_msi_irq(unsigned int irq) { x86_msi.teardown_msi_irq(irq); } +static inline void x86_restore_msi_irqs(struct pci_dev *dev, int irq) +{ + x86_msi.restore_msi_irqs(dev, irq); +} #define arch_setup_msi_irqs x86_setup_msi_irqs #define arch_teardown_msi_irqs x86_teardown_msi_irqs #define arch_teardown_msi_irq x86_teardown_msi_irq +#define arch_restore_msi_irqs x86_restore_msi_irqs /* implemented in arch/x86/kernel/apic/io_apic. */ int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type); void native_teardown_msi_irq(unsigned int irq); +void native_restore_msi_irqs(struct pci_dev *dev, int irq); /* default to the implementation in drivers/lib/msi.c */ #define HAVE_DEFAULT_MSI_TEARDOWN_IRQS +#define HAVE_DEFAULT_MSI_RESTORE_IRQS void default_teardown_msi_irqs(struct pci_dev *dev); +void default_restore_msi_irqs(struct pci_dev *dev, int irq); #else #define native_setup_msi_irqs NULL #define native_teardown_msi_irq NULL #define default_teardown_msi_irqs NULL +#define default_restore_msi_irqs NULL #endif #define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys) diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index e381978..b3a5317 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -44,8 +44,6 @@ enum pci_bf_sort_state { /* pci-i386.c */ -extern unsigned int pcibios_max_latency; - void pcibios_resource_survey(void); void pcibios_set_cache_line_size(void); diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 529bf07e..7a11910 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -414,22 +414,6 @@ do { \ #define this_cpu_xchg_2(pcp, nval) percpu_xchg_op(pcp, nval) #define this_cpu_xchg_4(pcp, nval) percpu_xchg_op(pcp, nval) -#define irqsafe_cpu_add_1(pcp, val) percpu_add_op((pcp), val) -#define irqsafe_cpu_add_2(pcp, val) percpu_add_op((pcp), val) -#define irqsafe_cpu_add_4(pcp, val) percpu_add_op((pcp), val) -#define irqsafe_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val) -#define irqsafe_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val) -#define irqsafe_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val) -#define irqsafe_cpu_or_1(pcp, val) percpu_to_op("or", (pcp), val) -#define irqsafe_cpu_or_2(pcp, val) percpu_to_op("or", (pcp), val) -#define irqsafe_cpu_or_4(pcp, val) percpu_to_op("or", (pcp), val) -#define irqsafe_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val) -#define irqsafe_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val) -#define irqsafe_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val) -#define irqsafe_cpu_xchg_1(pcp, nval) percpu_xchg_op(pcp, nval) -#define irqsafe_cpu_xchg_2(pcp, nval) percpu_xchg_op(pcp, nval) -#define irqsafe_cpu_xchg_4(pcp, nval) percpu_xchg_op(pcp, nval) - #ifndef CONFIG_M386 #define __this_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val) #define __this_cpu_add_return_2(pcp, val) percpu_add_return_op(pcp, val) @@ -445,9 +429,6 @@ do { \ #define this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) #define this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) -#define irqsafe_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) -#define irqsafe_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) -#define irqsafe_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) #endif /* !CONFIG_M386 */ #ifdef CONFIG_X86_CMPXCHG64 @@ -464,7 +445,6 @@ do { \ #define __this_cpu_cmpxchg_double_4 percpu_cmpxchg8b_double #define this_cpu_cmpxchg_double_4 percpu_cmpxchg8b_double -#define irqsafe_cpu_cmpxchg_double_4 percpu_cmpxchg8b_double #endif /* CONFIG_X86_CMPXCHG64 */ /* @@ -492,13 +472,6 @@ do { \ #define this_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval) #define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) -#define irqsafe_cpu_add_8(pcp, val) percpu_add_op((pcp), val) -#define irqsafe_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) -#define irqsafe_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) -#define irqsafe_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val) -#define irqsafe_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval) -#define irqsafe_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) - /* * Pretty complex macro to generate cmpxchg16 instruction. The instruction * is not supported on early AMD64 processors so we must be able to emulate @@ -521,7 +494,6 @@ do { \ #define __this_cpu_cmpxchg_double_8 percpu_cmpxchg16b_double #define this_cpu_cmpxchg_double_8 percpu_cmpxchg16b_double -#define irqsafe_cpu_cmpxchg_double_8 percpu_cmpxchg16b_double #endif diff --git a/arch/x86/include/asm/serpent.h b/arch/x86/include/asm/serpent.h new file mode 100644 index 0000000..d3ef63f --- /dev/null +++ b/arch/x86/include/asm/serpent.h @@ -0,0 +1,63 @@ +#ifndef ASM_X86_SERPENT_H +#define ASM_X86_SERPENT_H + +#include <linux/crypto.h> +#include <crypto/serpent.h> + +#ifdef CONFIG_X86_32 + +#define SERPENT_PARALLEL_BLOCKS 4 + +asmlinkage void __serpent_enc_blk_4way(struct serpent_ctx *ctx, u8 *dst, + const u8 *src, bool xor); +asmlinkage void serpent_dec_blk_4way(struct serpent_ctx *ctx, u8 *dst, + const u8 *src); + +static inline void serpent_enc_blk_xway(struct serpent_ctx *ctx, u8 *dst, + const u8 *src) +{ + __serpent_enc_blk_4way(ctx, dst, src, false); +} + +static inline void serpent_enc_blk_xway_xor(struct serpent_ctx *ctx, u8 *dst, + const u8 *src) +{ + __serpent_enc_blk_4way(ctx, dst, src, true); +} + +static inline void serpent_dec_blk_xway(struct serpent_ctx *ctx, u8 *dst, + const u8 *src) +{ + serpent_dec_blk_4way(ctx, dst, src); +} + +#else + +#define SERPENT_PARALLEL_BLOCKS 8 + +asmlinkage void __serpent_enc_blk_8way(struct serpent_ctx *ctx, u8 *dst, + const u8 *src, bool xor); +asmlinkage void serpent_dec_blk_8way(struct serpent_ctx *ctx, u8 *dst, + const u8 *src); + +static inline void serpent_enc_blk_xway(struct serpent_ctx *ctx, u8 *dst, + const u8 *src) +{ + __serpent_enc_blk_8way(ctx, dst, src, false); +} + +static inline void serpent_enc_blk_xway_xor(struct serpent_ctx *ctx, u8 *dst, + const u8 *src) +{ + __serpent_enc_blk_8way(ctx, dst, src, true); +} + +static inline void serpent_dec_blk_xway(struct serpent_ctx *ctx, u8 *dst, + const u8 *src) +{ + serpent_dec_blk_8way(ctx, dst, src); +} + +#endif + +#endif diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 9756551..d0f19f9 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -47,7 +47,7 @@ extern void reserve_standard_io_resources(void); extern void i386_reserve_resources(void); extern void setup_default_timer_irq(void); -#ifdef CONFIG_X86_MRST +#ifdef CONFIG_X86_INTEL_MID extern void x86_mrst_early_setup(void); #else static inline void x86_mrst_early_setup(void) { } diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 73b11bc..0434c40 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -225,5 +225,11 @@ extern int hard_smp_processor_id(void); #endif /* CONFIG_X86_LOCAL_APIC */ +#ifdef CONFIG_DEBUG_NMI_SELFTEST +extern void nmi_selftest(void); +#else +#define nmi_selftest() do { } while (0) +#endif + #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_SMP_H */ diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index c4a348f..d962e56 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -15,6 +15,7 @@ #include <linux/sched.h> #include <linux/err.h> +#include <asm/asm-offsets.h> /* For NR_syscalls */ extern const unsigned long sys_call_table[]; diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 185b719..bc817cd 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -40,8 +40,8 @@ struct thread_info { */ __u8 supervisor_stack[0]; #endif - int sig_on_uaccess_error:1; - int uaccess_err:1; /* uaccess failed */ + unsigned int sig_on_uaccess_error:1; + unsigned int uaccess_err:1; /* uaccess failed */ }; #define INIT_THREAD_INFO(tsk) \ @@ -91,7 +91,6 @@ struct thread_info { #define TIF_MEMDIE 20 /* is terminating due to OOM killer */ #define TIF_DEBUG 21 /* uses debug registers */ #define TIF_IO_BITMAP 22 /* uses I/O bitmap */ -#define TIF_FREEZE 23 /* is freezing for suspend */ #define TIF_FORCED_TF 24 /* true if TF in eflags artificially */ #define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF */ #define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */ @@ -113,7 +112,6 @@ struct thread_info { #define _TIF_FORK (1 << TIF_FORK) #define _TIF_DEBUG (1 << TIF_DEBUG) #define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) -#define _TIF_FREEZE (1 << TIF_FREEZE) #define _TIF_FORCED_TF (1 << TIF_FORCED_TF) #define _TIF_BLOCKSTEP (1 << TIF_BLOCKSTEP) #define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES) diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 800f77c..b9676ae 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -172,7 +172,7 @@ static inline void arch_fix_phys_package_id(int num, u32 slot) } struct pci_bus; -void x86_pci_root_bus_res_quirks(struct pci_bus *b); +void x86_pci_root_bus_resources(int bus, struct list_head *resources); #ifdef CONFIG_SMP #define mc_capable() ((boot_cpu_data.x86_max_cores > 1) && \ diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h index 2a58ed3..21f77b8 100644 --- a/arch/x86/include/asm/unistd.h +++ b/arch/x86/include/asm/unistd.h @@ -1,13 +1,60 @@ +#ifndef _ASM_X86_UNISTD_H +#define _ASM_X86_UNISTD_H 1 + #ifdef __KERNEL__ # ifdef CONFIG_X86_32 -# include "unistd_32.h" + +# include <asm/unistd_32.h> +# define __ARCH_WANT_IPC_PARSE_VERSION +# define __ARCH_WANT_STAT64 +# define __ARCH_WANT_SYS_IPC +# define __ARCH_WANT_SYS_OLD_MMAP +# define __ARCH_WANT_SYS_OLD_SELECT + # else -# include "unistd_64.h" + +# include <asm/unistd_64.h> +# define __ARCH_WANT_COMPAT_SYS_TIME + # endif + +# define __ARCH_WANT_OLD_READDIR +# define __ARCH_WANT_OLD_STAT +# define __ARCH_WANT_SYS_ALARM +# define __ARCH_WANT_SYS_FADVISE64 +# define __ARCH_WANT_SYS_GETHOSTNAME +# define __ARCH_WANT_SYS_GETPGRP +# define __ARCH_WANT_SYS_LLSEEK +# define __ARCH_WANT_SYS_NICE +# define __ARCH_WANT_SYS_OLDUMOUNT +# define __ARCH_WANT_SYS_OLD_GETRLIMIT +# define __ARCH_WANT_SYS_OLD_UNAME +# define __ARCH_WANT_SYS_PAUSE +# define __ARCH_WANT_SYS_RT_SIGACTION +# define __ARCH_WANT_SYS_RT_SIGSUSPEND +# define __ARCH_WANT_SYS_SGETMASK +# define __ARCH_WANT_SYS_SIGNAL +# define __ARCH_WANT_SYS_SIGPENDING +# define __ARCH_WANT_SYS_SIGPROCMASK +# define __ARCH_WANT_SYS_SOCKETCALL +# define __ARCH_WANT_SYS_TIME +# define __ARCH_WANT_SYS_UTIME +# define __ARCH_WANT_SYS_WAITPID + +/* + * "Conditional" syscalls + * + * What we want is __attribute__((weak,alias("sys_ni_syscall"))), + * but it doesn't work on all toolchains, so we just do it by hand + */ +# define cond_syscall(x) asm(".weak\t" #x "\n\t.set\t" #x ",sys_ni_syscall") + #else # ifdef __i386__ -# include "unistd_32.h" +# include <asm/unistd_32.h> # else -# include "unistd_64.h" +# include <asm/unistd_64.h> # endif #endif + +#endif /* _ASM_X86_UNISTD_H */ diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h deleted file mode 100644 index 599c77d..0000000 --- a/arch/x86/include/asm/unistd_32.h +++ /dev/null @@ -1,401 +0,0 @@ -#ifndef _ASM_X86_UNISTD_32_H -#define _ASM_X86_UNISTD_32_H - -/* - * This file contains the system call numbers. - */ - -#define __NR_restart_syscall 0 -#define __NR_exit 1 -#define __NR_fork 2 -#define __NR_read 3 -#define __NR_write 4 -#define __NR_open 5 -#define __NR_close 6 -#define __NR_waitpid 7 -#define __NR_creat 8 -#define __NR_link 9 -#define __NR_unlink 10 -#define __NR_execve 11 -#define __NR_chdir 12 -#define __NR_time 13 -#define __NR_mknod 14 -#define __NR_chmod 15 -#define __NR_lchown 16 -#define __NR_break 17 -#define __NR_oldstat 18 -#define __NR_lseek 19 -#define __NR_getpid 20 -#define __NR_mount 21 -#define __NR_umount 22 -#define __NR_setuid 23 -#define __NR_getuid 24 -#define __NR_stime 25 -#define __NR_ptrace 26 -#define __NR_alarm 27 -#define __NR_oldfstat 28 -#define __NR_pause 29 -#define __NR_utime 30 -#define __NR_stty 31 -#define __NR_gtty 32 -#define __NR_access 33 -#define __NR_nice 34 -#define __NR_ftime 35 -#define __NR_sync 36 -#define __NR_kill 37 -#define __NR_rename 38 -#define __NR_mkdir 39 -#define __NR_rmdir 40 -#define __NR_dup 41 -#define __NR_pipe 42 -#define __NR_times 43 -#define __NR_prof 44 -#define __NR_brk 45 -#define __NR_setgid 46 -#define __NR_getgid 47 -#define __NR_signal 48 -#define __NR_geteuid 49 -#define __NR_getegid 50 -#define __NR_acct 51 -#define __NR_umount2 52 -#define __NR_lock 53 -#define __NR_ioctl 54 -#define __NR_fcntl 55 -#define __NR_mpx 56 -#define __NR_setpgid 57 -#define __NR_ulimit 58 -#define __NR_oldolduname 59 -#define __NR_umask 60 -#define __NR_chroot 61 -#define __NR_ustat 62 -#define __NR_dup2 63 -#define __NR_getppid 64 -#define __NR_getpgrp 65 -#define __NR_setsid 66 -#define __NR_sigaction 67 -#define __NR_sgetmask 68 -#define __NR_ssetmask 69 -#define __NR_setreuid 70 -#define __NR_setregid 71 -#define __NR_sigsuspend 72 -#define __NR_sigpending 73 -#define __NR_sethostname 74 -#define __NR_setrlimit 75 -#define __NR_getrlimit 76 /* Back compatible 2Gig limited rlimit */ -#define __NR_getrusage 77 -#define __NR_gettimeofday 78 -#define __NR_settimeofday 79 -#define __NR_getgroups 80 -#define __NR_setgroups 81 -#define __NR_select 82 -#define __NR_symlink 83 -#define __NR_oldlstat 84 -#define __NR_readlink 85 -#define __NR_uselib 86 -#define __NR_swapon 87 -#define __NR_reboot 88 -#define __NR_readdir 89 -#define __NR_mmap 90 -#define __NR_munmap 91 -#define __NR_truncate 92 -#define __NR_ftruncate 93 -#define __NR_fchmod 94 -#define __NR_fchown 95 -#define __NR_getpriority 96 -#define __NR_setpriority 97 -#define __NR_profil 98 -#define __NR_statfs 99 -#define __NR_fstatfs 100 -#define __NR_ioperm 101 -#define __NR_socketcall 102 -#define __NR_syslog 103 -#define __NR_setitimer 104 -#define __NR_getitimer 105 -#define __NR_stat 106 -#define __NR_lstat 107 -#define __NR_fstat 108 -#define __NR_olduname 109 -#define __NR_iopl 110 -#define __NR_vhangup 111 -#define __NR_idle 112 -#define __NR_vm86old 113 -#define __NR_wait4 114 -#define __NR_swapoff 115 -#define __NR_sysinfo 116 -#define __NR_ipc 117 -#define __NR_fsync 118 -#define __NR_sigreturn 119 -#define __NR_clone 120 -#define __NR_setdomainname 121 -#define __NR_uname 122 -#define __NR_modify_ldt 123 -#define __NR_adjtimex 124 -#define __NR_mprotect 125 -#define __NR_sigprocmask 126 -#define __NR_create_module 127 -#define __NR_init_module 128 -#define __NR_delete_module 129 -#define __NR_get_kernel_syms 130 -#define __NR_quotactl 131 -#define __NR_getpgid 132 -#define __NR_fchdir 133 -#define __NR_bdflush 134 -#define __NR_sysfs 135 -#define __NR_personality 136 -#define __NR_afs_syscall 137 /* Syscall for Andrew File System */ -#define __NR_setfsuid 138 -#define __NR_setfsgid 139 -#define __NR__llseek 140 -#define __NR_getdents 141 -#define __NR__newselect 142 -#define __NR_flock 143 -#define __NR_msync 144 -#define __NR_readv 145 -#define __NR_writev 146 -#define __NR_getsid 147 -#define __NR_fdatasync 148 -#define __NR__sysctl 149 -#define __NR_mlock 150 -#define __NR_munlock 151 -#define __NR_mlockall 152 -#define __NR_munlockall 153 -#define __NR_sched_setparam 154 -#define __NR_sched_getparam 155 -#define __NR_sched_setscheduler 156 -#define __NR_sched_getscheduler 157 -#define __NR_sched_yield 158 -#define __NR_sched_get_priority_max 159 -#define __NR_sched_get_priority_min 160 -#define __NR_sched_rr_get_interval 161 -#define __NR_nanosleep 162 -#define __NR_mremap 163 -#define __NR_setresuid 164 -#define __NR_getresuid 165 -#define __NR_vm86 166 -#define __NR_query_module 167 -#define __NR_poll 168 -#define __NR_nfsservctl 169 -#define __NR_setresgid 170 -#define __NR_getresgid 171 -#define __NR_prctl 172 -#define __NR_rt_sigreturn 173 -#define __NR_rt_sigaction 174 -#define __NR_rt_sigprocmask 175 -#define __NR_rt_sigpending 176 -#define __NR_rt_sigtimedwait 177 -#define __NR_rt_sigqueueinfo 178 -#define __NR_rt_sigsuspend 179 -#define __NR_pread64 180 -#define __NR_pwrite64 181 -#define __NR_chown 182 -#define __NR_getcwd 183 -#define __NR_capget 184 -#define __NR_capset 185 -#define __NR_sigaltstack 186 -#define __NR_sendfile 187 -#define __NR_getpmsg 188 /* some people actually want streams */ -#define __NR_putpmsg 189 /* some people actually want streams */ -#define __NR_vfork 190 -#define __NR_ugetrlimit 191 /* SuS compliant getrlimit */ -#define __NR_mmap2 192 -#define __NR_truncate64 193 -#define __NR_ftruncate64 194 -#define __NR_stat64 195 -#define __NR_lstat64 196 -#define __NR_fstat64 197 -#define __NR_lchown32 198 -#define __NR_getuid32 199 -#define __NR_getgid32 200 -#define __NR_geteuid32 201 -#define __NR_getegid32 202 -#define __NR_setreuid32 203 -#define __NR_setregid32 204 -#define __NR_getgroups32 205 -#define __NR_setgroups32 206 -#define __NR_fchown32 207 -#define __NR_setresuid32 208 -#define __NR_getresuid32 209 -#define __NR_setresgid32 210 -#define __NR_getresgid32 211 -#define __NR_chown32 212 -#define __NR_setuid32 213 -#define __NR_setgid32 214 -#define __NR_setfsuid32 215 -#define __NR_setfsgid32 216 -#define __NR_pivot_root 217 -#define __NR_mincore 218 -#define __NR_madvise 219 -#define __NR_madvise1 219 /* delete when C lib stub is removed */ -#define __NR_getdents64 220 -#define __NR_fcntl64 221 -/* 223 is unused */ -#define __NR_gettid 224 -#define __NR_readahead 225 -#define __NR_setxattr 226 -#define __NR_lsetxattr 227 -#define __NR_fsetxattr 228 -#define __NR_getxattr 229 -#define __NR_lgetxattr 230 -#define __NR_fgetxattr 231 -#define __NR_listxattr 232 -#define __NR_llistxattr 233 -#define __NR_flistxattr 234 -#define __NR_removexattr 235 -#define __NR_lremovexattr 236 -#define __NR_fremovexattr 237 -#define __NR_tkill 238 -#define __NR_sendfile64 239 -#define __NR_futex 240 -#define __NR_sched_setaffinity 241 -#define __NR_sched_getaffinity 242 -#define __NR_set_thread_area 243 -#define __NR_get_thread_area 244 -#define __NR_io_setup 245 -#define __NR_io_destroy 246 -#define __NR_io_getevents 247 -#define __NR_io_submit 248 -#define __NR_io_cancel 249 -#define __NR_fadvise64 250 -/* 251 is available for reuse (was briefly sys_set_zone_reclaim) */ -#define __NR_exit_group 252 -#define __NR_lookup_dcookie 253 -#define __NR_epoll_create 254 -#define __NR_epoll_ctl 255 -#define __NR_epoll_wait 256 -#define __NR_remap_file_pages 257 -#define __NR_set_tid_address 258 -#define __NR_timer_create 259 -#define __NR_timer_settime (__NR_timer_create+1) -#define __NR_timer_gettime (__NR_timer_create+2) -#define __NR_timer_getoverrun (__NR_timer_create+3) -#define __NR_timer_delete (__NR_timer_create+4) -#define __NR_clock_settime (__NR_timer_create+5) -#define __NR_clock_gettime (__NR_timer_create+6) -#define __NR_clock_getres (__NR_timer_create+7) -#define __NR_clock_nanosleep (__NR_timer_create+8) -#define __NR_statfs64 268 -#define __NR_fstatfs64 269 -#define __NR_tgkill 270 -#define __NR_utimes 271 -#define __NR_fadvise64_64 272 -#define __NR_vserver 273 -#define __NR_mbind 274 -#define __NR_get_mempolicy 275 -#define __NR_set_mempolicy 276 -#define __NR_mq_open 277 -#define __NR_mq_unlink (__NR_mq_open+1) -#define __NR_mq_timedsend (__NR_mq_open+2) -#define __NR_mq_timedreceive (__NR_mq_open+3) -#define __NR_mq_notify (__NR_mq_open+4) -#define __NR_mq_getsetattr (__NR_mq_open+5) -#define __NR_kexec_load 283 -#define __NR_waitid 284 -/* #define __NR_sys_setaltroot 285 */ -#define __NR_add_key 286 -#define __NR_request_key 287 -#define __NR_keyctl 288 -#define __NR_ioprio_set 289 -#define __NR_ioprio_get 290 -#define __NR_inotify_init 291 -#define __NR_inotify_add_watch 292 -#define __NR_inotify_rm_watch 293 -#define __NR_migrate_pages 294 -#define __NR_openat 295 -#define __NR_mkdirat 296 -#define __NR_mknodat 297 -#define __NR_fchownat 298 -#define __NR_futimesat 299 -#define __NR_fstatat64 300 -#define __NR_unlinkat 301 -#define __NR_renameat 302 -#define __NR_linkat 303 -#define __NR_symlinkat 304 -#define __NR_readlinkat 305 -#define __NR_fchmodat 306 -#define __NR_faccessat 307 -#define __NR_pselect6 308 -#define __NR_ppoll 309 -#define __NR_unshare 310 -#define __NR_set_robust_list 311 -#define __NR_get_robust_list 312 -#define __NR_splice 313 -#define __NR_sync_file_range 314 -#define __NR_tee 315 -#define __NR_vmsplice 316 -#define __NR_move_pages 317 -#define __NR_getcpu 318 -#define __NR_epoll_pwait 319 -#define __NR_utimensat 320 -#define __NR_signalfd 321 -#define __NR_timerfd_create 322 -#define __NR_eventfd 323 -#define __NR_fallocate 324 -#define __NR_timerfd_settime 325 -#define __NR_timerfd_gettime 326 -#define __NR_signalfd4 327 -#define __NR_eventfd2 328 -#define __NR_epoll_create1 329 -#define __NR_dup3 330 -#define __NR_pipe2 331 -#define __NR_inotify_init1 332 -#define __NR_preadv 333 -#define __NR_pwritev 334 -#define __NR_rt_tgsigqueueinfo 335 -#define __NR_perf_event_open 336 -#define __NR_recvmmsg 337 -#define __NR_fanotify_init 338 -#define __NR_fanotify_mark 339 -#define __NR_prlimit64 340 -#define __NR_name_to_handle_at 341 -#define __NR_open_by_handle_at 342 -#define __NR_clock_adjtime 343 -#define __NR_syncfs 344 -#define __NR_sendmmsg 345 -#define __NR_setns 346 -#define __NR_process_vm_readv 347 -#define __NR_process_vm_writev 348 - -#ifdef __KERNEL__ - -#define NR_syscalls 349 - -#define __ARCH_WANT_IPC_PARSE_VERSION -#define __ARCH_WANT_OLD_READDIR -#define __ARCH_WANT_OLD_STAT -#define __ARCH_WANT_STAT64 -#define __ARCH_WANT_SYS_ALARM -#define __ARCH_WANT_SYS_GETHOSTNAME -#define __ARCH_WANT_SYS_IPC -#define __ARCH_WANT_SYS_PAUSE -#define __ARCH_WANT_SYS_SGETMASK -#define __ARCH_WANT_SYS_SIGNAL -#define __ARCH_WANT_SYS_TIME -#define __ARCH_WANT_SYS_UTIME -#define __ARCH_WANT_SYS_WAITPID -#define __ARCH_WANT_SYS_SOCKETCALL -#define __ARCH_WANT_SYS_FADVISE64 -#define __ARCH_WANT_SYS_GETPGRP -#define __ARCH_WANT_SYS_LLSEEK -#define __ARCH_WANT_SYS_NICE -#define __ARCH_WANT_SYS_OLD_GETRLIMIT -#define __ARCH_WANT_SYS_OLD_UNAME -#define __ARCH_WANT_SYS_OLD_MMAP -#define __ARCH_WANT_SYS_OLD_SELECT -#define __ARCH_WANT_SYS_OLDUMOUNT -#define __ARCH_WANT_SYS_SIGPENDING -#define __ARCH_WANT_SYS_SIGPROCMASK -#define __ARCH_WANT_SYS_RT_SIGACTION -#define __ARCH_WANT_SYS_RT_SIGSUSPEND - -/* - * "Conditional" syscalls - * - * What we want is __attribute__((weak,alias("sys_ni_syscall"))), - * but it doesn't work on all toolchains, so we just do it by hand - */ -#ifndef cond_syscall -#define cond_syscall(x) asm(".weak\t" #x "\n\t.set\t" #x ",sys_ni_syscall") -#endif - -#endif /* __KERNEL__ */ -#endif /* _ASM_X86_UNISTD_32_H */ diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h deleted file mode 100644 index 0431f19..0000000 --- a/arch/x86/include/asm/unistd_64.h +++ /dev/null @@ -1,732 +0,0 @@ -#ifndef _ASM_X86_UNISTD_64_H -#define _ASM_X86_UNISTD_64_H - -#ifndef __SYSCALL -#define __SYSCALL(a, b) -#endif - -/* - * This file contains the system call numbers. - * - * Note: holes are not allowed. - */ - -/* at least 8 syscall per cacheline */ -#define __NR_read 0 -__SYSCALL(__NR_read, sys_read) -#define __NR_write 1 -__SYSCALL(__NR_write, sys_write) -#define __NR_open 2 -__SYSCALL(__NR_open, sys_open) -#define __NR_close 3 -__SYSCALL(__NR_close, sys_close) -#define __NR_stat 4 -__SYSCALL(__NR_stat, sys_newstat) -#define __NR_fstat 5 -__SYSCALL(__NR_fstat, sys_newfstat) -#define __NR_lstat 6 -__SYSCALL(__NR_lstat, sys_newlstat) -#define __NR_poll 7 -__SYSCALL(__NR_poll, sys_poll) - -#define __NR_lseek 8 -__SYSCALL(__NR_lseek, sys_lseek) -#define __NR_mmap 9 -__SYSCALL(__NR_mmap, sys_mmap) -#define __NR_mprotect 10 -__SYSCALL(__NR_mprotect, sys_mprotect) -#define __NR_munmap 11 -__SYSCALL(__NR_munmap, sys_munmap) -#define __NR_brk 12 -__SYSCALL(__NR_brk, sys_brk) -#define __NR_rt_sigaction 13 -__SYSCALL(__NR_rt_sigaction, sys_rt_sigaction) -#define __NR_rt_sigprocmask 14 -__SYSCALL(__NR_rt_sigprocmask, sys_rt_sigprocmask) -#define __NR_rt_sigreturn 15 -__SYSCALL(__NR_rt_sigreturn, stub_rt_sigreturn) - -#define __NR_ioctl 16 -__SYSCALL(__NR_ioctl, sys_ioctl) -#define __NR_pread64 17 -__SYSCALL(__NR_pread64, sys_pread64) -#define __NR_pwrite64 18 -__SYSCALL(__NR_pwrite64, sys_pwrite64) -#define __NR_readv 19 -__SYSCALL(__NR_readv, sys_readv) -#define __NR_writev 20 -__SYSCALL(__NR_writev, sys_writev) -#define __NR_access 21 -__SYSCALL(__NR_access, sys_access) -#define __NR_pipe 22 -__SYSCALL(__NR_pipe, sys_pipe) -#define __NR_select 23 -__SYSCALL(__NR_select, sys_select) - -#define __NR_sched_yield 24 -__SYSCALL(__NR_sched_yield, sys_sched_yield) -#define __NR_mremap 25 -__SYSCALL(__NR_mremap, sys_mremap) -#define __NR_msync 26 -__SYSCALL(__NR_msync, sys_msync) -#define __NR_mincore 27 -__SYSCALL(__NR_mincore, sys_mincore) -#define __NR_madvise 28 -__SYSCALL(__NR_madvise, sys_madvise) -#define __NR_shmget 29 -__SYSCALL(__NR_shmget, sys_shmget) -#define __NR_shmat 30 -__SYSCALL(__NR_shmat, sys_shmat) -#define __NR_shmctl 31 -__SYSCALL(__NR_shmctl, sys_shmctl) - -#define __NR_dup 32 -__SYSCALL(__NR_dup, sys_dup) -#define __NR_dup2 33 -__SYSCALL(__NR_dup2, sys_dup2) -#define __NR_pause 34 -__SYSCALL(__NR_pause, sys_pause) -#define __NR_nanosleep 35 -__SYSCALL(__NR_nanosleep, sys_nanosleep) -#define __NR_getitimer 36 -__SYSCALL(__NR_getitimer, sys_getitimer) -#define __NR_alarm 37 -__SYSCALL(__NR_alarm, sys_alarm) -#define __NR_setitimer 38 -__SYSCALL(__NR_setitimer, sys_setitimer) -#define __NR_getpid 39 -__SYSCALL(__NR_getpid, sys_getpid) - -#define __NR_sendfile 40 -__SYSCALL(__NR_sendfile, sys_sendfile64) -#define __NR_socket 41 -__SYSCALL(__NR_socket, sys_socket) -#define __NR_connect 42 -__SYSCALL(__NR_connect, sys_connect) -#define __NR_accept 43 -__SYSCALL(__NR_accept, sys_accept) -#define __NR_sendto 44 -__SYSCALL(__NR_sendto, sys_sendto) -#define __NR_recvfrom 45 -__SYSCALL(__NR_recvfrom, sys_recvfrom) -#define __NR_sendmsg 46 -__SYSCALL(__NR_sendmsg, sys_sendmsg) -#define __NR_recvmsg 47 -__SYSCALL(__NR_recvmsg, sys_recvmsg) - -#define __NR_shutdown 48 -__SYSCALL(__NR_shutdown, sys_shutdown) -#define __NR_bind 49 -__SYSCALL(__NR_bind, sys_bind) -#define __NR_listen 50 -__SYSCALL(__NR_listen, sys_listen) -#define __NR_getsockname 51 -__SYSCALL(__NR_getsockname, sys_getsockname) -#define __NR_getpeername 52 -__SYSCALL(__NR_getpeername, sys_getpeername) -#define __NR_socketpair 53 -__SYSCALL(__NR_socketpair, sys_socketpair) -#define __NR_setsockopt 54 -__SYSCALL(__NR_setsockopt, sys_setsockopt) -#define __NR_getsockopt 55 -__SYSCALL(__NR_getsockopt, sys_getsockopt) - -#define __NR_clone 56 -__SYSCALL(__NR_clone, stub_clone) -#define __NR_fork 57 -__SYSCALL(__NR_fork, stub_fork) -#define __NR_vfork 58 -__SYSCALL(__NR_vfork, stub_vfork) -#define __NR_execve 59 -__SYSCALL(__NR_execve, stub_execve) -#define __NR_exit 60 -__SYSCALL(__NR_exit, sys_exit) -#define __NR_wait4 61 -__SYSCALL(__NR_wait4, sys_wait4) -#define __NR_kill 62 -__SYSCALL(__NR_kill, sys_kill) -#define __NR_uname 63 -__SYSCALL(__NR_uname, sys_newuname) - -#define __NR_semget 64 -__SYSCALL(__NR_semget, sys_semget) -#define __NR_semop 65 -__SYSCALL(__NR_semop, sys_semop) -#define __NR_semctl 66 -__SYSCALL(__NR_semctl, sys_semctl) -#define __NR_shmdt 67 -__SYSCALL(__NR_shmdt, sys_shmdt) -#define __NR_msgget 68 -__SYSCALL(__NR_msgget, sys_msgget) -#define __NR_msgsnd 69 -__SYSCALL(__NR_msgsnd, sys_msgsnd) -#define __NR_msgrcv 70 -__SYSCALL(__NR_msgrcv, sys_msgrcv) -#define __NR_msgctl 71 -__SYSCALL(__NR_msgctl, sys_msgctl) - -#define __NR_fcntl 72 -__SYSCALL(__NR_fcntl, sys_fcntl) -#define __NR_flock 73 -__SYSCALL(__NR_flock, sys_flock) -#define __NR_fsync 74 -__SYSCALL(__NR_fsync, sys_fsync) -#define __NR_fdatasync 75 -__SYSCALL(__NR_fdatasync, sys_fdatasync) -#define __NR_truncate 76 -__SYSCALL(__NR_truncate, sys_truncate) -#define __NR_ftruncate 77 -__SYSCALL(__NR_ftruncate, sys_ftruncate) -#define __NR_getdents 78 -__SYSCALL(__NR_getdents, sys_getdents) -#define __NR_getcwd 79 -__SYSCALL(__NR_getcwd, sys_getcwd) - -#define __NR_chdir 80 -__SYSCALL(__NR_chdir, sys_chdir) -#define __NR_fchdir 81 -__SYSCALL(__NR_fchdir, sys_fchdir) -#define __NR_rename 82 -__SYSCALL(__NR_rename, sys_rename) -#define __NR_mkdir 83 -__SYSCALL(__NR_mkdir, sys_mkdir) -#define __NR_rmdir 84 -__SYSCALL(__NR_rmdir, sys_rmdir) -#define __NR_creat 85 -__SYSCALL(__NR_creat, sys_creat) -#define __NR_link 86 -__SYSCALL(__NR_link, sys_link) -#define __NR_unlink 87 -__SYSCALL(__NR_unlink, sys_unlink) - -#define __NR_symlink 88 -__SYSCALL(__NR_symlink, sys_symlink) -#define __NR_readlink 89 -__SYSCALL(__NR_readlink, sys_readlink) -#define __NR_chmod 90 -__SYSCALL(__NR_chmod, sys_chmod) -#define __NR_fchmod 91 -__SYSCALL(__NR_fchmod, sys_fchmod) -#define __NR_chown 92 -__SYSCALL(__NR_chown, sys_chown) -#define __NR_fchown 93 -__SYSCALL(__NR_fchown, sys_fchown) -#define __NR_lchown 94 -__SYSCALL(__NR_lchown, sys_lchown) -#define __NR_umask 95 -__SYSCALL(__NR_umask, sys_umask) - -#define __NR_gettimeofday 96 -__SYSCALL(__NR_gettimeofday, sys_gettimeofday) -#define __NR_getrlimit 97 -__SYSCALL(__NR_getrlimit, sys_getrlimit) -#define __NR_getrusage 98 -__SYSCALL(__NR_getrusage, sys_getrusage) -#define __NR_sysinfo 99 -__SYSCALL(__NR_sysinfo, sys_sysinfo) -#define __NR_times 100 -__SYSCALL(__NR_times, sys_times) -#define __NR_ptrace 101 -__SYSCALL(__NR_ptrace, sys_ptrace) -#define __NR_getuid 102 -__SYSCALL(__NR_getuid, sys_getuid) -#define __NR_syslog 103 -__SYSCALL(__NR_syslog, sys_syslog) - -/* at the very end the stuff that never runs during the benchmarks */ -#define __NR_getgid 104 -__SYSCALL(__NR_getgid, sys_getgid) -#define __NR_setuid 105 -__SYSCALL(__NR_setuid, sys_setuid) -#define __NR_setgid 106 -__SYSCALL(__NR_setgid, sys_setgid) -#define __NR_geteuid 107 -__SYSCALL(__NR_geteuid, sys_geteuid) -#define __NR_getegid 108 -__SYSCALL(__NR_getegid, sys_getegid) -#define __NR_setpgid 109 -__SYSCALL(__NR_setpgid, sys_setpgid) -#define __NR_getppid 110 -__SYSCALL(__NR_getppid, sys_getppid) -#define __NR_getpgrp 111 -__SYSCALL(__NR_getpgrp, sys_getpgrp) - -#define __NR_setsid 112 -__SYSCALL(__NR_setsid, sys_setsid) -#define __NR_setreuid 113 -__SYSCALL(__NR_setreuid, sys_setreuid) -#define __NR_setregid 114 -__SYSCALL(__NR_setregid, sys_setregid) -#define __NR_getgroups 115 -__SYSCALL(__NR_getgroups, sys_getgroups) -#define __NR_setgroups 116 -__SYSCALL(__NR_setgroups, sys_setgroups) -#define __NR_setresuid 117 -__SYSCALL(__NR_setresuid, sys_setresuid) -#define __NR_getresuid 118 -__SYSCALL(__NR_getresuid, sys_getresuid) -#define __NR_setresgid 119 -__SYSCALL(__NR_setresgid, sys_setresgid) - -#define __NR_getresgid 120 -__SYSCALL(__NR_getresgid, sys_getresgid) -#define __NR_getpgid 121 -__SYSCALL(__NR_getpgid, sys_getpgid) -#define __NR_setfsuid 122 -__SYSCALL(__NR_setfsuid, sys_setfsuid) -#define __NR_setfsgid 123 -__SYSCALL(__NR_setfsgid, sys_setfsgid) -#define __NR_getsid 124 -__SYSCALL(__NR_getsid, sys_getsid) -#define __NR_capget 125 -__SYSCALL(__NR_capget, sys_capget) -#define __NR_capset 126 -__SYSCALL(__NR_capset, sys_capset) - -#define __NR_rt_sigpending 127 -__SYSCALL(__NR_rt_sigpending, sys_rt_sigpending) -#define __NR_rt_sigtimedwait 128 -__SYSCALL(__NR_rt_sigtimedwait, sys_rt_sigtimedwait) -#define __NR_rt_sigqueueinfo 129 -__SYSCALL(__NR_rt_sigqueueinfo, sys_rt_sigqueueinfo) -#define __NR_rt_sigsuspend 130 -__SYSCALL(__NR_rt_sigsuspend, sys_rt_sigsuspend) -#define __NR_sigaltstack 131 -__SYSCALL(__NR_sigaltstack, stub_sigaltstack) -#define __NR_utime 132 -__SYSCALL(__NR_utime, sys_utime) -#define __NR_mknod 133 -__SYSCALL(__NR_mknod, sys_mknod) - -/* Only needed for a.out */ -#define __NR_uselib 134 -__SYSCALL(__NR_uselib, sys_ni_syscall) -#define __NR_personality 135 -__SYSCALL(__NR_personality, sys_personality) - -#define __NR_ustat 136 -__SYSCALL(__NR_ustat, sys_ustat) -#define __NR_statfs 137 -__SYSCALL(__NR_statfs, sys_statfs) -#define __NR_fstatfs 138 -__SYSCALL(__NR_fstatfs, sys_fstatfs) -#define __NR_sysfs 139 -__SYSCALL(__NR_sysfs, sys_sysfs) - -#define __NR_getpriority 140 -__SYSCALL(__NR_getpriority, sys_getpriority) -#define __NR_setpriority 141 -__SYSCALL(__NR_setpriority, sys_setpriority) -#define __NR_sched_setparam 142 -__SYSCALL(__NR_sched_setparam, sys_sched_setparam) -#define __NR_sched_getparam 143 -__SYSCALL(__NR_sched_getparam, sys_sched_getparam) -#define __NR_sched_setscheduler 144 -__SYSCALL(__NR_sched_setscheduler, sys_sched_setscheduler) -#define __NR_sched_getscheduler 145 -__SYSCALL(__NR_sched_getscheduler, sys_sched_getscheduler) -#define __NR_sched_get_priority_max 146 -__SYSCALL(__NR_sched_get_priority_max, sys_sched_get_priority_max) -#define __NR_sched_get_priority_min 147 -__SYSCALL(__NR_sched_get_priority_min, sys_sched_get_priority_min) -#define __NR_sched_rr_get_interval 148 -__SYSCALL(__NR_sched_rr_get_interval, sys_sched_rr_get_interval) - -#define __NR_mlock 149 -__SYSCALL(__NR_mlock, sys_mlock) -#define __NR_munlock 150 -__SYSCALL(__NR_munlock, sys_munlock) -#define __NR_mlockall 151 -__SYSCALL(__NR_mlockall, sys_mlockall) -#define __NR_munlockall 152 -__SYSCALL(__NR_munlockall, sys_munlockall) - -#define __NR_vhangup 153 -__SYSCALL(__NR_vhangup, sys_vhangup) - -#define __NR_modify_ldt 154 -__SYSCALL(__NR_modify_ldt, sys_modify_ldt) - -#define __NR_pivot_root 155 -__SYSCALL(__NR_pivot_root, sys_pivot_root) - -#define __NR__sysctl 156 -__SYSCALL(__NR__sysctl, sys_sysctl) - -#define __NR_prctl 157 -__SYSCALL(__NR_prctl, sys_prctl) -#define __NR_arch_prctl 158 -__SYSCALL(__NR_arch_prctl, sys_arch_prctl) - -#define __NR_adjtimex 159 -__SYSCALL(__NR_adjtimex, sys_adjtimex) - -#define __NR_setrlimit 160 -__SYSCALL(__NR_setrlimit, sys_setrlimit) - -#define __NR_chroot 161 -__SYSCALL(__NR_chroot, sys_chroot) - -#define __NR_sync 162 -__SYSCALL(__NR_sync, sys_sync) - -#define __NR_acct 163 -__SYSCALL(__NR_acct, sys_acct) - -#define __NR_settimeofday 164 -__SYSCALL(__NR_settimeofday, sys_settimeofday) - -#define __NR_mount 165 -__SYSCALL(__NR_mount, sys_mount) -#define __NR_umount2 166 -__SYSCALL(__NR_umount2, sys_umount) - -#define __NR_swapon 167 -__SYSCALL(__NR_swapon, sys_swapon) -#define __NR_swapoff 168 -__SYSCALL(__NR_swapoff, sys_swapoff) - -#define __NR_reboot 169 -__SYSCALL(__NR_reboot, sys_reboot) - -#define __NR_sethostname 170 -__SYSCALL(__NR_sethostname, sys_sethostname) -#define __NR_setdomainname 171 -__SYSCALL(__NR_setdomainname, sys_setdomainname) - -#define __NR_iopl 172 -__SYSCALL(__NR_iopl, stub_iopl) -#define __NR_ioperm 173 -__SYSCALL(__NR_ioperm, sys_ioperm) - -#define __NR_create_module 174 -__SYSCALL(__NR_create_module, sys_ni_syscall) -#define __NR_init_module 175 -__SYSCALL(__NR_init_module, sys_init_module) -#define __NR_delete_module 176 -__SYSCALL(__NR_delete_module, sys_delete_module) -#define __NR_get_kernel_syms 177 -__SYSCALL(__NR_get_kernel_syms, sys_ni_syscall) -#define __NR_query_module 178 -__SYSCALL(__NR_query_module, sys_ni_syscall) - -#define __NR_quotactl 179 -__SYSCALL(__NR_quotactl, sys_quotactl) - -#define __NR_nfsservctl 180 -__SYSCALL(__NR_nfsservctl, sys_ni_syscall) - -/* reserved for LiS/STREAMS */ -#define __NR_getpmsg 181 -__SYSCALL(__NR_getpmsg, sys_ni_syscall) -#define __NR_putpmsg 182 -__SYSCALL(__NR_putpmsg, sys_ni_syscall) - -/* reserved for AFS */ -#define __NR_afs_syscall 183 -__SYSCALL(__NR_afs_syscall, sys_ni_syscall) - -/* reserved for tux */ -#define __NR_tuxcall 184 -__SYSCALL(__NR_tuxcall, sys_ni_syscall) - -#define __NR_security 185 -__SYSCALL(__NR_security, sys_ni_syscall) - -#define __NR_gettid 186 -__SYSCALL(__NR_gettid, sys_gettid) - -#define __NR_readahead 187 -__SYSCALL(__NR_readahead, sys_readahead) -#define __NR_setxattr 188 -__SYSCALL(__NR_setxattr, sys_setxattr) -#define __NR_lsetxattr 189 -__SYSCALL(__NR_lsetxattr, sys_lsetxattr) -#define __NR_fsetxattr 190 -__SYSCALL(__NR_fsetxattr, sys_fsetxattr) -#define __NR_getxattr 191 -__SYSCALL(__NR_getxattr, sys_getxattr) -#define __NR_lgetxattr 192 -__SYSCALL(__NR_lgetxattr, sys_lgetxattr) -#define __NR_fgetxattr 193 -__SYSCALL(__NR_fgetxattr, sys_fgetxattr) -#define __NR_listxattr 194 -__SYSCALL(__NR_listxattr, sys_listxattr) -#define __NR_llistxattr 195 -__SYSCALL(__NR_llistxattr, sys_llistxattr) -#define __NR_flistxattr 196 -__SYSCALL(__NR_flistxattr, sys_flistxattr) -#define __NR_removexattr 197 -__SYSCALL(__NR_removexattr, sys_removexattr) -#define __NR_lremovexattr 198 -__SYSCALL(__NR_lremovexattr, sys_lremovexattr) -#define __NR_fremovexattr 199 -__SYSCALL(__NR_fremovexattr, sys_fremovexattr) -#define __NR_tkill 200 -__SYSCALL(__NR_tkill, sys_tkill) -#define __NR_time 201 -__SYSCALL(__NR_time, sys_time) -#define __NR_futex 202 -__SYSCALL(__NR_futex, sys_futex) -#define __NR_sched_setaffinity 203 -__SYSCALL(__NR_sched_setaffinity, sys_sched_setaffinity) -#define __NR_sched_getaffinity 204 -__SYSCALL(__NR_sched_getaffinity, sys_sched_getaffinity) -#define __NR_set_thread_area 205 -__SYSCALL(__NR_set_thread_area, sys_ni_syscall) /* use arch_prctl */ -#define __NR_io_setup 206 -__SYSCALL(__NR_io_setup, sys_io_setup) -#define __NR_io_destroy 207 -__SYSCALL(__NR_io_destroy, sys_io_destroy) -#define __NR_io_getevents 208 -__SYSCALL(__NR_io_getevents, sys_io_getevents) -#define __NR_io_submit 209 -__SYSCALL(__NR_io_submit, sys_io_submit) -#define __NR_io_cancel 210 -__SYSCALL(__NR_io_cancel, sys_io_cancel) -#define __NR_get_thread_area 211 -__SYSCALL(__NR_get_thread_area, sys_ni_syscall) /* use arch_prctl */ -#define __NR_lookup_dcookie 212 -__SYSCALL(__NR_lookup_dcookie, sys_lookup_dcookie) -#define __NR_epoll_create 213 -__SYSCALL(__NR_epoll_create, sys_epoll_create) -#define __NR_epoll_ctl_old 214 -__SYSCALL(__NR_epoll_ctl_old, sys_ni_syscall) -#define __NR_epoll_wait_old 215 -__SYSCALL(__NR_epoll_wait_old, sys_ni_syscall) -#define __NR_remap_file_pages 216 -__SYSCALL(__NR_remap_file_pages, sys_remap_file_pages) -#define __NR_getdents64 217 -__SYSCALL(__NR_getdents64, sys_getdents64) -#define __NR_set_tid_address 218 -__SYSCALL(__NR_set_tid_address, sys_set_tid_address) -#define __NR_restart_syscall 219 -__SYSCALL(__NR_restart_syscall, sys_restart_syscall) -#define __NR_semtimedop 220 -__SYSCALL(__NR_semtimedop, sys_semtimedop) -#define __NR_fadvise64 221 -__SYSCALL(__NR_fadvise64, sys_fadvise64) -#define __NR_timer_create 222 -__SYSCALL(__NR_timer_create, sys_timer_create) -#define __NR_timer_settime 223 -__SYSCALL(__NR_timer_settime, sys_timer_settime) -#define __NR_timer_gettime 224 -__SYSCALL(__NR_timer_gettime, sys_timer_gettime) -#define __NR_timer_getoverrun 225 -__SYSCALL(__NR_timer_getoverrun, sys_timer_getoverrun) -#define __NR_timer_delete 226 -__SYSCALL(__NR_timer_delete, sys_timer_delete) -#define __NR_clock_settime 227 -__SYSCALL(__NR_clock_settime, sys_clock_settime) -#define __NR_clock_gettime 228 -__SYSCALL(__NR_clock_gettime, sys_clock_gettime) -#define __NR_clock_getres 229 -__SYSCALL(__NR_clock_getres, sys_clock_getres) -#define __NR_clock_nanosleep 230 -__SYSCALL(__NR_clock_nanosleep, sys_clock_nanosleep) -#define __NR_exit_group 231 -__SYSCALL(__NR_exit_group, sys_exit_group) -#define __NR_epoll_wait 232 -__SYSCALL(__NR_epoll_wait, sys_epoll_wait) -#define __NR_epoll_ctl 233 -__SYSCALL(__NR_epoll_ctl, sys_epoll_ctl) -#define __NR_tgkill 234 -__SYSCALL(__NR_tgkill, sys_tgkill) -#define __NR_utimes 235 -__SYSCALL(__NR_utimes, sys_utimes) -#define __NR_vserver 236 -__SYSCALL(__NR_vserver, sys_ni_syscall) -#define __NR_mbind 237 -__SYSCALL(__NR_mbind, sys_mbind) -#define __NR_set_mempolicy 238 -__SYSCALL(__NR_set_mempolicy, sys_set_mempolicy) -#define __NR_get_mempolicy 239 -__SYSCALL(__NR_get_mempolicy, sys_get_mempolicy) -#define __NR_mq_open 240 -__SYSCALL(__NR_mq_open, sys_mq_open) -#define __NR_mq_unlink 241 -__SYSCALL(__NR_mq_unlink, sys_mq_unlink) -#define __NR_mq_timedsend 242 -__SYSCALL(__NR_mq_timedsend, sys_mq_timedsend) -#define __NR_mq_timedreceive 243 -__SYSCALL(__NR_mq_timedreceive, sys_mq_timedreceive) -#define __NR_mq_notify 244 -__SYSCALL(__NR_mq_notify, sys_mq_notify) -#define __NR_mq_getsetattr 245 -__SYSCALL(__NR_mq_getsetattr, sys_mq_getsetattr) -#define __NR_kexec_load 246 -__SYSCALL(__NR_kexec_load, sys_kexec_load) -#define __NR_waitid 247 -__SYSCALL(__NR_waitid, sys_waitid) -#define __NR_add_key 248 -__SYSCALL(__NR_add_key, sys_add_key) -#define __NR_request_key 249 -__SYSCALL(__NR_request_key, sys_request_key) -#define __NR_keyctl 250 -__SYSCALL(__NR_keyctl, sys_keyctl) -#define __NR_ioprio_set 251 -__SYSCALL(__NR_ioprio_set, sys_ioprio_set) -#define __NR_ioprio_get 252 -__SYSCALL(__NR_ioprio_get, sys_ioprio_get) -#define __NR_inotify_init 253 -__SYSCALL(__NR_inotify_init, sys_inotify_init) -#define __NR_inotify_add_watch 254 -__SYSCALL(__NR_inotify_add_watch, sys_inotify_add_watch) -#define __NR_inotify_rm_watch 255 -__SYSCALL(__NR_inotify_rm_watch, sys_inotify_rm_watch) -#define __NR_migrate_pages 256 -__SYSCALL(__NR_migrate_pages, sys_migrate_pages) -#define __NR_openat 257 -__SYSCALL(__NR_openat, sys_openat) -#define __NR_mkdirat 258 -__SYSCALL(__NR_mkdirat, sys_mkdirat) -#define __NR_mknodat 259 -__SYSCALL(__NR_mknodat, sys_mknodat) -#define __NR_fchownat 260 -__SYSCALL(__NR_fchownat, sys_fchownat) -#define __NR_futimesat 261 -__SYSCALL(__NR_futimesat, sys_futimesat) -#define __NR_newfstatat 262 -__SYSCALL(__NR_newfstatat, sys_newfstatat) -#define __NR_unlinkat 263 -__SYSCALL(__NR_unlinkat, sys_unlinkat) -#define __NR_renameat 264 -__SYSCALL(__NR_renameat, sys_renameat) -#define __NR_linkat 265 -__SYSCALL(__NR_linkat, sys_linkat) -#define __NR_symlinkat 266 -__SYSCALL(__NR_symlinkat, sys_symlinkat) -#define __NR_readlinkat 267 -__SYSCALL(__NR_readlinkat, sys_readlinkat) -#define __NR_fchmodat 268 -__SYSCALL(__NR_fchmodat, sys_fchmodat) -#define __NR_faccessat 269 -__SYSCALL(__NR_faccessat, sys_faccessat) -#define __NR_pselect6 270 -__SYSCALL(__NR_pselect6, sys_pselect6) -#define __NR_ppoll 271 -__SYSCALL(__NR_ppoll, sys_ppoll) -#define __NR_unshare 272 -__SYSCALL(__NR_unshare, sys_unshare) -#define __NR_set_robust_list 273 -__SYSCALL(__NR_set_robust_list, sys_set_robust_list) -#define __NR_get_robust_list 274 -__SYSCALL(__NR_get_robust_list, sys_get_robust_list) -#define __NR_splice 275 -__SYSCALL(__NR_splice, sys_splice) -#define __NR_tee 276 -__SYSCALL(__NR_tee, sys_tee) -#define __NR_sync_file_range 277 -__SYSCALL(__NR_sync_file_range, sys_sync_file_range) -#define __NR_vmsplice 278 -__SYSCALL(__NR_vmsplice, sys_vmsplice) -#define __NR_move_pages 279 -__SYSCALL(__NR_move_pages, sys_move_pages) -#define __NR_utimensat 280 -__SYSCALL(__NR_utimensat, sys_utimensat) -#define __NR_epoll_pwait 281 -__SYSCALL(__NR_epoll_pwait, sys_epoll_pwait) -#define __NR_signalfd 282 -__SYSCALL(__NR_signalfd, sys_signalfd) -#define __NR_timerfd_create 283 -__SYSCALL(__NR_timerfd_create, sys_timerfd_create) -#define __NR_eventfd 284 -__SYSCALL(__NR_eventfd, sys_eventfd) -#define __NR_fallocate 285 -__SYSCALL(__NR_fallocate, sys_fallocate) -#define __NR_timerfd_settime 286 -__SYSCALL(__NR_timerfd_settime, sys_timerfd_settime) -#define __NR_timerfd_gettime 287 -__SYSCALL(__NR_timerfd_gettime, sys_timerfd_gettime) -#define __NR_accept4 288 -__SYSCALL(__NR_accept4, sys_accept4) -#define __NR_signalfd4 289 -__SYSCALL(__NR_signalfd4, sys_signalfd4) -#define __NR_eventfd2 290 -__SYSCALL(__NR_eventfd2, sys_eventfd2) -#define __NR_epoll_create1 291 -__SYSCALL(__NR_epoll_create1, sys_epoll_create1) -#define __NR_dup3 292 -__SYSCALL(__NR_dup3, sys_dup3) -#define __NR_pipe2 293 -__SYSCALL(__NR_pipe2, sys_pipe2) -#define __NR_inotify_init1 294 -__SYSCALL(__NR_inotify_init1, sys_inotify_init1) -#define __NR_preadv 295 -__SYSCALL(__NR_preadv, sys_preadv) -#define __NR_pwritev 296 -__SYSCALL(__NR_pwritev, sys_pwritev) -#define __NR_rt_tgsigqueueinfo 297 -__SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo) -#define __NR_perf_event_open 298 -__SYSCALL(__NR_perf_event_open, sys_perf_event_open) -#define __NR_recvmmsg 299 -__SYSCALL(__NR_recvmmsg, sys_recvmmsg) -#define __NR_fanotify_init 300 -__SYSCALL(__NR_fanotify_init, sys_fanotify_init) -#define __NR_fanotify_mark 301 -__SYSCALL(__NR_fanotify_mark, sys_fanotify_mark) -#define __NR_prlimit64 302 -__SYSCALL(__NR_prlimit64, sys_prlimit64) -#define __NR_name_to_handle_at 303 -__SYSCALL(__NR_name_to_handle_at, sys_name_to_handle_at) -#define __NR_open_by_handle_at 304 -__SYSCALL(__NR_open_by_handle_at, sys_open_by_handle_at) -#define __NR_clock_adjtime 305 -__SYSCALL(__NR_clock_adjtime, sys_clock_adjtime) -#define __NR_syncfs 306 -__SYSCALL(__NR_syncfs, sys_syncfs) -#define __NR_sendmmsg 307 -__SYSCALL(__NR_sendmmsg, sys_sendmmsg) -#define __NR_setns 308 -__SYSCALL(__NR_setns, sys_setns) -#define __NR_getcpu 309 -__SYSCALL(__NR_getcpu, sys_getcpu) -#define __NR_process_vm_readv 310 -__SYSCALL(__NR_process_vm_readv, sys_process_vm_readv) -#define __NR_process_vm_writev 311 -__SYSCALL(__NR_process_vm_writev, sys_process_vm_writev) - -#ifndef __NO_STUBS -#define __ARCH_WANT_OLD_READDIR -#define __ARCH_WANT_OLD_STAT -#define __ARCH_WANT_SYS_ALARM -#define __ARCH_WANT_SYS_GETHOSTNAME -#define __ARCH_WANT_SYS_PAUSE -#define __ARCH_WANT_SYS_SGETMASK -#define __ARCH_WANT_SYS_SIGNAL -#define __ARCH_WANT_SYS_UTIME -#define __ARCH_WANT_SYS_WAITPID -#define __ARCH_WANT_SYS_SOCKETCALL -#define __ARCH_WANT_SYS_FADVISE64 -#define __ARCH_WANT_SYS_GETPGRP -#define __ARCH_WANT_SYS_LLSEEK -#define __ARCH_WANT_SYS_NICE -#define __ARCH_WANT_SYS_OLD_GETRLIMIT -#define __ARCH_WANT_SYS_OLD_UNAME -#define __ARCH_WANT_SYS_OLDUMOUNT -#define __ARCH_WANT_SYS_SIGPENDING -#define __ARCH_WANT_SYS_SIGPROCMASK -#define __ARCH_WANT_SYS_RT_SIGACTION -#define __ARCH_WANT_SYS_RT_SIGSUSPEND -#define __ARCH_WANT_SYS_TIME -#define __ARCH_WANT_COMPAT_SYS_TIME -#endif /* __NO_STUBS */ - -#ifdef __KERNEL__ - -#ifndef COMPILE_OFFSETS -#include <asm/asm-offsets.h> -#define NR_syscalls (__NR_syscall_max + 1) -#endif - -/* - * "Conditional" syscalls - * - * What we want is __attribute__((weak,alias("sys_ni_syscall"))), - * but it doesn't work on all toolchains, so we just do it by hand - */ -#define cond_syscall(x) asm(".weak\t" #x "\n\t.set\t" #x ",sys_ni_syscall") -#endif /* __KERNEL__ */ - -#endif /* _ASM_X86_UNISTD_64_H */ diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index 8e862aa..becf47b 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -65,7 +65,7 @@ * UV2: Bit 19 selects between * (0): 10 microsecond timebase and * (1): 80 microseconds - * we're using 655us, similar to UV1: 65 units of 10us + * we're using 560us, similar to UV1: 65 units of 10us */ #define UV1_INTD_SOFT_ACK_TIMEOUT_PERIOD (9UL) #define UV2_INTD_SOFT_ACK_TIMEOUT_PERIOD (15UL) @@ -167,6 +167,7 @@ #define FLUSH_RETRY_TIMEOUT 2 #define FLUSH_GIVEUP 3 #define FLUSH_COMPLETE 4 +#define FLUSH_RETRY_BUSYBUG 5 /* * tuning the action when the numalink network is extremely delayed @@ -235,10 +236,10 @@ struct bau_msg_payload { /* - * Message header: 16 bytes (128 bits) (bytes 0x30-0x3f of descriptor) + * UV1 Message header: 16 bytes (128 bits) (bytes 0x30-0x3f of descriptor) * see table 4.2.3.0.1 in broacast_assist spec. */ -struct bau_msg_header { +struct uv1_bau_msg_header { unsigned int dest_subnodeid:6; /* must be 0x10, for the LB */ /* bits 5:0 */ unsigned int base_dest_nasid:15; /* nasid of the first bit */ @@ -318,19 +319,87 @@ struct bau_msg_header { }; /* + * UV2 Message header: 16 bytes (128 bits) (bytes 0x30-0x3f of descriptor) + * see figure 9-2 of harp_sys.pdf + */ +struct uv2_bau_msg_header { + unsigned int base_dest_nasid:15; /* nasid of the first bit */ + /* bits 14:0 */ /* in uvhub map */ + unsigned int dest_subnodeid:5; /* must be 0x10, for the LB */ + /* bits 19:15 */ + unsigned int rsvd_1:1; /* must be zero */ + /* bit 20 */ + /* Address bits 59:21 */ + /* bits 25:2 of address (44:21) are payload */ + /* these next 24 bits become bytes 12-14 of msg */ + /* bits 28:21 land in byte 12 */ + unsigned int replied_to:1; /* sent as 0 by the source to + byte 12 */ + /* bit 21 */ + unsigned int msg_type:3; /* software type of the + message */ + /* bits 24:22 */ + unsigned int canceled:1; /* message canceled, resource + is to be freed*/ + /* bit 25 */ + unsigned int payload_1:3; /* not currently used */ + /* bits 28:26 */ + + /* bits 36:29 land in byte 13 */ + unsigned int payload_2a:3; /* not currently used */ + unsigned int payload_2b:5; /* not currently used */ + /* bits 36:29 */ + + /* bits 44:37 land in byte 14 */ + unsigned int payload_3:8; /* not currently used */ + /* bits 44:37 */ + + unsigned int rsvd_2:7; /* reserved */ + /* bits 51:45 */ + unsigned int swack_flag:1; /* software acknowledge flag */ + /* bit 52 */ + unsigned int rsvd_3a:3; /* must be zero */ + unsigned int rsvd_3b:8; /* must be zero */ + unsigned int rsvd_3c:8; /* must be zero */ + unsigned int rsvd_3d:3; /* must be zero */ + /* bits 74:53 */ + unsigned int fairness:3; /* usually zero */ + /* bits 77:75 */ + + unsigned int sequence:16; /* message sequence number */ + /* bits 93:78 Suppl_A */ + unsigned int chaining:1; /* next descriptor is part of + this activation*/ + /* bit 94 */ + unsigned int multilevel:1; /* multi-level multicast + format */ + /* bit 95 */ + unsigned int rsvd_4:24; /* ordered / source node / + source subnode / aging + must be zero */ + /* bits 119:96 */ + unsigned int command:8; /* message type */ + /* bits 127:120 */ +}; + +/* * The activation descriptor: * The format of the message to send, plus all accompanying control * Should be 64 bytes */ struct bau_desc { - struct pnmask distribution; + struct pnmask distribution; /* * message template, consisting of header and payload: */ - struct bau_msg_header header; - struct bau_msg_payload payload; + union bau_msg_header { + struct uv1_bau_msg_header uv1_hdr; + struct uv2_bau_msg_header uv2_hdr; + } header; + + struct bau_msg_payload payload; }; -/* +/* UV1: * -payload-- ---------header------ * bytes 0-11 bits 41-56 bits 58-81 * A B (2) C (3) @@ -340,6 +409,16 @@ struct bau_desc { * bytes 0-11 bytes 12-14 bytes 16-17 (byte 15 filled in by hw as vector) * ------------payload queue----------- */ +/* UV2: + * -payload-- ---------header------ + * bytes 0-11 bits 70-78 bits 21-44 + * A B (2) C (3) + * + * A/B/C are moved to: + * A C B + * bytes 0-11 bytes 12-14 bytes 16-17 (byte 15 filled in by hw as vector) + * ------------payload queue----------- + */ /* * The payload queue on the destination side is an array of these. @@ -385,7 +464,6 @@ struct bau_pq_entry { struct msg_desc { struct bau_pq_entry *msg; int msg_slot; - int swack_slot; struct bau_pq_entry *queue_first; struct bau_pq_entry *queue_last; }; @@ -405,6 +483,7 @@ struct ptc_stats { requests */ unsigned long s_stimeout; /* source side timeouts */ unsigned long s_dtimeout; /* destination side timeouts */ + unsigned long s_strongnacks; /* number of strong nack's */ unsigned long s_time; /* time spent in sending side */ unsigned long s_retriesok; /* successful retries */ unsigned long s_ntargcpu; /* total number of cpu's @@ -439,6 +518,9 @@ struct ptc_stats { unsigned long s_retry_messages; /* retry broadcasts */ unsigned long s_bau_reenabled; /* for bau enable/disable */ unsigned long s_bau_disabled; /* for bau enable/disable */ + unsigned long s_uv2_wars; /* uv2 workaround, perm. busy */ + unsigned long s_uv2_wars_hw; /* uv2 workaround, hiwater */ + unsigned long s_uv2_war_waits; /* uv2 workaround, long waits */ /* destination statistics */ unsigned long d_alltlb; /* times all tlb's on this cpu were flushed */ @@ -511,9 +593,12 @@ struct bau_control { short osnode; short uvhub_cpu; short uvhub; + short uvhub_version; short cpus_in_socket; short cpus_in_uvhub; short partition_base_pnode; + short using_desc; /* an index, like uvhub_cpu */ + unsigned int inuse_map; unsigned short message_number; unsigned short uvhub_quiesce; short socket_acknowledge_count[DEST_Q_SIZE]; @@ -531,6 +616,7 @@ struct bau_control { int cong_response_us; int cong_reps; int cong_period; + unsigned long clocks_per_100_usec; cycles_t period_time; long period_requests; struct hub_and_pnode *thp; @@ -591,6 +677,11 @@ static inline void write_mmr_sw_ack(unsigned long mr) uv_write_local_mmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, mr); } +static inline void write_gmmr_sw_ack(int pnode, unsigned long mr) +{ + write_gmmr(pnode, UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, mr); +} + static inline unsigned long read_mmr_sw_ack(void) { return read_lmmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE); diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 1ac860a..517d476 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -179,6 +179,7 @@ struct x86_msi_ops { int (*setup_msi_irqs)(struct pci_dev *dev, int nvec, int type); void (*teardown_msi_irq)(unsigned int irq); void (*teardown_msi_irqs)(struct pci_dev *dev); + void (*restore_msi_irqs)(struct pci_dev *dev, int irq); }; extern struct x86_init_ops x86_init; diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 8baca3c..5369059 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -25,7 +25,8 @@ obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-y += probe_roms.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o -obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o +obj-y += syscall_$(BITS).o +obj-$(CONFIG_X86_64) += vsyscall_64.o obj-$(CONFIG_X86_64) += vsyscall_emu_64.o obj-y += bootflag.o e820.o obj-y += pci-dma.o quirks.o topology.o kdebugfs.o @@ -80,6 +81,7 @@ obj-$(CONFIG_APB_TIMER) += apb_timer.o obj-$(CONFIG_AMD_NB) += amd_nb.o obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o +obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o obj-$(CONFIG_KVM_GUEST) += kvm.o obj-$(CONFIG_KVM_CLOCK) += kvmclock.o diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 013c181..be16854 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -119,6 +119,37 @@ bool __init early_is_amd_nb(u32 device) return false; } +struct resource *amd_get_mmconfig_range(struct resource *res) +{ + u32 address; + u64 base, msr; + unsigned segn_busn_bits; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) + return NULL; + + /* assume all cpus from fam10h have mmconfig */ + if (boot_cpu_data.x86 < 0x10) + return NULL; + + address = MSR_FAM10H_MMIO_CONF_BASE; + rdmsrl(address, msr); + + /* mmconfig is not enabled */ + if (!(msr & FAM10H_MMIO_CONF_ENABLE)) + return NULL; + + base = msr & (FAM10H_MMIO_CONF_BASE_MASK<<FAM10H_MMIO_CONF_BASE_SHIFT); + + segn_busn_bits = (msr >> FAM10H_MMIO_CONF_BUSRANGE_SHIFT) & + FAM10H_MMIO_CONF_BUSRANGE_MASK; + + res->flags = IORESOURCE_MEM; + res->start = base; + res->end = base + (1ULL<<(segn_busn_bits + 20)) - 1; + return res; +} + int amd_get_subcaches(int cpu) { struct pci_dev *link = node_to_amd_nb(amd_get_nb_id(cpu))->link; diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 9d59bba..79b05b8 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -769,7 +769,12 @@ void __init uv_system_init(void) for(i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) uv_possible_blades += hweight64(uv_read_local_mmr( UVH_NODE_PRESENT_TABLE + i * 8)); - printk(KERN_DEBUG "UV: Found %d blades\n", uv_num_possible_blades()); + + /* uv_num_possible_blades() is really the hub count */ + printk(KERN_INFO "UV: Found %d blades, %d hubs\n", + is_uv1_hub() ? uv_num_possible_blades() : + (uv_num_possible_blades() + 1) / 2, + uv_num_possible_blades()); bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades(); uv_blade_info = kzalloc(bytes, GFP_KERNEL); diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index a46bd38..f76623c 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -383,21 +383,21 @@ static int ignore_sys_suspend; static int ignore_normal_resume; static int bounce_interval __read_mostly = DEFAULT_BOUNCE_INTERVAL; -static int debug __read_mostly; -static int smp __read_mostly; +static bool debug __read_mostly; +static bool smp __read_mostly; static int apm_disabled = -1; #ifdef CONFIG_SMP -static int power_off; +static bool power_off; #else -static int power_off = 1; +static bool power_off = 1; #endif -static int realmode_power_off; +static bool realmode_power_off; #ifdef CONFIG_APM_ALLOW_INTS -static int allow_ints = 1; +static bool allow_ints = 1; #else -static int allow_ints; +static bool allow_ints; #endif -static int broken_psr; +static bool broken_psr; static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue); static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue); diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 4f13faf..68de2dc 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -67,4 +67,6 @@ void common(void) { OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); OFFSET(BP_version, boot_params, hdr.version); OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment); + OFFSET(BP_pref_address, boot_params, hdr.pref_address); + OFFSET(BP_code32_start, boot_params, hdr.code32_start); } diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 395a10e..85d98ab 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -3,6 +3,11 @@ #include <linux/lguest.h> #include "../../../drivers/lguest/lg.h" +#define __SYSCALL_I386(nr, sym, compat) [nr] = 1, +static char syscalls[] = { +#include <asm/syscalls_32.h> +}; + /* workaround for a warning with -Wmissing-prototypes */ void foo(void); @@ -76,4 +81,7 @@ void foo(void) OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode); OFFSET(LGUEST_PAGES_regs, lguest_pages, regs); #endif + BLANK(); + DEFINE(__NR_syscall_max, sizeof(syscalls) - 1); + DEFINE(NR_syscalls, sizeof(syscalls)); } diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index e72a119..834e897 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -1,11 +1,12 @@ #include <asm/ia32.h> -#define __NO_STUBS 1 -#undef __SYSCALL -#undef _ASM_X86_UNISTD_64_H -#define __SYSCALL(nr, sym) [nr] = 1, -static char syscalls[] = { -#include <asm/unistd.h> +#define __SYSCALL_64(nr, sym, compat) [nr] = 1, +static char syscalls_64[] = { +#include <asm/syscalls_64.h> +}; +#define __SYSCALL_I386(nr, sym, compat) [nr] = 1, +static char syscalls_ia32[] = { +#include <asm/syscalls_32.h> }; int main(void) @@ -72,7 +73,11 @@ int main(void) OFFSET(TSS_ist, tss_struct, x86_tss.ist); BLANK(); - DEFINE(__NR_syscall_max, sizeof(syscalls) - 1); + DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1); + DEFINE(NR_syscalls, sizeof(syscalls_64)); + + DEFINE(__NR_ia32_syscall_max, sizeof(syscalls_ia32) - 1); + DEFINE(IA32_NR_syscalls, sizeof(syscalls_ia32)); return 0; } diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 850f296..d43cad7 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1021,6 +1021,8 @@ __setup("clearcpuid=", setup_disablecpuid); #ifdef CONFIG_X86_64 struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table }; +struct desc_ptr nmi_idt_descr = { NR_VECTORS * 16 - 1, + (unsigned long) nmi_idt_table }; DEFINE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __aligned(PAGE_SIZE); @@ -1085,6 +1087,26 @@ unsigned long kernel_eflags; */ DEFINE_PER_CPU(struct orig_ist, orig_ist); +static DEFINE_PER_CPU(unsigned long, debug_stack_addr); +DEFINE_PER_CPU(int, debug_stack_usage); + +int is_debug_stack(unsigned long addr) +{ + return __get_cpu_var(debug_stack_usage) || + (addr <= __get_cpu_var(debug_stack_addr) && + addr > (__get_cpu_var(debug_stack_addr) - DEBUG_STKSZ)); +} + +void debug_stack_set_zero(void) +{ + load_idt((const struct desc_ptr *)&nmi_idt_descr); +} + +void debug_stack_reset(void) +{ + load_idt((const struct desc_ptr *)&idt_descr); +} + #else /* CONFIG_X86_64 */ DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; @@ -1212,6 +1234,8 @@ void __cpuinit cpu_init(void) estacks += exception_stack_sizes[v]; oist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks; + if (v == DEBUG_STACK-1) + per_cpu(debug_stack_addr, cpu) = (unsigned long)estacks; } } diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index a3b0811..6b45e5e 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -844,8 +844,7 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu) #include <linux/kobject.h> #include <linux/sysfs.h> - -extern struct sysdev_class cpu_sysdev_class; /* from drivers/base/cpu.c */ +#include <linux/cpu.h> /* pointer to kobject for cpuX/cache */ static DEFINE_PER_CPU(struct kobject *, ici_cache_kobject); @@ -1073,9 +1072,9 @@ err_out: static DECLARE_BITMAP(cache_dev_map, NR_CPUS); /* Add/Remove cache interface for CPU device */ -static int __cpuinit cache_add_dev(struct sys_device * sys_dev) +static int __cpuinit cache_add_dev(struct device *dev) { - unsigned int cpu = sys_dev->id; + unsigned int cpu = dev->id; unsigned long i, j; struct _index_kobject *this_object; struct _cpuid4_info *this_leaf; @@ -1087,7 +1086,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) retval = kobject_init_and_add(per_cpu(ici_cache_kobject, cpu), &ktype_percpu_entry, - &sys_dev->kobj, "%s", "cache"); + &dev->kobj, "%s", "cache"); if (retval < 0) { cpuid4_cache_sysfs_exit(cpu); return retval; @@ -1124,9 +1123,9 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) return 0; } -static void __cpuinit cache_remove_dev(struct sys_device * sys_dev) +static void __cpuinit cache_remove_dev(struct device *dev) { - unsigned int cpu = sys_dev->id; + unsigned int cpu = dev->id; unsigned long i; if (per_cpu(ici_cpuid4_info, cpu) == NULL) @@ -1145,17 +1144,17 @@ static int __cpuinit cacheinfo_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; - struct sys_device *sys_dev; + struct device *dev; - sys_dev = get_cpu_sysdev(cpu); + dev = get_cpu_device(cpu); switch (action) { case CPU_ONLINE: case CPU_ONLINE_FROZEN: - cache_add_dev(sys_dev); + cache_add_dev(dev); break; case CPU_DEAD: case CPU_DEAD_FROZEN: - cache_remove_dev(sys_dev); + cache_remove_dev(dev); break; } return NOTIFY_OK; @@ -1174,9 +1173,9 @@ static int __cpuinit cache_sysfs_init(void) for_each_online_cpu(i) { int err; - struct sys_device *sys_dev = get_cpu_sysdev(i); + struct device *dev = get_cpu_device(i); - err = cache_add_dev(sys_dev); + err = cache_add_dev(dev); if (err) return err; } diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index fefcc69..ed44c8a 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -1,4 +1,4 @@ -#include <linux/sysdev.h> +#include <linux/device.h> #include <asm/mce.h> enum severity_level { @@ -17,7 +17,7 @@ enum severity_level { struct mce_bank { u64 ctl; /* subevents to enable */ unsigned char init; /* initialise bank? */ - struct sysdev_attribute attr; /* sysdev attribute */ + struct device_attribute attr; /* device attribute */ char attrname[ATTR_LEN]; /* attribute name */ }; diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index cbe82b5..5a11ae2 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -19,7 +19,7 @@ #include <linux/kernel.h> #include <linux/percpu.h> #include <linux/string.h> -#include <linux/sysdev.h> +#include <linux/device.h> #include <linux/syscore_ops.h> #include <linux/delay.h> #include <linux/ctype.h> @@ -1818,7 +1818,7 @@ static struct syscore_ops mce_syscore_ops = { }; /* - * mce_sysdev: Sysfs support + * mce_device: Sysfs support */ static void mce_cpu_restart(void *data) @@ -1854,27 +1854,28 @@ static void mce_enable_ce(void *all) __mcheck_cpu_init_timer(); } -static struct sysdev_class mce_sysdev_class = { +static struct bus_type mce_subsys = { .name = "machinecheck", + .dev_name = "machinecheck", }; -DEFINE_PER_CPU(struct sys_device, mce_sysdev); +struct device *mce_device[CONFIG_NR_CPUS]; __cpuinitdata void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); -static inline struct mce_bank *attr_to_bank(struct sysdev_attribute *attr) +static inline struct mce_bank *attr_to_bank(struct device_attribute *attr) { return container_of(attr, struct mce_bank, attr); } -static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, +static ssize_t show_bank(struct device *s, struct device_attribute *attr, char *buf) { return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl); } -static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, +static ssize_t set_bank(struct device *s, struct device_attribute *attr, const char *buf, size_t size) { u64 new; @@ -1889,14 +1890,14 @@ static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, } static ssize_t -show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf) +show_trigger(struct device *s, struct device_attribute *attr, char *buf) { strcpy(buf, mce_helper); strcat(buf, "\n"); return strlen(mce_helper) + 1; } -static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, +static ssize_t set_trigger(struct device *s, struct device_attribute *attr, const char *buf, size_t siz) { char *p; @@ -1911,8 +1912,8 @@ static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, return strlen(mce_helper) + !!p; } -static ssize_t set_ignore_ce(struct sys_device *s, - struct sysdev_attribute *attr, +static ssize_t set_ignore_ce(struct device *s, + struct device_attribute *attr, const char *buf, size_t size) { u64 new; @@ -1935,8 +1936,8 @@ static ssize_t set_ignore_ce(struct sys_device *s, return size; } -static ssize_t set_cmci_disabled(struct sys_device *s, - struct sysdev_attribute *attr, +static ssize_t set_cmci_disabled(struct device *s, + struct device_attribute *attr, const char *buf, size_t size) { u64 new; @@ -1958,108 +1959,117 @@ static ssize_t set_cmci_disabled(struct sys_device *s, return size; } -static ssize_t store_int_with_restart(struct sys_device *s, - struct sysdev_attribute *attr, +static ssize_t store_int_with_restart(struct device *s, + struct device_attribute *attr, const char *buf, size_t size) { - ssize_t ret = sysdev_store_int(s, attr, buf, size); + ssize_t ret = device_store_int(s, attr, buf, size); mce_restart(); return ret; } -static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); -static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); -static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout); -static SYSDEV_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce); +static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger); +static DEVICE_INT_ATTR(tolerant, 0644, tolerant); +static DEVICE_INT_ATTR(monarch_timeout, 0644, monarch_timeout); +static DEVICE_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce); -static struct sysdev_ext_attribute attr_check_interval = { - _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int, - store_int_with_restart), +static struct dev_ext_attribute dev_attr_check_interval = { + __ATTR(check_interval, 0644, device_show_int, store_int_with_restart), &check_interval }; -static struct sysdev_ext_attribute attr_ignore_ce = { - _SYSDEV_ATTR(ignore_ce, 0644, sysdev_show_int, set_ignore_ce), +static struct dev_ext_attribute dev_attr_ignore_ce = { + __ATTR(ignore_ce, 0644, device_show_int, set_ignore_ce), &mce_ignore_ce }; -static struct sysdev_ext_attribute attr_cmci_disabled = { - _SYSDEV_ATTR(cmci_disabled, 0644, sysdev_show_int, set_cmci_disabled), +static struct dev_ext_attribute dev_attr_cmci_disabled = { + __ATTR(cmci_disabled, 0644, device_show_int, set_cmci_disabled), &mce_cmci_disabled }; -static struct sysdev_attribute *mce_sysdev_attrs[] = { - &attr_tolerant.attr, - &attr_check_interval.attr, - &attr_trigger, - &attr_monarch_timeout.attr, - &attr_dont_log_ce.attr, - &attr_ignore_ce.attr, - &attr_cmci_disabled.attr, +static struct device_attribute *mce_device_attrs[] = { + &dev_attr_tolerant.attr, + &dev_attr_check_interval.attr, + &dev_attr_trigger, + &dev_attr_monarch_timeout.attr, + &dev_attr_dont_log_ce.attr, + &dev_attr_ignore_ce.attr, + &dev_attr_cmci_disabled.attr, NULL }; -static cpumask_var_t mce_sysdev_initialized; +static cpumask_var_t mce_device_initialized; -/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */ -static __cpuinit int mce_sysdev_create(unsigned int cpu) +static void mce_device_release(struct device *dev) { - struct sys_device *sysdev = &per_cpu(mce_sysdev, cpu); + kfree(dev); +} + +/* Per cpu device init. All of the cpus still share the same ctrl bank: */ +static __cpuinit int mce_device_create(unsigned int cpu) +{ + struct device *dev; int err; int i, j; if (!mce_available(&boot_cpu_data)) return -EIO; - memset(&sysdev->kobj, 0, sizeof(struct kobject)); - sysdev->id = cpu; - sysdev->cls = &mce_sysdev_class; + dev = kzalloc(sizeof *dev, GFP_KERNEL); + if (!dev) + return -ENOMEM; + dev->id = cpu; + dev->bus = &mce_subsys; + dev->release = &mce_device_release; - err = sysdev_register(sysdev); + err = device_register(dev); if (err) return err; - for (i = 0; mce_sysdev_attrs[i]; i++) { - err = sysdev_create_file(sysdev, mce_sysdev_attrs[i]); + for (i = 0; mce_device_attrs[i]; i++) { + err = device_create_file(dev, mce_device_attrs[i]); if (err) goto error; } for (j = 0; j < banks; j++) { - err = sysdev_create_file(sysdev, &mce_banks[j].attr); + err = device_create_file(dev, &mce_banks[j].attr); if (err) goto error2; } - cpumask_set_cpu(cpu, mce_sysdev_initialized); + cpumask_set_cpu(cpu, mce_device_initialized); + mce_device[cpu] = dev; return 0; error2: while (--j >= 0) - sysdev_remove_file(sysdev, &mce_banks[j].attr); + device_remove_file(dev, &mce_banks[j].attr); error: while (--i >= 0) - sysdev_remove_file(sysdev, mce_sysdev_attrs[i]); + device_remove_file(dev, mce_device_attrs[i]); - sysdev_unregister(sysdev); + device_unregister(dev); return err; } -static __cpuinit void mce_sysdev_remove(unsigned int cpu) +static __cpuinit void mce_device_remove(unsigned int cpu) { - struct sys_device *sysdev = &per_cpu(mce_sysdev, cpu); + struct device *dev = mce_device[cpu]; int i; - if (!cpumask_test_cpu(cpu, mce_sysdev_initialized)) + if (!cpumask_test_cpu(cpu, mce_device_initialized)) return; - for (i = 0; mce_sysdev_attrs[i]; i++) - sysdev_remove_file(sysdev, mce_sysdev_attrs[i]); + for (i = 0; mce_device_attrs[i]; i++) + device_remove_file(dev, mce_device_attrs[i]); for (i = 0; i < banks; i++) - sysdev_remove_file(sysdev, &mce_banks[i].attr); + device_remove_file(dev, &mce_banks[i].attr); - sysdev_unregister(sysdev); - cpumask_clear_cpu(cpu, mce_sysdev_initialized); + device_unregister(dev); + cpumask_clear_cpu(cpu, mce_device_initialized); + mce_device[cpu] = NULL; } /* Make sure there are no machine checks on offlined CPUs. */ @@ -2109,7 +2119,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) switch (action) { case CPU_ONLINE: case CPU_ONLINE_FROZEN: - mce_sysdev_create(cpu); + mce_device_create(cpu); if (threshold_cpu_callback) threshold_cpu_callback(action, cpu); break; @@ -2117,7 +2127,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) case CPU_DEAD_FROZEN: if (threshold_cpu_callback) threshold_cpu_callback(action, cpu); - mce_sysdev_remove(cpu); + mce_device_remove(cpu); break; case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: @@ -2151,7 +2161,7 @@ static __init void mce_init_banks(void) for (i = 0; i < banks; i++) { struct mce_bank *b = &mce_banks[i]; - struct sysdev_attribute *a = &b->attr; + struct device_attribute *a = &b->attr; sysfs_attr_init(&a->attr); a->attr.name = b->attrname; @@ -2171,16 +2181,16 @@ static __init int mcheck_init_device(void) if (!mce_available(&boot_cpu_data)) return -EIO; - zalloc_cpumask_var(&mce_sysdev_initialized, GFP_KERNEL); + zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL); mce_init_banks(); - err = sysdev_class_register(&mce_sysdev_class); + err = subsys_system_register(&mce_subsys, NULL); if (err) return err; for_each_online_cpu(i) { - err = mce_sysdev_create(i); + err = mce_device_create(i); if (err) return err; } diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 1d76872..786e76a 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -17,7 +17,6 @@ #include <linux/notifier.h> #include <linux/kobject.h> #include <linux/percpu.h> -#include <linux/sysdev.h> #include <linux/errno.h> #include <linux/sched.h> #include <linux/sysfs.h> @@ -524,6 +523,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) { int i, err = 0; struct threshold_bank *b = NULL; + struct device *dev = mce_device[cpu]; char name[32]; sprintf(name, "threshold_bank%i", bank); @@ -544,8 +544,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) if (!b) goto out; - err = sysfs_create_link(&per_cpu(mce_sysdev, cpu).kobj, - b->kobj, name); + err = sysfs_create_link(&dev->kobj, b->kobj, name); if (err) goto out; @@ -566,7 +565,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) goto out; } - b->kobj = kobject_create_and_add(name, &per_cpu(mce_sysdev, cpu).kobj); + b->kobj = kobject_create_and_add(name, &dev->kobj); if (!b->kobj) goto out_free; @@ -586,8 +585,9 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) if (i == cpu) continue; - err = sysfs_create_link(&per_cpu(mce_sysdev, i).kobj, - b->kobj, name); + dev = mce_device[i]; + if (dev) + err = sysfs_create_link(&dev->kobj,b->kobj, name); if (err) goto out; @@ -650,6 +650,7 @@ static void deallocate_threshold_block(unsigned int cpu, static void threshold_remove_bank(unsigned int cpu, int bank) { struct threshold_bank *b; + struct device *dev; char name[32]; int i = 0; @@ -664,7 +665,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank) #ifdef CONFIG_SMP /* sibling symlink */ if (shared_bank[bank] && b->blocks->cpu != cpu) { - sysfs_remove_link(&per_cpu(mce_sysdev, cpu).kobj, name); + sysfs_remove_link(&mce_device[cpu]->kobj, name); per_cpu(threshold_banks, cpu)[bank] = NULL; return; @@ -676,7 +677,9 @@ static void threshold_remove_bank(unsigned int cpu, int bank) if (i == cpu) continue; - sysfs_remove_link(&per_cpu(mce_sysdev, i).kobj, name); + dev = mce_device[i]; + if (dev) + sysfs_remove_link(&dev->kobj, name); per_cpu(threshold_banks, i)[bank] = NULL; } diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 39c6089..67bb17a 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -19,7 +19,6 @@ #include <linux/kernel.h> #include <linux/percpu.h> #include <linux/export.h> -#include <linux/sysdev.h> #include <linux/types.h> #include <linux/init.h> #include <linux/smp.h> @@ -69,16 +68,16 @@ static atomic_t therm_throt_en = ATOMIC_INIT(0); static u32 lvtthmr_init __read_mostly; #ifdef CONFIG_SYSFS -#define define_therm_throt_sysdev_one_ro(_name) \ - static SYSDEV_ATTR(_name, 0444, \ - therm_throt_sysdev_show_##_name, \ +#define define_therm_throt_device_one_ro(_name) \ + static DEVICE_ATTR(_name, 0444, \ + therm_throt_device_show_##_name, \ NULL) \ -#define define_therm_throt_sysdev_show_func(event, name) \ +#define define_therm_throt_device_show_func(event, name) \ \ -static ssize_t therm_throt_sysdev_show_##event##_##name( \ - struct sys_device *dev, \ - struct sysdev_attribute *attr, \ +static ssize_t therm_throt_device_show_##event##_##name( \ + struct device *dev, \ + struct device_attribute *attr, \ char *buf) \ { \ unsigned int cpu = dev->id; \ @@ -95,20 +94,20 @@ static ssize_t therm_throt_sysdev_show_##event##_##name( \ return ret; \ } -define_therm_throt_sysdev_show_func(core_throttle, count); -define_therm_throt_sysdev_one_ro(core_throttle_count); +define_therm_throt_device_show_func(core_throttle, count); +define_therm_throt_device_one_ro(core_throttle_count); -define_therm_throt_sysdev_show_func(core_power_limit, count); -define_therm_throt_sysdev_one_ro(core_power_limit_count); +define_therm_throt_device_show_func(core_power_limit, count); +define_therm_throt_device_one_ro(core_power_limit_count); -define_therm_throt_sysdev_show_func(package_throttle, count); -define_therm_throt_sysdev_one_ro(package_throttle_count); +define_therm_throt_device_show_func(package_throttle, count); +define_therm_throt_device_one_ro(package_throttle_count); -define_therm_throt_sysdev_show_func(package_power_limit, count); -define_therm_throt_sysdev_one_ro(package_power_limit_count); +define_therm_throt_device_show_func(package_power_limit, count); +define_therm_throt_device_one_ro(package_power_limit_count); static struct attribute *thermal_throttle_attrs[] = { - &attr_core_throttle_count.attr, + &dev_attr_core_throttle_count.attr, NULL }; @@ -223,36 +222,36 @@ static int thresh_event_valid(int event) #ifdef CONFIG_SYSFS /* Add/Remove thermal_throttle interface for CPU device: */ -static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev, +static __cpuinit int thermal_throttle_add_dev(struct device *dev, unsigned int cpu) { int err; struct cpuinfo_x86 *c = &cpu_data(cpu); - err = sysfs_create_group(&sys_dev->kobj, &thermal_attr_group); + err = sysfs_create_group(&dev->kobj, &thermal_attr_group); if (err) return err; if (cpu_has(c, X86_FEATURE_PLN)) - err = sysfs_add_file_to_group(&sys_dev->kobj, - &attr_core_power_limit_count.attr, + err = sysfs_add_file_to_group(&dev->kobj, + &dev_attr_core_power_limit_count.attr, thermal_attr_group.name); if (cpu_has(c, X86_FEATURE_PTS)) { - err = sysfs_add_file_to_group(&sys_dev->kobj, - &attr_package_throttle_count.attr, + err = sysfs_add_file_to_group(&dev->kobj, + &dev_attr_package_throttle_count.attr, thermal_attr_group.name); if (cpu_has(c, X86_FEATURE_PLN)) - err = sysfs_add_file_to_group(&sys_dev->kobj, - &attr_package_power_limit_count.attr, + err = sysfs_add_file_to_group(&dev->kobj, + &dev_attr_package_power_limit_count.attr, thermal_attr_group.name); } return err; } -static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) +static __cpuinit void thermal_throttle_remove_dev(struct device *dev) { - sysfs_remove_group(&sys_dev->kobj, &thermal_attr_group); + sysfs_remove_group(&dev->kobj, &thermal_attr_group); } /* Mutex protecting device creation against CPU hotplug: */ @@ -265,16 +264,16 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; - struct sys_device *sys_dev; + struct device *dev; int err = 0; - sys_dev = get_cpu_sysdev(cpu); + dev = get_cpu_device(cpu); switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: mutex_lock(&therm_cpu_lock); - err = thermal_throttle_add_dev(sys_dev, cpu); + err = thermal_throttle_add_dev(dev, cpu); mutex_unlock(&therm_cpu_lock); WARN_ON(err); break; @@ -283,7 +282,7 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb, case CPU_DEAD: case CPU_DEAD_FROZEN: mutex_lock(&therm_cpu_lock); - thermal_throttle_remove_dev(sys_dev); + thermal_throttle_remove_dev(dev); mutex_unlock(&therm_cpu_lock); break; } @@ -310,7 +309,7 @@ static __init int thermal_throttle_init_device(void) #endif /* connect live CPUs to sysfs */ for_each_online_cpu(cpu) { - err = thermal_throttle_add_dev(get_cpu_sysdev(cpu), cpu); + err = thermal_throttle_add_dev(get_cpu_device(cpu), cpu); WARN_ON(err); } #ifdef CONFIG_HOTPLUG_CPU diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 212a6a4..a524353 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -177,7 +177,7 @@ static struct notifier_block __refdata cpuid_class_cpu_notifier = .notifier_call = cpuid_class_cpu_callback, }; -static char *cpuid_devnode(struct device *dev, mode_t *mode) +static char *cpuid_devnode(struct device *dev, umode_t *mode) { return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt)); } diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 8071e2f..62d61e9 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -19,6 +19,7 @@ #include <linux/acpi.h> #include <linux/firmware-map.h> #include <linux/memblock.h> +#include <linux/sort.h> #include <asm/e820.h> #include <asm/proto.h> @@ -227,22 +228,38 @@ void __init e820_print_map(char *who) * ____________________33__ * ______________________4_ */ +struct change_member { + struct e820entry *pbios; /* pointer to original bios entry */ + unsigned long long addr; /* address for this change point */ +}; + +static int __init cpcompare(const void *a, const void *b) +{ + struct change_member * const *app = a, * const *bpp = b; + const struct change_member *ap = *app, *bp = *bpp; + + /* + * Inputs are pointers to two elements of change_point[]. If their + * addresses are unequal, their difference dominates. If the addresses + * are equal, then consider one that represents the end of its region + * to be greater than one that does not. + */ + if (ap->addr != bp->addr) + return ap->addr > bp->addr ? 1 : -1; + + return (ap->addr != ap->pbios->addr) - (bp->addr != bp->pbios->addr); +} int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, u32 *pnr_map) { - struct change_member { - struct e820entry *pbios; /* pointer to original bios entry */ - unsigned long long addr; /* address for this change point */ - }; static struct change_member change_point_list[2*E820_X_MAX] __initdata; static struct change_member *change_point[2*E820_X_MAX] __initdata; static struct e820entry *overlap_list[E820_X_MAX] __initdata; static struct e820entry new_bios[E820_X_MAX] __initdata; - struct change_member *change_tmp; unsigned long current_type, last_type; unsigned long long last_addr; - int chgidx, still_changing; + int chgidx; int overlap_entries; int new_bios_entry; int old_nr, new_nr, chg_nr; @@ -279,35 +296,7 @@ int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, chg_nr = chgidx; /* sort change-point list by memory addresses (low -> high) */ - still_changing = 1; - while (still_changing) { - still_changing = 0; - for (i = 1; i < chg_nr; i++) { - unsigned long long curaddr, lastaddr; - unsigned long long curpbaddr, lastpbaddr; - - curaddr = change_point[i]->addr; - lastaddr = change_point[i - 1]->addr; - curpbaddr = change_point[i]->pbios->addr; - lastpbaddr = change_point[i - 1]->pbios->addr; - - /* - * swap entries, when: - * - * curaddr > lastaddr or - * curaddr == lastaddr and curaddr == curpbaddr and - * lastaddr != lastpbaddr - */ - if (curaddr < lastaddr || - (curaddr == lastaddr && curaddr == curpbaddr && - lastaddr != lastpbaddr)) { - change_tmp = change_point[i]; - change_point[i] = change_point[i-1]; - change_point[i-1] = change_tmp; - still_changing = 1; - } - } - } + sort(change_point, chg_nr, sizeof *change_point, cpcompare, NULL); /* create a new bios memory map, removing overlaps */ overlap_entries = 0; /* number of entries in the overlap table */ @@ -714,7 +703,7 @@ void __init e820_mark_nosave_regions(unsigned long limit_pfn) } #endif -#ifdef CONFIG_HIBERNATION +#ifdef CONFIG_ACPI /** * Mark ACPI NVS memory region, so that we can save/restore it during * hibernation and the subsequent resume. @@ -727,7 +716,7 @@ static int __init e820_mark_nvs_memory(void) struct e820entry *ei = &e820.map[i]; if (ei->type == E820_NVS) - suspend_nvs_register(ei->addr, ei->size); + acpi_nvs_register(ei->addr, ei->size); } return 0; diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index cd28a35..9b9f18b 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -240,14 +240,14 @@ static int __init setup_early_printk(char *buf) if (!strncmp(buf, "xen", 3)) early_console_register(&xenboot_console, keep); #endif -#ifdef CONFIG_EARLY_PRINTK_MRST +#ifdef CONFIG_EARLY_PRINTK_INTEL_MID if (!strncmp(buf, "mrst", 4)) { mrst_early_console_init(); early_console_register(&early_mrst_console, keep); } if (!strncmp(buf, "hsu", 3)) { - hsu_early_console_init(); + hsu_early_console_init(buf + 3); early_console_register(&early_hsu_console, keep); } #endif diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 22d0e21..79d97e6 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -42,6 +42,7 @@ */ #include <linux/linkage.h> +#include <linux/err.h> #include <asm/thread_info.h> #include <asm/irqflags.h> #include <asm/errno.h> @@ -81,8 +82,6 @@ * enough to patch inline, increasing performance. */ -#define nr_syscalls ((syscall_table_size)/4) - #ifdef CONFIG_PREEMPT #define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF #else @@ -423,7 +422,7 @@ sysenter_past_esp: testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) jnz sysenter_audit sysenter_do_call: - cmpl $(nr_syscalls), %eax + cmpl $(NR_syscalls), %eax jae syscall_badsys call *sys_call_table(,%eax,4) movl %eax,PT_EAX(%esp) @@ -455,7 +454,7 @@ sysenter_audit: movl %ebx,%ecx /* 3rd arg: 1st syscall arg */ movl %eax,%edx /* 2nd arg: syscall number */ movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */ - call audit_syscall_entry + call __audit_syscall_entry pushl_cfi %ebx movl PT_EAX(%esp),%eax /* reload syscall number */ jmp sysenter_do_call @@ -466,11 +465,10 @@ sysexit_audit: TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_ANY) movl %eax,%edx /* second arg, syscall return value */ - cmpl $0,%eax /* is it < 0? */ - setl %al /* 1 if so, 0 if not */ + cmpl $-MAX_ERRNO,%eax /* is it an error ? */ + setbe %al /* 1 if so, 0 if not */ movzbl %al,%eax /* zero-extend that */ - inc %eax /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ - call audit_syscall_exit + call __audit_syscall_exit DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx @@ -504,7 +502,7 @@ ENTRY(system_call) # system call tracing in operation / emulation testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) jnz syscall_trace_entry - cmpl $(nr_syscalls), %eax + cmpl $(NR_syscalls), %eax jae syscall_badsys syscall_call: call *sys_call_table(,%eax,4) @@ -654,7 +652,7 @@ syscall_trace_entry: movl %esp, %eax call syscall_trace_enter /* What it returned is what we'll actually use. */ - cmpl $(nr_syscalls), %eax + cmpl $(NR_syscalls), %eax jnae syscall_call jmp syscall_exit END(syscall_trace_entry) @@ -694,29 +692,28 @@ END(syscall_badsys) * System calls that need a pt_regs pointer. */ #define PTREGSCALL0(name) \ - ALIGN; \ -ptregs_##name: \ +ENTRY(ptregs_##name) ; \ leal 4(%esp),%eax; \ - jmp sys_##name; + jmp sys_##name; \ +ENDPROC(ptregs_##name) #define PTREGSCALL1(name) \ - ALIGN; \ -ptregs_##name: \ +ENTRY(ptregs_##name) ; \ leal 4(%esp),%edx; \ movl (PT_EBX+4)(%esp),%eax; \ - jmp sys_##name; + jmp sys_##name; \ +ENDPROC(ptregs_##name) #define PTREGSCALL2(name) \ - ALIGN; \ -ptregs_##name: \ +ENTRY(ptregs_##name) ; \ leal 4(%esp),%ecx; \ movl (PT_ECX+4)(%esp),%edx; \ movl (PT_EBX+4)(%esp),%eax; \ - jmp sys_##name; + jmp sys_##name; \ +ENDPROC(ptregs_##name) #define PTREGSCALL3(name) \ - ALIGN; \ -ptregs_##name: \ +ENTRY(ptregs_##name) ; \ CFI_STARTPROC; \ leal 4(%esp),%eax; \ pushl_cfi %eax; \ @@ -741,8 +738,7 @@ PTREGSCALL2(vm86) PTREGSCALL1(vm86old) /* Clone is an oddball. The 4th arg is in %edi */ - ALIGN; -ptregs_clone: +ENTRY(ptregs_clone) CFI_STARTPROC leal 4(%esp),%eax pushl_cfi %eax @@ -1213,11 +1209,6 @@ return_to_handler: jmp *%ecx #endif -.section .rodata,"a" -#include "syscall_table_32.S" - -syscall_table_size=(.-sys_call_table) - /* * Some functions should be protected against kprobes */ diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index a20e1cb..3fe8239 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -55,6 +55,7 @@ #include <asm/paravirt.h> #include <asm/ftrace.h> #include <asm/percpu.h> +#include <linux/err.h> /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ #include <linux/elf-em.h> @@ -548,7 +549,7 @@ badsys: #ifdef CONFIG_AUDITSYSCALL /* * Fast path for syscall audit without full syscall trace. - * We just call audit_syscall_entry() directly, and then + * We just call __audit_syscall_entry() directly, and then * jump back to the normal fast path. */ auditsys: @@ -558,22 +559,21 @@ auditsys: movq %rdi,%rdx /* 3rd arg: 1st syscall arg */ movq %rax,%rsi /* 2nd arg: syscall number */ movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */ - call audit_syscall_entry + call __audit_syscall_entry LOAD_ARGS 0 /* reload call-clobbered registers */ jmp system_call_fastpath /* - * Return fast path for syscall audit. Call audit_syscall_exit() + * Return fast path for syscall audit. Call __audit_syscall_exit() * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT * masked off. */ sysret_audit: movq RAX-ARGOFFSET(%rsp),%rsi /* second arg, syscall return value */ - cmpq $0,%rsi /* is it < 0? */ - setl %al /* 1 if so, 0 if not */ + cmpq $-MAX_ERRNO,%rsi /* is it < -MAX_ERRNO? */ + setbe %al /* 1 if so, 0 if not */ movzbl %al,%edi /* zero-extend that into %edi */ - inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ - call audit_syscall_exit + call __audit_syscall_exit movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi jmp sysret_check #endif /* CONFIG_AUDITSYSCALL */ @@ -1480,62 +1480,214 @@ ENTRY(error_exit) CFI_ENDPROC END(error_exit) +/* + * Test if a given stack is an NMI stack or not. + */ + .macro test_in_nmi reg stack nmi_ret normal_ret + cmpq %\reg, \stack + ja \normal_ret + subq $EXCEPTION_STKSZ, %\reg + cmpq %\reg, \stack + jb \normal_ret + jmp \nmi_ret + .endm /* runs on exception stack */ ENTRY(nmi) INTR_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME - pushq_cfi $-1 + /* + * We allow breakpoints in NMIs. If a breakpoint occurs, then + * the iretq it performs will take us out of NMI context. + * This means that we can have nested NMIs where the next + * NMI is using the top of the stack of the previous NMI. We + * can't let it execute because the nested NMI will corrupt the + * stack of the previous NMI. NMI handlers are not re-entrant + * anyway. + * + * To handle this case we do the following: + * Check the a special location on the stack that contains + * a variable that is set when NMIs are executing. + * The interrupted task's stack is also checked to see if it + * is an NMI stack. + * If the variable is not set and the stack is not the NMI + * stack then: + * o Set the special variable on the stack + * o Copy the interrupt frame into a "saved" location on the stack + * o Copy the interrupt frame into a "copy" location on the stack + * o Continue processing the NMI + * If the variable is set or the previous stack is the NMI stack: + * o Modify the "copy" location to jump to the repeate_nmi + * o return back to the first NMI + * + * Now on exit of the first NMI, we first clear the stack variable + * The NMI stack will tell any nested NMIs at that point that it is + * nested. Then we pop the stack normally with iret, and if there was + * a nested NMI that updated the copy interrupt stack frame, a + * jump will be made to the repeat_nmi code that will handle the second + * NMI. + */ + + /* Use %rdx as out temp variable throughout */ + pushq_cfi %rdx + + /* + * Check the special variable on the stack to see if NMIs are + * executing. + */ + cmp $1, -8(%rsp) + je nested_nmi + + /* + * Now test if the previous stack was an NMI stack. + * We need the double check. We check the NMI stack to satisfy the + * race when the first NMI clears the variable before returning. + * We check the variable because the first NMI could be in a + * breakpoint routine using a breakpoint stack. + */ + lea 6*8(%rsp), %rdx + test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi + +nested_nmi: + /* + * Do nothing if we interrupted the fixup in repeat_nmi. + * It's about to repeat the NMI handler, so we are fine + * with ignoring this one. + */ + movq $repeat_nmi, %rdx + cmpq 8(%rsp), %rdx + ja 1f + movq $end_repeat_nmi, %rdx + cmpq 8(%rsp), %rdx + ja nested_nmi_out + +1: + /* Set up the interrupted NMIs stack to jump to repeat_nmi */ + leaq -6*8(%rsp), %rdx + movq %rdx, %rsp + CFI_ADJUST_CFA_OFFSET 6*8 + pushq_cfi $__KERNEL_DS + pushq_cfi %rdx + pushfq_cfi + pushq_cfi $__KERNEL_CS + pushq_cfi $repeat_nmi + + /* Put stack back */ + addq $(11*8), %rsp + CFI_ADJUST_CFA_OFFSET -11*8 + +nested_nmi_out: + popq_cfi %rdx + + /* No need to check faults here */ + INTERRUPT_RETURN + +first_nmi: + /* + * Because nested NMIs will use the pushed location that we + * stored in rdx, we must keep that space available. + * Here's what our stack frame will look like: + * +-------------------------+ + * | original SS | + * | original Return RSP | + * | original RFLAGS | + * | original CS | + * | original RIP | + * +-------------------------+ + * | temp storage for rdx | + * +-------------------------+ + * | NMI executing variable | + * +-------------------------+ + * | Saved SS | + * | Saved Return RSP | + * | Saved RFLAGS | + * | Saved CS | + * | Saved RIP | + * +-------------------------+ + * | copied SS | + * | copied Return RSP | + * | copied RFLAGS | + * | copied CS | + * | copied RIP | + * +-------------------------+ + * | pt_regs | + * +-------------------------+ + * + * The saved RIP is used to fix up the copied RIP that a nested + * NMI may zero out. The original stack frame and the temp storage + * is also used by nested NMIs and can not be trusted on exit. + */ + /* Set the NMI executing variable on the stack. */ + pushq_cfi $1 + + /* Copy the stack frame to the Saved frame */ + .rept 5 + pushq_cfi 6*8(%rsp) + .endr + + /* Make another copy, this one may be modified by nested NMIs */ + .rept 5 + pushq_cfi 4*8(%rsp) + .endr + + /* Do not pop rdx, nested NMIs will corrupt it */ + movq 11*8(%rsp), %rdx + + /* + * Everything below this point can be preempted by a nested + * NMI if the first NMI took an exception. Repeated NMIs + * caused by an exception and nested NMI will start here, and + * can still be preempted by another NMI. + */ +restart_nmi: + pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ subq $ORIG_RAX-R15, %rsp CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 + /* + * Use save_paranoid to handle SWAPGS, but no need to use paranoid_exit + * as we should not be calling schedule in NMI context. + * Even with normal interrupts enabled. An NMI should not be + * setting NEED_RESCHED or anything that normal interrupts and + * exceptions might do. + */ call save_paranoid DEFAULT_FRAME 0 /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ movq %rsp,%rdi movq $-1,%rsi call do_nmi -#ifdef CONFIG_TRACE_IRQFLAGS - /* paranoidexit; without TRACE_IRQS_OFF */ - /* ebx: no swapgs flag */ - DISABLE_INTERRUPTS(CLBR_NONE) testl %ebx,%ebx /* swapgs needed? */ jnz nmi_restore - testl $3,CS(%rsp) - jnz nmi_userspace nmi_swapgs: SWAPGS_UNSAFE_STACK nmi_restore: RESTORE_ALL 8 + /* Clear the NMI executing stack variable */ + movq $0, 10*8(%rsp) jmp irq_return -nmi_userspace: - GET_THREAD_INFO(%rcx) - movl TI_flags(%rcx),%ebx - andl $_TIF_WORK_MASK,%ebx - jz nmi_swapgs - movq %rsp,%rdi /* &pt_regs */ - call sync_regs - movq %rax,%rsp /* switch stack for scheduling */ - testl $_TIF_NEED_RESCHED,%ebx - jnz nmi_schedule - movl %ebx,%edx /* arg3: thread flags */ - ENABLE_INTERRUPTS(CLBR_NONE) - xorl %esi,%esi /* arg2: oldset */ - movq %rsp,%rdi /* arg1: &pt_regs */ - call do_notify_resume - DISABLE_INTERRUPTS(CLBR_NONE) - jmp nmi_userspace -nmi_schedule: - ENABLE_INTERRUPTS(CLBR_ANY) - call schedule - DISABLE_INTERRUPTS(CLBR_ANY) - jmp nmi_userspace - CFI_ENDPROC -#else - jmp paranoid_exit CFI_ENDPROC -#endif END(nmi) + /* + * If an NMI hit an iret because of an exception or breakpoint, + * it can lose its NMI context, and a nested NMI may come in. + * In that case, the nested NMI will change the preempted NMI's + * stack to jump to here when it does the final iret. + */ +repeat_nmi: + INTR_FRAME + /* Update the stack variable to say we are still in NMI */ + movq $1, 5*8(%rsp) + + /* copy the saved stack back to copy stack */ + .rept 5 + pushq_cfi 4*8(%rsp) + .endr + + jmp restart_nmi + CFI_ENDPROC +end_repeat_nmi: + ENTRY(ignore_sysret) CFI_STARTPROC mov $-ENOSYS,%eax diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index e11e394..40f4eb3 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -417,6 +417,10 @@ ENTRY(phys_base) ENTRY(idt_table) .skip IDT_ENTRIES * 16 + .align L1_CACHE_BYTES +ENTRY(nmi_idt_table) + .skip IDT_ENTRIES * 16 + __PAGE_ALIGNED_BSS .align PAGE_SIZE ENTRY(empty_zero_page) diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 07b0a56..ad0de0c 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -2,7 +2,6 @@ #include <linux/clockchips.h> #include <linux/interrupt.h> #include <linux/export.h> -#include <linux/sysdev.h> #include <linux/delay.h> #include <linux/errno.h> #include <linux/i8253.h> diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 7209070..40fc861 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -28,6 +28,9 @@ DEFINE_PER_CPU(struct pt_regs *, irq_regs); EXPORT_PER_CPU_SYMBOL(irq_regs); #ifdef CONFIG_DEBUG_STACKOVERFLOW + +int sysctl_panic_on_stackoverflow __read_mostly; + /* Debugging check for stack overflow: is there less than 1KB free? */ static int check_stack_overflow(void) { @@ -43,6 +46,8 @@ static void print_stack_overflow(void) { printk(KERN_WARNING "low stack detected by irq handler\n"); dump_stack(); + if (sysctl_panic_on_stackoverflow) + panic("low stack detected by irq handler - check messages\n"); } #else diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 69bca46..d04d3ecd 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -26,6 +26,8 @@ EXPORT_PER_CPU_SYMBOL(irq_stat); DEFINE_PER_CPU(struct pt_regs *, irq_regs); EXPORT_PER_CPU_SYMBOL(irq_regs); +int sysctl_panic_on_stackoverflow; + /* * Probabilistic stack overflow check: * @@ -36,18 +38,39 @@ EXPORT_PER_CPU_SYMBOL(irq_regs); static inline void stack_overflow_check(struct pt_regs *regs) { #ifdef CONFIG_DEBUG_STACKOVERFLOW +#define STACK_TOP_MARGIN 128 + struct orig_ist *oist; + u64 irq_stack_top, irq_stack_bottom; + u64 estack_top, estack_bottom; u64 curbase = (u64)task_stack_page(current); if (user_mode_vm(regs)) return; - WARN_ONCE(regs->sp >= curbase && - regs->sp <= curbase + THREAD_SIZE && - regs->sp < curbase + sizeof(struct thread_info) + - sizeof(struct pt_regs) + 128, + if (regs->sp >= curbase + sizeof(struct thread_info) + + sizeof(struct pt_regs) + STACK_TOP_MARGIN && + regs->sp <= curbase + THREAD_SIZE) + return; + + irq_stack_top = (u64)__get_cpu_var(irq_stack_union.irq_stack) + + STACK_TOP_MARGIN; + irq_stack_bottom = (u64)__get_cpu_var(irq_stack_ptr); + if (regs->sp >= irq_stack_top && regs->sp <= irq_stack_bottom) + return; + + oist = &__get_cpu_var(orig_ist); + estack_top = (u64)oist->ist[0] - EXCEPTION_STKSZ + STACK_TOP_MARGIN; + estack_bottom = (u64)oist->ist[N_EXCEPTION_STACKS - 1]; + if (regs->sp >= estack_top && regs->sp <= estack_bottom) + return; + + WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx)\n", + current->comm, curbase, regs->sp, + irq_stack_top, irq_stack_bottom, + estack_top, estack_bottom); - "do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n", - current->comm, curbase, regs->sp); + if (sysctl_panic_on_stackoverflow) + panic("low stack detected by irq handler - check messages\n"); #endif } diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index b3300e6..313fb5c 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -9,7 +9,7 @@ #include <linux/kprobes.h> #include <linux/init.h> #include <linux/kernel_stat.h> -#include <linux/sysdev.h> +#include <linux/device.h> #include <linux/bitops.h> #include <linux/acpi.h> #include <linux/io.h> diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index a9c2116..f0c6fd6 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -39,8 +39,6 @@ #include <asm/desc.h> #include <asm/tlbflush.h> -#define MMU_QUEUE_SIZE 1024 - static int kvmapf = 1; static int parse_no_kvmapf(char *arg) @@ -60,21 +58,10 @@ static int parse_no_stealacc(char *arg) early_param("no-steal-acc", parse_no_stealacc); -struct kvm_para_state { - u8 mmu_queue[MMU_QUEUE_SIZE]; - int mmu_queue_len; -}; - -static DEFINE_PER_CPU(struct kvm_para_state, para_state); static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64); static int has_steal_clock = 0; -static struct kvm_para_state *kvm_para_state(void) -{ - return &per_cpu(para_state, raw_smp_processor_id()); -} - /* * No need for any "IO delay" on KVM */ @@ -271,151 +258,6 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code) } } -static void kvm_mmu_op(void *buffer, unsigned len) -{ - int r; - unsigned long a1, a2; - - do { - a1 = __pa(buffer); - a2 = 0; /* on i386 __pa() always returns <4G */ - r = kvm_hypercall3(KVM_HC_MMU_OP, len, a1, a2); - buffer += r; - len -= r; - } while (len); -} - -static void mmu_queue_flush(struct kvm_para_state *state) -{ - if (state->mmu_queue_len) { - kvm_mmu_op(state->mmu_queue, state->mmu_queue_len); - state->mmu_queue_len = 0; - } -} - -static void kvm_deferred_mmu_op(void *buffer, int len) -{ - struct kvm_para_state *state = kvm_para_state(); - - if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU) { - kvm_mmu_op(buffer, len); - return; - } - if (state->mmu_queue_len + len > sizeof state->mmu_queue) - mmu_queue_flush(state); - memcpy(state->mmu_queue + state->mmu_queue_len, buffer, len); - state->mmu_queue_len += len; -} - -static void kvm_mmu_write(void *dest, u64 val) -{ - __u64 pte_phys; - struct kvm_mmu_op_write_pte wpte; - -#ifdef CONFIG_HIGHPTE - struct page *page; - unsigned long dst = (unsigned long) dest; - - page = kmap_atomic_to_page(dest); - pte_phys = page_to_pfn(page); - pte_phys <<= PAGE_SHIFT; - pte_phys += (dst & ~(PAGE_MASK)); -#else - pte_phys = (unsigned long)__pa(dest); -#endif - wpte.header.op = KVM_MMU_OP_WRITE_PTE; - wpte.pte_val = val; - wpte.pte_phys = pte_phys; - - kvm_deferred_mmu_op(&wpte, sizeof wpte); -} - -/* - * We only need to hook operations that are MMU writes. We hook these so that - * we can use lazy MMU mode to batch these operations. We could probably - * improve the performance of the host code if we used some of the information - * here to simplify processing of batched writes. - */ -static void kvm_set_pte(pte_t *ptep, pte_t pte) -{ - kvm_mmu_write(ptep, pte_val(pte)); -} - -static void kvm_set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte) -{ - kvm_mmu_write(ptep, pte_val(pte)); -} - -static void kvm_set_pmd(pmd_t *pmdp, pmd_t pmd) -{ - kvm_mmu_write(pmdp, pmd_val(pmd)); -} - -#if PAGETABLE_LEVELS >= 3 -#ifdef CONFIG_X86_PAE -static void kvm_set_pte_atomic(pte_t *ptep, pte_t pte) -{ - kvm_mmu_write(ptep, pte_val(pte)); -} - -static void kvm_pte_clear(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) -{ - kvm_mmu_write(ptep, 0); -} - -static void kvm_pmd_clear(pmd_t *pmdp) -{ - kvm_mmu_write(pmdp, 0); -} -#endif - -static void kvm_set_pud(pud_t *pudp, pud_t pud) -{ - kvm_mmu_write(pudp, pud_val(pud)); -} - -#if PAGETABLE_LEVELS == 4 -static void kvm_set_pgd(pgd_t *pgdp, pgd_t pgd) -{ - kvm_mmu_write(pgdp, pgd_val(pgd)); -} -#endif -#endif /* PAGETABLE_LEVELS >= 3 */ - -static void kvm_flush_tlb(void) -{ - struct kvm_mmu_op_flush_tlb ftlb = { - .header.op = KVM_MMU_OP_FLUSH_TLB, - }; - - kvm_deferred_mmu_op(&ftlb, sizeof ftlb); -} - -static void kvm_release_pt(unsigned long pfn) -{ - struct kvm_mmu_op_release_pt rpt = { - .header.op = KVM_MMU_OP_RELEASE_PT, - .pt_phys = (u64)pfn << PAGE_SHIFT, - }; - - kvm_mmu_op(&rpt, sizeof rpt); -} - -static void kvm_enter_lazy_mmu(void) -{ - paravirt_enter_lazy_mmu(); -} - -static void kvm_leave_lazy_mmu(void) -{ - struct kvm_para_state *state = kvm_para_state(); - - mmu_queue_flush(state); - paravirt_leave_lazy_mmu(); -} - static void __init paravirt_ops_setup(void) { pv_info.name = "KVM"; @@ -424,29 +266,6 @@ static void __init paravirt_ops_setup(void) if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY)) pv_cpu_ops.io_delay = kvm_io_delay; - if (kvm_para_has_feature(KVM_FEATURE_MMU_OP)) { - pv_mmu_ops.set_pte = kvm_set_pte; - pv_mmu_ops.set_pte_at = kvm_set_pte_at; - pv_mmu_ops.set_pmd = kvm_set_pmd; -#if PAGETABLE_LEVELS >= 3 -#ifdef CONFIG_X86_PAE - pv_mmu_ops.set_pte_atomic = kvm_set_pte_atomic; - pv_mmu_ops.pte_clear = kvm_pte_clear; - pv_mmu_ops.pmd_clear = kvm_pmd_clear; -#endif - pv_mmu_ops.set_pud = kvm_set_pud; -#if PAGETABLE_LEVELS == 4 - pv_mmu_ops.set_pgd = kvm_set_pgd; -#endif -#endif - pv_mmu_ops.flush_tlb_user = kvm_flush_tlb; - pv_mmu_ops.release_pte = kvm_release_pt; - pv_mmu_ops.release_pmd = kvm_release_pt; - pv_mmu_ops.release_pud = kvm_release_pt; - - pv_mmu_ops.lazy_mode.enter = kvm_enter_lazy_mmu; - pv_mmu_ops.lazy_mode.leave = kvm_leave_lazy_mmu; - } #ifdef CONFIG_X86_IO_APIC no_timer_check = 1; #endif diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 9302e2d..fda91c3 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -292,8 +292,8 @@ static int reload_for_cpu(int cpu) return err; } -static ssize_t reload_store(struct sys_device *dev, - struct sysdev_attribute *attr, +static ssize_t reload_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t size) { unsigned long val; @@ -318,30 +318,30 @@ static ssize_t reload_store(struct sys_device *dev, return ret; } -static ssize_t version_show(struct sys_device *dev, - struct sysdev_attribute *attr, char *buf) +static ssize_t version_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; return sprintf(buf, "0x%x\n", uci->cpu_sig.rev); } -static ssize_t pf_show(struct sys_device *dev, - struct sysdev_attribute *attr, char *buf) +static ssize_t pf_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; return sprintf(buf, "0x%x\n", uci->cpu_sig.pf); } -static SYSDEV_ATTR(reload, 0200, NULL, reload_store); -static SYSDEV_ATTR(version, 0400, version_show, NULL); -static SYSDEV_ATTR(processor_flags, 0400, pf_show, NULL); +static DEVICE_ATTR(reload, 0200, NULL, reload_store); +static DEVICE_ATTR(version, 0400, version_show, NULL); +static DEVICE_ATTR(processor_flags, 0400, pf_show, NULL); static struct attribute *mc_default_attrs[] = { - &attr_reload.attr, - &attr_version.attr, - &attr_processor_flags.attr, + &dev_attr_reload.attr, + &dev_attr_version.attr, + &dev_attr_processor_flags.attr, NULL }; @@ -405,43 +405,45 @@ static enum ucode_state microcode_update_cpu(int cpu) return ustate; } -static int mc_sysdev_add(struct sys_device *sys_dev) +static int mc_device_add(struct device *dev, struct subsys_interface *sif) { - int err, cpu = sys_dev->id; + int err, cpu = dev->id; if (!cpu_online(cpu)) return 0; pr_debug("CPU%d added\n", cpu); - err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group); + err = sysfs_create_group(&dev->kobj, &mc_attr_group); if (err) return err; if (microcode_init_cpu(cpu) == UCODE_ERROR) { - sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); + sysfs_remove_group(&dev->kobj, &mc_attr_group); return -EINVAL; } return err; } -static int mc_sysdev_remove(struct sys_device *sys_dev) +static int mc_device_remove(struct device *dev, struct subsys_interface *sif) { - int cpu = sys_dev->id; + int cpu = dev->id; if (!cpu_online(cpu)) return 0; pr_debug("CPU%d removed\n", cpu); microcode_fini_cpu(cpu); - sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); + sysfs_remove_group(&dev->kobj, &mc_attr_group); return 0; } -static struct sysdev_driver mc_sysdev_driver = { - .add = mc_sysdev_add, - .remove = mc_sysdev_remove, +static struct subsys_interface mc_cpu_interface = { + .name = "microcode", + .subsys = &cpu_subsys, + .add_dev = mc_device_add, + .remove_dev = mc_device_remove, }; /** @@ -464,9 +466,9 @@ static __cpuinit int mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; - struct sys_device *sys_dev; + struct device *dev; - sys_dev = get_cpu_sysdev(cpu); + dev = get_cpu_device(cpu); switch (action) { case CPU_ONLINE: case CPU_ONLINE_FROZEN: @@ -474,13 +476,13 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: pr_debug("CPU%d added\n", cpu); - if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group)) + if (sysfs_create_group(&dev->kobj, &mc_attr_group)) pr_err("Failed to create group for CPU%d\n", cpu); break; case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: /* Suspend is in progress, only remove the interface */ - sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); + sysfs_remove_group(&dev->kobj, &mc_attr_group); pr_debug("CPU%d removed\n", cpu); break; @@ -525,7 +527,7 @@ static int __init microcode_init(void) get_online_cpus(); mutex_lock(µcode_mutex); - error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver); + error = subsys_interface_register(&mc_cpu_interface); mutex_unlock(µcode_mutex); put_online_cpus(); @@ -535,7 +537,7 @@ static int __init microcode_init(void) error = microcode_dev_init(); if (error) - goto out_sysdev_driver; + goto out_driver; register_syscore_ops(&mc_syscore_ops); register_hotcpu_notifier(&mc_cpu_notifier); @@ -545,11 +547,11 @@ static int __init microcode_init(void) return 0; -out_sysdev_driver: +out_driver: get_online_cpus(); mutex_lock(µcode_mutex); - sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver); + subsys_interface_unregister(&mc_cpu_interface); mutex_unlock(µcode_mutex); put_online_cpus(); @@ -573,7 +575,7 @@ static void __exit microcode_exit(void) get_online_cpus(); mutex_lock(µcode_mutex); - sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver); + subsys_interface_unregister(&mc_cpu_interface); mutex_unlock(µcode_mutex); put_online_cpus(); diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 12fcbe2..9635676 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -236,7 +236,7 @@ static struct notifier_block __refdata msr_class_cpu_notifier = { .notifier_call = msr_class_cpu_callback, }; -static char *msr_devnode(struct device *dev, mode_t *mode) +static char *msr_devnode(struct device *dev, umode_t *mode) { return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt)); } diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index e88f37b..47acaf3 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -405,9 +405,108 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) unknown_nmi_error(reason, regs); } +/* + * NMIs can hit breakpoints which will cause it to lose its + * NMI context with the CPU when the breakpoint does an iret. + */ +#ifdef CONFIG_X86_32 +/* + * For i386, NMIs use the same stack as the kernel, and we can + * add a workaround to the iret problem in C. Simply have 3 states + * the NMI can be in. + * + * 1) not running + * 2) executing + * 3) latched + * + * When no NMI is in progress, it is in the "not running" state. + * When an NMI comes in, it goes into the "executing" state. + * Normally, if another NMI is triggered, it does not interrupt + * the running NMI and the HW will simply latch it so that when + * the first NMI finishes, it will restart the second NMI. + * (Note, the latch is binary, thus multiple NMIs triggering, + * when one is running, are ignored. Only one NMI is restarted.) + * + * If an NMI hits a breakpoint that executes an iret, another + * NMI can preempt it. We do not want to allow this new NMI + * to run, but we want to execute it when the first one finishes. + * We set the state to "latched", and the first NMI will perform + * an cmpxchg on the state, and if it doesn't successfully + * reset the state to "not running" it will restart the next + * NMI. + */ +enum nmi_states { + NMI_NOT_RUNNING, + NMI_EXECUTING, + NMI_LATCHED, +}; +static DEFINE_PER_CPU(enum nmi_states, nmi_state); + +#define nmi_nesting_preprocess(regs) \ + do { \ + if (__get_cpu_var(nmi_state) != NMI_NOT_RUNNING) { \ + __get_cpu_var(nmi_state) = NMI_LATCHED; \ + return; \ + } \ + nmi_restart: \ + __get_cpu_var(nmi_state) = NMI_EXECUTING; \ + } while (0) + +#define nmi_nesting_postprocess() \ + do { \ + if (cmpxchg(&__get_cpu_var(nmi_state), \ + NMI_EXECUTING, NMI_NOT_RUNNING) != NMI_EXECUTING) \ + goto nmi_restart; \ + } while (0) +#else /* x86_64 */ +/* + * In x86_64 things are a bit more difficult. This has the same problem + * where an NMI hitting a breakpoint that calls iret will remove the + * NMI context, allowing a nested NMI to enter. What makes this more + * difficult is that both NMIs and breakpoints have their own stack. + * When a new NMI or breakpoint is executed, the stack is set to a fixed + * point. If an NMI is nested, it will have its stack set at that same + * fixed address that the first NMI had, and will start corrupting the + * stack. This is handled in entry_64.S, but the same problem exists with + * the breakpoint stack. + * + * If a breakpoint is being processed, and the debug stack is being used, + * if an NMI comes in and also hits a breakpoint, the stack pointer + * will be set to the same fixed address as the breakpoint that was + * interrupted, causing that stack to be corrupted. To handle this case, + * check if the stack that was interrupted is the debug stack, and if + * so, change the IDT so that new breakpoints will use the current stack + * and not switch to the fixed address. On return of the NMI, switch back + * to the original IDT. + */ +static DEFINE_PER_CPU(int, update_debug_stack); + +static inline void nmi_nesting_preprocess(struct pt_regs *regs) +{ + /* + * If we interrupted a breakpoint, it is possible that + * the nmi handler will have breakpoints too. We need to + * change the IDT such that breakpoints that happen here + * continue to use the NMI stack. + */ + if (unlikely(is_debug_stack(regs->sp))) { + debug_stack_set_zero(); + __get_cpu_var(update_debug_stack) = 1; + } +} + +static inline void nmi_nesting_postprocess(void) +{ + if (unlikely(__get_cpu_var(update_debug_stack))) + debug_stack_reset(); +} +#endif + dotraplinkage notrace __kprobes void do_nmi(struct pt_regs *regs, long error_code) { + nmi_nesting_preprocess(regs); + nmi_enter(); inc_irq_stat(__nmi_count); @@ -416,6 +515,9 @@ do_nmi(struct pt_regs *regs, long error_code) default_do_nmi(regs); nmi_exit(); + + /* On i386, may loop back to preprocess */ + nmi_nesting_postprocess(); } void stop_nmi(void) diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c new file mode 100644 index 0000000..0d01a8e --- /dev/null +++ b/arch/x86/kernel/nmi_selftest.c @@ -0,0 +1,180 @@ +/* + * arch/x86/kernel/nmi-selftest.c + * + * Testsuite for NMI: IPIs + * + * Started by Don Zickus: + * (using lib/locking-selftest.c as a guide) + * + * Copyright (C) 2011 Red Hat, Inc., Don Zickus <dzickus@redhat.com> + */ + +#include <linux/smp.h> +#include <linux/cpumask.h> +#include <linux/delay.h> + +#include <asm/apic.h> +#include <asm/nmi.h> + +#define SUCCESS 0 +#define FAILURE 1 +#define TIMEOUT 2 + +static int nmi_fail; + +/* check to see if NMI IPIs work on this machine */ +static DECLARE_BITMAP(nmi_ipi_mask, NR_CPUS) __read_mostly; + +static int testcase_total; +static int testcase_successes; +static int expected_testcase_failures; +static int unexpected_testcase_failures; +static int unexpected_testcase_unknowns; + +static int nmi_unk_cb(unsigned int val, struct pt_regs *regs) +{ + unexpected_testcase_unknowns++; + return NMI_HANDLED; +} + +static void init_nmi_testsuite(void) +{ + /* trap all the unknown NMIs we may generate */ + register_nmi_handler(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk"); +} + +static void cleanup_nmi_testsuite(void) +{ + unregister_nmi_handler(NMI_UNKNOWN, "nmi_selftest_unk"); +} + +static int test_nmi_ipi_callback(unsigned int val, struct pt_regs *regs) +{ + int cpu = raw_smp_processor_id(); + + if (cpumask_test_and_clear_cpu(cpu, to_cpumask(nmi_ipi_mask))) + return NMI_HANDLED; + + return NMI_DONE; +} + +static void test_nmi_ipi(struct cpumask *mask) +{ + unsigned long timeout; + + if (register_nmi_handler(NMI_LOCAL, test_nmi_ipi_callback, + NMI_FLAG_FIRST, "nmi_selftest")) { + nmi_fail = FAILURE; + return; + } + + /* sync above data before sending NMI */ + wmb(); + + apic->send_IPI_mask(mask, NMI_VECTOR); + + /* Don't wait longer than a second */ + timeout = USEC_PER_SEC; + while (!cpumask_empty(mask) && timeout--) + udelay(1); + + /* What happens if we timeout, do we still unregister?? */ + unregister_nmi_handler(NMI_LOCAL, "nmi_selftest"); + + if (!timeout) + nmi_fail = TIMEOUT; + return; +} + +static void remote_ipi(void) +{ + cpumask_copy(to_cpumask(nmi_ipi_mask), cpu_online_mask); + cpumask_clear_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask)); + if (!cpumask_empty(to_cpumask(nmi_ipi_mask))) + test_nmi_ipi(to_cpumask(nmi_ipi_mask)); +} + +static void local_ipi(void) +{ + cpumask_clear(to_cpumask(nmi_ipi_mask)); + cpumask_set_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask)); + test_nmi_ipi(to_cpumask(nmi_ipi_mask)); +} + +static void reset_nmi(void) +{ + nmi_fail = 0; +} + +static void dotest(void (*testcase_fn)(void), int expected) +{ + testcase_fn(); + /* + * Filter out expected failures: + */ + if (nmi_fail != expected) { + unexpected_testcase_failures++; + + if (nmi_fail == FAILURE) + printk("FAILED |"); + else if (nmi_fail == TIMEOUT) + printk("TIMEOUT|"); + else + printk("ERROR |"); + dump_stack(); + } else { + testcase_successes++; + printk(" ok |"); + } + testcase_total++; + + reset_nmi(); +} + +static inline void print_testname(const char *testname) +{ + printk("%12s:", testname); +} + +void nmi_selftest(void) +{ + init_nmi_testsuite(); + + /* + * Run the testsuite: + */ + printk("----------------\n"); + printk("| NMI testsuite:\n"); + printk("--------------------\n"); + + print_testname("remote IPI"); + dotest(remote_ipi, SUCCESS); + printk("\n"); + print_testname("local IPI"); + dotest(local_ipi, SUCCESS); + printk("\n"); + + cleanup_nmi_testsuite(); + + if (unexpected_testcase_failures) { + printk("--------------------\n"); + printk("BUG: %3d unexpected failures (out of %3d) - debugging disabled! |\n", + unexpected_testcase_failures, testcase_total); + printk("-----------------------------------------------------------------\n"); + } else if (expected_testcase_failures && testcase_successes) { + printk("--------------------\n"); + printk("%3d out of %3d testcases failed, as expected. |\n", + expected_testcase_failures, testcase_total); + printk("----------------------------------------------------\n"); + } else if (expected_testcase_failures && !testcase_successes) { + printk("--------------------\n"); + printk("All %3d testcases failed, as expected. |\n", + expected_testcase_failures); + printk("----------------------------------------\n"); + } else { + printk("--------------------\n"); + printk("Good, all %3d testcases passed! |\n", + testcase_successes); + printk("---------------------------------\n"); + } +} diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 80dc793..1c4d769 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -45,6 +45,15 @@ int iommu_detected __read_mostly = 0; */ int iommu_pass_through __read_mostly; +/* + * Group multi-function PCI devices into a single device-group for the + * iommu_device_group interface. This tells the iommu driver to pretend + * it cannot distinguish between functions of a device, exposing only one + * group for the device. Useful for disallowing use of individual PCI + * functions from userspace drivers. + */ +int iommu_group_mf __read_mostly; + extern struct iommu_table_entry __iommu_table[], __iommu_table_end[]; /* Dummy device used for NULL arguments (normally ISA). */ @@ -169,6 +178,8 @@ static __init int iommu_setup(char *p) #endif if (!strncmp(p, "pt", 2)) iommu_pass_through = 1; + if (!strncmp(p, "group_mf", 8)) + iommu_group_mf = 1; gart_parse_options(p); diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 89a04c7..5026738 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -1392,20 +1392,18 @@ long syscall_trace_enter(struct pt_regs *regs) if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_enter(regs, regs->orig_ax); - if (unlikely(current->audit_context)) { - if (IS_IA32) - audit_syscall_entry(AUDIT_ARCH_I386, - regs->orig_ax, - regs->bx, regs->cx, - regs->dx, regs->si); + if (IS_IA32) + audit_syscall_entry(AUDIT_ARCH_I386, + regs->orig_ax, + regs->bx, regs->cx, + regs->dx, regs->si); #ifdef CONFIG_X86_64 - else - audit_syscall_entry(AUDIT_ARCH_X86_64, - regs->orig_ax, - regs->di, regs->si, - regs->dx, regs->r10); + else + audit_syscall_entry(AUDIT_ARCH_X86_64, + regs->orig_ax, + regs->di, regs->si, + regs->dx, regs->r10); #endif - } return ret ?: regs->orig_ax; } @@ -1414,8 +1412,7 @@ void syscall_trace_leave(struct pt_regs *regs) { bool step; - if (unlikely(current->audit_context)) - audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); + audit_syscall_exit(regs); if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_exit(regs, regs->ax); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index d05444a..d7d5099 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -749,12 +749,7 @@ void __init setup_arch(char **cmdline_p) #endif #ifdef CONFIG_EFI if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, -#ifdef CONFIG_X86_32 - "EL32", -#else - "EL64", -#endif - 4)) { + EFI_LOADER_SIGNATURE, 4)) { efi_enabled = 1; efi_memblock_x86_reserve_range(); } diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 54ddaeb2..46a01bdc 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -682,7 +682,6 @@ static int handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, struct pt_regs *regs) { - sigset_t blocked; int ret; /* Are we from a system call? */ @@ -733,10 +732,7 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, */ regs->flags &= ~X86_EFLAGS_TF; - sigorsets(&blocked, ¤t->blocked, &ka->sa.sa_mask); - if (!(ka->sa.sa_flags & SA_NODEFER)) - sigaddset(&blocked, sig); - set_current_blocked(&blocked); + block_sigmask(ka, sig); tracehook_signal_handler(sig, info, ka, regs, test_thread_flag(TIF_SINGLESTEP)); diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 16204dc..66c74f4 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -29,6 +29,7 @@ #include <asm/mmu_context.h> #include <asm/proto.h> #include <asm/apic.h> +#include <asm/nmi.h> /* * Some notes on x86 processor bugs affecting SMP operation: * @@ -148,6 +149,60 @@ void native_send_call_func_ipi(const struct cpumask *mask) free_cpumask_var(allbutself); } +static atomic_t stopping_cpu = ATOMIC_INIT(-1); + +static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs) +{ + /* We are registered on stopping cpu too, avoid spurious NMI */ + if (raw_smp_processor_id() == atomic_read(&stopping_cpu)) + return NMI_HANDLED; + + stop_this_cpu(NULL); + + return NMI_HANDLED; +} + +static void native_nmi_stop_other_cpus(int wait) +{ + unsigned long flags; + unsigned long timeout; + + if (reboot_force) + return; + + /* + * Use an own vector here because smp_call_function + * does lots of things not suitable in a panic situation. + */ + if (num_online_cpus() > 1) { + /* did someone beat us here? */ + if (atomic_cmpxchg(&stopping_cpu, -1, safe_smp_processor_id()) != -1) + return; + + if (register_nmi_handler(NMI_LOCAL, smp_stop_nmi_callback, + NMI_FLAG_FIRST, "smp_stop")) + /* Note: we ignore failures here */ + return; + + /* sync above data before sending NMI */ + wmb(); + + apic->send_IPI_allbutself(NMI_VECTOR); + + /* + * Don't wait longer than a second if the caller + * didn't ask us to wait. + */ + timeout = USEC_PER_SEC; + while (num_online_cpus() > 1 && (wait || timeout--)) + udelay(1); + } + + local_irq_save(flags); + disable_local_APIC(); + local_irq_restore(flags); +} + /* * this function calls the 'stop' function on all other CPUs in the system. */ @@ -160,7 +215,7 @@ asmlinkage void smp_reboot_interrupt(void) irq_exit(); } -static void native_stop_other_cpus(int wait) +static void native_irq_stop_other_cpus(int wait) { unsigned long flags; unsigned long timeout; @@ -194,6 +249,11 @@ static void native_stop_other_cpus(int wait) local_irq_restore(flags); } +static void native_smp_disable_nmi_ipi(void) +{ + smp_ops.stop_other_cpus = native_irq_stop_other_cpus; +} + /* * Reschedule call back. */ @@ -225,12 +285,20 @@ void smp_call_function_single_interrupt(struct pt_regs *regs) irq_exit(); } +static int __init nonmi_ipi_setup(char *str) +{ + native_smp_disable_nmi_ipi(); + return 1; +} + +__setup("nonmi_ipi", nonmi_ipi_setup); + struct smp_ops smp_ops = { .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu, .smp_prepare_cpus = native_smp_prepare_cpus, .smp_cpus_done = native_smp_cpus_done, - .stop_other_cpus = native_stop_other_cpus, + .stop_other_cpus = native_nmi_stop_other_cpus, .smp_send_reschedule = native_smp_send_reschedule, .cpu_up = native_cpu_up, diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index e38e217..66d250c 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -207,23 +207,29 @@ static void __cpuinit smp_callin(void) * Need to setup vector mappings before we enable interrupts. */ setup_vector_irq(smp_processor_id()); + + /* + * Save our processor parameters. Note: this information + * is needed for clock calibration. + */ + smp_store_cpu_info(cpuid); + /* * Get our bogomips. + * Update loops_per_jiffy in cpu_data. Previous call to + * smp_store_cpu_info() stored a value that is close but not as + * accurate as the value just calculated. * * Need to enable IRQs because it can take longer and then * the NMI watchdog might kill us. */ local_irq_enable(); calibrate_delay(); + cpu_data(cpuid).loops_per_jiffy = loops_per_jiffy; local_irq_disable(); pr_debug("Stack at about %p\n", &cpuid); /* - * Save our processor parameters - */ - smp_store_cpu_info(cpuid); - - /* * This must be done before setting cpu_online_mask * or calling notify_cpu_starting. */ @@ -1143,6 +1149,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus) { pr_debug("Boot done.\n"); + nmi_selftest(); impress_friends(); #ifdef CONFIG_X86_IO_APIC setup_ioapic_dest(); diff --git a/arch/x86/kernel/syscall_32.c b/arch/x86/kernel/syscall_32.c new file mode 100644 index 0000000..147fcd4 --- /dev/null +++ b/arch/x86/kernel/syscall_32.c @@ -0,0 +1,25 @@ +/* System call table for i386. */ + +#include <linux/linkage.h> +#include <linux/sys.h> +#include <linux/cache.h> +#include <asm/asm-offsets.h> + +#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void sym(void) ; +#include <asm/syscalls_32.h> +#undef __SYSCALL_I386 + +#define __SYSCALL_I386(nr, sym, compat) [nr] = sym, + +typedef asmlinkage void (*sys_call_ptr_t)(void); + +extern asmlinkage void sys_ni_syscall(void); + +const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { + /* + * Smells like a compiler bug -- it doesn't work + * when the & below is removed. + */ + [0 ... __NR_syscall_max] = &sys_ni_syscall, +#include <asm/syscalls_32.h> +}; diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c index de87d60..7ac7943 100644 --- a/arch/x86/kernel/syscall_64.c +++ b/arch/x86/kernel/syscall_64.c @@ -5,15 +5,11 @@ #include <linux/cache.h> #include <asm/asm-offsets.h> -#define __NO_STUBS +#define __SYSCALL_64(nr, sym, compat) extern asmlinkage void sym(void) ; +#include <asm/syscalls_64.h> +#undef __SYSCALL_64 -#define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ; -#undef _ASM_X86_UNISTD_64_H -#include <asm/unistd_64.h> - -#undef __SYSCALL -#define __SYSCALL(nr, sym) [nr] = sym, -#undef _ASM_X86_UNISTD_64_H +#define __SYSCALL_64(nr, sym, compat) [nr] = sym, typedef void (*sys_call_ptr_t)(void); @@ -21,9 +17,9 @@ extern void sys_ni_syscall(void); const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { /* - *Smells like a like a compiler bug -- it doesn't work - *when the & below is removed. - */ + * Smells like a compiler bug -- it doesn't work + * when the & below is removed. + */ [0 ... __NR_syscall_max] = &sys_ni_syscall, -#include <asm/unistd_64.h> +#include <asm/syscalls_64.h> }; diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S deleted file mode 100644 index 9a0e312..0000000 --- a/arch/x86/kernel/syscall_table_32.S +++ /dev/null @@ -1,350 +0,0 @@ -ENTRY(sys_call_table) - .long sys_restart_syscall /* 0 - old "setup()" system call, used for restarting */ - .long sys_exit - .long ptregs_fork - .long sys_read - .long sys_write - .long sys_open /* 5 */ - .long sys_close - .long sys_waitpid - .long sys_creat - .long sys_link - .long sys_unlink /* 10 */ - .long ptregs_execve - .long sys_chdir - .long sys_time - .long sys_mknod - .long sys_chmod /* 15 */ - .long sys_lchown16 - .long sys_ni_syscall /* old break syscall holder */ - .long sys_stat - .long sys_lseek - .long sys_getpid /* 20 */ - .long sys_mount - .long sys_oldumount - .long sys_setuid16 - .long sys_getuid16 - .long sys_stime /* 25 */ - .long sys_ptrace - .long sys_alarm - .long sys_fstat - .long sys_pause - .long sys_utime /* 30 */ - .long sys_ni_syscall /* old stty syscall holder */ - .long sys_ni_syscall /* old gtty syscall holder */ - .long sys_access - .long sys_nice - .long sys_ni_syscall /* 35 - old ftime syscall holder */ - .long sys_sync - .long sys_kill - .long sys_rename - .long sys_mkdir - .long sys_rmdir /* 40 */ - .long sys_dup - .long sys_pipe - .long sys_times - .long sys_ni_syscall /* old prof syscall holder */ - .long sys_brk /* 45 */ - .long sys_setgid16 - .long sys_getgid16 - .long sys_signal - .long sys_geteuid16 - .long sys_getegid16 /* 50 */ - .long sys_acct - .long sys_umount /* recycled never used phys() */ - .long sys_ni_syscall /* old lock syscall holder */ - .long sys_ioctl - .long sys_fcntl /* 55 */ - .long sys_ni_syscall /* old mpx syscall holder */ - .long sys_setpgid - .long sys_ni_syscall /* old ulimit syscall holder */ - .long sys_olduname - .long sys_umask /* 60 */ - .long sys_chroot - .long sys_ustat - .long sys_dup2 - .long sys_getppid - .long sys_getpgrp /* 65 */ - .long sys_setsid - .long sys_sigaction - .long sys_sgetmask - .long sys_ssetmask - .long sys_setreuid16 /* 70 */ - .long sys_setregid16 - .long sys_sigsuspend - .long sys_sigpending - .long sys_sethostname - .long sys_setrlimit /* 75 */ - .long sys_old_getrlimit - .long sys_getrusage - .long sys_gettimeofday - .long sys_settimeofday - .long sys_getgroups16 /* 80 */ - .long sys_setgroups16 - .long sys_old_select - .long sys_symlink - .long sys_lstat - .long sys_readlink /* 85 */ - .long sys_uselib - .long sys_swapon - .long sys_reboot - .long sys_old_readdir - .long sys_old_mmap /* 90 */ - .long sys_munmap - .long sys_truncate - .long sys_ftruncate - .long sys_fchmod - .long sys_fchown16 /* 95 */ - .long sys_getpriority - .long sys_setpriority - .long sys_ni_syscall /* old profil syscall holder */ - .long sys_statfs - .long sys_fstatfs /* 100 */ - .long sys_ioperm - .long sys_socketcall - .long sys_syslog - .long sys_setitimer - .long sys_getitimer /* 105 */ - .long sys_newstat - .long sys_newlstat - .long sys_newfstat - .long sys_uname - .long ptregs_iopl /* 110 */ - .long sys_vhangup - .long sys_ni_syscall /* old "idle" system call */ - .long ptregs_vm86old - .long sys_wait4 - .long sys_swapoff /* 115 */ - .long sys_sysinfo - .long sys_ipc - .long sys_fsync - .long ptregs_sigreturn - .long ptregs_clone /* 120 */ - .long sys_setdomainname - .long sys_newuname - .long sys_modify_ldt - .long sys_adjtimex - .long sys_mprotect /* 125 */ - .long sys_sigprocmask - .long sys_ni_syscall /* old "create_module" */ - .long sys_init_module - .long sys_delete_module - .long sys_ni_syscall /* 130: old "get_kernel_syms" */ - .long sys_quotactl - .long sys_getpgid - .long sys_fchdir - .long sys_bdflush - .long sys_sysfs /* 135 */ - .long sys_personality - .long sys_ni_syscall /* reserved for afs_syscall */ - .long sys_setfsuid16 - .long sys_setfsgid16 - .long sys_llseek /* 140 */ - .long sys_getdents - .long sys_select - .long sys_flock - .long sys_msync - .long sys_readv /* 145 */ - .long sys_writev - .long sys_getsid - .long sys_fdatasync - .long sys_sysctl - .long sys_mlock /* 150 */ - .long sys_munlock - .long sys_mlockall - .long sys_munlockall - .long sys_sched_setparam - .long sys_sched_getparam /* 155 */ - .long sys_sched_setscheduler - .long sys_sched_getscheduler - .long sys_sched_yield - .long sys_sched_get_priority_max - .long sys_sched_get_priority_min /* 160 */ - .long sys_sched_rr_get_interval - .long sys_nanosleep - .long sys_mremap - .long sys_setresuid16 - .long sys_getresuid16 /* 165 */ - .long ptregs_vm86 - .long sys_ni_syscall /* Old sys_query_module */ - .long sys_poll - .long sys_ni_syscall /* Old nfsservctl */ - .long sys_setresgid16 /* 170 */ - .long sys_getresgid16 - .long sys_prctl - .long ptregs_rt_sigreturn - .long sys_rt_sigaction - .long sys_rt_sigprocmask /* 175 */ - .long sys_rt_sigpending - .long sys_rt_sigtimedwait - .long sys_rt_sigqueueinfo - .long sys_rt_sigsuspend - .long sys_pread64 /* 180 */ - .long sys_pwrite64 - .long sys_chown16 - .long sys_getcwd - .long sys_capget - .long sys_capset /* 185 */ - .long ptregs_sigaltstack - .long sys_sendfile - .long sys_ni_syscall /* reserved for streams1 */ - .long sys_ni_syscall /* reserved for streams2 */ - .long ptregs_vfork /* 190 */ - .long sys_getrlimit - .long sys_mmap_pgoff - .long sys_truncate64 - .long sys_ftruncate64 - .long sys_stat64 /* 195 */ - .long sys_lstat64 - .long sys_fstat64 - .long sys_lchown - .long sys_getuid - .long sys_getgid /* 200 */ - .long sys_geteuid - .long sys_getegid - .long sys_setreuid - .long sys_setregid - .long sys_getgroups /* 205 */ - .long sys_setgroups - .long sys_fchown - .long sys_setresuid - .long sys_getresuid - .long sys_setresgid /* 210 */ - .long sys_getresgid - .long sys_chown - .long sys_setuid - .long sys_setgid - .long sys_setfsuid /* 215 */ - .long sys_setfsgid - .long sys_pivot_root - .long sys_mincore - .long sys_madvise - .long sys_getdents64 /* 220 */ - .long sys_fcntl64 - .long sys_ni_syscall /* reserved for TUX */ - .long sys_ni_syscall - .long sys_gettid - .long sys_readahead /* 225 */ - .long sys_setxattr - .long sys_lsetxattr - .long sys_fsetxattr - .long sys_getxattr - .long sys_lgetxattr /* 230 */ - .long sys_fgetxattr - .long sys_listxattr - .long sys_llistxattr - .long sys_flistxattr - .long sys_removexattr /* 235 */ - .long sys_lremovexattr - .long sys_fremovexattr - .long sys_tkill - .long sys_sendfile64 - .long sys_futex /* 240 */ - .long sys_sched_setaffinity - .long sys_sched_getaffinity - .long sys_set_thread_area - .long sys_get_thread_area - .long sys_io_setup /* 245 */ - .long sys_io_destroy - .long sys_io_getevents - .long sys_io_submit - .long sys_io_cancel - .long sys_fadvise64 /* 250 */ - .long sys_ni_syscall - .long sys_exit_group - .long sys_lookup_dcookie - .long sys_epoll_create - .long sys_epoll_ctl /* 255 */ - .long sys_epoll_wait - .long sys_remap_file_pages - .long sys_set_tid_address - .long sys_timer_create - .long sys_timer_settime /* 260 */ - .long sys_timer_gettime - .long sys_timer_getoverrun - .long sys_timer_delete - .long sys_clock_settime - .long sys_clock_gettime /* 265 */ - .long sys_clock_getres - .long sys_clock_nanosleep - .long sys_statfs64 - .long sys_fstatfs64 - .long sys_tgkill /* 270 */ - .long sys_utimes - .long sys_fadvise64_64 - .long sys_ni_syscall /* sys_vserver */ - .long sys_mbind - .long sys_get_mempolicy - .long sys_set_mempolicy - .long sys_mq_open - .long sys_mq_unlink - .long sys_mq_timedsend - .long sys_mq_timedreceive /* 280 */ - .long sys_mq_notify - .long sys_mq_getsetattr - .long sys_kexec_load - .long sys_waitid - .long sys_ni_syscall /* 285 */ /* available */ - .long sys_add_key - .long sys_request_key - .long sys_keyctl - .long sys_ioprio_set - .long sys_ioprio_get /* 290 */ - .long sys_inotify_init - .long sys_inotify_add_watch - .long sys_inotify_rm_watch - .long sys_migrate_pages - .long sys_openat /* 295 */ - .long sys_mkdirat - .long sys_mknodat - .long sys_fchownat - .long sys_futimesat - .long sys_fstatat64 /* 300 */ - .long sys_unlinkat - .long sys_renameat - .long sys_linkat - .long sys_symlinkat - .long sys_readlinkat /* 305 */ - .long sys_fchmodat - .long sys_faccessat - .long sys_pselect6 - .long sys_ppoll - .long sys_unshare /* 310 */ - .long sys_set_robust_list - .long sys_get_robust_list - .long sys_splice - .long sys_sync_file_range - .long sys_tee /* 315 */ - .long sys_vmsplice - .long sys_move_pages - .long sys_getcpu - .long sys_epoll_pwait - .long sys_utimensat /* 320 */ - .long sys_signalfd - .long sys_timerfd_create - .long sys_eventfd - .long sys_fallocate - .long sys_timerfd_settime /* 325 */ - .long sys_timerfd_gettime - .long sys_signalfd4 - .long sys_eventfd2 - .long sys_epoll_create1 - .long sys_dup3 /* 330 */ - .long sys_pipe2 - .long sys_inotify_init1 - .long sys_preadv - .long sys_pwritev - .long sys_rt_tgsigqueueinfo /* 335 */ - .long sys_perf_event_open - .long sys_recvmmsg - .long sys_fanotify_init - .long sys_fanotify_mark - .long sys_prlimit64 /* 340 */ - .long sys_name_to_handle_at - .long sys_open_by_handle_at - .long sys_clock_adjtime - .long sys_syncfs - .long sys_sendmmsg /* 345 */ - .long sys_setns - .long sys_process_vm_readv - .long sys_process_vm_writev diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index fa1191fb..482ec3a 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -311,9 +311,15 @@ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) == NOTIFY_STOP) return; + /* + * Let others (NMI) know that the debug stack is in use + * as we may switch to the interrupt stack. + */ + debug_stack_usage_inc(); preempt_conditional_sti(regs); do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); preempt_conditional_cli(regs); + debug_stack_usage_dec(); } #ifdef CONFIG_X86_64 @@ -406,6 +412,12 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) SIGTRAP) == NOTIFY_STOP) return; + /* + * Let others (NMI) know that the debug stack is in use + * as we may switch to the interrupt stack. + */ + debug_stack_usage_inc(); + /* It's safe to allow irq's after DR6 has been saved */ preempt_conditional_sti(regs); @@ -413,6 +425,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1); preempt_conditional_cli(regs); + debug_stack_usage_dec(); return; } @@ -432,6 +445,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp) send_sigtrap(tsk, regs, error_code, si_code); preempt_conditional_cli(regs); + debug_stack_usage_dec(); return; } @@ -718,4 +732,10 @@ void __init trap_init(void) cpu_init(); x86_init.irqs.trap_init(); + +#ifdef CONFIG_X86_64 + memcpy(&nmi_idt_table, &idt_table, IDT_ENTRIES * 16); + set_nmi_gate(1, &debug); + set_nmi_gate(3, &int3); +#endif } diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 2c9cf0f..a62c201 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -290,14 +290,15 @@ static inline int pit_verify_msb(unsigned char val) static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *deltap) { int count; - u64 tsc = 0; + u64 tsc = 0, prev_tsc = 0; for (count = 0; count < 50000; count++) { if (!pit_verify_msb(val)) break; + prev_tsc = tsc; tsc = get_cycles(); } - *deltap = get_cycles() - tsc; + *deltap = get_cycles() - prev_tsc; *tscp = tsc; /* @@ -311,9 +312,9 @@ static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *de * How many MSB values do we want to see? We aim for * a maximum error rate of 500ppm (in practice the * real error is much smaller), but refuse to spend - * more than 25ms on it. + * more than 50ms on it. */ -#define MAX_QUICK_PIT_MS 25 +#define MAX_QUICK_PIT_MS 50 #define MAX_QUICK_PIT_ITERATIONS (MAX_QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256) static unsigned long quick_pit_calibrate(void) @@ -383,15 +384,12 @@ success: * * As a result, we can depend on there not being * any odd delays anywhere, and the TSC reads are - * reliable (within the error). We also adjust the - * delta to the middle of the error bars, just - * because it looks nicer. + * reliable (within the error). * * kHz = ticks / time-in-seconds / 1000; * kHz = (t2 - t1) / (I * 256 / PIT_TICK_RATE) / 1000 * kHz = ((t2 - t1) * PIT_TICK_RATE) / (I * 256 * 1000) */ - delta += (long)(d2 - d1)/2; delta *= PIT_TICK_RATE; do_div(delta, i*256*1000); printk("Fast TSC calibration using PIT\n"); @@ -995,3 +993,23 @@ void __init tsc_init(void) check_system_tsc_reliable(); } +#ifdef CONFIG_SMP +/* + * If we have a constant TSC and are using the TSC for the delay loop, + * we can skip clock calibration if another cpu in the same socket has already + * been calibrated. This assumes that CONSTANT_TSC applies to all + * cpus in the socket - this should be a safe assumption. + */ +unsigned long __cpuinit calibrate_delay_is_known(void) +{ + int i, cpu = smp_processor_id(); + + if (!tsc_disabled && !cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC)) + return 0; + + for_each_online_cpu(i) + if (cpu_data(i).phys_proc_id == cpu_data(cpu).phys_proc_id) + return cpu_data(i).loops_per_jiffy; + return 0; +} +#endif diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 863f875..b466cab 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -335,9 +335,11 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk if (info->flags & VM86_SCREEN_BITMAP) mark_screen_rdonly(tsk->mm); - /*call audit_syscall_exit since we do not exit via the normal paths */ + /*call __audit_syscall_exit since we do not exit via the normal paths */ +#ifdef CONFIG_AUDITSYSCALL if (unlikely(current->audit_context)) - audit_syscall_exit(AUDITSC_RESULT(0), 0); + __audit_syscall_exit(1, 0); +#endif __asm__ __volatile__( "movl %0,%%esp\n\t" diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 91f83e2..947a06c 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -115,4 +115,5 @@ struct x86_msi_ops x86_msi = { .setup_msi_irqs = native_setup_msi_irqs, .teardown_msi_irq = native_teardown_msi_irq, .teardown_msi_irqs = default_teardown_msi_irqs, + .restore_msi_irqs = default_restore_msi_irqs, }; diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index ff5790d..1a7fe86 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -35,6 +35,7 @@ config KVM select KVM_MMIO select TASKSTATS select TASK_DELAY_ACCT + select PERF_EVENTS ---help--- Support hosting fully virtualized guest machines using hardware virtualization extensions. You will need a fairly recent @@ -52,6 +53,8 @@ config KVM config KVM_INTEL tristate "KVM for Intel processors support" depends on KVM + # for perf_guest_get_msrs(): + depends on CPU_SUP_INTEL ---help--- Provides support for KVM on Intel processors equipped with the VT extensions. diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index f15501f43..4f579e8 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -12,7 +12,7 @@ kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o) kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o) kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ - i8254.o timer.o + i8254.o timer.o cpuid.o pmu.o kvm-intel-y += vmx.o kvm-amd-y += svm.o diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c new file mode 100644 index 0000000..89b02bf --- /dev/null +++ b/arch/x86/kvm/cpuid.c @@ -0,0 +1,670 @@ +/* + * Kernel-based Virtual Machine driver for Linux + * cpuid support routines + * + * derived from arch/x86/kvm/x86.c + * + * Copyright 2011 Red Hat, Inc. and/or its affiliates. + * Copyright IBM Corporation, 2008 + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include <linux/kvm_host.h> +#include <linux/module.h> +#include <linux/vmalloc.h> +#include <linux/uaccess.h> +#include <asm/user.h> +#include <asm/xsave.h> +#include "cpuid.h" +#include "lapic.h" +#include "mmu.h" +#include "trace.h" + +void kvm_update_cpuid(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + struct kvm_lapic *apic = vcpu->arch.apic; + + best = kvm_find_cpuid_entry(vcpu, 1, 0); + if (!best) + return; + + /* Update OSXSAVE bit */ + if (cpu_has_xsave && best->function == 0x1) { + best->ecx &= ~(bit(X86_FEATURE_OSXSAVE)); + if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) + best->ecx |= bit(X86_FEATURE_OSXSAVE); + } + + if (apic) { + if (best->ecx & bit(X86_FEATURE_TSC_DEADLINE_TIMER)) + apic->lapic_timer.timer_mode_mask = 3 << 17; + else + apic->lapic_timer.timer_mode_mask = 1 << 17; + } + + kvm_pmu_cpuid_update(vcpu); +} + +static int is_efer_nx(void) +{ + unsigned long long efer = 0; + + rdmsrl_safe(MSR_EFER, &efer); + return efer & EFER_NX; +} + +static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu) +{ + int i; + struct kvm_cpuid_entry2 *e, *entry; + + entry = NULL; + for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { + e = &vcpu->arch.cpuid_entries[i]; + if (e->function == 0x80000001) { + entry = e; + break; + } + } + if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) { + entry->edx &= ~(1 << 20); + printk(KERN_INFO "kvm: guest NX capability removed\n"); + } +} + +/* when an old userspace process fills a new kernel module */ +int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, + struct kvm_cpuid *cpuid, + struct kvm_cpuid_entry __user *entries) +{ + int r, i; + struct kvm_cpuid_entry *cpuid_entries; + + r = -E2BIG; + if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) + goto out; + r = -ENOMEM; + cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent); + if (!cpuid_entries) + goto out; + r = -EFAULT; + if (copy_from_user(cpuid_entries, entries, + cpuid->nent * sizeof(struct kvm_cpuid_entry))) + goto out_free; + for (i = 0; i < cpuid->nent; i++) { + vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function; + vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax; + vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx; + vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx; + vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx; + vcpu->arch.cpuid_entries[i].index = 0; + vcpu->arch.cpuid_entries[i].flags = 0; + vcpu->arch.cpuid_entries[i].padding[0] = 0; + vcpu->arch.cpuid_entries[i].padding[1] = 0; + vcpu->arch.cpuid_entries[i].padding[2] = 0; + } + vcpu->arch.cpuid_nent = cpuid->nent; + cpuid_fix_nx_cap(vcpu); + r = 0; + kvm_apic_set_version(vcpu); + kvm_x86_ops->cpuid_update(vcpu); + kvm_update_cpuid(vcpu); + +out_free: + vfree(cpuid_entries); +out: + return r; +} + +int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, + struct kvm_cpuid2 *cpuid, + struct kvm_cpuid_entry2 __user *entries) +{ + int r; + + r = -E2BIG; + if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) + goto out; + r = -EFAULT; + if (copy_from_user(&vcpu->arch.cpuid_entries, entries, + cpuid->nent * sizeof(struct kvm_cpuid_entry2))) + goto out; + vcpu->arch.cpuid_nent = cpuid->nent; + kvm_apic_set_version(vcpu); + kvm_x86_ops->cpuid_update(vcpu); + kvm_update_cpuid(vcpu); + return 0; + +out: + return r; +} + +int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, + struct kvm_cpuid2 *cpuid, + struct kvm_cpuid_entry2 __user *entries) +{ + int r; + + r = -E2BIG; + if (cpuid->nent < vcpu->arch.cpuid_nent) + goto out; + r = -EFAULT; + if (copy_to_user(entries, &vcpu->arch.cpuid_entries, + vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2))) + goto out; + return 0; + +out: + cpuid->nent = vcpu->arch.cpuid_nent; + return r; +} + +static void cpuid_mask(u32 *word, int wordnum) +{ + *word &= boot_cpu_data.x86_capability[wordnum]; +} + +static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, + u32 index) +{ + entry->function = function; + entry->index = index; + cpuid_count(entry->function, entry->index, + &entry->eax, &entry->ebx, &entry->ecx, &entry->edx); + entry->flags = 0; +} + +static bool supported_xcr0_bit(unsigned bit) +{ + u64 mask = ((u64)1 << bit); + + return mask & (XSTATE_FP | XSTATE_SSE | XSTATE_YMM) & host_xcr0; +} + +#define F(x) bit(X86_FEATURE_##x) + +static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, + u32 index, int *nent, int maxnent) +{ + int r; + unsigned f_nx = is_efer_nx() ? F(NX) : 0; +#ifdef CONFIG_X86_64 + unsigned f_gbpages = (kvm_x86_ops->get_lpage_level() == PT_PDPE_LEVEL) + ? F(GBPAGES) : 0; + unsigned f_lm = F(LM); +#else + unsigned f_gbpages = 0; + unsigned f_lm = 0; +#endif + unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0; + + /* cpuid 1.edx */ + const u32 kvm_supported_word0_x86_features = + F(FPU) | F(VME) | F(DE) | F(PSE) | + F(TSC) | F(MSR) | F(PAE) | F(MCE) | + F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) | + F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | + F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLSH) | + 0 /* Reserved, DS, ACPI */ | F(MMX) | + F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) | + 0 /* HTT, TM, Reserved, PBE */; + /* cpuid 0x80000001.edx */ + const u32 kvm_supported_word1_x86_features = + F(FPU) | F(VME) | F(DE) | F(PSE) | + F(TSC) | F(MSR) | F(PAE) | F(MCE) | + F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) | + F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | + F(PAT) | F(PSE36) | 0 /* Reserved */ | + f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) | + F(FXSR) | F(FXSR_OPT) | f_gbpages | f_rdtscp | + 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); + /* cpuid 1.ecx */ + const u32 kvm_supported_word4_x86_features = + F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ | + 0 /* DS-CPL, VMX, SMX, EST */ | + 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | + F(FMA) | F(CX16) | 0 /* xTPR Update, PDCM */ | + 0 /* Reserved, DCA */ | F(XMM4_1) | + F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | + 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) | + F(F16C) | F(RDRAND); + /* cpuid 0x80000001.ecx */ + const u32 kvm_supported_word6_x86_features = + F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ | + F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | + F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) | + 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM); + + /* cpuid 0xC0000001.edx */ + const u32 kvm_supported_word5_x86_features = + F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) | + F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) | + F(PMM) | F(PMM_EN); + + /* cpuid 7.0.ebx */ + const u32 kvm_supported_word9_x86_features = + F(FSGSBASE) | F(BMI1) | F(AVX2) | F(SMEP) | F(BMI2) | F(ERMS); + + /* all calls to cpuid_count() should be made on the same cpu */ + get_cpu(); + + r = -E2BIG; + + if (*nent >= maxnent) + goto out; + + do_cpuid_1_ent(entry, function, index); + ++*nent; + + switch (function) { + case 0: + entry->eax = min(entry->eax, (u32)0xd); + break; + case 1: + entry->edx &= kvm_supported_word0_x86_features; + cpuid_mask(&entry->edx, 0); + entry->ecx &= kvm_supported_word4_x86_features; + cpuid_mask(&entry->ecx, 4); + /* we support x2apic emulation even if host does not support + * it since we emulate x2apic in software */ + entry->ecx |= F(X2APIC); + break; + /* function 2 entries are STATEFUL. That is, repeated cpuid commands + * may return different values. This forces us to get_cpu() before + * issuing the first command, and also to emulate this annoying behavior + * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */ + case 2: { + int t, times = entry->eax & 0xff; + + entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; + entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; + for (t = 1; t < times; ++t) { + if (*nent >= maxnent) + goto out; + + do_cpuid_1_ent(&entry[t], function, 0); + entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; + ++*nent; + } + break; + } + /* function 4 has additional index. */ + case 4: { + int i, cache_type; + + entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + /* read more entries until cache_type is zero */ + for (i = 1; ; ++i) { + if (*nent >= maxnent) + goto out; + + cache_type = entry[i - 1].eax & 0x1f; + if (!cache_type) + break; + do_cpuid_1_ent(&entry[i], function, i); + entry[i].flags |= + KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + ++*nent; + } + break; + } + case 7: { + entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + /* Mask ebx against host capbability word 9 */ + if (index == 0) { + entry->ebx &= kvm_supported_word9_x86_features; + cpuid_mask(&entry->ebx, 9); + } else + entry->ebx = 0; + entry->eax = 0; + entry->ecx = 0; + entry->edx = 0; + break; + } + case 9: + break; + case 0xa: { /* Architectural Performance Monitoring */ + struct x86_pmu_capability cap; + union cpuid10_eax eax; + union cpuid10_edx edx; + + perf_get_x86_pmu_capability(&cap); + + /* + * Only support guest architectural pmu on a host + * with architectural pmu. + */ + if (!cap.version) + memset(&cap, 0, sizeof(cap)); + + eax.split.version_id = min(cap.version, 2); + eax.split.num_counters = cap.num_counters_gp; + eax.split.bit_width = cap.bit_width_gp; + eax.split.mask_length = cap.events_mask_len; + + edx.split.num_counters_fixed = cap.num_counters_fixed; + edx.split.bit_width_fixed = cap.bit_width_fixed; + edx.split.reserved = 0; + + entry->eax = eax.full; + entry->ebx = cap.events_mask; + entry->ecx = 0; + entry->edx = edx.full; + break; + } + /* function 0xb has additional index. */ + case 0xb: { + int i, level_type; + + entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + /* read more entries until level_type is zero */ + for (i = 1; ; ++i) { + if (*nent >= maxnent) + goto out; + + level_type = entry[i - 1].ecx & 0xff00; + if (!level_type) + break; + do_cpuid_1_ent(&entry[i], function, i); + entry[i].flags |= + KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + ++*nent; + } + break; + } + case 0xd: { + int idx, i; + + entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + for (idx = 1, i = 1; idx < 64; ++idx) { + if (*nent >= maxnent) + goto out; + + do_cpuid_1_ent(&entry[i], function, idx); + if (entry[i].eax == 0 || !supported_xcr0_bit(idx)) + continue; + entry[i].flags |= + KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + ++*nent; + ++i; + } + break; + } + case KVM_CPUID_SIGNATURE: { + char signature[12] = "KVMKVMKVM\0\0"; + u32 *sigptr = (u32 *)signature; + entry->eax = 0; + entry->ebx = sigptr[0]; + entry->ecx = sigptr[1]; + entry->edx = sigptr[2]; + break; + } + case KVM_CPUID_FEATURES: + entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) | + (1 << KVM_FEATURE_NOP_IO_DELAY) | + (1 << KVM_FEATURE_CLOCKSOURCE2) | + (1 << KVM_FEATURE_ASYNC_PF) | + (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT); + + if (sched_info_on()) + entry->eax |= (1 << KVM_FEATURE_STEAL_TIME); + + entry->ebx = 0; + entry->ecx = 0; + entry->edx = 0; + break; + case 0x80000000: + entry->eax = min(entry->eax, 0x8000001a); + break; + case 0x80000001: + entry->edx &= kvm_supported_word1_x86_features; + cpuid_mask(&entry->edx, 1); + entry->ecx &= kvm_supported_word6_x86_features; + cpuid_mask(&entry->ecx, 6); + break; + case 0x80000008: { + unsigned g_phys_as = (entry->eax >> 16) & 0xff; + unsigned virt_as = max((entry->eax >> 8) & 0xff, 48U); + unsigned phys_as = entry->eax & 0xff; + + if (!g_phys_as) + g_phys_as = phys_as; + entry->eax = g_phys_as | (virt_as << 8); + entry->ebx = entry->edx = 0; + break; + } + case 0x80000019: + entry->ecx = entry->edx = 0; + break; + case 0x8000001a: + break; + case 0x8000001d: + break; + /*Add support for Centaur's CPUID instruction*/ + case 0xC0000000: + /*Just support up to 0xC0000004 now*/ + entry->eax = min(entry->eax, 0xC0000004); + break; + case 0xC0000001: + entry->edx &= kvm_supported_word5_x86_features; + cpuid_mask(&entry->edx, 5); + break; + case 3: /* Processor serial number */ + case 5: /* MONITOR/MWAIT */ + case 6: /* Thermal management */ + case 0x80000007: /* Advanced power management */ + case 0xC0000002: + case 0xC0000003: + case 0xC0000004: + default: + entry->eax = entry->ebx = entry->ecx = entry->edx = 0; + break; + } + + kvm_x86_ops->set_supported_cpuid(function, entry); + + r = 0; + +out: + put_cpu(); + + return r; +} + +#undef F + +struct kvm_cpuid_param { + u32 func; + u32 idx; + bool has_leaf_count; + bool (*qualifier)(struct kvm_cpuid_param *param); +}; + +static bool is_centaur_cpu(struct kvm_cpuid_param *param) +{ + return boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR; +} + +int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, + struct kvm_cpuid_entry2 __user *entries) +{ + struct kvm_cpuid_entry2 *cpuid_entries; + int limit, nent = 0, r = -E2BIG, i; + u32 func; + static struct kvm_cpuid_param param[] = { + { .func = 0, .has_leaf_count = true }, + { .func = 0x80000000, .has_leaf_count = true }, + { .func = 0xC0000000, .qualifier = is_centaur_cpu, .has_leaf_count = true }, + { .func = KVM_CPUID_SIGNATURE }, + { .func = KVM_CPUID_FEATURES }, + }; + + if (cpuid->nent < 1) + goto out; + if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) + cpuid->nent = KVM_MAX_CPUID_ENTRIES; + r = -ENOMEM; + cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent); + if (!cpuid_entries) + goto out; + + r = 0; + for (i = 0; i < ARRAY_SIZE(param); i++) { + struct kvm_cpuid_param *ent = ¶m[i]; + + if (ent->qualifier && !ent->qualifier(ent)) + continue; + + r = do_cpuid_ent(&cpuid_entries[nent], ent->func, ent->idx, + &nent, cpuid->nent); + + if (r) + goto out_free; + + if (!ent->has_leaf_count) + continue; + + limit = cpuid_entries[nent - 1].eax; + for (func = ent->func + 1; func <= limit && nent < cpuid->nent && r == 0; ++func) + r = do_cpuid_ent(&cpuid_entries[nent], func, ent->idx, + &nent, cpuid->nent); + + if (r) + goto out_free; + } + + r = -EFAULT; + if (copy_to_user(entries, cpuid_entries, + nent * sizeof(struct kvm_cpuid_entry2))) + goto out_free; + cpuid->nent = nent; + r = 0; + +out_free: + vfree(cpuid_entries); +out: + return r; +} + +static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) +{ + struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i]; + int j, nent = vcpu->arch.cpuid_nent; + + e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT; + /* when no next entry is found, the current entry[i] is reselected */ + for (j = i + 1; ; j = (j + 1) % nent) { + struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j]; + if (ej->function == e->function) { + ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; + return j; + } + } + return 0; /* silence gcc, even though control never reaches here */ +} + +/* find an entry with matching function, matching index (if needed), and that + * should be read next (if it's stateful) */ +static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e, + u32 function, u32 index) +{ + if (e->function != function) + return 0; + if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index) + return 0; + if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) && + !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT)) + return 0; + return 1; +} + +struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, + u32 function, u32 index) +{ + int i; + struct kvm_cpuid_entry2 *best = NULL; + + for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { + struct kvm_cpuid_entry2 *e; + + e = &vcpu->arch.cpuid_entries[i]; + if (is_matching_cpuid_entry(e, function, index)) { + if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) + move_to_next_stateful_cpuid_entry(vcpu, i); + best = e; + break; + } + } + return best; +} +EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry); + +int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0); + if (!best || best->eax < 0x80000008) + goto not_found; + best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); + if (best) + return best->eax & 0xff; +not_found: + return 36; +} + +/* + * If no match is found, check whether we exceed the vCPU's limit + * and return the content of the highest valid _standard_ leaf instead. + * This is to satisfy the CPUID specification. + */ +static struct kvm_cpuid_entry2* check_cpuid_limit(struct kvm_vcpu *vcpu, + u32 function, u32 index) +{ + struct kvm_cpuid_entry2 *maxlevel; + + maxlevel = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0); + if (!maxlevel || maxlevel->eax >= function) + return NULL; + if (function & 0x80000000) { + maxlevel = kvm_find_cpuid_entry(vcpu, 0, 0); + if (!maxlevel) + return NULL; + } + return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index); +} + +void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) +{ + u32 function, index; + struct kvm_cpuid_entry2 *best; + + function = kvm_register_read(vcpu, VCPU_REGS_RAX); + index = kvm_register_read(vcpu, VCPU_REGS_RCX); + kvm_register_write(vcpu, VCPU_REGS_RAX, 0); + kvm_register_write(vcpu, VCPU_REGS_RBX, 0); + kvm_register_write(vcpu, VCPU_REGS_RCX, 0); + kvm_register_write(vcpu, VCPU_REGS_RDX, 0); + best = kvm_find_cpuid_entry(vcpu, function, index); + + if (!best) + best = check_cpuid_limit(vcpu, function, index); + + if (best) { + kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax); + kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx); + kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx); + kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx); + } + kvm_x86_ops->skip_emulated_instruction(vcpu); + trace_kvm_cpuid(function, + kvm_register_read(vcpu, VCPU_REGS_RAX), + kvm_register_read(vcpu, VCPU_REGS_RBX), + kvm_register_read(vcpu, VCPU_REGS_RCX), + kvm_register_read(vcpu, VCPU_REGS_RDX)); +} +EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h new file mode 100644 index 0000000..5b97e17 --- /dev/null +++ b/arch/x86/kvm/cpuid.h @@ -0,0 +1,46 @@ +#ifndef ARCH_X86_KVM_CPUID_H +#define ARCH_X86_KVM_CPUID_H + +#include "x86.h" + +void kvm_update_cpuid(struct kvm_vcpu *vcpu); +struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, + u32 function, u32 index); +int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, + struct kvm_cpuid_entry2 __user *entries); +int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, + struct kvm_cpuid *cpuid, + struct kvm_cpuid_entry __user *entries); +int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, + struct kvm_cpuid2 *cpuid, + struct kvm_cpuid_entry2 __user *entries); +int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, + struct kvm_cpuid2 *cpuid, + struct kvm_cpuid_entry2 __user *entries); + + +static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 1, 0); + return best && (best->ecx & bit(X86_FEATURE_XSAVE)); +} + +static inline bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 7, 0); + return best && (best->ebx & bit(X86_FEATURE_SMEP)); +} + +static inline bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 7, 0); + return best && (best->ebx & bit(X86_FEATURE_FSGSBASE)); +} + +#endif diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index f1e3be18..05a562b 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -125,8 +125,9 @@ #define Lock (1<<26) /* lock prefix is allowed for the instruction */ #define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */ #define No64 (1<<28) +#define PageTable (1 << 29) /* instruction used to write page table */ /* Source 2 operand type */ -#define Src2Shift (29) +#define Src2Shift (30) #define Src2None (OpNone << Src2Shift) #define Src2CL (OpCL << Src2Shift) #define Src2ImmByte (OpImmByte << Src2Shift) @@ -1674,11 +1675,6 @@ static int em_jmp_far(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } -static int em_grp1a(struct x86_emulate_ctxt *ctxt) -{ - return emulate_pop(ctxt, &ctxt->dst.val, ctxt->dst.bytes); -} - static int em_grp2(struct x86_emulate_ctxt *ctxt) { switch (ctxt->modrm_reg) { @@ -1788,7 +1784,7 @@ static int em_grp45(struct x86_emulate_ctxt *ctxt) return rc; } -static int em_grp9(struct x86_emulate_ctxt *ctxt) +static int em_cmpxchg8b(struct x86_emulate_ctxt *ctxt) { u64 old = ctxt->dst.orig_val64; @@ -1831,6 +1827,24 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt) return rc; } +static int em_cmpxchg(struct x86_emulate_ctxt *ctxt) +{ + /* Save real source value, then compare EAX against destination. */ + ctxt->src.orig_val = ctxt->src.val; + ctxt->src.val = ctxt->regs[VCPU_REGS_RAX]; + emulate_2op_SrcV(ctxt, "cmp"); + + if (ctxt->eflags & EFLG_ZF) { + /* Success: write back to memory. */ + ctxt->dst.val = ctxt->src.orig_val; + } else { + /* Failure: write the value we saw to EAX. */ + ctxt->dst.type = OP_REG; + ctxt->dst.addr.reg = (unsigned long *)&ctxt->regs[VCPU_REGS_RAX]; + } + return X86EMUL_CONTINUE; +} + static int em_lseg(struct x86_emulate_ctxt *ctxt) { int seg = ctxt->src2.val; @@ -2481,6 +2495,15 @@ static int em_das(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +static int em_call(struct x86_emulate_ctxt *ctxt) +{ + long rel = ctxt->src.val; + + ctxt->src.val = (unsigned long)ctxt->_eip; + jmp_rel(ctxt, rel); + return em_push(ctxt); +} + static int em_call_far(struct x86_emulate_ctxt *ctxt) { u16 sel, old_cs; @@ -2622,12 +2645,75 @@ static int em_rdtsc(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +static int em_rdpmc(struct x86_emulate_ctxt *ctxt) +{ + u64 pmc; + + if (ctxt->ops->read_pmc(ctxt, ctxt->regs[VCPU_REGS_RCX], &pmc)) + return emulate_gp(ctxt, 0); + ctxt->regs[VCPU_REGS_RAX] = (u32)pmc; + ctxt->regs[VCPU_REGS_RDX] = pmc >> 32; + return X86EMUL_CONTINUE; +} + static int em_mov(struct x86_emulate_ctxt *ctxt) { ctxt->dst.val = ctxt->src.val; return X86EMUL_CONTINUE; } +static int em_cr_write(struct x86_emulate_ctxt *ctxt) +{ + if (ctxt->ops->set_cr(ctxt, ctxt->modrm_reg, ctxt->src.val)) + return emulate_gp(ctxt, 0); + + /* Disable writeback. */ + ctxt->dst.type = OP_NONE; + return X86EMUL_CONTINUE; +} + +static int em_dr_write(struct x86_emulate_ctxt *ctxt) +{ + unsigned long val; + + if (ctxt->mode == X86EMUL_MODE_PROT64) + val = ctxt->src.val & ~0ULL; + else + val = ctxt->src.val & ~0U; + + /* #UD condition is already handled. */ + if (ctxt->ops->set_dr(ctxt, ctxt->modrm_reg, val) < 0) + return emulate_gp(ctxt, 0); + + /* Disable writeback. */ + ctxt->dst.type = OP_NONE; + return X86EMUL_CONTINUE; +} + +static int em_wrmsr(struct x86_emulate_ctxt *ctxt) +{ + u64 msr_data; + + msr_data = (u32)ctxt->regs[VCPU_REGS_RAX] + | ((u64)ctxt->regs[VCPU_REGS_RDX] << 32); + if (ctxt->ops->set_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], msr_data)) + return emulate_gp(ctxt, 0); + + return X86EMUL_CONTINUE; +} + +static int em_rdmsr(struct x86_emulate_ctxt *ctxt) +{ + u64 msr_data; + + if (ctxt->ops->get_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], &msr_data)) + return emulate_gp(ctxt, 0); + + ctxt->regs[VCPU_REGS_RAX] = (u32)msr_data; + ctxt->regs[VCPU_REGS_RDX] = msr_data >> 32; + return X86EMUL_CONTINUE; +} + static int em_mov_rm_sreg(struct x86_emulate_ctxt *ctxt) { if (ctxt->modrm_reg > VCPU_SREG_GS) @@ -2775,6 +2861,24 @@ static int em_jcxz(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +static int em_in(struct x86_emulate_ctxt *ctxt) +{ + if (!pio_in_emulated(ctxt, ctxt->dst.bytes, ctxt->src.val, + &ctxt->dst.val)) + return X86EMUL_IO_NEEDED; + + return X86EMUL_CONTINUE; +} + +static int em_out(struct x86_emulate_ctxt *ctxt) +{ + ctxt->ops->pio_out_emulated(ctxt, ctxt->src.bytes, ctxt->dst.val, + &ctxt->src.val, 1); + /* Disable writeback. */ + ctxt->dst.type = OP_NONE; + return X86EMUL_CONTINUE; +} + static int em_cli(struct x86_emulate_ctxt *ctxt) { if (emulator_bad_iopl(ctxt)) @@ -2794,6 +2898,69 @@ static int em_sti(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +static int em_bt(struct x86_emulate_ctxt *ctxt) +{ + /* Disable writeback. */ + ctxt->dst.type = OP_NONE; + /* only subword offset */ + ctxt->src.val &= (ctxt->dst.bytes << 3) - 1; + + emulate_2op_SrcV_nobyte(ctxt, "bt"); + return X86EMUL_CONTINUE; +} + +static int em_bts(struct x86_emulate_ctxt *ctxt) +{ + emulate_2op_SrcV_nobyte(ctxt, "bts"); + return X86EMUL_CONTINUE; +} + +static int em_btr(struct x86_emulate_ctxt *ctxt) +{ + emulate_2op_SrcV_nobyte(ctxt, "btr"); + return X86EMUL_CONTINUE; +} + +static int em_btc(struct x86_emulate_ctxt *ctxt) +{ + emulate_2op_SrcV_nobyte(ctxt, "btc"); + return X86EMUL_CONTINUE; +} + +static int em_bsf(struct x86_emulate_ctxt *ctxt) +{ + u8 zf; + + __asm__ ("bsf %2, %0; setz %1" + : "=r"(ctxt->dst.val), "=q"(zf) + : "r"(ctxt->src.val)); + + ctxt->eflags &= ~X86_EFLAGS_ZF; + if (zf) { + ctxt->eflags |= X86_EFLAGS_ZF; + /* Disable writeback. */ + ctxt->dst.type = OP_NONE; + } + return X86EMUL_CONTINUE; +} + +static int em_bsr(struct x86_emulate_ctxt *ctxt) +{ + u8 zf; + + __asm__ ("bsr %2, %0; setz %1" + : "=r"(ctxt->dst.val), "=q"(zf) + : "r"(ctxt->src.val)); + + ctxt->eflags &= ~X86_EFLAGS_ZF; + if (zf) { + ctxt->eflags |= X86_EFLAGS_ZF; + /* Disable writeback. */ + ctxt->dst.type = OP_NONE; + } + return X86EMUL_CONTINUE; +} + static bool valid_cr(int nr) { switch (nr) { @@ -2867,9 +3034,6 @@ static int check_cr_write(struct x86_emulate_ctxt *ctxt) break; } case 4: { - u64 cr4; - - cr4 = ctxt->ops->get_cr(ctxt, 4); ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); if ((efer & EFER_LMA) && !(new_val & X86_CR4_PAE)) @@ -3003,6 +3167,8 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt) #define D2bv(_f) D((_f) | ByteOp), D(_f) #define D2bvIP(_f, _i, _p) DIP((_f) | ByteOp, _i, _p), DIP(_f, _i, _p) #define I2bv(_f, _e) I((_f) | ByteOp, _e), I(_f, _e) +#define I2bvIP(_f, _e, _i, _p) \ + IIP((_f) | ByteOp, _e, _i, _p), IIP(_f, _e, _i, _p) #define I6ALU(_f, _e) I2bv((_f) | DstMem | SrcReg | ModRM, _e), \ I2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e), \ @@ -3033,17 +3199,17 @@ static struct opcode group7_rm7[] = { static struct opcode group1[] = { I(Lock, em_add), - I(Lock, em_or), + I(Lock | PageTable, em_or), I(Lock, em_adc), I(Lock, em_sbb), - I(Lock, em_and), + I(Lock | PageTable, em_and), I(Lock, em_sub), I(Lock, em_xor), I(0, em_cmp), }; static struct opcode group1A[] = { - D(DstMem | SrcNone | ModRM | Mov | Stack), N, N, N, N, N, N, N, + I(DstMem | SrcNone | ModRM | Mov | Stack, em_pop), N, N, N, N, N, N, N, }; static struct opcode group3[] = { @@ -3058,16 +3224,19 @@ static struct opcode group3[] = { }; static struct opcode group4[] = { - D(ByteOp | DstMem | SrcNone | ModRM | Lock), D(ByteOp | DstMem | SrcNone | ModRM | Lock), + I(ByteOp | DstMem | SrcNone | ModRM | Lock, em_grp45), + I(ByteOp | DstMem | SrcNone | ModRM | Lock, em_grp45), N, N, N, N, N, N, }; static struct opcode group5[] = { - D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock), - D(SrcMem | ModRM | Stack), + I(DstMem | SrcNone | ModRM | Lock, em_grp45), + I(DstMem | SrcNone | ModRM | Lock, em_grp45), + I(SrcMem | ModRM | Stack, em_grp45), I(SrcMemFAddr | ModRM | ImplicitOps | Stack, em_call_far), - D(SrcMem | ModRM | Stack), D(SrcMemFAddr | ModRM | ImplicitOps), - D(SrcMem | ModRM | Stack), N, + I(SrcMem | ModRM | Stack, em_grp45), + I(SrcMemFAddr | ModRM | ImplicitOps, em_grp45), + I(SrcMem | ModRM | Stack, em_grp45), N, }; static struct opcode group6[] = { @@ -3096,18 +3265,21 @@ static struct group_dual group7 = { { static struct opcode group8[] = { N, N, N, N, - D(DstMem | SrcImmByte | ModRM), D(DstMem | SrcImmByte | ModRM | Lock), - D(DstMem | SrcImmByte | ModRM | Lock), D(DstMem | SrcImmByte | ModRM | Lock), + I(DstMem | SrcImmByte | ModRM, em_bt), + I(DstMem | SrcImmByte | ModRM | Lock | PageTable, em_bts), + I(DstMem | SrcImmByte | ModRM | Lock, em_btr), + I(DstMem | SrcImmByte | ModRM | Lock | PageTable, em_btc), }; static struct group_dual group9 = { { - N, D(DstMem64 | ModRM | Lock), N, N, N, N, N, N, + N, I(DstMem64 | ModRM | Lock | PageTable, em_cmpxchg8b), N, N, N, N, N, N, }, { N, N, N, N, N, N, N, N, } }; static struct opcode group11[] = { - I(DstMem | SrcImm | ModRM | Mov, em_mov), X7(D(Undefined)), + I(DstMem | SrcImm | ModRM | Mov | PageTable, em_mov), + X7(D(Undefined)), }; static struct gprefix pfx_0f_6f_0f_7f = { @@ -3120,7 +3292,7 @@ static struct opcode opcode_table[256] = { I(ImplicitOps | Stack | No64 | Src2ES, em_push_sreg), I(ImplicitOps | Stack | No64 | Src2ES, em_pop_sreg), /* 0x08 - 0x0F */ - I6ALU(Lock, em_or), + I6ALU(Lock | PageTable, em_or), I(ImplicitOps | Stack | No64 | Src2CS, em_push_sreg), N, /* 0x10 - 0x17 */ @@ -3132,7 +3304,7 @@ static struct opcode opcode_table[256] = { I(ImplicitOps | Stack | No64 | Src2DS, em_push_sreg), I(ImplicitOps | Stack | No64 | Src2DS, em_pop_sreg), /* 0x20 - 0x27 */ - I6ALU(Lock, em_and), N, N, + I6ALU(Lock | PageTable, em_and), N, N, /* 0x28 - 0x2F */ I6ALU(Lock, em_sub), N, I(ByteOp | DstAcc | No64, em_das), /* 0x30 - 0x37 */ @@ -3155,8 +3327,8 @@ static struct opcode opcode_table[256] = { I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op), I(SrcImmByte | Mov | Stack, em_push), I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op), - D2bvIP(DstDI | SrcDX | Mov | String, ins, check_perm_in), /* insb, insw/insd */ - D2bvIP(SrcSI | DstDX | String, outs, check_perm_out), /* outsb, outsw/outsd */ + I2bvIP(DstDI | SrcDX | Mov | String, em_in, ins, check_perm_in), /* insb, insw/insd */ + I2bvIP(SrcSI | DstDX | String, em_out, outs, check_perm_out), /* outsb, outsw/outsd */ /* 0x70 - 0x7F */ X16(D(SrcImmByte)), /* 0x80 - 0x87 */ @@ -3165,11 +3337,11 @@ static struct opcode opcode_table[256] = { G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1), G(DstMem | SrcImmByte | ModRM | Group, group1), I2bv(DstMem | SrcReg | ModRM, em_test), - I2bv(DstMem | SrcReg | ModRM | Lock, em_xchg), + I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_xchg), /* 0x88 - 0x8F */ - I2bv(DstMem | SrcReg | ModRM | Mov, em_mov), + I2bv(DstMem | SrcReg | ModRM | Mov | PageTable, em_mov), I2bv(DstReg | SrcMem | ModRM | Mov, em_mov), - I(DstMem | SrcNone | ModRM | Mov, em_mov_rm_sreg), + I(DstMem | SrcNone | ModRM | Mov | PageTable, em_mov_rm_sreg), D(ModRM | SrcMem | NoAccess | DstReg), I(ImplicitOps | SrcMem16 | ModRM, em_mov_sreg_rm), G(0, group1A), @@ -3182,7 +3354,7 @@ static struct opcode opcode_table[256] = { II(ImplicitOps | Stack, em_popf, popf), N, N, /* 0xA0 - 0xA7 */ I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov), - I2bv(DstMem | SrcAcc | Mov | MemAbs, em_mov), + I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov), I2bv(SrcSI | DstDI | Mov | String, em_mov), I2bv(SrcSI | DstDI | String, em_cmp), /* 0xA8 - 0xAF */ @@ -3213,13 +3385,13 @@ static struct opcode opcode_table[256] = { /* 0xE0 - 0xE7 */ X3(I(SrcImmByte, em_loop)), I(SrcImmByte, em_jcxz), - D2bvIP(SrcImmUByte | DstAcc, in, check_perm_in), - D2bvIP(SrcAcc | DstImmUByte, out, check_perm_out), + I2bvIP(SrcImmUByte | DstAcc, em_in, in, check_perm_in), + I2bvIP(SrcAcc | DstImmUByte, em_out, out, check_perm_out), /* 0xE8 - 0xEF */ - D(SrcImm | Stack), D(SrcImm | ImplicitOps), + I(SrcImm | Stack, em_call), D(SrcImm | ImplicitOps), I(SrcImmFAddr | No64, em_jmp_far), D(SrcImmByte | ImplicitOps), - D2bvIP(SrcDX | DstAcc, in, check_perm_in), - D2bvIP(SrcAcc | DstDX, out, check_perm_out), + I2bvIP(SrcDX | DstAcc, em_in, in, check_perm_in), + I2bvIP(SrcAcc | DstDX, em_out, out, check_perm_out), /* 0xF0 - 0xF7 */ N, DI(ImplicitOps, icebp), N, N, DI(ImplicitOps | Priv, hlt), D(ImplicitOps), @@ -3242,15 +3414,15 @@ static struct opcode twobyte_table[256] = { /* 0x20 - 0x2F */ DIP(ModRM | DstMem | Priv | Op3264, cr_read, check_cr_read), DIP(ModRM | DstMem | Priv | Op3264, dr_read, check_dr_read), - DIP(ModRM | SrcMem | Priv | Op3264, cr_write, check_cr_write), - DIP(ModRM | SrcMem | Priv | Op3264, dr_write, check_dr_write), + IIP(ModRM | SrcMem | Priv | Op3264, em_cr_write, cr_write, check_cr_write), + IIP(ModRM | SrcMem | Priv | Op3264, em_dr_write, dr_write, check_dr_write), N, N, N, N, N, N, N, N, N, N, N, N, /* 0x30 - 0x3F */ - DI(ImplicitOps | Priv, wrmsr), + II(ImplicitOps | Priv, em_wrmsr, wrmsr), IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc), - DI(ImplicitOps | Priv, rdmsr), - DIP(ImplicitOps | Priv, rdpmc, check_rdpmc), + II(ImplicitOps | Priv, em_rdmsr, rdmsr), + IIP(ImplicitOps, em_rdpmc, rdpmc, check_rdpmc), I(ImplicitOps | VendorSpecific, em_sysenter), I(ImplicitOps | Priv | VendorSpecific, em_sysexit), N, N, @@ -3275,26 +3447,28 @@ static struct opcode twobyte_table[256] = { X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)), /* 0xA0 - 0xA7 */ I(Stack | Src2FS, em_push_sreg), I(Stack | Src2FS, em_pop_sreg), - DI(ImplicitOps, cpuid), D(DstMem | SrcReg | ModRM | BitOp), + DI(ImplicitOps, cpuid), I(DstMem | SrcReg | ModRM | BitOp, em_bt), D(DstMem | SrcReg | Src2ImmByte | ModRM), D(DstMem | SrcReg | Src2CL | ModRM), N, N, /* 0xA8 - 0xAF */ I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg), - DI(ImplicitOps, rsm), D(DstMem | SrcReg | ModRM | BitOp | Lock), + DI(ImplicitOps, rsm), + I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_bts), D(DstMem | SrcReg | Src2ImmByte | ModRM), D(DstMem | SrcReg | Src2CL | ModRM), D(ModRM), I(DstReg | SrcMem | ModRM, em_imul), /* 0xB0 - 0xB7 */ - D2bv(DstMem | SrcReg | ModRM | Lock), + I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_cmpxchg), I(DstReg | SrcMemFAddr | ModRM | Src2SS, em_lseg), - D(DstMem | SrcReg | ModRM | BitOp | Lock), + I(DstMem | SrcReg | ModRM | BitOp | Lock, em_btr), I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg), I(DstReg | SrcMemFAddr | ModRM | Src2GS, em_lseg), D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), /* 0xB8 - 0xBF */ N, N, - G(BitOp, group8), D(DstMem | SrcReg | ModRM | BitOp | Lock), - D(DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM), + G(BitOp, group8), + I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc), + I(DstReg | SrcMem | ModRM, em_bsf), I(DstReg | SrcMem | ModRM, em_bsr), D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), /* 0xC0 - 0xCF */ D2bv(DstMem | SrcReg | ModRM | Lock), @@ -3320,6 +3494,7 @@ static struct opcode twobyte_table[256] = { #undef D2bv #undef D2bvIP #undef I2bv +#undef I2bvIP #undef I6ALU static unsigned imm_size(struct x86_emulate_ctxt *ctxt) @@ -3697,6 +3872,11 @@ done: return (rc != X86EMUL_CONTINUE) ? EMULATION_FAILED : EMULATION_OK; } +bool x86_page_table_writing_insn(struct x86_emulate_ctxt *ctxt) +{ + return ctxt->d & PageTable; +} + static bool string_insn_completed(struct x86_emulate_ctxt *ctxt) { /* The second termination condition only applies for REPE @@ -3720,7 +3900,6 @@ static bool string_insn_completed(struct x86_emulate_ctxt *ctxt) int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) { struct x86_emulate_ops *ops = ctxt->ops; - u64 msr_data; int rc = X86EMUL_CONTINUE; int saved_dst_type = ctxt->dst.type; @@ -3854,15 +4033,6 @@ special_insn: goto cannot_emulate; ctxt->dst.val = (s32) ctxt->src.val; break; - case 0x6c: /* insb */ - case 0x6d: /* insw/insd */ - ctxt->src.val = ctxt->regs[VCPU_REGS_RDX]; - goto do_io_in; - case 0x6e: /* outsb */ - case 0x6f: /* outsw/outsd */ - ctxt->dst.val = ctxt->regs[VCPU_REGS_RDX]; - goto do_io_out; - break; case 0x70 ... 0x7f: /* jcc (short) */ if (test_cc(ctxt->b, ctxt->eflags)) jmp_rel(ctxt, ctxt->src.val); @@ -3870,9 +4040,6 @@ special_insn: case 0x8d: /* lea r16/r32, m */ ctxt->dst.val = ctxt->src.addr.mem.ea; break; - case 0x8f: /* pop (sole member of Grp1a) */ - rc = em_grp1a(ctxt); - break; case 0x90 ... 0x97: /* nop / xchg reg, rax */ if (ctxt->dst.addr.reg == &ctxt->regs[VCPU_REGS_RAX]) break; @@ -3905,38 +4072,11 @@ special_insn: ctxt->src.val = ctxt->regs[VCPU_REGS_RCX]; rc = em_grp2(ctxt); break; - case 0xe4: /* inb */ - case 0xe5: /* in */ - goto do_io_in; - case 0xe6: /* outb */ - case 0xe7: /* out */ - goto do_io_out; - case 0xe8: /* call (near) */ { - long int rel = ctxt->src.val; - ctxt->src.val = (unsigned long) ctxt->_eip; - jmp_rel(ctxt, rel); - rc = em_push(ctxt); - break; - } case 0xe9: /* jmp rel */ case 0xeb: /* jmp rel short */ jmp_rel(ctxt, ctxt->src.val); ctxt->dst.type = OP_NONE; /* Disable writeback. */ break; - case 0xec: /* in al,dx */ - case 0xed: /* in (e/r)ax,dx */ - do_io_in: - if (!pio_in_emulated(ctxt, ctxt->dst.bytes, ctxt->src.val, - &ctxt->dst.val)) - goto done; /* IO is needed */ - break; - case 0xee: /* out dx,al */ - case 0xef: /* out dx,(e/r)ax */ - do_io_out: - ops->pio_out_emulated(ctxt, ctxt->src.bytes, ctxt->dst.val, - &ctxt->src.val, 1); - ctxt->dst.type = OP_NONE; /* Disable writeback. */ - break; case 0xf4: /* hlt */ ctxt->ops->halt(ctxt); break; @@ -3956,12 +4096,6 @@ special_insn: case 0xfd: /* std */ ctxt->eflags |= EFLG_DF; break; - case 0xfe: /* Grp4 */ - rc = em_grp45(ctxt); - break; - case 0xff: /* Grp5 */ - rc = em_grp45(ctxt); - break; default: goto cannot_emulate; } @@ -4036,49 +4170,6 @@ twobyte_insn: case 0x21: /* mov from dr to reg */ ops->get_dr(ctxt, ctxt->modrm_reg, &ctxt->dst.val); break; - case 0x22: /* mov reg, cr */ - if (ops->set_cr(ctxt, ctxt->modrm_reg, ctxt->src.val)) { - emulate_gp(ctxt, 0); - rc = X86EMUL_PROPAGATE_FAULT; - goto done; - } - ctxt->dst.type = OP_NONE; - break; - case 0x23: /* mov from reg to dr */ - if (ops->set_dr(ctxt, ctxt->modrm_reg, ctxt->src.val & - ((ctxt->mode == X86EMUL_MODE_PROT64) ? - ~0ULL : ~0U)) < 0) { - /* #UD condition is already handled by the code above */ - emulate_gp(ctxt, 0); - rc = X86EMUL_PROPAGATE_FAULT; - goto done; - } - - ctxt->dst.type = OP_NONE; /* no writeback */ - break; - case 0x30: - /* wrmsr */ - msr_data = (u32)ctxt->regs[VCPU_REGS_RAX] - | ((u64)ctxt->regs[VCPU_REGS_RDX] << 32); - if (ops->set_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], msr_data)) { - emulate_gp(ctxt, 0); - rc = X86EMUL_PROPAGATE_FAULT; - goto done; - } - rc = X86EMUL_CONTINUE; - break; - case 0x32: - /* rdmsr */ - if (ops->get_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], &msr_data)) { - emulate_gp(ctxt, 0); - rc = X86EMUL_PROPAGATE_FAULT; - goto done; - } else { - ctxt->regs[VCPU_REGS_RAX] = (u32)msr_data; - ctxt->regs[VCPU_REGS_RDX] = msr_data >> 32; - } - rc = X86EMUL_CONTINUE; - break; case 0x40 ... 0x4f: /* cmov */ ctxt->dst.val = ctxt->dst.orig_val = ctxt->src.val; if (!test_cc(ctxt->b, ctxt->eflags)) @@ -4091,93 +4182,21 @@ twobyte_insn: case 0x90 ... 0x9f: /* setcc r/m8 */ ctxt->dst.val = test_cc(ctxt->b, ctxt->eflags); break; - case 0xa3: - bt: /* bt */ - ctxt->dst.type = OP_NONE; - /* only subword offset */ - ctxt->src.val &= (ctxt->dst.bytes << 3) - 1; - emulate_2op_SrcV_nobyte(ctxt, "bt"); - break; case 0xa4: /* shld imm8, r, r/m */ case 0xa5: /* shld cl, r, r/m */ emulate_2op_cl(ctxt, "shld"); break; - case 0xab: - bts: /* bts */ - emulate_2op_SrcV_nobyte(ctxt, "bts"); - break; case 0xac: /* shrd imm8, r, r/m */ case 0xad: /* shrd cl, r, r/m */ emulate_2op_cl(ctxt, "shrd"); break; case 0xae: /* clflush */ break; - case 0xb0 ... 0xb1: /* cmpxchg */ - /* - * Save real source value, then compare EAX against - * destination. - */ - ctxt->src.orig_val = ctxt->src.val; - ctxt->src.val = ctxt->regs[VCPU_REGS_RAX]; - emulate_2op_SrcV(ctxt, "cmp"); - if (ctxt->eflags & EFLG_ZF) { - /* Success: write back to memory. */ - ctxt->dst.val = ctxt->src.orig_val; - } else { - /* Failure: write the value we saw to EAX. */ - ctxt->dst.type = OP_REG; - ctxt->dst.addr.reg = (unsigned long *)&ctxt->regs[VCPU_REGS_RAX]; - } - break; - case 0xb3: - btr: /* btr */ - emulate_2op_SrcV_nobyte(ctxt, "btr"); - break; case 0xb6 ... 0xb7: /* movzx */ ctxt->dst.bytes = ctxt->op_bytes; ctxt->dst.val = (ctxt->d & ByteOp) ? (u8) ctxt->src.val : (u16) ctxt->src.val; break; - case 0xba: /* Grp8 */ - switch (ctxt->modrm_reg & 3) { - case 0: - goto bt; - case 1: - goto bts; - case 2: - goto btr; - case 3: - goto btc; - } - break; - case 0xbb: - btc: /* btc */ - emulate_2op_SrcV_nobyte(ctxt, "btc"); - break; - case 0xbc: { /* bsf */ - u8 zf; - __asm__ ("bsf %2, %0; setz %1" - : "=r"(ctxt->dst.val), "=q"(zf) - : "r"(ctxt->src.val)); - ctxt->eflags &= ~X86_EFLAGS_ZF; - if (zf) { - ctxt->eflags |= X86_EFLAGS_ZF; - ctxt->dst.type = OP_NONE; /* Disable writeback. */ - } - break; - } - case 0xbd: { /* bsr */ - u8 zf; - __asm__ ("bsr %2, %0; setz %1" - : "=r"(ctxt->dst.val), "=q"(zf) - : "r"(ctxt->src.val)); - ctxt->eflags &= ~X86_EFLAGS_ZF; - if (zf) { - ctxt->eflags |= X86_EFLAGS_ZF; - ctxt->dst.type = OP_NONE; /* Disable writeback. */ - } - break; - } case 0xbe ... 0xbf: /* movsx */ ctxt->dst.bytes = ctxt->op_bytes; ctxt->dst.val = (ctxt->d & ByteOp) ? (s8) ctxt->src.val : @@ -4194,9 +4213,6 @@ twobyte_insn: ctxt->dst.val = (ctxt->op_bytes == 4) ? (u32) ctxt->src.val : (u64) ctxt->src.val; break; - case 0xc7: /* Grp9 (cmpxchg8b) */ - rc = em_grp9(ctxt); - break; default: goto cannot_emulate; } diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 405f262..d68f99d 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -344,7 +344,7 @@ static void create_pit_timer(struct kvm *kvm, u32 val, int is_period) struct kvm_timer *pt = &ps->pit_timer; s64 interval; - if (!irqchip_in_kernel(kvm)) + if (!irqchip_in_kernel(kvm) || ps->flags & KVM_PIT_FLAGS_HPET_LEGACY) return; interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ); @@ -397,15 +397,11 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val) case 1: /* FIXME: enhance mode 4 precision */ case 4: - if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)) { - create_pit_timer(kvm, val, 0); - } + create_pit_timer(kvm, val, 0); break; case 2: case 3: - if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)){ - create_pit_timer(kvm, val, 1); - } + create_pit_timer(kvm, val, 1); break; default: destroy_pit_timer(kvm->arch.vpit); diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index cac4746..b6a7353 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -262,9 +262,10 @@ int kvm_pic_read_irq(struct kvm *kvm) void kvm_pic_reset(struct kvm_kpic_state *s) { - int irq; - struct kvm_vcpu *vcpu0 = s->pics_state->kvm->bsp_vcpu; + int irq, i; + struct kvm_vcpu *vcpu; u8 irr = s->irr, isr = s->imr; + bool found = false; s->last_irr = 0; s->irr = 0; @@ -281,12 +282,19 @@ void kvm_pic_reset(struct kvm_kpic_state *s) s->special_fully_nested_mode = 0; s->init4 = 0; - for (irq = 0; irq < PIC_NUM_PINS/2; irq++) { - if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0)) - if (irr & (1 << irq) || isr & (1 << irq)) { - pic_clear_isr(s, irq); - } - } + kvm_for_each_vcpu(i, vcpu, s->pics_state->kvm) + if (kvm_apic_accept_pic_intr(vcpu)) { + found = true; + break; + } + + + if (!found) + return; + + for (irq = 0; irq < PIC_NUM_PINS/2; irq++) + if (irr & (1 << irq) || isr & (1 << irq)) + pic_clear_isr(s, irq); } static void pic_ioport_write(void *opaque, u32 addr, u32 val) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 54abb40..cfdc6e0 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -38,6 +38,7 @@ #include "irq.h" #include "trace.h" #include "x86.h" +#include "cpuid.h" #ifndef CONFIG_X86_64 #define mod_64(x, y) ((x) - (y) * div64_u64(x, y)) @@ -1120,7 +1121,7 @@ int apic_has_pending_timer(struct kvm_vcpu *vcpu) return 0; } -static int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type) +int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type) { u32 reg = apic_get_reg(apic, lvt_type); int vector, mode, trig_mode; diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 138e8cc..6f4ce25 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -34,6 +34,7 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu); int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq); +int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type); u64 kvm_get_apic_base(struct kvm_vcpu *vcpu); void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index f1b36cf..224b02c 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -59,15 +59,6 @@ enum { AUDIT_POST_SYNC }; -char *audit_point_name[] = { - "pre page fault", - "post page fault", - "pre pte write", - "post pte write", - "pre sync", - "post sync" -}; - #undef MMU_DEBUG #ifdef MMU_DEBUG @@ -83,13 +74,10 @@ char *audit_point_name[] = { #endif #ifdef MMU_DEBUG -static int dbg = 0; +static bool dbg = 0; module_param(dbg, bool, 0644); #endif -static int oos_shadow = 1; -module_param(oos_shadow, bool, 0644); - #ifndef MMU_DEBUG #define ASSERT(x) do { } while (0) #else @@ -593,6 +581,11 @@ static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, return 0; } +static int mmu_memory_cache_free_objects(struct kvm_mmu_memory_cache *cache) +{ + return cache->nobjs; +} + static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc, struct kmem_cache *cache) { @@ -953,21 +946,35 @@ static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn) } } +static unsigned long *__gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level, + struct kvm_memory_slot *slot) +{ + struct kvm_lpage_info *linfo; + + if (likely(level == PT_PAGE_TABLE_LEVEL)) + return &slot->rmap[gfn - slot->base_gfn]; + + linfo = lpage_info_slot(gfn, slot, level); + return &linfo->rmap_pde; +} + /* * Take gfn and return the reverse mapping to it. */ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) { struct kvm_memory_slot *slot; - struct kvm_lpage_info *linfo; slot = gfn_to_memslot(kvm, gfn); - if (likely(level == PT_PAGE_TABLE_LEVEL)) - return &slot->rmap[gfn - slot->base_gfn]; + return __gfn_to_rmap(kvm, gfn, level, slot); +} - linfo = lpage_info_slot(gfn, slot, level); +static bool rmap_can_add(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu_memory_cache *cache; - return &linfo->rmap_pde; + cache = &vcpu->arch.mmu_pte_list_desc_cache; + return mmu_memory_cache_free_objects(cache); } static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) @@ -1004,17 +1011,16 @@ static void drop_spte(struct kvm *kvm, u64 *sptep) rmap_remove(kvm, sptep); } -static int rmap_write_protect(struct kvm *kvm, u64 gfn) +int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn, + struct kvm_memory_slot *slot) { unsigned long *rmapp; u64 *spte; int i, write_protected = 0; - rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL); - + rmapp = __gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL, slot); spte = rmap_next(kvm, rmapp, NULL); while (spte) { - BUG_ON(!spte); BUG_ON(!(*spte & PT_PRESENT_MASK)); rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); if (is_writable_pte(*spte)) { @@ -1027,12 +1033,11 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) /* check for huge page mappings */ for (i = PT_DIRECTORY_LEVEL; i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { - rmapp = gfn_to_rmap(kvm, gfn, i); + rmapp = __gfn_to_rmap(kvm, gfn, i, slot); spte = rmap_next(kvm, rmapp, NULL); while (spte) { - BUG_ON(!spte); BUG_ON(!(*spte & PT_PRESENT_MASK)); - BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)); + BUG_ON(!is_large_pte(*spte)); pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); if (is_writable_pte(*spte)) { drop_spte(kvm, spte); @@ -1047,6 +1052,14 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) return write_protected; } +static int rmap_write_protect(struct kvm *kvm, u64 gfn) +{ + struct kvm_memory_slot *slot; + + slot = gfn_to_memslot(kvm, gfn); + return kvm_mmu_rmap_write_protect(kvm, gfn, slot); +} + static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, unsigned long data) { @@ -1103,15 +1116,15 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, int (*handler)(struct kvm *kvm, unsigned long *rmapp, unsigned long data)) { - int i, j; + int j; int ret; int retval = 0; struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; slots = kvm_memslots(kvm); - for (i = 0; i < slots->nmemslots; i++) { - struct kvm_memory_slot *memslot = &slots->memslots[i]; + kvm_for_each_memslot(memslot, slots) { unsigned long start = memslot->userspace_addr; unsigned long end; @@ -1324,7 +1337,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, PAGE_SIZE); set_page_private(virt_to_page(sp->spt), (unsigned long)sp); list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); - bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); + bitmap_zero(sp->slot_bitmap, KVM_MEM_SLOTS_NUM); sp->parent_ptes = 0; mmu_page_add_parent_pte(vcpu, sp, parent_pte); kvm_mod_used_mmu_pages(vcpu->kvm, +1); @@ -1511,6 +1524,13 @@ static int kvm_sync_page_transient(struct kvm_vcpu *vcpu, return ret; } +#ifdef CONFIG_KVM_MMU_AUDIT +#include "mmu_audit.c" +#else +static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { } +static void mmu_audit_disable(void) { } +#endif + static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, struct list_head *invalid_list) { @@ -1640,6 +1660,18 @@ static void init_shadow_page_table(struct kvm_mmu_page *sp) sp->spt[i] = 0ull; } +static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp) +{ + sp->write_flooding_count = 0; +} + +static void clear_sp_write_flooding_count(u64 *spte) +{ + struct kvm_mmu_page *sp = page_header(__pa(spte)); + + __clear_sp_write_flooding_count(sp); +} + static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gaddr, @@ -1683,6 +1715,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, } else if (sp->unsync) kvm_mmu_mark_parents_unsync(sp); + __clear_sp_write_flooding_count(sp); trace_kvm_mmu_get_page(sp, false); return sp; } @@ -1796,7 +1829,7 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, } } -static void mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, +static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, u64 *spte) { u64 pte; @@ -1804,17 +1837,21 @@ static void mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, pte = *spte; if (is_shadow_present_pte(pte)) { - if (is_last_spte(pte, sp->role.level)) + if (is_last_spte(pte, sp->role.level)) { drop_spte(kvm, spte); - else { + if (is_large_pte(pte)) + --kvm->stat.lpages; + } else { child = page_header(pte & PT64_BASE_ADDR_MASK); drop_parent_pte(child, spte); } - } else if (is_mmio_spte(pte)) + return true; + } + + if (is_mmio_spte(pte)) mmu_spte_clear_no_track(spte); - if (is_large_pte(pte)) - --kvm->stat.lpages; + return false; } static void kvm_mmu_page_unlink_children(struct kvm *kvm, @@ -1831,15 +1868,6 @@ static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte) mmu_page_remove_parent_pte(sp, parent_pte); } -static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm) -{ - int i; - struct kvm_vcpu *vcpu; - - kvm_for_each_vcpu(i, vcpu, kvm) - vcpu->arch.last_pte_updated = NULL; -} - static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) { u64 *parent_pte; @@ -1899,7 +1927,6 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, } sp->role.invalid = 1; - kvm_mmu_reset_last_pte_updated(kvm); return ret; } @@ -1985,7 +2012,7 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages) kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages; } -static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) +int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) { struct kvm_mmu_page *sp; struct hlist_node *node; @@ -1994,7 +2021,7 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) pgprintk("%s: looking for gfn %llx\n", __func__, gfn); r = 0; - + spin_lock(&kvm->mmu_lock); for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { pgprintk("%s: gfn %llx role %x\n", __func__, gfn, sp->role.word); @@ -2002,22 +2029,11 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); } kvm_mmu_commit_zap_page(kvm, &invalid_list); - return r; -} - -static void mmu_unshadow(struct kvm *kvm, gfn_t gfn) -{ - struct kvm_mmu_page *sp; - struct hlist_node *node; - LIST_HEAD(invalid_list); + spin_unlock(&kvm->mmu_lock); - for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { - pgprintk("%s: zap %llx %x\n", - __func__, gfn, sp->role.word); - kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); - } - kvm_mmu_commit_zap_page(kvm, &invalid_list); + return r; } +EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page); static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) { @@ -2169,8 +2185,6 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, return 1; if (!need_unsync && !s->unsync) { - if (!oos_shadow) - return 1; need_unsync = true; } } @@ -2191,11 +2205,6 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (set_mmio_spte(sptep, gfn, pfn, pte_access)) return 0; - /* - * We don't set the accessed bit, since we sometimes want to see - * whether the guest actually used the pte (in order to detect - * demand paging). - */ spte = PT_PRESENT_MASK; if (!speculative) spte |= shadow_accessed_mask; @@ -2346,10 +2355,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, } } kvm_release_pfn_clean(pfn); - if (speculative) { - vcpu->arch.last_pte_updated = sptep; - vcpu->arch.last_pte_gfn = gfn; - } } static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) @@ -2840,12 +2845,12 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu) return; vcpu_clear_mmio_info(vcpu, ~0ul); - trace_kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); + kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { hpa_t root = vcpu->arch.mmu.root_hpa; sp = page_header(root); mmu_sync_children(vcpu, sp); - trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); + kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); return; } for (i = 0; i < 4; ++i) { @@ -2857,7 +2862,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu) mmu_sync_children(vcpu, sp); } } - trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); + kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); } void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) @@ -3510,28 +3515,119 @@ static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page, kvm_mmu_flush_tlb(vcpu); } -static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu) +static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa, + const u8 *new, int *bytes) { - u64 *spte = vcpu->arch.last_pte_updated; + u64 gentry; + int r; + + /* + * Assume that the pte write on a page table of the same type + * as the current vcpu paging mode since we update the sptes only + * when they have the same mode. + */ + if (is_pae(vcpu) && *bytes == 4) { + /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ + *gpa &= ~(gpa_t)7; + *bytes = 8; + r = kvm_read_guest(vcpu->kvm, *gpa, &gentry, min(*bytes, 8)); + if (r) + gentry = 0; + new = (const u8 *)&gentry; + } - return !!(spte && (*spte & shadow_accessed_mask)); + switch (*bytes) { + case 4: + gentry = *(const u32 *)new; + break; + case 8: + gentry = *(const u64 *)new; + break; + default: + gentry = 0; + break; + } + + return gentry; } -static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn) +/* + * If we're seeing too many writes to a page, it may no longer be a page table, + * or we may be forking, in which case it is better to unmap the page. + */ +static bool detect_write_flooding(struct kvm_mmu_page *sp, u64 *spte) { - u64 *spte = vcpu->arch.last_pte_updated; + /* + * Skip write-flooding detected for the sp whose level is 1, because + * it can become unsync, then the guest page is not write-protected. + */ + if (sp->role.level == 1) + return false; - if (spte - && vcpu->arch.last_pte_gfn == gfn - && shadow_accessed_mask - && !(*spte & shadow_accessed_mask) - && is_shadow_present_pte(*spte)) - set_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte); + return ++sp->write_flooding_count >= 3; +} + +/* + * Misaligned accesses are too much trouble to fix up; also, they usually + * indicate a page is not used as a page table. + */ +static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa, + int bytes) +{ + unsigned offset, pte_size, misaligned; + + pgprintk("misaligned: gpa %llx bytes %d role %x\n", + gpa, bytes, sp->role.word); + + offset = offset_in_page(gpa); + pte_size = sp->role.cr4_pae ? 8 : 4; + + /* + * Sometimes, the OS only writes the last one bytes to update status + * bits, for example, in linux, andb instruction is used in clear_bit(). + */ + if (!(offset & (pte_size - 1)) && bytes == 1) + return false; + + misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); + misaligned |= bytes < 4; + + return misaligned; +} + +static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte) +{ + unsigned page_offset, quadrant; + u64 *spte; + int level; + + page_offset = offset_in_page(gpa); + level = sp->role.level; + *nspte = 1; + if (!sp->role.cr4_pae) { + page_offset <<= 1; /* 32->64 */ + /* + * A 32-bit pde maps 4MB while the shadow pdes map + * only 2MB. So we need to double the offset again + * and zap two pdes instead of one. + */ + if (level == PT32_ROOT_LEVEL) { + page_offset &= ~7; /* kill rounding error */ + page_offset <<= 1; + *nspte = 2; + } + quadrant = page_offset >> PAGE_SHIFT; + page_offset &= ~PAGE_MASK; + if (quadrant != sp->role.quadrant) + return NULL; + } + + spte = &sp->spt[page_offset / sizeof(*spte)]; + return spte; } void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, - const u8 *new, int bytes, - bool guest_initiated) + const u8 *new, int bytes) { gfn_t gfn = gpa >> PAGE_SHIFT; union kvm_mmu_page_role mask = { .word = 0 }; @@ -3539,8 +3635,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, struct hlist_node *node; LIST_HEAD(invalid_list); u64 entry, gentry, *spte; - unsigned pte_size, page_offset, misaligned, quadrant, offset; - int level, npte, invlpg_counter, r, flooded = 0; + int npte; bool remote_flush, local_flush, zap_page; /* @@ -3551,112 +3646,45 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, return; zap_page = remote_flush = local_flush = false; - offset = offset_in_page(gpa); pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); - invlpg_counter = atomic_read(&vcpu->kvm->arch.invlpg_counter); + gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, new, &bytes); /* - * Assume that the pte write on a page table of the same type - * as the current vcpu paging mode since we update the sptes only - * when they have the same mode. + * No need to care whether allocation memory is successful + * or not since pte prefetch is skiped if it does not have + * enough objects in the cache. */ - if ((is_pae(vcpu) && bytes == 4) || !new) { - /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ - if (is_pae(vcpu)) { - gpa &= ~(gpa_t)7; - bytes = 8; - } - r = kvm_read_guest(vcpu->kvm, gpa, &gentry, min(bytes, 8)); - if (r) - gentry = 0; - new = (const u8 *)&gentry; - } - - switch (bytes) { - case 4: - gentry = *(const u32 *)new; - break; - case 8: - gentry = *(const u64 *)new; - break; - default: - gentry = 0; - break; - } + mmu_topup_memory_caches(vcpu); spin_lock(&vcpu->kvm->mmu_lock); - if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter) - gentry = 0; - kvm_mmu_free_some_pages(vcpu); ++vcpu->kvm->stat.mmu_pte_write; - trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE); - if (guest_initiated) { - kvm_mmu_access_page(vcpu, gfn); - if (gfn == vcpu->arch.last_pt_write_gfn - && !last_updated_pte_accessed(vcpu)) { - ++vcpu->arch.last_pt_write_count; - if (vcpu->arch.last_pt_write_count >= 3) - flooded = 1; - } else { - vcpu->arch.last_pt_write_gfn = gfn; - vcpu->arch.last_pt_write_count = 1; - vcpu->arch.last_pte_updated = NULL; - } - } + kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE); mask.cr0_wp = mask.cr4_pae = mask.nxe = 1; for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) { - pte_size = sp->role.cr4_pae ? 8 : 4; - misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); - misaligned |= bytes < 4; - if (misaligned || flooded) { - /* - * Misaligned accesses are too much trouble to fix - * up; also, they usually indicate a page is not used - * as a page table. - * - * If we're seeing too many writes to a page, - * it may no longer be a page table, or we may be - * forking, in which case it is better to unmap the - * page. - */ - pgprintk("misaligned: gpa %llx bytes %d role %x\n", - gpa, bytes, sp->role.word); + spte = get_written_sptes(sp, gpa, &npte); + + if (detect_write_misaligned(sp, gpa, bytes) || + detect_write_flooding(sp, spte)) { zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); ++vcpu->kvm->stat.mmu_flooded; continue; } - page_offset = offset; - level = sp->role.level; - npte = 1; - if (!sp->role.cr4_pae) { - page_offset <<= 1; /* 32->64 */ - /* - * A 32-bit pde maps 4MB while the shadow pdes map - * only 2MB. So we need to double the offset again - * and zap two pdes instead of one. - */ - if (level == PT32_ROOT_LEVEL) { - page_offset &= ~7; /* kill rounding error */ - page_offset <<= 1; - npte = 2; - } - quadrant = page_offset >> PAGE_SHIFT; - page_offset &= ~PAGE_MASK; - if (quadrant != sp->role.quadrant) - continue; - } + + spte = get_written_sptes(sp, gpa, &npte); + if (!spte) + continue; + local_flush = true; - spte = &sp->spt[page_offset / sizeof(*spte)]; while (npte--) { entry = *spte; mmu_page_zap_pte(vcpu->kvm, sp, spte); if (gentry && !((sp->role.word ^ vcpu->arch.mmu.base_role.word) - & mask.word)) + & mask.word) && rmap_can_add(vcpu)) mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); if (!remote_flush && need_remote_flush(entry, *spte)) remote_flush = true; @@ -3665,7 +3693,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, } mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush); kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); - trace_kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE); + kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE); spin_unlock(&vcpu->kvm->mmu_lock); } @@ -3679,9 +3707,8 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); - spin_lock(&vcpu->kvm->mmu_lock); r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); - spin_unlock(&vcpu->kvm->mmu_lock); + return r; } EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt); @@ -3702,10 +3729,18 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); } +static bool is_mmio_page_fault(struct kvm_vcpu *vcpu, gva_t addr) +{ + if (vcpu->arch.mmu.direct_map || mmu_is_nested(vcpu)) + return vcpu_match_mmio_gpa(vcpu, addr); + + return vcpu_match_mmio_gva(vcpu, addr); +} + int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, void *insn, int insn_len) { - int r; + int r, emulation_type = EMULTYPE_RETRY; enum emulation_result er; r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false); @@ -3717,11 +3752,10 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, goto out; } - r = mmu_topup_memory_caches(vcpu); - if (r) - goto out; + if (is_mmio_page_fault(vcpu, cr2)) + emulation_type = 0; - er = x86_emulate_instruction(vcpu, cr2, 0, insn, insn_len); + er = x86_emulate_instruction(vcpu, cr2, emulation_type, insn, insn_len); switch (er) { case EMULATE_DONE: @@ -3792,7 +3826,11 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu) int kvm_mmu_create(struct kvm_vcpu *vcpu) { ASSERT(vcpu); - ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); + + vcpu->arch.walk_mmu = &vcpu->arch.mmu; + vcpu->arch.mmu.root_hpa = INVALID_PAGE; + vcpu->arch.mmu.translate_gpa = translate_gpa; + vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa; return alloc_mmu_pages(vcpu); } @@ -3852,14 +3890,14 @@ restart: spin_unlock(&kvm->mmu_lock); } -static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm, - struct list_head *invalid_list) +static void kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm, + struct list_head *invalid_list) { struct kvm_mmu_page *page; page = container_of(kvm->arch.active_mmu_pages.prev, struct kvm_mmu_page, link); - return kvm_mmu_prepare_zap_page(kvm, page, invalid_list); + kvm_mmu_prepare_zap_page(kvm, page, invalid_list); } static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) @@ -3874,15 +3912,15 @@ static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) raw_spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { - int idx, freed_pages; + int idx; LIST_HEAD(invalid_list); idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); if (!kvm_freed && nr_to_scan > 0 && kvm->arch.n_used_mmu_pages > 0) { - freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm, - &invalid_list); + kvm_mmu_remove_some_alloc_mmu_pages(kvm, + &invalid_list); kvm_freed = kvm; } nr_to_scan--; @@ -3944,15 +3982,15 @@ nomem: */ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm) { - int i; unsigned int nr_mmu_pages; unsigned int nr_pages = 0; struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; slots = kvm_memslots(kvm); - for (i = 0; i < slots->nmemslots; i++) - nr_pages += slots->memslots[i].npages; + kvm_for_each_memslot(memslot, slots) + nr_pages += memslot->npages; nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000; nr_mmu_pages = max(nr_mmu_pages, @@ -3961,127 +3999,6 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm) return nr_mmu_pages; } -static void *pv_mmu_peek_buffer(struct kvm_pv_mmu_op_buffer *buffer, - unsigned len) -{ - if (len > buffer->len) - return NULL; - return buffer->ptr; -} - -static void *pv_mmu_read_buffer(struct kvm_pv_mmu_op_buffer *buffer, - unsigned len) -{ - void *ret; - - ret = pv_mmu_peek_buffer(buffer, len); - if (!ret) - return ret; - buffer->ptr += len; - buffer->len -= len; - buffer->processed += len; - return ret; -} - -static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu, - gpa_t addr, gpa_t value) -{ - int bytes = 8; - int r; - - if (!is_long_mode(vcpu) && !is_pae(vcpu)) - bytes = 4; - - r = mmu_topup_memory_caches(vcpu); - if (r) - return r; - - if (!emulator_write_phys(vcpu, addr, &value, bytes)) - return -EFAULT; - - return 1; -} - -static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu) -{ - (void)kvm_set_cr3(vcpu, kvm_read_cr3(vcpu)); - return 1; -} - -static int kvm_pv_mmu_release_pt(struct kvm_vcpu *vcpu, gpa_t addr) -{ - spin_lock(&vcpu->kvm->mmu_lock); - mmu_unshadow(vcpu->kvm, addr >> PAGE_SHIFT); - spin_unlock(&vcpu->kvm->mmu_lock); - return 1; -} - -static int kvm_pv_mmu_op_one(struct kvm_vcpu *vcpu, - struct kvm_pv_mmu_op_buffer *buffer) -{ - struct kvm_mmu_op_header *header; - - header = pv_mmu_peek_buffer(buffer, sizeof *header); - if (!header) - return 0; - switch (header->op) { - case KVM_MMU_OP_WRITE_PTE: { - struct kvm_mmu_op_write_pte *wpte; - - wpte = pv_mmu_read_buffer(buffer, sizeof *wpte); - if (!wpte) - return 0; - return kvm_pv_mmu_write(vcpu, wpte->pte_phys, - wpte->pte_val); - } - case KVM_MMU_OP_FLUSH_TLB: { - struct kvm_mmu_op_flush_tlb *ftlb; - - ftlb = pv_mmu_read_buffer(buffer, sizeof *ftlb); - if (!ftlb) - return 0; - return kvm_pv_mmu_flush_tlb(vcpu); - } - case KVM_MMU_OP_RELEASE_PT: { - struct kvm_mmu_op_release_pt *rpt; - - rpt = pv_mmu_read_buffer(buffer, sizeof *rpt); - if (!rpt) - return 0; - return kvm_pv_mmu_release_pt(vcpu, rpt->pt_phys); - } - default: return 0; - } -} - -int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes, - gpa_t addr, unsigned long *ret) -{ - int r; - struct kvm_pv_mmu_op_buffer *buffer = &vcpu->arch.mmu_op_buffer; - - buffer->ptr = buffer->buf; - buffer->len = min_t(unsigned long, bytes, sizeof buffer->buf); - buffer->processed = 0; - - r = kvm_read_guest(vcpu->kvm, addr, buffer->buf, buffer->len); - if (r) - goto out; - - while (buffer->len) { - r = kvm_pv_mmu_op_one(vcpu, buffer); - if (r < 0) - goto out; - if (r == 0) - break; - } - - r = 1; -out: - *ret = buffer->processed; - return r; -} - int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]) { struct kvm_shadow_walk_iterator iterator; @@ -4110,12 +4027,6 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu) mmu_free_memory_caches(vcpu); } -#ifdef CONFIG_KVM_MMU_AUDIT -#include "mmu_audit.c" -#else -static void mmu_audit_disable(void) { } -#endif - void kvm_mmu_module_exit(void) { mmu_destroy_caches(); diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c index 746ec25..fe15dcc 100644 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu_audit.c @@ -19,6 +19,15 @@ #include <linux/ratelimit.h> +char const *audit_point_name[] = { + "pre page fault", + "post page fault", + "pre pte write", + "post pte write", + "pre sync", + "post sync" +}; + #define audit_printk(kvm, fmt, args...) \ printk(KERN_ERR "audit: (%s) error: " \ fmt, audit_point_name[kvm->arch.audit_point], ##args) @@ -224,7 +233,10 @@ static void audit_vcpu_spte(struct kvm_vcpu *vcpu) mmu_spte_walk(vcpu, audit_spte); } -static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int point) +static bool mmu_audit; +static struct jump_label_key mmu_audit_key; + +static void __kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10); @@ -236,18 +248,18 @@ static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int point) audit_vcpu_spte(vcpu); } -static bool mmu_audit; +static inline void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) +{ + if (static_branch((&mmu_audit_key))) + __kvm_mmu_audit(vcpu, point); +} static void mmu_audit_enable(void) { - int ret; - if (mmu_audit) return; - ret = register_trace_kvm_mmu_audit(kvm_mmu_audit, NULL); - WARN_ON(ret); - + jump_label_inc(&mmu_audit_key); mmu_audit = true; } @@ -256,8 +268,7 @@ static void mmu_audit_disable(void) if (!mmu_audit) return; - unregister_trace_kvm_mmu_audit(kvm_mmu_audit, NULL); - tracepoint_synchronize_unregister(); + jump_label_dec(&mmu_audit_key); mmu_audit = false; } diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index eed67f3..89fb0e8 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h @@ -243,25 +243,6 @@ TRACE_EVENT( TP_printk("addr:%llx gfn %llx access %x", __entry->addr, __entry->gfn, __entry->access) ); - -TRACE_EVENT( - kvm_mmu_audit, - TP_PROTO(struct kvm_vcpu *vcpu, int audit_point), - TP_ARGS(vcpu, audit_point), - - TP_STRUCT__entry( - __field(struct kvm_vcpu *, vcpu) - __field(int, audit_point) - ), - - TP_fast_assign( - __entry->vcpu = vcpu; - __entry->audit_point = audit_point; - ), - - TP_printk("vcpu:%d %s", __entry->vcpu->cpu, - audit_point_name[__entry->audit_point]) -); #endif /* _TRACE_KVMMMU_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 9299410..1561028 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -497,6 +497,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, shadow_walk_next(&it)) { gfn_t table_gfn; + clear_sp_write_flooding_count(it.sptep); drop_large_spte(vcpu, it.sptep); sp = NULL; @@ -522,6 +523,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, shadow_walk_next(&it)) { gfn_t direct_gfn; + clear_sp_write_flooding_count(it.sptep); validate_direct_spte(vcpu, it.sptep, direct_access); drop_large_spte(vcpu, it.sptep); @@ -536,6 +538,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, link_shadow_page(it.sptep, sp); } + clear_sp_write_flooding_count(it.sptep); mmu_set_spte(vcpu, it.sptep, access, gw->pte_access, user_fault, write_fault, emulate, it.level, gw->gfn, pfn, prefault, map_writable); @@ -599,11 +602,9 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, */ if (!r) { pgprintk("%s: guest page fault\n", __func__); - if (!prefault) { + if (!prefault) inject_page_fault(vcpu, &walker.fault); - /* reset fork detector */ - vcpu->arch.last_pt_write_count = 0; - } + return 0; } @@ -631,7 +632,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, if (mmu_notifier_retry(vcpu, mmu_seq)) goto out_unlock; - trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); + kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); kvm_mmu_free_some_pages(vcpu); if (!force_pt_level) transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level); @@ -641,11 +642,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, pgprintk("%s: shadow pte %p %llx emulate %d\n", __func__, sptep, *sptep, emulate); - if (!emulate) - vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ - ++vcpu->stat.pf_fixed; - trace_kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); + kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); spin_unlock(&vcpu->kvm->mmu_lock); return emulate; @@ -656,65 +654,66 @@ out_unlock: return 0; } +static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp) +{ + int offset = 0; + + WARN_ON(sp->role.level != 1); + + if (PTTYPE == 32) + offset = sp->role.quadrant << PT64_LEVEL_BITS; + + return gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t); +} + static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) { struct kvm_shadow_walk_iterator iterator; struct kvm_mmu_page *sp; - gpa_t pte_gpa = -1; int level; u64 *sptep; - int need_flush = 0; vcpu_clear_mmio_info(vcpu, gva); - spin_lock(&vcpu->kvm->mmu_lock); + /* + * No need to check return value here, rmap_can_add() can + * help us to skip pte prefetch later. + */ + mmu_topup_memory_caches(vcpu); + spin_lock(&vcpu->kvm->mmu_lock); for_each_shadow_entry(vcpu, gva, iterator) { level = iterator.level; sptep = iterator.sptep; sp = page_header(__pa(sptep)); if (is_last_spte(*sptep, level)) { - int offset, shift; + pt_element_t gpte; + gpa_t pte_gpa; if (!sp->unsync) break; - shift = PAGE_SHIFT - - (PT_LEVEL_BITS - PT64_LEVEL_BITS) * level; - offset = sp->role.quadrant << shift; - - pte_gpa = (sp->gfn << PAGE_SHIFT) + offset; + pte_gpa = FNAME(get_level1_sp_gpa)(sp); pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t); - if (is_shadow_present_pte(*sptep)) { - if (is_large_pte(*sptep)) - --vcpu->kvm->stat.lpages; - drop_spte(vcpu->kvm, sptep); - need_flush = 1; - } else if (is_mmio_spte(*sptep)) - mmu_spte_clear_no_track(sptep); + if (mmu_page_zap_pte(vcpu->kvm, sp, sptep)) + kvm_flush_remote_tlbs(vcpu->kvm); - break; + if (!rmap_can_add(vcpu)) + break; + + if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte, + sizeof(pt_element_t))) + break; + + FNAME(update_pte)(vcpu, sp, sptep, &gpte); } if (!is_shadow_present_pte(*sptep) || !sp->unsync_children) break; } - - if (need_flush) - kvm_flush_remote_tlbs(vcpu->kvm); - - atomic_inc(&vcpu->kvm->arch.invlpg_counter); - spin_unlock(&vcpu->kvm->mmu_lock); - - if (pte_gpa == -1) - return; - - if (mmu_topup_memory_caches(vcpu)) - return; - kvm_mmu_pte_write(vcpu, pte_gpa, NULL, sizeof(pt_element_t), 0); } static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, @@ -769,19 +768,14 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, */ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { - int i, offset, nr_present; + int i, nr_present = 0; bool host_writable; gpa_t first_pte_gpa; - offset = nr_present = 0; - /* direct kvm_mmu_page can not be unsync. */ BUG_ON(sp->role.direct); - if (PTTYPE == 32) - offset = sp->role.quadrant << PT64_LEVEL_BITS; - - first_pte_gpa = gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t); + first_pte_gpa = FNAME(get_level1_sp_gpa)(sp); for (i = 0; i < PT64_ENT_PER_PAGE; i++) { unsigned pte_access; diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c new file mode 100644 index 0000000..7aad544 --- /dev/null +++ b/arch/x86/kvm/pmu.c @@ -0,0 +1,533 @@ +/* + * Kernel-based Virtual Machine -- Performane Monitoring Unit support + * + * Copyright 2011 Red Hat, Inc. and/or its affiliates. + * + * Authors: + * Avi Kivity <avi@redhat.com> + * Gleb Natapov <gleb@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include <linux/types.h> +#include <linux/kvm_host.h> +#include <linux/perf_event.h> +#include "x86.h" +#include "cpuid.h" +#include "lapic.h" + +static struct kvm_arch_event_perf_mapping { + u8 eventsel; + u8 unit_mask; + unsigned event_type; + bool inexact; +} arch_events[] = { + /* Index must match CPUID 0x0A.EBX bit vector */ + [0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES }, + [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS }, + [2] = { 0x3c, 0x01, PERF_COUNT_HW_BUS_CYCLES }, + [3] = { 0x2e, 0x4f, PERF_COUNT_HW_CACHE_REFERENCES }, + [4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES }, + [5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, + [6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES }, +}; + +/* mapping between fixed pmc index and arch_events array */ +int fixed_pmc_events[] = {1, 0, 2}; + +static bool pmc_is_gp(struct kvm_pmc *pmc) +{ + return pmc->type == KVM_PMC_GP; +} + +static inline u64 pmc_bitmask(struct kvm_pmc *pmc) +{ + struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu; + + return pmu->counter_bitmask[pmc->type]; +} + +static inline bool pmc_enabled(struct kvm_pmc *pmc) +{ + struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu; + return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl); +} + +static inline struct kvm_pmc *get_gp_pmc(struct kvm_pmu *pmu, u32 msr, + u32 base) +{ + if (msr >= base && msr < base + pmu->nr_arch_gp_counters) + return &pmu->gp_counters[msr - base]; + return NULL; +} + +static inline struct kvm_pmc *get_fixed_pmc(struct kvm_pmu *pmu, u32 msr) +{ + int base = MSR_CORE_PERF_FIXED_CTR0; + if (msr >= base && msr < base + pmu->nr_arch_fixed_counters) + return &pmu->fixed_counters[msr - base]; + return NULL; +} + +static inline struct kvm_pmc *get_fixed_pmc_idx(struct kvm_pmu *pmu, int idx) +{ + return get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + idx); +} + +static struct kvm_pmc *global_idx_to_pmc(struct kvm_pmu *pmu, int idx) +{ + if (idx < X86_PMC_IDX_FIXED) + return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + idx, MSR_P6_EVNTSEL0); + else + return get_fixed_pmc_idx(pmu, idx - X86_PMC_IDX_FIXED); +} + +void kvm_deliver_pmi(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.apic) + kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC); +} + +static void trigger_pmi(struct irq_work *irq_work) +{ + struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu, + irq_work); + struct kvm_vcpu *vcpu = container_of(pmu, struct kvm_vcpu, + arch.pmu); + + kvm_deliver_pmi(vcpu); +} + +static void kvm_perf_overflow(struct perf_event *perf_event, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct kvm_pmc *pmc = perf_event->overflow_handler_context; + struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu; + __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); +} + +static void kvm_perf_overflow_intr(struct perf_event *perf_event, + struct perf_sample_data *data, struct pt_regs *regs) +{ + struct kvm_pmc *pmc = perf_event->overflow_handler_context; + struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu; + if (!test_and_set_bit(pmc->idx, (unsigned long *)&pmu->reprogram_pmi)) { + kvm_perf_overflow(perf_event, data, regs); + kvm_make_request(KVM_REQ_PMU, pmc->vcpu); + /* + * Inject PMI. If vcpu was in a guest mode during NMI PMI + * can be ejected on a guest mode re-entry. Otherwise we can't + * be sure that vcpu wasn't executing hlt instruction at the + * time of vmexit and is not going to re-enter guest mode until, + * woken up. So we should wake it, but this is impossible from + * NMI context. Do it from irq work instead. + */ + if (!kvm_is_in_guest()) + irq_work_queue(&pmc->vcpu->arch.pmu.irq_work); + else + kvm_make_request(KVM_REQ_PMI, pmc->vcpu); + } +} + +static u64 read_pmc(struct kvm_pmc *pmc) +{ + u64 counter, enabled, running; + + counter = pmc->counter; + + if (pmc->perf_event) + counter += perf_event_read_value(pmc->perf_event, + &enabled, &running); + + /* FIXME: Scaling needed? */ + + return counter & pmc_bitmask(pmc); +} + +static void stop_counter(struct kvm_pmc *pmc) +{ + if (pmc->perf_event) { + pmc->counter = read_pmc(pmc); + perf_event_release_kernel(pmc->perf_event); + pmc->perf_event = NULL; + } +} + +static void reprogram_counter(struct kvm_pmc *pmc, u32 type, + unsigned config, bool exclude_user, bool exclude_kernel, + bool intr) +{ + struct perf_event *event; + struct perf_event_attr attr = { + .type = type, + .size = sizeof(attr), + .pinned = true, + .exclude_idle = true, + .exclude_host = 1, + .exclude_user = exclude_user, + .exclude_kernel = exclude_kernel, + .config = config, + }; + + attr.sample_period = (-pmc->counter) & pmc_bitmask(pmc); + + event = perf_event_create_kernel_counter(&attr, -1, current, + intr ? kvm_perf_overflow_intr : + kvm_perf_overflow, pmc); + if (IS_ERR(event)) { + printk_once("kvm: pmu event creation failed %ld\n", + PTR_ERR(event)); + return; + } + + pmc->perf_event = event; + clear_bit(pmc->idx, (unsigned long*)&pmc->vcpu->arch.pmu.reprogram_pmi); +} + +static unsigned find_arch_event(struct kvm_pmu *pmu, u8 event_select, + u8 unit_mask) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(arch_events); i++) + if (arch_events[i].eventsel == event_select + && arch_events[i].unit_mask == unit_mask + && (pmu->available_event_types & (1 << i))) + break; + + if (i == ARRAY_SIZE(arch_events)) + return PERF_COUNT_HW_MAX; + + return arch_events[i].event_type; +} + +static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) +{ + unsigned config, type = PERF_TYPE_RAW; + u8 event_select, unit_mask; + + pmc->eventsel = eventsel; + + stop_counter(pmc); + + if (!(eventsel & ARCH_PERFMON_EVENTSEL_ENABLE) || !pmc_enabled(pmc)) + return; + + event_select = eventsel & ARCH_PERFMON_EVENTSEL_EVENT; + unit_mask = (eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; + + if (!(event_select & (ARCH_PERFMON_EVENTSEL_EDGE | + ARCH_PERFMON_EVENTSEL_INV | + ARCH_PERFMON_EVENTSEL_CMASK))) { + config = find_arch_event(&pmc->vcpu->arch.pmu, event_select, + unit_mask); + if (config != PERF_COUNT_HW_MAX) + type = PERF_TYPE_HARDWARE; + } + + if (type == PERF_TYPE_RAW) + config = eventsel & X86_RAW_EVENT_MASK; + + reprogram_counter(pmc, type, config, + !(eventsel & ARCH_PERFMON_EVENTSEL_USR), + !(eventsel & ARCH_PERFMON_EVENTSEL_OS), + eventsel & ARCH_PERFMON_EVENTSEL_INT); +} + +static void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 en_pmi, int idx) +{ + unsigned en = en_pmi & 0x3; + bool pmi = en_pmi & 0x8; + + stop_counter(pmc); + + if (!en || !pmc_enabled(pmc)) + return; + + reprogram_counter(pmc, PERF_TYPE_HARDWARE, + arch_events[fixed_pmc_events[idx]].event_type, + !(en & 0x2), /* exclude user */ + !(en & 0x1), /* exclude kernel */ + pmi); +} + +static inline u8 fixed_en_pmi(u64 ctrl, int idx) +{ + return (ctrl >> (idx * 4)) & 0xf; +} + +static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data) +{ + int i; + + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { + u8 en_pmi = fixed_en_pmi(data, i); + struct kvm_pmc *pmc = get_fixed_pmc_idx(pmu, i); + + if (fixed_en_pmi(pmu->fixed_ctr_ctrl, i) == en_pmi) + continue; + + reprogram_fixed_counter(pmc, en_pmi, i); + } + + pmu->fixed_ctr_ctrl = data; +} + +static void reprogram_idx(struct kvm_pmu *pmu, int idx) +{ + struct kvm_pmc *pmc = global_idx_to_pmc(pmu, idx); + + if (!pmc) + return; + + if (pmc_is_gp(pmc)) + reprogram_gp_counter(pmc, pmc->eventsel); + else { + int fidx = idx - X86_PMC_IDX_FIXED; + reprogram_fixed_counter(pmc, + fixed_en_pmi(pmu->fixed_ctr_ctrl, fidx), fidx); + } +} + +static void global_ctrl_changed(struct kvm_pmu *pmu, u64 data) +{ + int bit; + u64 diff = pmu->global_ctrl ^ data; + + pmu->global_ctrl = data; + + for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX) + reprogram_idx(pmu, bit); +} + +bool kvm_pmu_msr(struct kvm_vcpu *vcpu, u32 msr) +{ + struct kvm_pmu *pmu = &vcpu->arch.pmu; + int ret; + + switch (msr) { + case MSR_CORE_PERF_FIXED_CTR_CTRL: + case MSR_CORE_PERF_GLOBAL_STATUS: + case MSR_CORE_PERF_GLOBAL_CTRL: + case MSR_CORE_PERF_GLOBAL_OVF_CTRL: + ret = pmu->version > 1; + break; + default: + ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) + || get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0) + || get_fixed_pmc(pmu, msr); + break; + } + return ret; +} + +int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data) +{ + struct kvm_pmu *pmu = &vcpu->arch.pmu; + struct kvm_pmc *pmc; + + switch (index) { + case MSR_CORE_PERF_FIXED_CTR_CTRL: + *data = pmu->fixed_ctr_ctrl; + return 0; + case MSR_CORE_PERF_GLOBAL_STATUS: + *data = pmu->global_status; + return 0; + case MSR_CORE_PERF_GLOBAL_CTRL: + *data = pmu->global_ctrl; + return 0; + case MSR_CORE_PERF_GLOBAL_OVF_CTRL: + *data = pmu->global_ovf_ctrl; + return 0; + default: + if ((pmc = get_gp_pmc(pmu, index, MSR_IA32_PERFCTR0)) || + (pmc = get_fixed_pmc(pmu, index))) { + *data = read_pmc(pmc); + return 0; + } else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) { + *data = pmc->eventsel; + return 0; + } + } + return 1; +} + +int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data) +{ + struct kvm_pmu *pmu = &vcpu->arch.pmu; + struct kvm_pmc *pmc; + + switch (index) { + case MSR_CORE_PERF_FIXED_CTR_CTRL: + if (pmu->fixed_ctr_ctrl == data) + return 0; + if (!(data & 0xfffffffffffff444)) { + reprogram_fixed_counters(pmu, data); + return 0; + } + break; + case MSR_CORE_PERF_GLOBAL_STATUS: + break; /* RO MSR */ + case MSR_CORE_PERF_GLOBAL_CTRL: + if (pmu->global_ctrl == data) + return 0; + if (!(data & pmu->global_ctrl_mask)) { + global_ctrl_changed(pmu, data); + return 0; + } + break; + case MSR_CORE_PERF_GLOBAL_OVF_CTRL: + if (!(data & (pmu->global_ctrl_mask & ~(3ull<<62)))) { + pmu->global_status &= ~data; + pmu->global_ovf_ctrl = data; + return 0; + } + break; + default: + if ((pmc = get_gp_pmc(pmu, index, MSR_IA32_PERFCTR0)) || + (pmc = get_fixed_pmc(pmu, index))) { + data = (s64)(s32)data; + pmc->counter += data - read_pmc(pmc); + return 0; + } else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) { + if (data == pmc->eventsel) + return 0; + if (!(data & 0xffffffff00200000ull)) { + reprogram_gp_counter(pmc, data); + return 0; + } + } + } + return 1; +} + +int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data) +{ + struct kvm_pmu *pmu = &vcpu->arch.pmu; + bool fast_mode = pmc & (1u << 31); + bool fixed = pmc & (1u << 30); + struct kvm_pmc *counters; + u64 ctr; + + pmc &= (3u << 30) - 1; + if (!fixed && pmc >= pmu->nr_arch_gp_counters) + return 1; + if (fixed && pmc >= pmu->nr_arch_fixed_counters) + return 1; + counters = fixed ? pmu->fixed_counters : pmu->gp_counters; + ctr = read_pmc(&counters[pmc]); + if (fast_mode) + ctr = (u32)ctr; + *data = ctr; + + return 0; +} + +void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = &vcpu->arch.pmu; + struct kvm_cpuid_entry2 *entry; + unsigned bitmap_len; + + pmu->nr_arch_gp_counters = 0; + pmu->nr_arch_fixed_counters = 0; + pmu->counter_bitmask[KVM_PMC_GP] = 0; + pmu->counter_bitmask[KVM_PMC_FIXED] = 0; + pmu->version = 0; + + entry = kvm_find_cpuid_entry(vcpu, 0xa, 0); + if (!entry) + return; + + pmu->version = entry->eax & 0xff; + if (!pmu->version) + return; + + pmu->nr_arch_gp_counters = min((int)(entry->eax >> 8) & 0xff, + X86_PMC_MAX_GENERIC); + pmu->counter_bitmask[KVM_PMC_GP] = + ((u64)1 << ((entry->eax >> 16) & 0xff)) - 1; + bitmap_len = (entry->eax >> 24) & 0xff; + pmu->available_event_types = ~entry->ebx & ((1ull << bitmap_len) - 1); + + if (pmu->version == 1) { + pmu->global_ctrl = (1 << pmu->nr_arch_gp_counters) - 1; + return; + } + + pmu->nr_arch_fixed_counters = min((int)(entry->edx & 0x1f), + X86_PMC_MAX_FIXED); + pmu->counter_bitmask[KVM_PMC_FIXED] = + ((u64)1 << ((entry->edx >> 5) & 0xff)) - 1; + pmu->global_ctrl_mask = ~(((1 << pmu->nr_arch_gp_counters) - 1) + | (((1ull << pmu->nr_arch_fixed_counters) - 1) + << X86_PMC_IDX_FIXED)); +} + +void kvm_pmu_init(struct kvm_vcpu *vcpu) +{ + int i; + struct kvm_pmu *pmu = &vcpu->arch.pmu; + + memset(pmu, 0, sizeof(*pmu)); + for (i = 0; i < X86_PMC_MAX_GENERIC; i++) { + pmu->gp_counters[i].type = KVM_PMC_GP; + pmu->gp_counters[i].vcpu = vcpu; + pmu->gp_counters[i].idx = i; + } + for (i = 0; i < X86_PMC_MAX_FIXED; i++) { + pmu->fixed_counters[i].type = KVM_PMC_FIXED; + pmu->fixed_counters[i].vcpu = vcpu; + pmu->fixed_counters[i].idx = i + X86_PMC_IDX_FIXED; + } + init_irq_work(&pmu->irq_work, trigger_pmi); + kvm_pmu_cpuid_update(vcpu); +} + +void kvm_pmu_reset(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = &vcpu->arch.pmu; + int i; + + irq_work_sync(&pmu->irq_work); + for (i = 0; i < X86_PMC_MAX_GENERIC; i++) { + struct kvm_pmc *pmc = &pmu->gp_counters[i]; + stop_counter(pmc); + pmc->counter = pmc->eventsel = 0; + } + + for (i = 0; i < X86_PMC_MAX_FIXED; i++) + stop_counter(&pmu->fixed_counters[i]); + + pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = + pmu->global_ovf_ctrl = 0; +} + +void kvm_pmu_destroy(struct kvm_vcpu *vcpu) +{ + kvm_pmu_reset(vcpu); +} + +void kvm_handle_pmu_event(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = &vcpu->arch.pmu; + u64 bitmask; + int bit; + + bitmask = pmu->reprogram_pmi; + + for_each_set_bit(bit, (unsigned long *)&bitmask, X86_PMC_IDX_MAX) { + struct kvm_pmc *pmc = global_idx_to_pmc(pmu, bit); + + if (unlikely(!pmc || !pmc->perf_event)) { + clear_bit(bit, (unsigned long *)&pmu->reprogram_pmi); + continue; + } + + reprogram_idx(pmu, bit); + } +} diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index e32243e..5fa553b 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1014,6 +1014,7 @@ static void init_vmcb(struct vcpu_svm *svm) set_intercept(svm, INTERCEPT_NMI); set_intercept(svm, INTERCEPT_SMI); set_intercept(svm, INTERCEPT_SELECTIVE_CR0); + set_intercept(svm, INTERCEPT_RDPMC); set_intercept(svm, INTERCEPT_CPUID); set_intercept(svm, INTERCEPT_INVD); set_intercept(svm, INTERCEPT_HLT); @@ -2770,6 +2771,19 @@ static int emulate_on_interception(struct vcpu_svm *svm) return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE; } +static int rdpmc_interception(struct vcpu_svm *svm) +{ + int err; + + if (!static_cpu_has(X86_FEATURE_NRIPS)) + return emulate_on_interception(svm); + + err = kvm_rdpmc(&svm->vcpu); + kvm_complete_insn_gp(&svm->vcpu, err); + + return 1; +} + bool check_selective_cr0_intercepted(struct vcpu_svm *svm, unsigned long val) { unsigned long cr0 = svm->vcpu.arch.cr0; @@ -3190,6 +3204,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_SMI] = nop_on_interception, [SVM_EXIT_INIT] = nop_on_interception, [SVM_EXIT_VINTR] = interrupt_window_interception, + [SVM_EXIT_RDPMC] = rdpmc_interception, [SVM_EXIT_CPUID] = cpuid_interception, [SVM_EXIT_IRET] = iret_interception, [SVM_EXIT_INVD] = emulate_on_interception, diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c index ae432ea..6b85cc6 100644 --- a/arch/x86/kvm/timer.c +++ b/arch/x86/kvm/timer.c @@ -18,9 +18,10 @@ #include <linux/atomic.h> #include "kvm_timer.h" -static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer) +enum hrtimer_restart kvm_timer_fn(struct hrtimer *data) { - int restart_timer = 0; + struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer); + struct kvm_vcpu *vcpu = ktimer->vcpu; wait_queue_head_t *q = &vcpu->wq; /* @@ -40,26 +41,7 @@ static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer) if (ktimer->t_ops->is_periodic(ktimer)) { hrtimer_add_expires_ns(&ktimer->timer, ktimer->period); - restart_timer = 1; - } - - return restart_timer; -} - -enum hrtimer_restart kvm_timer_fn(struct hrtimer *data) -{ - int restart_timer; - struct kvm_vcpu *vcpu; - struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer); - - vcpu = ktimer->vcpu; - if (!vcpu) - return HRTIMER_NORESTART; - - restart_timer = __kvm_timer_fn(vcpu, ktimer); - if (restart_timer) return HRTIMER_RESTART; - else + } else return HRTIMER_NORESTART; } - diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 579a0b5..d29216c 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -18,6 +18,7 @@ #include "irq.h" #include "mmu.h" +#include "cpuid.h" #include <linux/kvm_host.h> #include <linux/module.h> @@ -50,29 +51,29 @@ MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); -static int __read_mostly enable_vpid = 1; +static bool __read_mostly enable_vpid = 1; module_param_named(vpid, enable_vpid, bool, 0444); -static int __read_mostly flexpriority_enabled = 1; +static bool __read_mostly flexpriority_enabled = 1; module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO); -static int __read_mostly enable_ept = 1; +static bool __read_mostly enable_ept = 1; module_param_named(ept, enable_ept, bool, S_IRUGO); -static int __read_mostly enable_unrestricted_guest = 1; +static bool __read_mostly enable_unrestricted_guest = 1; module_param_named(unrestricted_guest, enable_unrestricted_guest, bool, S_IRUGO); -static int __read_mostly emulate_invalid_guest_state = 0; +static bool __read_mostly emulate_invalid_guest_state = 0; module_param(emulate_invalid_guest_state, bool, S_IRUGO); -static int __read_mostly vmm_exclusive = 1; +static bool __read_mostly vmm_exclusive = 1; module_param(vmm_exclusive, bool, S_IRUGO); -static int __read_mostly yield_on_hlt = 1; +static bool __read_mostly yield_on_hlt = 1; module_param(yield_on_hlt, bool, S_IRUGO); -static int __read_mostly fasteoi = 1; +static bool __read_mostly fasteoi = 1; module_param(fasteoi, bool, S_IRUGO); /* @@ -80,7 +81,7 @@ module_param(fasteoi, bool, S_IRUGO); * VMX and be a hypervisor for its own guests. If nested=0, guests may not * use VMX instructions. */ -static int __read_mostly nested = 0; +static bool __read_mostly nested = 0; module_param(nested, bool, S_IRUGO); #define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ @@ -1747,7 +1748,6 @@ static void setup_msrs(struct vcpu_vmx *vmx) int save_nmsrs, index; unsigned long *msr_bitmap; - vmx_load_host_state(vmx); save_nmsrs = 0; #ifdef CONFIG_X86_64 if (is_long_mode(&vmx->vcpu)) { @@ -1956,6 +1956,7 @@ static __init void nested_vmx_setup_ctls_msrs(void) #endif CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING | + CPU_BASED_RDPMC_EXITING | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; /* * We can allow some features even when not supported by the @@ -2142,12 +2143,10 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) return 1; /* Otherwise falls through */ default: - vmx_load_host_state(to_vmx(vcpu)); if (vmx_get_vmx_msr(vcpu, msr_index, pdata)) return 0; msr = find_msr_entry(to_vmx(vcpu), msr_index); if (msr) { - vmx_load_host_state(to_vmx(vcpu)); data = msr->data; break; } @@ -2171,7 +2170,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) switch (msr_index) { case MSR_EFER: - vmx_load_host_state(vmx); ret = kvm_set_msr_common(vcpu, msr_index, data); break; #ifdef CONFIG_X86_64 @@ -2220,7 +2218,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) break; msr = find_msr_entry(vmx, msr_index); if (msr) { - vmx_load_host_state(vmx); msr->data = data; break; } @@ -2414,7 +2411,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) CPU_BASED_USE_TSC_OFFSETING | CPU_BASED_MWAIT_EXITING | CPU_BASED_MONITOR_EXITING | - CPU_BASED_INVLPG_EXITING; + CPU_BASED_INVLPG_EXITING | + CPU_BASED_RDPMC_EXITING; if (yield_on_hlt) min |= CPU_BASED_HLT_EXITING; @@ -2716,11 +2714,13 @@ static gva_t rmode_tss_base(struct kvm *kvm) { if (!kvm->arch.tss_addr) { struct kvm_memslots *slots; + struct kvm_memory_slot *slot; gfn_t base_gfn; slots = kvm_memslots(kvm); - base_gfn = slots->memslots[0].base_gfn + - kvm->memslots->memslots[0].npages - 3; + slot = id_to_memslot(slots, 0); + base_gfn = slot->base_gfn + slot->npages - 3; + return base_gfn << PAGE_SHIFT; } return kvm->arch.tss_addr; @@ -3945,12 +3945,15 @@ static bool nested_exit_on_intr(struct kvm_vcpu *vcpu) static void enable_irq_window(struct kvm_vcpu *vcpu) { u32 cpu_based_vm_exec_control; - if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) - /* We can get here when nested_run_pending caused - * vmx_interrupt_allowed() to return false. In this case, do - * nothing - the interrupt will be injected later. + if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) { + /* + * We get here if vmx_interrupt_allowed() said we can't + * inject to L1 now because L2 must run. Ask L2 to exit + * right after entry, so we can inject to L1 more promptly. */ + kvm_make_request(KVM_REQ_IMMEDIATE_EXIT, vcpu); return; + } cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; @@ -4077,11 +4080,12 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) { if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) { - struct vmcs12 *vmcs12; - if (to_vmx(vcpu)->nested.nested_run_pending) + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + if (to_vmx(vcpu)->nested.nested_run_pending || + (vmcs12->idt_vectoring_info_field & + VECTORING_INFO_VALID_MASK)) return 0; nested_vmx_vmexit(vcpu); - vmcs12 = get_vmcs12(vcpu); vmcs12->vm_exit_reason = EXIT_REASON_EXTERNAL_INTERRUPT; vmcs12->vm_exit_intr_info = 0; /* fall through to normal code, but now in L1, not L2 */ @@ -4611,6 +4615,16 @@ static int handle_invlpg(struct kvm_vcpu *vcpu) return 1; } +static int handle_rdpmc(struct kvm_vcpu *vcpu) +{ + int err; + + err = kvm_rdpmc(vcpu); + kvm_complete_insn_gp(vcpu, err); + + return 1; +} + static int handle_wbinvd(struct kvm_vcpu *vcpu) { skip_emulated_instruction(vcpu); @@ -5561,6 +5575,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_HLT] = handle_halt, [EXIT_REASON_INVD] = handle_invd, [EXIT_REASON_INVLPG] = handle_invlpg, + [EXIT_REASON_RDPMC] = handle_rdpmc, [EXIT_REASON_VMCALL] = handle_vmcall, [EXIT_REASON_VMCLEAR] = handle_vmclear, [EXIT_REASON_VMLAUNCH] = handle_vmlaunch, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4c938da..14d6cad 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -26,6 +26,7 @@ #include "tss.h" #include "kvm_cache_regs.h" #include "x86.h" +#include "cpuid.h" #include <linux/clocksource.h> #include <linux/interrupt.h> @@ -82,15 +83,13 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU static void update_cr8_intercept(struct kvm_vcpu *vcpu); -static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, - struct kvm_cpuid_entry2 __user *entries); static void process_nmi(struct kvm_vcpu *vcpu); struct kvm_x86_ops *kvm_x86_ops; EXPORT_SYMBOL_GPL(kvm_x86_ops); -int ignore_msrs = 0; -module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR); +static bool ignore_msrs = 0; +module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR); bool kvm_has_tsc_control; EXPORT_SYMBOL_GPL(kvm_has_tsc_control); @@ -574,54 +573,6 @@ int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) } EXPORT_SYMBOL_GPL(kvm_set_xcr); -static bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu) -{ - struct kvm_cpuid_entry2 *best; - - best = kvm_find_cpuid_entry(vcpu, 1, 0); - return best && (best->ecx & bit(X86_FEATURE_XSAVE)); -} - -static bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu) -{ - struct kvm_cpuid_entry2 *best; - - best = kvm_find_cpuid_entry(vcpu, 7, 0); - return best && (best->ebx & bit(X86_FEATURE_SMEP)); -} - -static bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu) -{ - struct kvm_cpuid_entry2 *best; - - best = kvm_find_cpuid_entry(vcpu, 7, 0); - return best && (best->ebx & bit(X86_FEATURE_FSGSBASE)); -} - -static void update_cpuid(struct kvm_vcpu *vcpu) -{ - struct kvm_cpuid_entry2 *best; - struct kvm_lapic *apic = vcpu->arch.apic; - - best = kvm_find_cpuid_entry(vcpu, 1, 0); - if (!best) - return; - - /* Update OSXSAVE bit */ - if (cpu_has_xsave && best->function == 0x1) { - best->ecx &= ~(bit(X86_FEATURE_OSXSAVE)); - if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) - best->ecx |= bit(X86_FEATURE_OSXSAVE); - } - - if (apic) { - if (best->ecx & bit(X86_FEATURE_TSC_DEADLINE_TIMER)) - apic->lapic_timer.timer_mode_mask = 3 << 17; - else - apic->lapic_timer.timer_mode_mask = 1 << 17; - } -} - int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { unsigned long old_cr4 = kvm_read_cr4(vcpu); @@ -655,7 +606,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) kvm_mmu_reset_context(vcpu); if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE) - update_cpuid(vcpu); + kvm_update_cpuid(vcpu); return 0; } @@ -809,6 +760,21 @@ int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) } EXPORT_SYMBOL_GPL(kvm_get_dr); +bool kvm_rdpmc(struct kvm_vcpu *vcpu) +{ + u32 ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); + u64 data; + int err; + + err = kvm_pmu_read_pmc(vcpu, ecx, &data); + if (err) + return err; + kvm_register_write(vcpu, VCPU_REGS_RAX, (u32)data); + kvm_register_write(vcpu, VCPU_REGS_RDX, data >> 32); + return err; +} +EXPORT_SYMBOL_GPL(kvm_rdpmc); + /* * List of msr numbers which we expose to userspace through KVM_GET_MSRS * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. @@ -1358,12 +1324,11 @@ static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data) if (page_num >= blob_size) goto out; r = -ENOMEM; - page = kzalloc(PAGE_SIZE, GFP_KERNEL); - if (!page) + page = memdup_user(blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE); + if (IS_ERR(page)) { + r = PTR_ERR(page); goto out; - r = -EFAULT; - if (copy_from_user(page, blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE)) - goto out_free; + } if (kvm_write_guest(kvm, page_addr, page, PAGE_SIZE)) goto out_free; r = 0; @@ -1652,8 +1617,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) * which we perfectly emulate ;-). Any other value should be at least * reported, some guests depend on them. */ - case MSR_P6_EVNTSEL0: - case MSR_P6_EVNTSEL1: case MSR_K7_EVNTSEL0: case MSR_K7_EVNTSEL1: case MSR_K7_EVNTSEL2: @@ -1665,8 +1628,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) /* at least RHEL 4 unconditionally writes to the perfctr registers, * so we ignore writes to make it happy. */ - case MSR_P6_PERFCTR0: - case MSR_P6_PERFCTR1: case MSR_K7_PERFCTR0: case MSR_K7_PERFCTR1: case MSR_K7_PERFCTR2: @@ -1703,6 +1664,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) default: if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) return xen_hvm_config(vcpu, data); + if (kvm_pmu_msr(vcpu, msr)) + return kvm_pmu_set_msr(vcpu, msr, data); if (!ignore_msrs) { pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data); @@ -1865,10 +1828,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_K8_SYSCFG: case MSR_K7_HWCR: case MSR_VM_HSAVE_PA: - case MSR_P6_PERFCTR0: - case MSR_P6_PERFCTR1: - case MSR_P6_EVNTSEL0: - case MSR_P6_EVNTSEL1: case MSR_K7_EVNTSEL0: case MSR_K7_PERFCTR0: case MSR_K8_INT_PENDING_MSG: @@ -1979,6 +1938,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) data = 0xbe702111; break; default: + if (kvm_pmu_msr(vcpu, msr)) + return kvm_pmu_get_msr(vcpu, msr, pdata); if (!ignore_msrs) { pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); return 1; @@ -2037,15 +1998,12 @@ static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs, if (msrs.nmsrs >= MAX_IO_MSRS) goto out; - r = -ENOMEM; size = sizeof(struct kvm_msr_entry) * msrs.nmsrs; - entries = kmalloc(size, GFP_KERNEL); - if (!entries) + entries = memdup_user(user_msrs->entries, size); + if (IS_ERR(entries)) { + r = PTR_ERR(entries); goto out; - - r = -EFAULT; - if (copy_from_user(entries, user_msrs->entries, size)) - goto out_free; + } r = n = __msr_io(vcpu, &msrs, entries, do_msr); if (r < 0) @@ -2265,466 +2223,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu); } -static int is_efer_nx(void) -{ - unsigned long long efer = 0; - - rdmsrl_safe(MSR_EFER, &efer); - return efer & EFER_NX; -} - -static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu) -{ - int i; - struct kvm_cpuid_entry2 *e, *entry; - - entry = NULL; - for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { - e = &vcpu->arch.cpuid_entries[i]; - if (e->function == 0x80000001) { - entry = e; - break; - } - } - if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) { - entry->edx &= ~(1 << 20); - printk(KERN_INFO "kvm: guest NX capability removed\n"); - } -} - -/* when an old userspace process fills a new kernel module */ -static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, - struct kvm_cpuid *cpuid, - struct kvm_cpuid_entry __user *entries) -{ - int r, i; - struct kvm_cpuid_entry *cpuid_entries; - - r = -E2BIG; - if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) - goto out; - r = -ENOMEM; - cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent); - if (!cpuid_entries) - goto out; - r = -EFAULT; - if (copy_from_user(cpuid_entries, entries, - cpuid->nent * sizeof(struct kvm_cpuid_entry))) - goto out_free; - for (i = 0; i < cpuid->nent; i++) { - vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function; - vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax; - vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx; - vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx; - vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx; - vcpu->arch.cpuid_entries[i].index = 0; - vcpu->arch.cpuid_entries[i].flags = 0; - vcpu->arch.cpuid_entries[i].padding[0] = 0; - vcpu->arch.cpuid_entries[i].padding[1] = 0; - vcpu->arch.cpuid_entries[i].padding[2] = 0; - } - vcpu->arch.cpuid_nent = cpuid->nent; - cpuid_fix_nx_cap(vcpu); - r = 0; - kvm_apic_set_version(vcpu); - kvm_x86_ops->cpuid_update(vcpu); - update_cpuid(vcpu); - -out_free: - vfree(cpuid_entries); -out: - return r; -} - -static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, - struct kvm_cpuid2 *cpuid, - struct kvm_cpuid_entry2 __user *entries) -{ - int r; - - r = -E2BIG; - if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) - goto out; - r = -EFAULT; - if (copy_from_user(&vcpu->arch.cpuid_entries, entries, - cpuid->nent * sizeof(struct kvm_cpuid_entry2))) - goto out; - vcpu->arch.cpuid_nent = cpuid->nent; - kvm_apic_set_version(vcpu); - kvm_x86_ops->cpuid_update(vcpu); - update_cpuid(vcpu); - return 0; - -out: - return r; -} - -static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, - struct kvm_cpuid2 *cpuid, - struct kvm_cpuid_entry2 __user *entries) -{ - int r; - - r = -E2BIG; - if (cpuid->nent < vcpu->arch.cpuid_nent) - goto out; - r = -EFAULT; - if (copy_to_user(entries, &vcpu->arch.cpuid_entries, - vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2))) - goto out; - return 0; - -out: - cpuid->nent = vcpu->arch.cpuid_nent; - return r; -} - -static void cpuid_mask(u32 *word, int wordnum) -{ - *word &= boot_cpu_data.x86_capability[wordnum]; -} - -static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, - u32 index) -{ - entry->function = function; - entry->index = index; - cpuid_count(entry->function, entry->index, - &entry->eax, &entry->ebx, &entry->ecx, &entry->edx); - entry->flags = 0; -} - -static bool supported_xcr0_bit(unsigned bit) -{ - u64 mask = ((u64)1 << bit); - - return mask & (XSTATE_FP | XSTATE_SSE | XSTATE_YMM) & host_xcr0; -} - -#define F(x) bit(X86_FEATURE_##x) - -static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, - u32 index, int *nent, int maxnent) -{ - unsigned f_nx = is_efer_nx() ? F(NX) : 0; -#ifdef CONFIG_X86_64 - unsigned f_gbpages = (kvm_x86_ops->get_lpage_level() == PT_PDPE_LEVEL) - ? F(GBPAGES) : 0; - unsigned f_lm = F(LM); -#else - unsigned f_gbpages = 0; - unsigned f_lm = 0; -#endif - unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0; - - /* cpuid 1.edx */ - const u32 kvm_supported_word0_x86_features = - F(FPU) | F(VME) | F(DE) | F(PSE) | - F(TSC) | F(MSR) | F(PAE) | F(MCE) | - F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) | - F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | - F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLSH) | - 0 /* Reserved, DS, ACPI */ | F(MMX) | - F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) | - 0 /* HTT, TM, Reserved, PBE */; - /* cpuid 0x80000001.edx */ - const u32 kvm_supported_word1_x86_features = - F(FPU) | F(VME) | F(DE) | F(PSE) | - F(TSC) | F(MSR) | F(PAE) | F(MCE) | - F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) | - F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | - F(PAT) | F(PSE36) | 0 /* Reserved */ | - f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) | - F(FXSR) | F(FXSR_OPT) | f_gbpages | f_rdtscp | - 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); - /* cpuid 1.ecx */ - const u32 kvm_supported_word4_x86_features = - F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ | - 0 /* DS-CPL, VMX, SMX, EST */ | - 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | - 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ | - 0 /* Reserved, DCA */ | F(XMM4_1) | - F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | - 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) | - F(F16C) | F(RDRAND); - /* cpuid 0x80000001.ecx */ - const u32 kvm_supported_word6_x86_features = - F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ | - F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | - F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) | - 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM); - - /* cpuid 0xC0000001.edx */ - const u32 kvm_supported_word5_x86_features = - F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) | - F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) | - F(PMM) | F(PMM_EN); - - /* cpuid 7.0.ebx */ - const u32 kvm_supported_word9_x86_features = - F(SMEP) | F(FSGSBASE) | F(ERMS); - - /* all calls to cpuid_count() should be made on the same cpu */ - get_cpu(); - do_cpuid_1_ent(entry, function, index); - ++*nent; - - switch (function) { - case 0: - entry->eax = min(entry->eax, (u32)0xd); - break; - case 1: - entry->edx &= kvm_supported_word0_x86_features; - cpuid_mask(&entry->edx, 0); - entry->ecx &= kvm_supported_word4_x86_features; - cpuid_mask(&entry->ecx, 4); - /* we support x2apic emulation even if host does not support - * it since we emulate x2apic in software */ - entry->ecx |= F(X2APIC); - break; - /* function 2 entries are STATEFUL. That is, repeated cpuid commands - * may return different values. This forces us to get_cpu() before - * issuing the first command, and also to emulate this annoying behavior - * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */ - case 2: { - int t, times = entry->eax & 0xff; - - entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; - entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; - for (t = 1; t < times && *nent < maxnent; ++t) { - do_cpuid_1_ent(&entry[t], function, 0); - entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; - ++*nent; - } - break; - } - /* function 4 has additional index. */ - case 4: { - int i, cache_type; - - entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; - /* read more entries until cache_type is zero */ - for (i = 1; *nent < maxnent; ++i) { - cache_type = entry[i - 1].eax & 0x1f; - if (!cache_type) - break; - do_cpuid_1_ent(&entry[i], function, i); - entry[i].flags |= - KVM_CPUID_FLAG_SIGNIFCANT_INDEX; - ++*nent; - } - break; - } - case 7: { - entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; - /* Mask ebx against host capbability word 9 */ - if (index == 0) { - entry->ebx &= kvm_supported_word9_x86_features; - cpuid_mask(&entry->ebx, 9); - } else - entry->ebx = 0; - entry->eax = 0; - entry->ecx = 0; - entry->edx = 0; - break; - } - case 9: - break; - /* function 0xb has additional index. */ - case 0xb: { - int i, level_type; - - entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; - /* read more entries until level_type is zero */ - for (i = 1; *nent < maxnent; ++i) { - level_type = entry[i - 1].ecx & 0xff00; - if (!level_type) - break; - do_cpuid_1_ent(&entry[i], function, i); - entry[i].flags |= - KVM_CPUID_FLAG_SIGNIFCANT_INDEX; - ++*nent; - } - break; - } - case 0xd: { - int idx, i; - - entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; - for (idx = 1, i = 1; *nent < maxnent && idx < 64; ++idx) { - do_cpuid_1_ent(&entry[i], function, idx); - if (entry[i].eax == 0 || !supported_xcr0_bit(idx)) - continue; - entry[i].flags |= - KVM_CPUID_FLAG_SIGNIFCANT_INDEX; - ++*nent; - ++i; - } - break; - } - case KVM_CPUID_SIGNATURE: { - char signature[12] = "KVMKVMKVM\0\0"; - u32 *sigptr = (u32 *)signature; - entry->eax = 0; - entry->ebx = sigptr[0]; - entry->ecx = sigptr[1]; - entry->edx = sigptr[2]; - break; - } - case KVM_CPUID_FEATURES: - entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) | - (1 << KVM_FEATURE_NOP_IO_DELAY) | - (1 << KVM_FEATURE_CLOCKSOURCE2) | - (1 << KVM_FEATURE_ASYNC_PF) | - (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT); - - if (sched_info_on()) - entry->eax |= (1 << KVM_FEATURE_STEAL_TIME); - - entry->ebx = 0; - entry->ecx = 0; - entry->edx = 0; - break; - case 0x80000000: - entry->eax = min(entry->eax, 0x8000001a); - break; - case 0x80000001: - entry->edx &= kvm_supported_word1_x86_features; - cpuid_mask(&entry->edx, 1); - entry->ecx &= kvm_supported_word6_x86_features; - cpuid_mask(&entry->ecx, 6); - break; - case 0x80000008: { - unsigned g_phys_as = (entry->eax >> 16) & 0xff; - unsigned virt_as = max((entry->eax >> 8) & 0xff, 48U); - unsigned phys_as = entry->eax & 0xff; - - if (!g_phys_as) - g_phys_as = phys_as; - entry->eax = g_phys_as | (virt_as << 8); - entry->ebx = entry->edx = 0; - break; - } - case 0x80000019: - entry->ecx = entry->edx = 0; - break; - case 0x8000001a: - break; - case 0x8000001d: - break; - /*Add support for Centaur's CPUID instruction*/ - case 0xC0000000: - /*Just support up to 0xC0000004 now*/ - entry->eax = min(entry->eax, 0xC0000004); - break; - case 0xC0000001: - entry->edx &= kvm_supported_word5_x86_features; - cpuid_mask(&entry->edx, 5); - break; - case 3: /* Processor serial number */ - case 5: /* MONITOR/MWAIT */ - case 6: /* Thermal management */ - case 0xA: /* Architectural Performance Monitoring */ - case 0x80000007: /* Advanced power management */ - case 0xC0000002: - case 0xC0000003: - case 0xC0000004: - default: - entry->eax = entry->ebx = entry->ecx = entry->edx = 0; - break; - } - - kvm_x86_ops->set_supported_cpuid(function, entry); - - put_cpu(); -} - -#undef F - -static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, - struct kvm_cpuid_entry2 __user *entries) -{ - struct kvm_cpuid_entry2 *cpuid_entries; - int limit, nent = 0, r = -E2BIG; - u32 func; - - if (cpuid->nent < 1) - goto out; - if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) - cpuid->nent = KVM_MAX_CPUID_ENTRIES; - r = -ENOMEM; - cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent); - if (!cpuid_entries) - goto out; - - do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent); - limit = cpuid_entries[0].eax; - for (func = 1; func <= limit && nent < cpuid->nent; ++func) - do_cpuid_ent(&cpuid_entries[nent], func, 0, - &nent, cpuid->nent); - r = -E2BIG; - if (nent >= cpuid->nent) - goto out_free; - - do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent); - limit = cpuid_entries[nent - 1].eax; - for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func) - do_cpuid_ent(&cpuid_entries[nent], func, 0, - &nent, cpuid->nent); - - - - r = -E2BIG; - if (nent >= cpuid->nent) - goto out_free; - - /* Add support for Centaur's CPUID instruction. */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR) { - do_cpuid_ent(&cpuid_entries[nent], 0xC0000000, 0, - &nent, cpuid->nent); - - r = -E2BIG; - if (nent >= cpuid->nent) - goto out_free; - - limit = cpuid_entries[nent - 1].eax; - for (func = 0xC0000001; - func <= limit && nent < cpuid->nent; ++func) - do_cpuid_ent(&cpuid_entries[nent], func, 0, - &nent, cpuid->nent); - - r = -E2BIG; - if (nent >= cpuid->nent) - goto out_free; - } - - do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_SIGNATURE, 0, &nent, - cpuid->nent); - - r = -E2BIG; - if (nent >= cpuid->nent) - goto out_free; - - do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_FEATURES, 0, &nent, - cpuid->nent); - - r = -E2BIG; - if (nent >= cpuid->nent) - goto out_free; - - r = -EFAULT; - if (copy_to_user(entries, cpuid_entries, - nent * sizeof(struct kvm_cpuid_entry2))) - goto out_free; - cpuid->nent = nent; - r = 0; - -out_free: - vfree(cpuid_entries); -out: - return r; -} - static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { @@ -3042,13 +2540,12 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = -EINVAL; if (!vcpu->arch.apic) goto out; - u.lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); - r = -ENOMEM; - if (!u.lapic) - goto out; - r = -EFAULT; - if (copy_from_user(u.lapic, argp, sizeof(struct kvm_lapic_state))) + u.lapic = memdup_user(argp, sizeof(*u.lapic)); + if (IS_ERR(u.lapic)) { + r = PTR_ERR(u.lapic); goto out; + } + r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic); if (r) goto out; @@ -3227,14 +2724,11 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break; } case KVM_SET_XSAVE: { - u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL); - r = -ENOMEM; - if (!u.xsave) - break; - - r = -EFAULT; - if (copy_from_user(u.xsave, argp, sizeof(struct kvm_xsave))) - break; + u.xsave = memdup_user(argp, sizeof(*u.xsave)); + if (IS_ERR(u.xsave)) { + r = PTR_ERR(u.xsave); + goto out; + } r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave); break; @@ -3255,15 +2749,11 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break; } case KVM_SET_XCRS: { - u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL); - r = -ENOMEM; - if (!u.xcrs) - break; - - r = -EFAULT; - if (copy_from_user(u.xcrs, argp, - sizeof(struct kvm_xcrs))) - break; + u.xcrs = memdup_user(argp, sizeof(*u.xcrs)); + if (IS_ERR(u.xcrs)) { + r = PTR_ERR(u.xcrs); + goto out; + } r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs); break; @@ -3460,16 +2950,59 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm, return 0; } +/** + * write_protect_slot - write protect a slot for dirty logging + * @kvm: the kvm instance + * @memslot: the slot we protect + * @dirty_bitmap: the bitmap indicating which pages are dirty + * @nr_dirty_pages: the number of dirty pages + * + * We have two ways to find all sptes to protect: + * 1. Use kvm_mmu_slot_remove_write_access() which walks all shadow pages and + * checks ones that have a spte mapping a page in the slot. + * 2. Use kvm_mmu_rmap_write_protect() for each gfn found in the bitmap. + * + * Generally speaking, if there are not so many dirty pages compared to the + * number of shadow pages, we should use the latter. + * + * Note that letting others write into a page marked dirty in the old bitmap + * by using the remaining tlb entry is not a problem. That page will become + * write protected again when we flush the tlb and then be reported dirty to + * the user space by copying the old bitmap. + */ +static void write_protect_slot(struct kvm *kvm, + struct kvm_memory_slot *memslot, + unsigned long *dirty_bitmap, + unsigned long nr_dirty_pages) +{ + /* Not many dirty pages compared to # of shadow pages. */ + if (nr_dirty_pages < kvm->arch.n_used_mmu_pages) { + unsigned long gfn_offset; + + for_each_set_bit(gfn_offset, dirty_bitmap, memslot->npages) { + unsigned long gfn = memslot->base_gfn + gfn_offset; + + spin_lock(&kvm->mmu_lock); + kvm_mmu_rmap_write_protect(kvm, gfn, memslot); + spin_unlock(&kvm->mmu_lock); + } + kvm_flush_remote_tlbs(kvm); + } else { + spin_lock(&kvm->mmu_lock); + kvm_mmu_slot_remove_write_access(kvm, memslot->id); + spin_unlock(&kvm->mmu_lock); + } +} + /* * Get (and clear) the dirty memory log for a memory slot. */ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) { - int r, i; + int r; struct kvm_memory_slot *memslot; - unsigned long n; - unsigned long is_dirty = 0; + unsigned long n, nr_dirty_pages; mutex_lock(&kvm->slots_lock); @@ -3477,43 +3010,41 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, if (log->slot >= KVM_MEMORY_SLOTS) goto out; - memslot = &kvm->memslots->memslots[log->slot]; + memslot = id_to_memslot(kvm->memslots, log->slot); r = -ENOENT; if (!memslot->dirty_bitmap) goto out; n = kvm_dirty_bitmap_bytes(memslot); - - for (i = 0; !is_dirty && i < n/sizeof(long); i++) - is_dirty = memslot->dirty_bitmap[i]; + nr_dirty_pages = memslot->nr_dirty_pages; /* If nothing is dirty, don't bother messing with page tables. */ - if (is_dirty) { + if (nr_dirty_pages) { struct kvm_memslots *slots, *old_slots; - unsigned long *dirty_bitmap; + unsigned long *dirty_bitmap, *dirty_bitmap_head; - dirty_bitmap = memslot->dirty_bitmap_head; - if (memslot->dirty_bitmap == dirty_bitmap) - dirty_bitmap += n / sizeof(long); - memset(dirty_bitmap, 0, n); + dirty_bitmap = memslot->dirty_bitmap; + dirty_bitmap_head = memslot->dirty_bitmap_head; + if (dirty_bitmap == dirty_bitmap_head) + dirty_bitmap_head += n / sizeof(long); + memset(dirty_bitmap_head, 0, n); r = -ENOMEM; - slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); + slots = kmemdup(kvm->memslots, sizeof(*kvm->memslots), GFP_KERNEL); if (!slots) goto out; - memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); - slots->memslots[log->slot].dirty_bitmap = dirty_bitmap; - slots->generation++; + + memslot = id_to_memslot(slots, log->slot); + memslot->nr_dirty_pages = 0; + memslot->dirty_bitmap = dirty_bitmap_head; + update_memslots(slots, NULL); old_slots = kvm->memslots; rcu_assign_pointer(kvm->memslots, slots); synchronize_srcu_expedited(&kvm->srcu); - dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap; kfree(old_slots); - spin_lock(&kvm->mmu_lock); - kvm_mmu_slot_remove_write_access(kvm, log->slot); - spin_unlock(&kvm->mmu_lock); + write_protect_slot(kvm, memslot, dirty_bitmap, nr_dirty_pages); r = -EFAULT; if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) @@ -3658,14 +3189,14 @@ long kvm_arch_vm_ioctl(struct file *filp, } case KVM_GET_IRQCHIP: { /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ - struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL); + struct kvm_irqchip *chip; - r = -ENOMEM; - if (!chip) + chip = memdup_user(argp, sizeof(*chip)); + if (IS_ERR(chip)) { + r = PTR_ERR(chip); goto out; - r = -EFAULT; - if (copy_from_user(chip, argp, sizeof *chip)) - goto get_irqchip_out; + } + r = -ENXIO; if (!irqchip_in_kernel(kvm)) goto get_irqchip_out; @@ -3684,14 +3215,14 @@ long kvm_arch_vm_ioctl(struct file *filp, } case KVM_SET_IRQCHIP: { /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ - struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL); + struct kvm_irqchip *chip; - r = -ENOMEM; - if (!chip) + chip = memdup_user(argp, sizeof(*chip)); + if (IS_ERR(chip)) { + r = PTR_ERR(chip); goto out; - r = -EFAULT; - if (copy_from_user(chip, argp, sizeof *chip)) - goto set_irqchip_out; + } + r = -ENXIO; if (!irqchip_in_kernel(kvm)) goto set_irqchip_out; @@ -3898,12 +3429,7 @@ void kvm_get_segment(struct kvm_vcpu *vcpu, kvm_x86_ops->get_segment(vcpu, var, seg); } -static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access) -{ - return gpa; -} - -static gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access) +gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access) { gpa_t t_gpa; struct x86_exception exception; @@ -4087,7 +3613,7 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes); if (ret < 0) return 0; - kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1); + kvm_mmu_pte_write(vcpu, gpa, val, bytes); return 1; } @@ -4324,7 +3850,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, if (!exchanged) return X86EMUL_CMPXCHG_FAILED; - kvm_mmu_pte_write(vcpu, gpa, new, bytes, 1); + kvm_mmu_pte_write(vcpu, gpa, new, bytes); return X86EMUL_CONTINUE; @@ -4349,32 +3875,24 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) return r; } - -static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt, - int size, unsigned short port, void *val, - unsigned int count) +static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size, + unsigned short port, void *val, + unsigned int count, bool in) { - struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - - if (vcpu->arch.pio.count) - goto data_avail; - - trace_kvm_pio(0, port, size, count); + trace_kvm_pio(!in, port, size, count); vcpu->arch.pio.port = port; - vcpu->arch.pio.in = 1; + vcpu->arch.pio.in = in; vcpu->arch.pio.count = count; vcpu->arch.pio.size = size; if (!kernel_pio(vcpu, vcpu->arch.pio_data)) { - data_avail: - memcpy(val, vcpu->arch.pio_data, size * count); vcpu->arch.pio.count = 0; return 1; } vcpu->run->exit_reason = KVM_EXIT_IO; - vcpu->run->io.direction = KVM_EXIT_IO_IN; + vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; vcpu->run->io.size = size; vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; vcpu->run->io.count = count; @@ -4383,36 +3901,37 @@ static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt, return 0; } -static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt, - int size, unsigned short port, - const void *val, unsigned int count) +static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt, + int size, unsigned short port, void *val, + unsigned int count) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + int ret; - trace_kvm_pio(1, port, size, count); - - vcpu->arch.pio.port = port; - vcpu->arch.pio.in = 0; - vcpu->arch.pio.count = count; - vcpu->arch.pio.size = size; - - memcpy(vcpu->arch.pio_data, val, size * count); + if (vcpu->arch.pio.count) + goto data_avail; - if (!kernel_pio(vcpu, vcpu->arch.pio_data)) { + ret = emulator_pio_in_out(vcpu, size, port, val, count, true); + if (ret) { +data_avail: + memcpy(val, vcpu->arch.pio_data, size * count); vcpu->arch.pio.count = 0; return 1; } - vcpu->run->exit_reason = KVM_EXIT_IO; - vcpu->run->io.direction = KVM_EXIT_IO_OUT; - vcpu->run->io.size = size; - vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; - vcpu->run->io.count = count; - vcpu->run->io.port = port; - return 0; } +static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt, + int size, unsigned short port, + const void *val, unsigned int count) +{ + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + + memcpy(vcpu->arch.pio_data, val, size * count); + return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false); +} + static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) { return kvm_x86_ops->get_segment_base(vcpu, seg); @@ -4627,6 +4146,12 @@ static int emulator_set_msr(struct x86_emulate_ctxt *ctxt, return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data); } +static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt, + u32 pmc, u64 *pdata) +{ + return kvm_pmu_read_pmc(emul_to_vcpu(ctxt), pmc, pdata); +} + static void emulator_halt(struct x86_emulate_ctxt *ctxt) { emul_to_vcpu(ctxt)->arch.halt_request = 1; @@ -4679,6 +4204,7 @@ static struct x86_emulate_ops emulate_ops = { .set_dr = emulator_set_dr, .set_msr = emulator_set_msr, .get_msr = emulator_get_msr, + .read_pmc = emulator_read_pmc, .halt = emulator_halt, .wbinvd = emulator_wbinvd, .fix_hypercall = emulator_fix_hypercall, @@ -4836,6 +4362,50 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva) return false; } +static bool retry_instruction(struct x86_emulate_ctxt *ctxt, + unsigned long cr2, int emulation_type) +{ + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + unsigned long last_retry_eip, last_retry_addr, gpa = cr2; + + last_retry_eip = vcpu->arch.last_retry_eip; + last_retry_addr = vcpu->arch.last_retry_addr; + + /* + * If the emulation is caused by #PF and it is non-page_table + * writing instruction, it means the VM-EXIT is caused by shadow + * page protected, we can zap the shadow page and retry this + * instruction directly. + * + * Note: if the guest uses a non-page-table modifying instruction + * on the PDE that points to the instruction, then we will unmap + * the instruction and go to an infinite loop. So, we cache the + * last retried eip and the last fault address, if we meet the eip + * and the address again, we can break out of the potential infinite + * loop. + */ + vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0; + + if (!(emulation_type & EMULTYPE_RETRY)) + return false; + + if (x86_page_table_writing_insn(ctxt)) + return false; + + if (ctxt->eip == last_retry_eip && last_retry_addr == cr2) + return false; + + vcpu->arch.last_retry_eip = ctxt->eip; + vcpu->arch.last_retry_addr = cr2; + + if (!vcpu->arch.mmu.direct_map) + gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL); + + kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); + + return true; +} + int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2, int emulation_type, @@ -4877,6 +4447,9 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, return EMULATE_DONE; } + if (retry_instruction(ctxt, cr2, emulation_type)) + return EMULATE_DONE; + /* this is needed for vmware backdoor interface to work since it changes registers values during IO operation */ if (vcpu->arch.emulate_regs_need_sync_from_vcpu) { @@ -5095,17 +4668,17 @@ static void kvm_timer_init(void) static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu); -static int kvm_is_in_guest(void) +int kvm_is_in_guest(void) { - return percpu_read(current_vcpu) != NULL; + return __this_cpu_read(current_vcpu) != NULL; } static int kvm_is_user_mode(void) { int user_mode = 3; - if (percpu_read(current_vcpu)) - user_mode = kvm_x86_ops->get_cpl(percpu_read(current_vcpu)); + if (__this_cpu_read(current_vcpu)) + user_mode = kvm_x86_ops->get_cpl(__this_cpu_read(current_vcpu)); return user_mode != 0; } @@ -5114,8 +4687,8 @@ static unsigned long kvm_get_guest_ip(void) { unsigned long ip = 0; - if (percpu_read(current_vcpu)) - ip = kvm_rip_read(percpu_read(current_vcpu)); + if (__this_cpu_read(current_vcpu)) + ip = kvm_rip_read(__this_cpu_read(current_vcpu)); return ip; } @@ -5128,13 +4701,13 @@ static struct perf_guest_info_callbacks kvm_guest_cbs = { void kvm_before_handle_nmi(struct kvm_vcpu *vcpu) { - percpu_write(current_vcpu, vcpu); + __this_cpu_write(current_vcpu, vcpu); } EXPORT_SYMBOL_GPL(kvm_before_handle_nmi); void kvm_after_handle_nmi(struct kvm_vcpu *vcpu) { - percpu_write(current_vcpu, NULL); + __this_cpu_write(current_vcpu, NULL); } EXPORT_SYMBOL_GPL(kvm_after_handle_nmi); @@ -5233,15 +4806,6 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_emulate_halt); -static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0, - unsigned long a1) -{ - if (is_long_mode(vcpu)) - return a0; - else - return a0 | ((gpa_t)a1 << 32); -} - int kvm_hv_hypercall(struct kvm_vcpu *vcpu) { u64 param, ingpa, outgpa, ret; @@ -5337,9 +4901,6 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) case KVM_HC_VAPIC_POLL_IRQ: ret = 0; break; - case KVM_HC_MMU_OP: - r = kvm_pv_mmu_op(vcpu, a0, hc_gpa(vcpu, a1, a2), &ret); - break; default: ret = -KVM_ENOSYS; break; @@ -5369,125 +4930,6 @@ int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt) return emulator_write_emulated(ctxt, rip, instruction, 3, NULL); } -static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) -{ - struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i]; - int j, nent = vcpu->arch.cpuid_nent; - - e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT; - /* when no next entry is found, the current entry[i] is reselected */ - for (j = i + 1; ; j = (j + 1) % nent) { - struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j]; - if (ej->function == e->function) { - ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; - return j; - } - } - return 0; /* silence gcc, even though control never reaches here */ -} - -/* find an entry with matching function, matching index (if needed), and that - * should be read next (if it's stateful) */ -static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e, - u32 function, u32 index) -{ - if (e->function != function) - return 0; - if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index) - return 0; - if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) && - !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT)) - return 0; - return 1; -} - -struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, - u32 function, u32 index) -{ - int i; - struct kvm_cpuid_entry2 *best = NULL; - - for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { - struct kvm_cpuid_entry2 *e; - - e = &vcpu->arch.cpuid_entries[i]; - if (is_matching_cpuid_entry(e, function, index)) { - if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) - move_to_next_stateful_cpuid_entry(vcpu, i); - best = e; - break; - } - } - return best; -} -EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry); - -int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) -{ - struct kvm_cpuid_entry2 *best; - - best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0); - if (!best || best->eax < 0x80000008) - goto not_found; - best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); - if (best) - return best->eax & 0xff; -not_found: - return 36; -} - -/* - * If no match is found, check whether we exceed the vCPU's limit - * and return the content of the highest valid _standard_ leaf instead. - * This is to satisfy the CPUID specification. - */ -static struct kvm_cpuid_entry2* check_cpuid_limit(struct kvm_vcpu *vcpu, - u32 function, u32 index) -{ - struct kvm_cpuid_entry2 *maxlevel; - - maxlevel = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0); - if (!maxlevel || maxlevel->eax >= function) - return NULL; - if (function & 0x80000000) { - maxlevel = kvm_find_cpuid_entry(vcpu, 0, 0); - if (!maxlevel) - return NULL; - } - return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index); -} - -void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) -{ - u32 function, index; - struct kvm_cpuid_entry2 *best; - - function = kvm_register_read(vcpu, VCPU_REGS_RAX); - index = kvm_register_read(vcpu, VCPU_REGS_RCX); - kvm_register_write(vcpu, VCPU_REGS_RAX, 0); - kvm_register_write(vcpu, VCPU_REGS_RBX, 0); - kvm_register_write(vcpu, VCPU_REGS_RCX, 0); - kvm_register_write(vcpu, VCPU_REGS_RDX, 0); - best = kvm_find_cpuid_entry(vcpu, function, index); - - if (!best) - best = check_cpuid_limit(vcpu, function, index); - - if (best) { - kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax); - kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx); - kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx); - kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx); - } - kvm_x86_ops->skip_emulated_instruction(vcpu); - trace_kvm_cpuid(function, - kvm_register_read(vcpu, VCPU_REGS_RAX), - kvm_register_read(vcpu, VCPU_REGS_RBX), - kvm_register_read(vcpu, VCPU_REGS_RCX), - kvm_register_read(vcpu, VCPU_REGS_RDX)); -} -EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); - /* * Check if userspace requested an interrupt window, and that the * interrupt window is open. @@ -5648,6 +5090,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) int r; bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && vcpu->run->request_interrupt_window; + bool req_immediate_exit = 0; if (vcpu->requests) { if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) @@ -5687,7 +5130,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) record_steal_time(vcpu); if (kvm_check_request(KVM_REQ_NMI, vcpu)) process_nmi(vcpu); - + req_immediate_exit = + kvm_check_request(KVM_REQ_IMMEDIATE_EXIT, vcpu); + if (kvm_check_request(KVM_REQ_PMU, vcpu)) + kvm_handle_pmu_event(vcpu); + if (kvm_check_request(KVM_REQ_PMI, vcpu)) + kvm_deliver_pmi(vcpu); } r = kvm_mmu_reload(vcpu); @@ -5738,6 +5186,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); + if (req_immediate_exit) + smp_send_reschedule(vcpu->cpu); + kvm_guest_enter(); if (unlikely(vcpu->arch.switch_db_regs)) { @@ -5943,10 +5394,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (r <= 0) goto out; - if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) - kvm_register_write(vcpu, VCPU_REGS_RAX, - kvm_run->hypercall.ret); - r = __vcpu_run(vcpu); out: @@ -6148,7 +5595,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; kvm_x86_ops->set_cr4(vcpu, sregs->cr4); if (sregs->cr4 & X86_CR4_OSXSAVE) - update_cpuid(vcpu); + kvm_update_cpuid(vcpu); idx = srcu_read_lock(&vcpu->kvm->srcu); if (!is_long_mode(vcpu) && is_pae(vcpu)) { @@ -6425,6 +5872,8 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) kvm_async_pf_hash_reset(vcpu); vcpu->arch.apf.halted = false; + kvm_pmu_reset(vcpu); + return kvm_x86_ops->vcpu_reset(vcpu); } @@ -6473,10 +5922,6 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) kvm = vcpu->kvm; vcpu->arch.emulate_ctxt.ops = &emulate_ops; - vcpu->arch.walk_mmu = &vcpu->arch.mmu; - vcpu->arch.mmu.root_hpa = INVALID_PAGE; - vcpu->arch.mmu.translate_gpa = translate_gpa; - vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa; if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; else @@ -6513,6 +5958,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) goto fail_free_mce_banks; kvm_async_pf_hash_reset(vcpu); + kvm_pmu_init(vcpu); return 0; fail_free_mce_banks: @@ -6531,6 +5977,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) { int idx; + kvm_pmu_destroy(vcpu); kfree(vcpu->arch.mce_banks); kvm_free_lapic(vcpu); idx = srcu_read_lock(&vcpu->kvm->srcu); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index d36fe23..cb80c29 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -33,9 +33,6 @@ static inline bool kvm_exception_is_soft(unsigned int nr) return (nr == BP_VECTOR) || (nr == OF_VECTOR); } -struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, - u32 function, u32 index); - static inline bool is_protmode(struct kvm_vcpu *vcpu) { return kvm_read_cr0_bits(vcpu, X86_CR0_PE); @@ -125,4 +122,6 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val, unsigned int bytes, struct x86_exception *exception); +extern u64 host_xcr0; + #endif diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index cf4603b..642d880 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -856,18 +856,23 @@ static void __init lguest_init_IRQ(void) } /* - * With CONFIG_SPARSE_IRQ, interrupt descriptors are allocated as-needed, so - * rather than set them in lguest_init_IRQ we are called here every time an - * lguest device needs an interrupt. - * - * FIXME: irq_alloc_desc_at() can fail due to lack of memory, we should - * pass that up! + * Interrupt descriptors are allocated as-needed, but low-numbered ones are + * reserved by the generic x86 code. So we ignore irq_alloc_desc_at if it + * tells us the irq is already used: other errors (ie. ENOMEM) we take + * seriously. */ -void lguest_setup_irq(unsigned int irq) +int lguest_setup_irq(unsigned int irq) { - irq_alloc_desc_at(irq, 0); + int err; + + /* Returns -ve error or vector number. */ + err = irq_alloc_desc_at(irq, 0); + if (err < 0 && err != -EEXIST) + return err; + irq_set_chip_and_handler_name(irq, &lguest_irq_controller, handle_level_irq, "level"); + return 0; } /* diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index 5b83c51..8191379 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt @@ -219,7 +219,9 @@ ab: STOS/W/D/Q Yv,rAX ac: LODS/B AL,Xb ad: LODS/W/D/Q rAX,Xv ae: SCAS/B AL,Yb -af: SCAS/W/D/Q rAX,Xv +# Note: The May 2011 Intel manual shows Xv for the second parameter of the +# next instruction but Yv is correct +af: SCAS/W/D/Q rAX,Yv # 0xb0 - 0xbf b0: MOV AL/R8L,Ib b1: MOV CL/R9L,Ib @@ -729,8 +731,8 @@ de: VAESDEC Vdq,Hdq,Wdq (66),(v1) df: VAESDECLAST Vdq,Hdq,Wdq (66),(v1) f0: MOVBE Gy,My | MOVBE Gw,Mw (66) | CRC32 Gd,Eb (F2) f1: MOVBE My,Gy | MOVBE Mw,Gw (66) | CRC32 Gd,Ey (F2) -f3: ANDN Gy,By,Ey (v) -f4: Grp17 (1A) +f2: ANDN Gy,By,Ey (v) +f3: Grp17 (1A) f5: BZHI Gy,Ey,By (v) | PEXT Gy,By,Ey (F3),(v) | PDEP Gy,By,Ey (F2),(v) f6: MULX By,Gy,rDX,Ey (F2),(v) f7: BEXTR Gy,Ey,By (v) | SHLX Gy,Ey,By (66),(v) | SARX Gy,Ey,By (F3),(v) | SHRX Gy,Ey,By (F2),(v) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index a298914..6cabf65 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -3,6 +3,7 @@ #include <linux/ioport.h> #include <linux/swap.h> #include <linux/memblock.h> +#include <linux/bootmem.h> /* for max_low_pfn */ #include <asm/cacheflush.h> #include <asm/e820.h> @@ -15,6 +16,7 @@ #include <asm/tlbflush.h> #include <asm/tlb.h> #include <asm/proto.h> +#include <asm/dma.h> /* for MAX_DMA_PFN */ unsigned long __initdata pgt_buf_start; unsigned long __meminitdata pgt_buf_end; @@ -392,3 +394,24 @@ void free_initrd_mem(unsigned long start, unsigned long end) free_init_pages("initrd memory", start, PAGE_ALIGN(end)); } #endif + +void __init zone_sizes_init(void) +{ + unsigned long max_zone_pfns[MAX_NR_ZONES]; + + memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); + +#ifdef CONFIG_ZONE_DMA + max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; +#endif +#ifdef CONFIG_ZONE_DMA32 + max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; +#endif + max_zone_pfns[ZONE_NORMAL] = max_low_pfn; +#ifdef CONFIG_HIGHMEM + max_zone_pfns[ZONE_HIGHMEM] = max_pfn; +#endif + + free_area_init_nodes(max_zone_pfns); +} + diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 0c1da39..8663f6c 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -668,22 +668,6 @@ void __init initmem_init(void) } #endif /* !CONFIG_NEED_MULTIPLE_NODES */ -static void __init zone_sizes_init(void) -{ - unsigned long max_zone_pfns[MAX_NR_ZONES]; - memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); -#ifdef CONFIG_ZONE_DMA - max_zone_pfns[ZONE_DMA] = - virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; -#endif - max_zone_pfns[ZONE_NORMAL] = max_low_pfn; -#ifdef CONFIG_HIGHMEM - max_zone_pfns[ZONE_HIGHMEM] = highend_pfn; -#endif - - free_area_init_nodes(max_zone_pfns); -} - void __init setup_bootmem_allocator(void) { printk(KERN_INFO " mapped low ram: 0 - %08lx\n", @@ -754,6 +738,17 @@ void __init mem_init(void) #ifdef CONFIG_FLATMEM BUG_ON(!mem_map); #endif + /* + * With CONFIG_DEBUG_PAGEALLOC initialization of highmem pages has to + * be done before free_all_bootmem(). Memblock use free low memory for + * temporary data (see find_range_array()) and for this purpose can use + * pages that was already passed to the buddy allocator, hence marked as + * not accessible in the page tables when compiled with + * CONFIG_DEBUG_PAGEALLOC. Otherwise order of initialization is not + * important here. + */ + set_highmem_pages_init(); + /* this will put all low memory onto the freelists */ totalram_pages += free_all_bootmem(); @@ -765,8 +760,6 @@ void __init mem_init(void) if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp))) reservedpages++; - set_highmem_pages_init(); - codesize = (unsigned long) &_etext - (unsigned long) &_text; datasize = (unsigned long) &_edata - (unsigned long) &_etext; initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index a8a56ce..436a030 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -614,15 +614,6 @@ void __init initmem_init(void) void __init paging_init(void) { - unsigned long max_zone_pfns[MAX_NR_ZONES]; - - memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); -#ifdef CONFIG_ZONE_DMA - max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; -#endif - max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; - max_zone_pfns[ZONE_NORMAL] = max_pfn; - sparse_memory_present_with_active_regions(MAX_NUMNODES); sparse_init(); @@ -634,7 +625,7 @@ void __init paging_init(void) */ node_clear_state(0, N_NORMAL_MEMORY); - free_area_init_nodes(max_zone_pfns); + zone_sizes_init(); } /* diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 4b5ba85..845df68 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -75,9 +75,9 @@ static unsigned long mmap_rnd(void) */ if (current->flags & PF_RANDOMIZE) { if (mmap_is_ia32()) - rnd = (long)get_random_int() % (1<<8); + rnd = get_random_int() % (1<<8); else - rnd = (long)(get_random_int() % (1<<28)); + rnd = get_random_int() % (1<<28); } return rnd << PAGE_SHIFT; } diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c index de54b9b..dc0b727 100644 --- a/arch/x86/mm/mmio-mod.c +++ b/arch/x86/mm/mmio-mod.c @@ -75,8 +75,8 @@ static LIST_HEAD(trace_list); /* struct remap_trace */ /* module parameters */ static unsigned long filter_offset; -static int nommiotrace; -static int trace_pc; +static bool nommiotrace; +static bool trace_pc; module_param(filter_offset, ulong, 0); module_param(nommiotrace, bool, 0); diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 496f494..19d3fa0 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -110,7 +110,7 @@ void __cpuinit numa_clear_node(int cpu) * Allocate node_to_cpumask_map based on number of available nodes * Requires node_possible_map to be valid. * - * Note: node_to_cpumask() is not valid until after this is done. + * Note: cpumask_of_node() is not valid until after this is done. * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.) */ void __init setup_node_to_cpumask_map(void) @@ -422,8 +422,9 @@ static int __init numa_alloc_distance(void) * calls are ignored until the distance table is reset with * numa_reset_distance(). * - * If @from or @to is higher than the highest known node at the time of - * table creation or @distance doesn't make sense, the call is ignored. + * If @from or @to is higher than the highest known node or lower than zero + * at the time of table creation or @distance doesn't make sense, the call + * is ignored. * This is to allow simplification of specific NUMA config implementations. */ void __init numa_set_distance(int from, int to, int distance) @@ -431,8 +432,9 @@ void __init numa_set_distance(int from, int to, int distance) if (!numa_distance && numa_alloc_distance() < 0) return; - if (from >= numa_distance_cnt || to >= numa_distance_cnt) { - printk_once(KERN_DEBUG "NUMA: Debug: distance out of bound, from=%d to=%d distance=%d\n", + if (from >= numa_distance_cnt || to >= numa_distance_cnt || + from < 0 || to < 0) { + pr_warn_once("NUMA: Warning: node ids are out of bound, from=%d to=%d distance=%d\n", from, to, distance); return; } diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index eda2acb..e1ebde3 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -1334,12 +1334,6 @@ void kernel_map_pages(struct page *page, int numpages, int enable) } /* - * If page allocator is not up yet then do not call c_p_a(): - */ - if (!debug_pagealloc_enabled) - return; - - /* * The return value is ignored as the calls cannot fail. * Large pages for identity mappings are not used at boot time * and hence no memory allocations during large page split. diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c index fd61b3f..1c1c4f4 100644 --- a/arch/x86/mm/srat.c +++ b/arch/x86/mm/srat.c @@ -109,6 +109,8 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0) return; pxm = pa->proximity_domain_lo; + if (acpi_srat_revision >= 2) + pxm |= *((unsigned int*)pa->proximity_domain_hi) << 8; node = setup_node(pxm); if (node < 0) { printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm); @@ -160,6 +162,8 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) start = ma->base_address; end = start + ma->length; pxm = ma->proximity_domain; + if (acpi_srat_revision <= 1) + pxm &= 0xff; node = setup_node(pxm); if (node < 0) { printk(KERN_ERR "SRAT: Too many proximity domains.\n"); diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile index 6b8759f..e76e18c 100644 --- a/arch/x86/pci/Makefile +++ b/arch/x86/pci/Makefile @@ -15,11 +15,12 @@ obj-$(CONFIG_X86_VISWS) += visws.o obj-$(CONFIG_X86_NUMAQ) += numaq_32.o -obj-$(CONFIG_X86_MRST) += mrst.o +obj-$(CONFIG_X86_INTEL_MID) += mrst.o obj-y += common.o early.o -obj-y += amd_bus.o bus_numa.o +obj-y += bus_numa.o +obj-$(CONFIG_AMD_NB) += amd_bus.o obj-$(CONFIG_PCI_CNB20LE_QUIRK) += broadcom_bus.o ifeq ($(CONFIG_PCI_DEBUG),y) diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 404f21a..a312e76 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -12,7 +12,7 @@ struct pci_root_info { char *name; unsigned int res_num; struct resource *res; - struct pci_bus *bus; + struct list_head *resources; int busnum; }; @@ -24,6 +24,12 @@ static int __init set_use_crs(const struct dmi_system_id *id) return 0; } +static int __init set_nouse_crs(const struct dmi_system_id *id) +{ + pci_use_crs = false; + return 0; +} + static const struct dmi_system_id pci_use_crs_table[] __initconst = { /* http://bugzilla.kernel.org/show_bug.cgi?id=14183 */ { @@ -54,6 +60,29 @@ static const struct dmi_system_id pci_use_crs_table[] __initconst = { DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."), }, }, + + /* Now for the blacklist.. */ + + /* https://bugzilla.redhat.com/show_bug.cgi?id=769657 */ + { + .callback = set_nouse_crs, + .ident = "Dell Studio 1557", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Studio 1557"), + DMI_MATCH(DMI_BIOS_VERSION, "A09"), + }, + }, + /* https://bugzilla.redhat.com/show_bug.cgi?id=769657 */ + { + .callback = set_nouse_crs, + .ident = "Thinkpad SL510", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "LENOVO"), + DMI_MATCH(DMI_BOARD_NAME, "2847DFG"), + DMI_MATCH(DMI_BIOS_VERSION, "6JET85WW (1.43 )"), + }, + }, {} }; @@ -149,7 +178,7 @@ setup_resource(struct acpi_resource *acpi_res, void *data) struct acpi_resource_address64 addr; acpi_status status; unsigned long flags; - u64 start, end; + u64 start, orig_end, end; status = resource_to_addr(acpi_res, &addr); if (!ACPI_SUCCESS(status)) @@ -165,7 +194,21 @@ setup_resource(struct acpi_resource *acpi_res, void *data) return AE_OK; start = addr.minimum + addr.translation_offset; - end = addr.maximum + addr.translation_offset; + orig_end = end = addr.maximum + addr.translation_offset; + + /* Exclude non-addressable range or non-addressable portion of range */ + end = min(end, (u64)iomem_resource.end); + if (end <= start) { + dev_info(&info->bridge->dev, + "host bridge window [%#llx-%#llx] " + "(ignored, not CPU addressable)\n", start, orig_end); + return AE_OK; + } else if (orig_end != end) { + dev_info(&info->bridge->dev, + "host bridge window [%#llx-%#llx] " + "([%#llx-%#llx] ignored, not CPU addressable)\n", + start, orig_end, end + 1, orig_end); + } res = &info->res[info->res_num]; res->name = info->name; @@ -261,23 +304,20 @@ static void add_resources(struct pci_root_info *info) "ignoring host bridge window %pR (conflicts with %s %pR)\n", res, conflict->name, conflict); else - pci_bus_add_resource(info->bus, res, 0); + pci_add_resource(info->resources, res); } } static void get_current_resources(struct acpi_device *device, int busnum, - int domain, struct pci_bus *bus) + int domain, struct list_head *resources) { struct pci_root_info info; size_t size; - if (pci_use_crs) - pci_bus_remove_resources(bus); - info.bridge = device; - info.bus = bus; info.res_num = 0; + info.resources = resources; acpi_walk_resources(device->handle, METHOD_NAME__CRS, count_resource, &info); if (!info.res_num) @@ -286,7 +326,7 @@ get_current_resources(struct acpi_device *device, int busnum, size = sizeof(*info.res) * info.res_num; info.res = kmalloc(size, GFP_KERNEL); if (!info.res) - goto res_alloc_fail; + return; info.name = kasprintf(GFP_KERNEL, "PCI Bus %04x:%02x", domain, busnum); if (!info.name) @@ -301,8 +341,6 @@ get_current_resources(struct acpi_device *device, int busnum, name_alloc_fail: kfree(info.res); -res_alloc_fail: - return; } struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root) @@ -310,6 +348,7 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root) struct acpi_device *device = root->device; int domain = root->segment; int busnum = root->secondary.start; + LIST_HEAD(resources); struct pci_bus *bus; struct pci_sysdata *sd; int node; @@ -364,11 +403,15 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root) memcpy(bus->sysdata, sd, sizeof(*sd)); kfree(sd); } else { - bus = pci_create_bus(NULL, busnum, &pci_root_ops, sd); - if (bus) { - get_current_resources(device, busnum, domain, bus); + get_current_resources(device, busnum, domain, &resources); + if (list_empty(&resources)) + x86_pci_root_bus_resources(busnum, &resources); + bus = pci_create_root_bus(NULL, busnum, &pci_root_ops, sd, + &resources); + if (bus) bus->subordinate = pci_scan_child_bus(bus); - } + else + pci_free_resource_list(&resources); } /* After the PCI-E bus has been walked and all devices discovered, diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c index 026e493..0567df3 100644 --- a/arch/x86/pci/amd_bus.c +++ b/arch/x86/pci/amd_bus.c @@ -30,34 +30,6 @@ static struct pci_hostbridge_probe pci_probes[] __initdata = { { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1300 }, }; -static u64 __initdata fam10h_mmconf_start; -static u64 __initdata fam10h_mmconf_end; -static void __init get_pci_mmcfg_amd_fam10h_range(void) -{ - u32 address; - u64 base, msr; - unsigned segn_busn_bits; - - /* assume all cpus from fam10h have mmconf */ - if (boot_cpu_data.x86 < 0x10) - return; - - address = MSR_FAM10H_MMIO_CONF_BASE; - rdmsrl(address, msr); - - /* mmconfig is not enable */ - if (!(msr & FAM10H_MMIO_CONF_ENABLE)) - return; - - base = msr & (FAM10H_MMIO_CONF_BASE_MASK<<FAM10H_MMIO_CONF_BASE_SHIFT); - - segn_busn_bits = (msr >> FAM10H_MMIO_CONF_BUSRANGE_SHIFT) & - FAM10H_MMIO_CONF_BUSRANGE_MASK; - - fam10h_mmconf_start = base; - fam10h_mmconf_end = base + (1ULL<<(segn_busn_bits + 20)) - 1; -} - #define RANGE_NUM 16 /** @@ -85,6 +57,9 @@ static int __init early_fill_mp_bus_info(void) u64 val; u32 address; bool found; + struct resource fam10h_mmconf_res, *fam10h_mmconf; + u64 fam10h_mmconf_start; + u64 fam10h_mmconf_end; if (!early_pci_allowed()) return -1; @@ -211,12 +186,17 @@ static int __init early_fill_mp_bus_info(void) subtract_range(range, RANGE_NUM, 0, end); /* get mmconfig */ - get_pci_mmcfg_amd_fam10h_range(); + fam10h_mmconf = amd_get_mmconfig_range(&fam10h_mmconf_res); /* need to take out mmconf range */ - if (fam10h_mmconf_end) { - printk(KERN_DEBUG "Fam 10h mmconf [%llx, %llx]\n", fam10h_mmconf_start, fam10h_mmconf_end); + if (fam10h_mmconf) { + printk(KERN_DEBUG "Fam 10h mmconf %pR\n", fam10h_mmconf); + fam10h_mmconf_start = fam10h_mmconf->start; + fam10h_mmconf_end = fam10h_mmconf->end; subtract_range(range, RANGE_NUM, fam10h_mmconf_start, fam10h_mmconf_end + 1); + } else { + fam10h_mmconf_start = 0; + fam10h_mmconf_end = 0; } /* mmio resource */ @@ -403,7 +383,6 @@ static void __init pci_enable_pci_io_ecs(void) ++n; } } - pr_info("Extended Config Space enabled on %u nodes\n", n); #endif } diff --git a/arch/x86/pci/broadcom_bus.c b/arch/x86/pci/broadcom_bus.c index ab8269b..f3a7c56 100644 --- a/arch/x86/pci/broadcom_bus.c +++ b/arch/x86/pci/broadcom_bus.c @@ -15,10 +15,11 @@ #include <linux/pci.h> #include <linux/init.h> #include <asm/pci_x86.h> +#include <asm/pci-direct.h> #include "bus_numa.h" -static void __devinit cnb20le_res(struct pci_dev *dev) +static void __init cnb20le_res(u8 bus, u8 slot, u8 func) { struct pci_root_info *info; struct resource res; @@ -26,21 +27,12 @@ static void __devinit cnb20le_res(struct pci_dev *dev) u8 fbus, lbus; int i; -#ifdef CONFIG_ACPI - /* - * We should get host bridge information from ACPI unless the BIOS - * doesn't support it. - */ - if (acpi_os_get_root_pointer()) - return; -#endif - info = &pci_root_info[pci_root_num]; pci_root_num++; /* read the PCI bus numbers */ - pci_read_config_byte(dev, 0x44, &fbus); - pci_read_config_byte(dev, 0x45, &lbus); + fbus = read_pci_config_byte(bus, slot, func, 0x44); + lbus = read_pci_config_byte(bus, slot, func, 0x45); info->bus_min = fbus; info->bus_max = lbus; @@ -59,8 +51,8 @@ static void __devinit cnb20le_res(struct pci_dev *dev) } /* read the non-prefetchable memory window */ - pci_read_config_word(dev, 0xc0, &word1); - pci_read_config_word(dev, 0xc2, &word2); + word1 = read_pci_config_16(bus, slot, func, 0xc0); + word2 = read_pci_config_16(bus, slot, func, 0xc2); if (word1 != word2) { res.start = (word1 << 16) | 0x0000; res.end = (word2 << 16) | 0xffff; @@ -69,8 +61,8 @@ static void __devinit cnb20le_res(struct pci_dev *dev) } /* read the prefetchable memory window */ - pci_read_config_word(dev, 0xc4, &word1); - pci_read_config_word(dev, 0xc6, &word2); + word1 = read_pci_config_16(bus, slot, func, 0xc4); + word2 = read_pci_config_16(bus, slot, func, 0xc6); if (word1 != word2) { res.start = (word1 << 16) | 0x0000; res.end = (word2 << 16) | 0xffff; @@ -79,8 +71,8 @@ static void __devinit cnb20le_res(struct pci_dev *dev) } /* read the IO port window */ - pci_read_config_word(dev, 0xd0, &word1); - pci_read_config_word(dev, 0xd2, &word2); + word1 = read_pci_config_16(bus, slot, func, 0xd0); + word2 = read_pci_config_16(bus, slot, func, 0xd2); if (word1 != word2) { res.start = word1; res.end = word2; @@ -92,13 +84,37 @@ static void __devinit cnb20le_res(struct pci_dev *dev) res.start = fbus; res.end = lbus; res.flags = IORESOURCE_BUS; - dev_info(&dev->dev, "CNB20LE PCI Host Bridge (domain %04x %pR)\n", - pci_domain_nr(dev->bus), &res); + printk(KERN_INFO "CNB20LE PCI Host Bridge (domain 0000 %pR)\n", &res); for (i = 0; i < info->res_num; i++) - dev_info(&dev->dev, "host bridge window %pR\n", &info->res[i]); + printk(KERN_INFO "host bridge window %pR\n", &info->res[i]); } -DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_SERVERWORKS, PCI_DEVICE_ID_SERVERWORKS_LE, - cnb20le_res); +static int __init broadcom_postcore_init(void) +{ + u8 bus = 0, slot = 0; + u32 id; + u16 vendor, device; + +#ifdef CONFIG_ACPI + /* + * We should get host bridge information from ACPI unless the BIOS + * doesn't support it. + */ + if (acpi_os_get_root_pointer()) + return 0; +#endif + + id = read_pci_config(bus, slot, 0, PCI_VENDOR_ID); + vendor = id & 0xffff; + device = (id >> 16) & 0xffff; + + if (vendor == PCI_VENDOR_ID_SERVERWORKS && + device == PCI_DEVICE_ID_SERVERWORKS_LE) { + cnb20le_res(bus, slot, 0); + cnb20le_res(bus, slot, 1); + } + return 0; +} +postcore_initcall(broadcom_postcore_init); diff --git a/arch/x86/pci/bus_numa.c b/arch/x86/pci/bus_numa.c index 64a1228..fd3f655 100644 --- a/arch/x86/pci/bus_numa.c +++ b/arch/x86/pci/bus_numa.c @@ -7,45 +7,50 @@ int pci_root_num; struct pci_root_info pci_root_info[PCI_ROOT_NR]; -void x86_pci_root_bus_res_quirks(struct pci_bus *b) +void x86_pci_root_bus_resources(int bus, struct list_head *resources) { int i; int j; struct pci_root_info *info; - /* don't go for it if _CRS is used already */ - if (b->resource[0] != &ioport_resource || - b->resource[1] != &iomem_resource) - return; - if (!pci_root_num) - return; + goto default_resources; for (i = 0; i < pci_root_num; i++) { - if (pci_root_info[i].bus_min == b->number) + if (pci_root_info[i].bus_min == bus) break; } if (i == pci_root_num) - return; + goto default_resources; - printk(KERN_DEBUG "PCI: peer root bus %02x res updated from pci conf\n", - b->number); + printk(KERN_DEBUG "PCI: root bus %02x: hardware-probed resources\n", + bus); - pci_bus_remove_resources(b); info = &pci_root_info[i]; for (j = 0; j < info->res_num; j++) { struct resource *res; struct resource *root; res = &info->res[j]; - pci_bus_add_resource(b, res, 0); + pci_add_resource(resources, res); if (res->flags & IORESOURCE_IO) root = &ioport_resource; else root = &iomem_resource; insert_resource(root, res); } + return; + +default_resources: + /* + * We don't have any host bridge aperture information from the + * "native host bridge drivers," e.g., amd_bus or broadcom_bus, + * so fall back to the defaults historically used by pci_create_bus(). + */ + printk(KERN_DEBUG "PCI: root bus %02x: using default resources\n", bus); + pci_add_resource(resources, &ioport_resource); + pci_add_resource(resources, &iomem_resource); } void __devinit update_res(struct pci_root_info *info, resource_size_t start, diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 7962ccb..323481e 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -164,9 +164,6 @@ void __devinit pcibios_fixup_bus(struct pci_bus *b) { struct pci_dev *dev; - /* root bus? */ - if (!b->parent) - x86_pci_root_bus_res_quirks(b); pci_read_bridge_bases(b); list_for_each_entry(dev, &b->devices, bus_list) pcibios_fixup_device_resources(dev); @@ -433,6 +430,7 @@ void __init dmi_check_pciprobe(void) struct pci_bus * __devinit pcibios_scan_root(int busnum) { + LIST_HEAD(resources); struct pci_bus *bus = NULL; struct pci_sysdata *sd; @@ -456,9 +454,12 @@ struct pci_bus * __devinit pcibios_scan_root(int busnum) sd->node = get_mp_bus_to_node(busnum); printk(KERN_DEBUG "PCI: Probing PCI hardware (bus %02x)\n", busnum); - bus = pci_scan_bus_parented(NULL, busnum, &pci_root_ops, sd); - if (!bus) + x86_pci_root_bus_resources(busnum, &resources); + bus = pci_scan_root_bus(NULL, busnum, &pci_root_ops, sd, &resources); + if (!bus) { + pci_free_resource_list(&resources); kfree(sd); + } return bus; } @@ -639,6 +640,7 @@ int pci_ext_cfg_avail(struct pci_dev *dev) struct pci_bus * __devinit pci_scan_bus_on_node(int busno, struct pci_ops *ops, int node) { + LIST_HEAD(resources); struct pci_bus *bus = NULL; struct pci_sysdata *sd; @@ -653,9 +655,12 @@ struct pci_bus * __devinit pci_scan_bus_on_node(int busno, struct pci_ops *ops, return NULL; } sd->node = node; - bus = pci_scan_bus(busno, ops, sd); - if (!bus) + x86_pci_root_bus_resources(busno, &resources); + bus = pci_scan_root_bus(NULL, busno, ops, sd, &resources); + if (!bus) { + pci_free_resource_list(&resources); kfree(sd); + } return bus; } diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index 794b092..91821a1 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -254,26 +254,6 @@ void __init pcibios_resource_survey(void) */ fs_initcall(pcibios_assign_resources); -/* - * If we set up a device for bus mastering, we need to check the latency - * timer as certain crappy BIOSes forget to set it properly. - */ -unsigned int pcibios_max_latency = 255; - -void pcibios_set_master(struct pci_dev *dev) -{ - u8 lat; - pci_read_config_byte(dev, PCI_LATENCY_TIMER, &lat); - if (lat < 16) - lat = (64 <= pcibios_max_latency) ? 64 : pcibios_max_latency; - else if (lat > pcibios_max_latency) - lat = pcibios_max_latency; - else - return; - dev_printk(KERN_DEBUG, &dev->dev, "setting latency timer to %d\n", lat); - pci_write_config_byte(dev, PCI_LATENCY_TIMER, lat); -} - static const struct vm_operations_struct pci_mmap_ops = { .access = generic_access_phys, }; diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c index 2c2aeab..a1df191 100644 --- a/arch/x86/pci/legacy.c +++ b/arch/x86/pci/legacy.c @@ -31,9 +31,6 @@ int __init pci_legacy_init(void) printk("PCI: Probing PCI hardware\n"); pci_root_bus = pcibios_scan_root(0); - if (pci_root_bus) - pci_bus_add_devices(pci_root_bus); - return 0; } diff --git a/arch/x86/pci/numaq_32.c b/arch/x86/pci/numaq_32.c index 51abf02..83e125b 100644 --- a/arch/x86/pci/numaq_32.c +++ b/arch/x86/pci/numaq_32.c @@ -153,8 +153,6 @@ int __init pci_numaq_init(void) raw_pci_ops = &pci_direct_conf1_mq; pci_root_bus = pcibios_scan_root(0); - if (pci_root_bus) - pci_bus_add_devices(pci_root_bus); if (num_online_nodes() > 1) for_each_online_node(quad) { if (quad == 0) diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c index db0e9a5..da8fe05 100644 --- a/arch/x86/pci/pcbios.c +++ b/arch/x86/pci/pcbios.c @@ -44,7 +44,7 @@ static inline void set_bios_x(void) pcibios_enabled = 1; set_memory_x(PAGE_OFFSET + BIOS_BEGIN, (BIOS_END - BIOS_BEGIN) >> PAGE_SHIFT); if (__supported_pte_mask & _PAGE_NX) - printk(KERN_INFO "PCI : PCI BIOS aera is rw and x. Use pci=nobios if you want it NX.\n"); + printk(KERN_INFO "PCI : PCI BIOS area is rw and x. Use pci=nobios if you want it NX.\n"); } /* diff --git a/arch/x86/platform/geode/alix.c b/arch/x86/platform/geode/alix.c index ca19736..dc5f1d3 100644 --- a/arch/x86/platform/geode/alix.c +++ b/arch/x86/platform/geode/alix.c @@ -27,7 +27,7 @@ #include <asm/geode.h> -static int force = 0; +static bool force = 0; module_param(force, bool, 0444); /* FIXME: Award bios is not automatically detected as Alix platform */ MODULE_PARM_DESC(force, "Force detection as ALIX.2/ALIX.3 platform"); diff --git a/arch/x86/platform/iris/iris.c b/arch/x86/platform/iris/iris.c index 1ba7f5e..5917eb5 100644 --- a/arch/x86/platform/iris/iris.c +++ b/arch/x86/platform/iris/iris.c @@ -42,7 +42,7 @@ MODULE_AUTHOR("Sébastien Hinderer <Sebastien.Hinderer@ens-lyon.org>"); MODULE_DESCRIPTION("A power_off handler for Iris devices from EuroBraille"); MODULE_SUPPORTED_DEVICE("Eurobraille/Iris"); -static int force; +static bool force; module_param(force, bool, 0); MODULE_PARM_DESC(force, "Set to one to force poweroff handler installation."); diff --git a/arch/x86/platform/mrst/Makefile b/arch/x86/platform/mrst/Makefile index 1ea3877..7baed51 100644 --- a/arch/x86/platform/mrst/Makefile +++ b/arch/x86/platform/mrst/Makefile @@ -1,4 +1,4 @@ -obj-$(CONFIG_X86_MRST) += mrst.o -obj-$(CONFIG_X86_MRST) += vrtc.o -obj-$(CONFIG_EARLY_PRINTK_MRST) += early_printk_mrst.o +obj-$(CONFIG_X86_INTEL_MID) += mrst.o +obj-$(CONFIG_X86_INTEL_MID) += vrtc.o +obj-$(CONFIG_EARLY_PRINTK_INTEL_MID) += early_printk_mrst.o obj-$(CONFIG_X86_MRST) += pmu.o diff --git a/arch/x86/platform/mrst/early_printk_mrst.c b/arch/x86/platform/mrst/early_printk_mrst.c index 25bfdbb..3c6e328 100644 --- a/arch/x86/platform/mrst/early_printk_mrst.c +++ b/arch/x86/platform/mrst/early_printk_mrst.c @@ -245,16 +245,24 @@ struct console early_mrst_console = { * Following is the early console based on Medfield HSU (High * Speed UART) device. */ -#define HSU_PORT2_PADDR 0xffa28180 +#define HSU_PORT_BASE 0xffa28080 static void __iomem *phsu; -void hsu_early_console_init(void) +void hsu_early_console_init(const char *s) { + unsigned long paddr, port = 0; u8 lcr; - phsu = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE, - HSU_PORT2_PADDR); + /* + * Select the early HSU console port if specified by user in the + * kernel command line. + */ + if (*s && !kstrtoul(s, 10, &port)) + port = clamp_val(port, 0, 2); + + paddr = HSU_PORT_BASE + port * 0x80; + phsu = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE, paddr); /* Disable FIFO */ writeb(0x0, phsu + UART_FCR); diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c index ad4ec1c..475e2cd 100644 --- a/arch/x86/platform/mrst/mrst.c +++ b/arch/x86/platform/mrst/mrst.c @@ -848,8 +848,7 @@ static void __init sfi_handle_ipc_dev(struct sfi_device_table_entry *entry) if (mrst_has_msic()) return; - /* ID as IRQ is a hack that will go away */ - pdev = platform_device_alloc(entry->name, entry->irq); + pdev = platform_device_alloc(entry->name, 0); if (pdev == NULL) { pr_err("out of memory for SFI platform device '%s'.\n", entry->name); @@ -1030,6 +1029,7 @@ static int __init pb_keys_init(void) num = sizeof(gpio_button) / sizeof(struct gpio_keys_button); for (i = 0; i < num; i++) { gb[i].gpio = get_gpio_by_name(gb[i].desc); + pr_debug("info[%2d]: name = %s, gpio = %d\n", i, gb[i].desc, gb[i].gpio); if (gb[i].gpio == -1) continue; diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 5b55219..9be4cff 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -157,13 +157,14 @@ static int __init uvhub_to_first_apicid(int uvhub) * clear of the Timeout bit (as well) will free the resource. No reply will * be sent (the hardware will only do one reply per message). */ -static void reply_to_message(struct msg_desc *mdp, struct bau_control *bcp) +static void reply_to_message(struct msg_desc *mdp, struct bau_control *bcp, + int do_acknowledge) { unsigned long dw; struct bau_pq_entry *msg; msg = mdp->msg; - if (!msg->canceled) { + if (!msg->canceled && do_acknowledge) { dw = (msg->swack_vec << UV_SW_ACK_NPENDING) | msg->swack_vec; write_mmr_sw_ack(dw); } @@ -212,8 +213,8 @@ static void bau_process_retry_msg(struct msg_desc *mdp, if (mmr & (msg_res << UV_SW_ACK_NPENDING)) { unsigned long mr; /* - * is the resource timed out? - * make everyone ignore the cancelled message. + * Is the resource timed out? + * Make everyone ignore the cancelled message. */ msg2->canceled = 1; stat->d_canceled++; @@ -231,8 +232,8 @@ static void bau_process_retry_msg(struct msg_desc *mdp, * Do all the things a cpu should do for a TLB shootdown message. * Other cpu's may come here at the same time for this message. */ -static void bau_process_message(struct msg_desc *mdp, - struct bau_control *bcp) +static void bau_process_message(struct msg_desc *mdp, struct bau_control *bcp, + int do_acknowledge) { short socket_ack_count = 0; short *sp; @@ -284,8 +285,9 @@ static void bau_process_message(struct msg_desc *mdp, if (msg_ack_count == bcp->cpus_in_uvhub) { /* * All cpus in uvhub saw it; reply + * (unless we are in the UV2 workaround) */ - reply_to_message(mdp, bcp); + reply_to_message(mdp, bcp, do_acknowledge); } } @@ -491,27 +493,138 @@ static int uv1_wait_completion(struct bau_desc *bau_desc, /* * UV2 has an extra bit of status in the ACTIVATION_STATUS_2 register. */ -static unsigned long uv2_read_status(unsigned long offset, int rshft, int cpu) +static unsigned long uv2_read_status(unsigned long offset, int rshft, int desc) { unsigned long descriptor_status; unsigned long descriptor_status2; descriptor_status = ((read_lmmr(offset) >> rshft) & UV_ACT_STATUS_MASK); - descriptor_status2 = (read_mmr_uv2_status() >> cpu) & 0x1UL; + descriptor_status2 = (read_mmr_uv2_status() >> desc) & 0x1UL; descriptor_status = (descriptor_status << 1) | descriptor_status2; return descriptor_status; } +/* + * Return whether the status of the descriptor that is normally used for this + * cpu (the one indexed by its hub-relative cpu number) is busy. + * The status of the original 32 descriptors is always reflected in the 64 + * bits of UVH_LB_BAU_SB_ACTIVATION_STATUS_0. + * The bit provided by the activation_status_2 register is irrelevant to + * the status if it is only being tested for busy or not busy. + */ +int normal_busy(struct bau_control *bcp) +{ + int cpu = bcp->uvhub_cpu; + int mmr_offset; + int right_shift; + + mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0; + right_shift = cpu * UV_ACT_STATUS_SIZE; + return (((((read_lmmr(mmr_offset) >> right_shift) & + UV_ACT_STATUS_MASK)) << 1) == UV2H_DESC_BUSY); +} + +/* + * Entered when a bau descriptor has gone into a permanent busy wait because + * of a hardware bug. + * Workaround the bug. + */ +int handle_uv2_busy(struct bau_control *bcp) +{ + int busy_one = bcp->using_desc; + int normal = bcp->uvhub_cpu; + int selected = -1; + int i; + unsigned long descriptor_status; + unsigned long status; + int mmr_offset; + struct bau_desc *bau_desc_old; + struct bau_desc *bau_desc_new; + struct bau_control *hmaster = bcp->uvhub_master; + struct ptc_stats *stat = bcp->statp; + cycles_t ttm; + + stat->s_uv2_wars++; + spin_lock(&hmaster->uvhub_lock); + /* try for the original first */ + if (busy_one != normal) { + if (!normal_busy(bcp)) + selected = normal; + } + if (selected < 0) { + /* can't use the normal, select an alternate */ + mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1; + descriptor_status = read_lmmr(mmr_offset); + + /* scan available descriptors 32-63 */ + for (i = 0; i < UV_CPUS_PER_AS; i++) { + if ((hmaster->inuse_map & (1 << i)) == 0) { + status = ((descriptor_status >> + (i * UV_ACT_STATUS_SIZE)) & + UV_ACT_STATUS_MASK) << 1; + if (status != UV2H_DESC_BUSY) { + selected = i + UV_CPUS_PER_AS; + break; + } + } + } + } + + if (busy_one != normal) + /* mark the busy alternate as not in-use */ + hmaster->inuse_map &= ~(1 << (busy_one - UV_CPUS_PER_AS)); + + if (selected >= 0) { + /* switch to the selected descriptor */ + if (selected != normal) { + /* set the selected alternate as in-use */ + hmaster->inuse_map |= + (1 << (selected - UV_CPUS_PER_AS)); + if (selected > stat->s_uv2_wars_hw) + stat->s_uv2_wars_hw = selected; + } + bau_desc_old = bcp->descriptor_base; + bau_desc_old += (ITEMS_PER_DESC * busy_one); + bcp->using_desc = selected; + bau_desc_new = bcp->descriptor_base; + bau_desc_new += (ITEMS_PER_DESC * selected); + *bau_desc_new = *bau_desc_old; + } else { + /* + * All are busy. Wait for the normal one for this cpu to + * free up. + */ + stat->s_uv2_war_waits++; + spin_unlock(&hmaster->uvhub_lock); + ttm = get_cycles(); + do { + cpu_relax(); + } while (normal_busy(bcp)); + spin_lock(&hmaster->uvhub_lock); + /* switch to the original descriptor */ + bcp->using_desc = normal; + bau_desc_old = bcp->descriptor_base; + bau_desc_old += (ITEMS_PER_DESC * bcp->using_desc); + bcp->using_desc = (ITEMS_PER_DESC * normal); + bau_desc_new = bcp->descriptor_base; + bau_desc_new += (ITEMS_PER_DESC * normal); + *bau_desc_new = *bau_desc_old; /* copy the entire descriptor */ + } + spin_unlock(&hmaster->uvhub_lock); + return FLUSH_RETRY_BUSYBUG; +} + static int uv2_wait_completion(struct bau_desc *bau_desc, unsigned long mmr_offset, int right_shift, struct bau_control *bcp, long try) { unsigned long descriptor_stat; cycles_t ttm; - int cpu = bcp->uvhub_cpu; + int desc = bcp->using_desc; + long busy_reps = 0; struct ptc_stats *stat = bcp->statp; - descriptor_stat = uv2_read_status(mmr_offset, right_shift, cpu); + descriptor_stat = uv2_read_status(mmr_offset, right_shift, desc); /* spin on the status MMR, waiting for it to go idle */ while (descriptor_stat != UV2H_DESC_IDLE) { @@ -522,32 +635,35 @@ static int uv2_wait_completion(struct bau_desc *bau_desc, * our message and its state will stay IDLE. */ if ((descriptor_stat == UV2H_DESC_SOURCE_TIMEOUT) || - (descriptor_stat == UV2H_DESC_DEST_STRONG_NACK) || (descriptor_stat == UV2H_DESC_DEST_PUT_ERR)) { stat->s_stimeout++; return FLUSH_GIVEUP; + } else if (descriptor_stat == UV2H_DESC_DEST_STRONG_NACK) { + stat->s_strongnacks++; + bcp->conseccompletes = 0; + return FLUSH_GIVEUP; } else if (descriptor_stat == UV2H_DESC_DEST_TIMEOUT) { stat->s_dtimeout++; - ttm = get_cycles(); - /* - * Our retries may be blocked by all destination - * swack resources being consumed, and a timeout - * pending. In that case hardware returns the - * ERROR that looks like a destination timeout. - */ - if (cycles_2_us(ttm - bcp->send_message) < timeout_us) { - bcp->conseccompletes = 0; - return FLUSH_RETRY_PLUGGED; - } bcp->conseccompletes = 0; return FLUSH_RETRY_TIMEOUT; } else { + busy_reps++; + if (busy_reps > 1000000) { + /* not to hammer on the clock */ + busy_reps = 0; + ttm = get_cycles(); + if ((ttm - bcp->send_message) > + (bcp->clocks_per_100_usec)) { + return handle_uv2_busy(bcp); + } + } /* * descriptor_stat is still BUSY */ cpu_relax(); } - descriptor_stat = uv2_read_status(mmr_offset, right_shift, cpu); + descriptor_stat = uv2_read_status(mmr_offset, right_shift, + desc); } bcp->conseccompletes++; return FLUSH_COMPLETE; @@ -563,17 +679,17 @@ static int wait_completion(struct bau_desc *bau_desc, { int right_shift; unsigned long mmr_offset; - int cpu = bcp->uvhub_cpu; + int desc = bcp->using_desc; - if (cpu < UV_CPUS_PER_AS) { + if (desc < UV_CPUS_PER_AS) { mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0; - right_shift = cpu * UV_ACT_STATUS_SIZE; + right_shift = desc * UV_ACT_STATUS_SIZE; } else { mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1; - right_shift = ((cpu - UV_CPUS_PER_AS) * UV_ACT_STATUS_SIZE); + right_shift = ((desc - UV_CPUS_PER_AS) * UV_ACT_STATUS_SIZE); } - if (is_uv1_hub()) + if (bcp->uvhub_version == 1) return uv1_wait_completion(bau_desc, mmr_offset, right_shift, bcp, try); else @@ -752,19 +868,22 @@ static void handle_cmplt(int completion_status, struct bau_desc *bau_desc, * Returns 1 if it gives up entirely and the original cpu mask is to be * returned to the kernel. */ -int uv_flush_send_and_wait(struct bau_desc *bau_desc, - struct cpumask *flush_mask, struct bau_control *bcp) +int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp) { int seq_number = 0; int completion_stat = 0; + int uv1 = 0; long try = 0; unsigned long index; cycles_t time1; cycles_t time2; struct ptc_stats *stat = bcp->statp; struct bau_control *hmaster = bcp->uvhub_master; + struct uv1_bau_msg_header *uv1_hdr = NULL; + struct uv2_bau_msg_header *uv2_hdr = NULL; + struct bau_desc *bau_desc; - if (is_uv1_hub()) + if (bcp->uvhub_version == 1) uv1_throttle(hmaster, stat); while (hmaster->uvhub_quiesce) @@ -772,22 +891,39 @@ int uv_flush_send_and_wait(struct bau_desc *bau_desc, time1 = get_cycles(); do { - if (try == 0) { - bau_desc->header.msg_type = MSG_REGULAR; + bau_desc = bcp->descriptor_base; + bau_desc += (ITEMS_PER_DESC * bcp->using_desc); + if (bcp->uvhub_version == 1) { + uv1 = 1; + uv1_hdr = &bau_desc->header.uv1_hdr; + } else + uv2_hdr = &bau_desc->header.uv2_hdr; + if ((try == 0) || (completion_stat == FLUSH_RETRY_BUSYBUG)) { + if (uv1) + uv1_hdr->msg_type = MSG_REGULAR; + else + uv2_hdr->msg_type = MSG_REGULAR; seq_number = bcp->message_number++; } else { - bau_desc->header.msg_type = MSG_RETRY; + if (uv1) + uv1_hdr->msg_type = MSG_RETRY; + else + uv2_hdr->msg_type = MSG_RETRY; stat->s_retry_messages++; } - bau_desc->header.sequence = seq_number; - index = (1UL << AS_PUSH_SHIFT) | bcp->uvhub_cpu; + if (uv1) + uv1_hdr->sequence = seq_number; + else + uv2_hdr->sequence = seq_number; + index = (1UL << AS_PUSH_SHIFT) | bcp->using_desc; bcp->send_message = get_cycles(); write_mmr_activation(index); try++; completion_stat = wait_completion(bau_desc, bcp, try); + /* UV2: wait_completion() may change the bcp->using_desc */ handle_cmplt(completion_stat, bau_desc, bcp, hmaster, stat); @@ -798,6 +934,7 @@ int uv_flush_send_and_wait(struct bau_desc *bau_desc, } cpu_relax(); } while ((completion_stat == FLUSH_RETRY_PLUGGED) || + (completion_stat == FLUSH_RETRY_BUSYBUG) || (completion_stat == FLUSH_RETRY_TIMEOUT)); time2 = get_cycles(); @@ -812,6 +949,7 @@ int uv_flush_send_and_wait(struct bau_desc *bau_desc, record_send_stats(time1, time2, bcp, stat, completion_stat, try); if (completion_stat == FLUSH_GIVEUP) + /* FLUSH_GIVEUP will fall back to using IPI's for tlb flush */ return 1; return 0; } @@ -967,7 +1105,7 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, stat->s_ntargself++; bau_desc = bcp->descriptor_base; - bau_desc += ITEMS_PER_DESC * bcp->uvhub_cpu; + bau_desc += (ITEMS_PER_DESC * bcp->using_desc); bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE); if (set_distrib_bits(flush_mask, bcp, bau_desc, &locals, &remotes)) return NULL; @@ -980,13 +1118,86 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, * uv_flush_send_and_wait returns 0 if all cpu's were messaged, * or 1 if it gave up and the original cpumask should be returned. */ - if (!uv_flush_send_and_wait(bau_desc, flush_mask, bcp)) + if (!uv_flush_send_and_wait(flush_mask, bcp)) return NULL; else return cpumask; } /* + * Search the message queue for any 'other' message with the same software + * acknowledge resource bit vector. + */ +struct bau_pq_entry *find_another_by_swack(struct bau_pq_entry *msg, + struct bau_control *bcp, unsigned char swack_vec) +{ + struct bau_pq_entry *msg_next = msg + 1; + + if (msg_next > bcp->queue_last) + msg_next = bcp->queue_first; + while ((msg_next->swack_vec != 0) && (msg_next != msg)) { + if (msg_next->swack_vec == swack_vec) + return msg_next; + msg_next++; + if (msg_next > bcp->queue_last) + msg_next = bcp->queue_first; + } + return NULL; +} + +/* + * UV2 needs to work around a bug in which an arriving message has not + * set a bit in the UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE register. + * Such a message must be ignored. + */ +void process_uv2_message(struct msg_desc *mdp, struct bau_control *bcp) +{ + unsigned long mmr_image; + unsigned char swack_vec; + struct bau_pq_entry *msg = mdp->msg; + struct bau_pq_entry *other_msg; + + mmr_image = read_mmr_sw_ack(); + swack_vec = msg->swack_vec; + + if ((swack_vec & mmr_image) == 0) { + /* + * This message was assigned a swack resource, but no + * reserved acknowlegment is pending. + * The bug has prevented this message from setting the MMR. + * And no other message has used the same sw_ack resource. + * Do the requested shootdown but do not reply to the msg. + * (the 0 means make no acknowledge) + */ + bau_process_message(mdp, bcp, 0); + return; + } + + /* + * Some message has set the MMR 'pending' bit; it might have been + * another message. Look for that message. + */ + other_msg = find_another_by_swack(msg, bcp, msg->swack_vec); + if (other_msg) { + /* There is another. Do not ack the current one. */ + bau_process_message(mdp, bcp, 0); + /* + * Let the natural processing of that message acknowledge + * it. Don't get the processing of sw_ack's out of order. + */ + return; + } + + /* + * There is no other message using this sw_ack, so it is safe to + * acknowledge it. + */ + bau_process_message(mdp, bcp, 1); + + return; +} + +/* * The BAU message interrupt comes here. (registered by set_intr_gate) * See entry_64.S * @@ -1009,6 +1220,7 @@ void uv_bau_message_interrupt(struct pt_regs *regs) struct ptc_stats *stat; struct msg_desc msgdesc; + ack_APIC_irq(); time_start = get_cycles(); bcp = &per_cpu(bau_control, smp_processor_id()); @@ -1022,9 +1234,11 @@ void uv_bau_message_interrupt(struct pt_regs *regs) count++; msgdesc.msg_slot = msg - msgdesc.queue_first; - msgdesc.swack_slot = ffs(msg->swack_vec) - 1; msgdesc.msg = msg; - bau_process_message(&msgdesc, bcp); + if (bcp->uvhub_version == 2) + process_uv2_message(&msgdesc, bcp); + else + bau_process_message(&msgdesc, bcp, 1); msg++; if (msg > msgdesc.queue_last) @@ -1036,8 +1250,6 @@ void uv_bau_message_interrupt(struct pt_regs *regs) stat->d_nomsg++; else if (count > 1) stat->d_multmsg++; - - ack_APIC_irq(); } /* @@ -1083,7 +1295,7 @@ static void __init enable_timeouts(void) */ mmr_image |= (1L << SOFTACK_MSHIFT); if (is_uv2_hub()) { - mmr_image |= (1L << UV2_LEG_SHFT); + mmr_image &= ~(1L << UV2_LEG_SHFT); mmr_image |= (1L << UV2_EXT_SHFT); } write_mmr_misc_control(pnode, mmr_image); @@ -1136,13 +1348,13 @@ static int ptc_seq_show(struct seq_file *file, void *data) seq_printf(file, "remotehub numuvhubs numuvhubs16 numuvhubs8 "); seq_printf(file, - "numuvhubs4 numuvhubs2 numuvhubs1 dto retries rok "); + "numuvhubs4 numuvhubs2 numuvhubs1 dto snacks retries rok "); seq_printf(file, "resetp resett giveup sto bz throt swack recv rtime "); seq_printf(file, "all one mult none retry canc nocan reset rcan "); seq_printf(file, - "disable enable\n"); + "disable enable wars warshw warwaits\n"); } if (cpu < num_possible_cpus() && cpu_online(cpu)) { stat = &per_cpu(ptcstats, cpu); @@ -1154,10 +1366,10 @@ static int ptc_seq_show(struct seq_file *file, void *data) stat->s_ntargremotes, stat->s_ntargcpu, stat->s_ntarglocaluvhub, stat->s_ntargremoteuvhub, stat->s_ntarguvhub, stat->s_ntarguvhub16); - seq_printf(file, "%ld %ld %ld %ld %ld ", + seq_printf(file, "%ld %ld %ld %ld %ld %ld ", stat->s_ntarguvhub8, stat->s_ntarguvhub4, stat->s_ntarguvhub2, stat->s_ntarguvhub1, - stat->s_dtimeout); + stat->s_dtimeout, stat->s_strongnacks); seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld ", stat->s_retry_messages, stat->s_retriesok, stat->s_resets_plug, stat->s_resets_timeout, @@ -1173,8 +1385,10 @@ static int ptc_seq_show(struct seq_file *file, void *data) stat->d_nomsg, stat->d_retries, stat->d_canceled, stat->d_nocanceled, stat->d_resets, stat->d_rcanceled); - seq_printf(file, "%ld %ld\n", - stat->s_bau_disabled, stat->s_bau_reenabled); + seq_printf(file, "%ld %ld %ld %ld %ld\n", + stat->s_bau_disabled, stat->s_bau_reenabled, + stat->s_uv2_wars, stat->s_uv2_wars_hw, + stat->s_uv2_war_waits); } return 0; } @@ -1432,12 +1646,15 @@ static void activation_descriptor_init(int node, int pnode, int base_pnode) { int i; int cpu; + int uv1 = 0; unsigned long gpa; unsigned long m; unsigned long n; size_t dsize; struct bau_desc *bau_desc; struct bau_desc *bd2; + struct uv1_bau_msg_header *uv1_hdr; + struct uv2_bau_msg_header *uv2_hdr; struct bau_control *bcp; /* @@ -1451,6 +1668,8 @@ static void activation_descriptor_init(int node, int pnode, int base_pnode) gpa = uv_gpa(bau_desc); n = uv_gpa_to_gnode(gpa); m = uv_gpa_to_offset(gpa); + if (is_uv1_hub()) + uv1 = 1; /* the 14-bit pnode */ write_mmr_descriptor_base(pnode, (n << UV_DESC_PSHIFT | m)); @@ -1461,21 +1680,33 @@ static void activation_descriptor_init(int node, int pnode, int base_pnode) */ for (i = 0, bd2 = bau_desc; i < (ADP_SZ * ITEMS_PER_DESC); i++, bd2++) { memset(bd2, 0, sizeof(struct bau_desc)); - bd2->header.swack_flag = 1; - /* - * The base_dest_nasid set in the message header is the nasid - * of the first uvhub in the partition. The bit map will - * indicate destination pnode numbers relative to that base. - * They may not be consecutive if nasid striding is being used. - */ - bd2->header.base_dest_nasid = UV_PNODE_TO_NASID(base_pnode); - bd2->header.dest_subnodeid = UV_LB_SUBNODEID; - bd2->header.command = UV_NET_ENDPOINT_INTD; - bd2->header.int_both = 1; - /* - * all others need to be set to zero: - * fairness chaining multilevel count replied_to - */ + if (uv1) { + uv1_hdr = &bd2->header.uv1_hdr; + uv1_hdr->swack_flag = 1; + /* + * The base_dest_nasid set in the message header + * is the nasid of the first uvhub in the partition. + * The bit map will indicate destination pnode numbers + * relative to that base. They may not be consecutive + * if nasid striding is being used. + */ + uv1_hdr->base_dest_nasid = + UV_PNODE_TO_NASID(base_pnode); + uv1_hdr->dest_subnodeid = UV_LB_SUBNODEID; + uv1_hdr->command = UV_NET_ENDPOINT_INTD; + uv1_hdr->int_both = 1; + /* + * all others need to be set to zero: + * fairness chaining multilevel count replied_to + */ + } else { + uv2_hdr = &bd2->header.uv2_hdr; + uv2_hdr->swack_flag = 1; + uv2_hdr->base_dest_nasid = + UV_PNODE_TO_NASID(base_pnode); + uv2_hdr->dest_subnodeid = UV_LB_SUBNODEID; + uv2_hdr->command = UV_NET_ENDPOINT_INTD; + } } for_each_present_cpu(cpu) { if (pnode != uv_blade_to_pnode(uv_cpu_to_blade_id(cpu))) @@ -1531,6 +1762,7 @@ static void pq_init(int node, int pnode) write_mmr_payload_first(pnode, pn_first); write_mmr_payload_tail(pnode, first); write_mmr_payload_last(pnode, last); + write_gmmr_sw_ack(pnode, 0xffffUL); /* in effect, all msg_type's are set to MSG_NOOP */ memset(pqp, 0, sizeof(struct bau_pq_entry) * DEST_Q_SIZE); @@ -1584,14 +1816,14 @@ static int calculate_destination_timeout(void) ts_ns = base * mult1 * mult2; ret = ts_ns / 1000; } else { - /* 4 bits 0/1 for 10/80us, 3 bits of multiplier */ - mmr_image = uv_read_local_mmr(UVH_AGING_PRESCALE_SEL); + /* 4 bits 0/1 for 10/80us base, 3 bits of multiplier */ + mmr_image = uv_read_local_mmr(UVH_LB_BAU_MISC_CONTROL); mmr_image = (mmr_image & UV_SA_MASK) >> UV_SA_SHFT; if (mmr_image & (1L << UV2_ACK_UNITS_SHFT)) - mult1 = 80; + base = 80; else - mult1 = 10; - base = mmr_image & UV2_ACK_MASK; + base = 10; + mult1 = mmr_image & UV2_ACK_MASK; ret = mult1 * base; } return ret; @@ -1618,6 +1850,7 @@ static void __init init_per_cpu_tunables(void) bcp->cong_response_us = congested_respns_us; bcp->cong_reps = congested_reps; bcp->cong_period = congested_period; + bcp->clocks_per_100_usec = usec_2_cycles(100); } } @@ -1728,8 +1961,17 @@ static int scan_sock(struct socket_desc *sdp, struct uvhub_desc *bdp, bcp->cpus_in_socket = sdp->num_cpus; bcp->socket_master = *smasterp; bcp->uvhub = bdp->uvhub; + if (is_uv1_hub()) + bcp->uvhub_version = 1; + else if (is_uv2_hub()) + bcp->uvhub_version = 2; + else { + printk(KERN_EMERG "uvhub version not 1 or 2\n"); + return 1; + } bcp->uvhub_master = *hmasterp; bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->blade_processor_id; + bcp->using_desc = bcp->uvhub_cpu; if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) { printk(KERN_EMERG "%d cpus per uvhub invalid\n", bcp->uvhub_cpu); @@ -1845,6 +2087,8 @@ static int __init uv_bau_init(void) uv_base_pnode = uv_blade_to_pnode(uvhub); } + enable_timeouts(); + if (init_per_cpu(nuvhubs, uv_base_pnode)) { nobau = 1; return 0; @@ -1855,7 +2099,6 @@ static int __init uv_bau_init(void) if (uv_blade_nr_possible_cpus(uvhub)) init_uvhub(uvhub, vector, uv_base_pnode); - enable_timeouts(); alloc_intr_gate(vector, uv_bau_message_intr1); for_each_possible_blade(uvhub) { @@ -1867,7 +2110,8 @@ static int __init uv_bau_init(void) val = 1L << 63; write_gmmr_activation(pnode, val); mmr = 1; /* should be 1 to broadcast to both sockets */ - write_mmr_data_broadcast(pnode, mmr); + if (!is_uv1_hub()) + write_mmr_data_broadcast(pnode, mmr); } } diff --git a/arch/x86/platform/uv/uv_sysfs.c b/arch/x86/platform/uv/uv_sysfs.c index 309c70f..5d4ba30 100644 --- a/arch/x86/platform/uv/uv_sysfs.c +++ b/arch/x86/platform/uv/uv_sysfs.c @@ -19,7 +19,7 @@ * Copyright (c) Russ Anderson */ -#include <linux/sysdev.h> +#include <linux/device.h> #include <asm/uv/bios.h> #include <asm/uv/uv.h> diff --git a/arch/x86/syscalls/Makefile b/arch/x86/syscalls/Makefile new file mode 100644 index 0000000..564b247 --- /dev/null +++ b/arch/x86/syscalls/Makefile @@ -0,0 +1,43 @@ +out := $(obj)/../include/generated/asm + +# Create output directory if not already present +_dummy := $(shell [ -d '$(out)' ] || mkdir -p '$(out)') + +syscall32 := $(srctree)/$(src)/syscall_32.tbl +syscall64 := $(srctree)/$(src)/syscall_64.tbl + +syshdr := $(srctree)/$(src)/syscallhdr.sh +systbl := $(srctree)/$(src)/syscalltbl.sh + +quiet_cmd_syshdr = SYSHDR $@ + cmd_syshdr = $(CONFIG_SHELL) '$(syshdr)' $< $@ \ + $(syshdr_abi_$(basetarget)) $(syshdr_pfx_$(basetarget)) +quiet_cmd_systbl = SYSTBL $@ + cmd_systbl = $(CONFIG_SHELL) '$(systbl)' $< $@ + +syshdr_abi_unistd_32 := i386 +$(out)/unistd_32.h: $(syscall32) $(syshdr) + $(call if_changed,syshdr) + +syshdr_abi_unistd_32_ia32 := i386 +syshdr_pfx_unistd_32_ia32 := ia32_ +$(out)/unistd_32_ia32.h: $(syscall32) $(syshdr) + $(call if_changed,syshdr) + +syshdr_abi_unistd_64 := 64 +$(out)/unistd_64.h: $(syscall64) $(syshdr) + $(call if_changed,syshdr) + +$(out)/syscalls_32.h: $(syscall32) $(systbl) + $(call if_changed,systbl) +$(out)/syscalls_64.h: $(syscall64) $(systbl) + $(call if_changed,systbl) + +syshdr-y += unistd_32.h unistd_64.h +syshdr-y += syscalls_32.h +syshdr-$(CONFIG_X86_64) += unistd_32_ia32.h +syshdr-$(CONFIG_X86_64) += syscalls_64.h + +targets += $(syshdr-y) + +all: $(addprefix $(out)/,$(targets)) diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl new file mode 100644 index 0000000..ce98e28 --- /dev/null +++ b/arch/x86/syscalls/syscall_32.tbl @@ -0,0 +1,357 @@ +# +# 32-bit system call numbers and entry vectors +# +# The format is: +# <number> <abi> <name> <entry point> <compat entry point> +# +# The abi is always "i386" for this file. +# +0 i386 restart_syscall sys_restart_syscall +1 i386 exit sys_exit +2 i386 fork ptregs_fork stub32_fork +3 i386 read sys_read +4 i386 write sys_write +5 i386 open sys_open compat_sys_open +6 i386 close sys_close +7 i386 waitpid sys_waitpid sys32_waitpid +8 i386 creat sys_creat +9 i386 link sys_link +10 i386 unlink sys_unlink +11 i386 execve ptregs_execve stub32_execve +12 i386 chdir sys_chdir +13 i386 time sys_time compat_sys_time +14 i386 mknod sys_mknod +15 i386 chmod sys_chmod +16 i386 lchown sys_lchown16 +17 i386 break +18 i386 oldstat sys_stat +19 i386 lseek sys_lseek sys32_lseek +20 i386 getpid sys_getpid +21 i386 mount sys_mount compat_sys_mount +22 i386 umount sys_oldumount +23 i386 setuid sys_setuid16 +24 i386 getuid sys_getuid16 +25 i386 stime sys_stime compat_sys_stime +26 i386 ptrace sys_ptrace compat_sys_ptrace +27 i386 alarm sys_alarm +28 i386 oldfstat sys_fstat +29 i386 pause sys_pause +30 i386 utime sys_utime compat_sys_utime +31 i386 stty +32 i386 gtty +33 i386 access sys_access +34 i386 nice sys_nice +35 i386 ftime +36 i386 sync sys_sync +37 i386 kill sys_kill sys32_kill +38 i386 rename sys_rename +39 i386 mkdir sys_mkdir +40 i386 rmdir sys_rmdir +41 i386 dup sys_dup +42 i386 pipe sys_pipe +43 i386 times sys_times compat_sys_times +44 i386 prof +45 i386 brk sys_brk +46 i386 setgid sys_setgid16 +47 i386 getgid sys_getgid16 +48 i386 signal sys_signal +49 i386 geteuid sys_geteuid16 +50 i386 getegid sys_getegid16 +51 i386 acct sys_acct +52 i386 umount2 sys_umount +53 i386 lock +54 i386 ioctl sys_ioctl compat_sys_ioctl +55 i386 fcntl sys_fcntl compat_sys_fcntl64 +56 i386 mpx +57 i386 setpgid sys_setpgid +58 i386 ulimit +59 i386 oldolduname sys_olduname +60 i386 umask sys_umask +61 i386 chroot sys_chroot +62 i386 ustat sys_ustat compat_sys_ustat +63 i386 dup2 sys_dup2 +64 i386 getppid sys_getppid +65 i386 getpgrp sys_getpgrp +66 i386 setsid sys_setsid +67 i386 sigaction sys_sigaction sys32_sigaction +68 i386 sgetmask sys_sgetmask +69 i386 ssetmask sys_ssetmask +70 i386 setreuid sys_setreuid16 +71 i386 setregid sys_setregid16 +72 i386 sigsuspend sys_sigsuspend sys32_sigsuspend +73 i386 sigpending sys_sigpending compat_sys_sigpending +74 i386 sethostname sys_sethostname +75 i386 setrlimit sys_setrlimit compat_sys_setrlimit +76 i386 getrlimit sys_old_getrlimit compat_sys_old_getrlimit +77 i386 getrusage sys_getrusage compat_sys_getrusage +78 i386 gettimeofday sys_gettimeofday compat_sys_gettimeofday +79 i386 settimeofday sys_settimeofday compat_sys_settimeofday +80 i386 getgroups sys_getgroups16 +81 i386 setgroups sys_setgroups16 +82 i386 select sys_old_select compat_sys_old_select +83 i386 symlink sys_symlink +84 i386 oldlstat sys_lstat +85 i386 readlink sys_readlink +86 i386 uselib sys_uselib +87 i386 swapon sys_swapon +88 i386 reboot sys_reboot +89 i386 readdir sys_old_readdir compat_sys_old_readdir +90 i386 mmap sys_old_mmap sys32_mmap +91 i386 munmap sys_munmap +92 i386 truncate sys_truncate +93 i386 ftruncate sys_ftruncate +94 i386 fchmod sys_fchmod +95 i386 fchown sys_fchown16 +96 i386 getpriority sys_getpriority +97 i386 setpriority sys_setpriority +98 i386 profil +99 i386 statfs sys_statfs compat_sys_statfs +100 i386 fstatfs sys_fstatfs compat_sys_fstatfs +101 i386 ioperm sys_ioperm +102 i386 socketcall sys_socketcall compat_sys_socketcall +103 i386 syslog sys_syslog +104 i386 setitimer sys_setitimer compat_sys_setitimer +105 i386 getitimer sys_getitimer compat_sys_getitimer +106 i386 stat sys_newstat compat_sys_newstat +107 i386 lstat sys_newlstat compat_sys_newlstat +108 i386 fstat sys_newfstat compat_sys_newfstat +109 i386 olduname sys_uname +110 i386 iopl ptregs_iopl stub32_iopl +111 i386 vhangup sys_vhangup +112 i386 idle +113 i386 vm86old ptregs_vm86old sys32_vm86_warning +114 i386 wait4 sys_wait4 compat_sys_wait4 +115 i386 swapoff sys_swapoff +116 i386 sysinfo sys_sysinfo compat_sys_sysinfo +117 i386 ipc sys_ipc sys32_ipc +118 i386 fsync sys_fsync +119 i386 sigreturn ptregs_sigreturn stub32_sigreturn +120 i386 clone ptregs_clone stub32_clone +121 i386 setdomainname sys_setdomainname +122 i386 uname sys_newuname +123 i386 modify_ldt sys_modify_ldt +124 i386 adjtimex sys_adjtimex compat_sys_adjtimex +125 i386 mprotect sys_mprotect sys32_mprotect +126 i386 sigprocmask sys_sigprocmask compat_sys_sigprocmask +127 i386 create_module +128 i386 init_module sys_init_module +129 i386 delete_module sys_delete_module +130 i386 get_kernel_syms +131 i386 quotactl sys_quotactl sys32_quotactl +132 i386 getpgid sys_getpgid +133 i386 fchdir sys_fchdir +134 i386 bdflush sys_bdflush +135 i386 sysfs sys_sysfs +136 i386 personality sys_personality +137 i386 afs_syscall +138 i386 setfsuid sys_setfsuid16 +139 i386 setfsgid sys_setfsgid16 +140 i386 _llseek sys_llseek +141 i386 getdents sys_getdents compat_sys_getdents +142 i386 _newselect sys_select compat_sys_select +143 i386 flock sys_flock +144 i386 msync sys_msync +145 i386 readv sys_readv compat_sys_readv +146 i386 writev sys_writev compat_sys_writev +147 i386 getsid sys_getsid +148 i386 fdatasync sys_fdatasync +149 i386 _sysctl sys_sysctl compat_sys_sysctl +150 i386 mlock sys_mlock +151 i386 munlock sys_munlock +152 i386 mlockall sys_mlockall +153 i386 munlockall sys_munlockall +154 i386 sched_setparam sys_sched_setparam +155 i386 sched_getparam sys_sched_getparam +156 i386 sched_setscheduler sys_sched_setscheduler +157 i386 sched_getscheduler sys_sched_getscheduler +158 i386 sched_yield sys_sched_yield +159 i386 sched_get_priority_max sys_sched_get_priority_max +160 i386 sched_get_priority_min sys_sched_get_priority_min +161 i386 sched_rr_get_interval sys_sched_rr_get_interval sys32_sched_rr_get_interval +162 i386 nanosleep sys_nanosleep compat_sys_nanosleep +163 i386 mremap sys_mremap +164 i386 setresuid sys_setresuid16 +165 i386 getresuid sys_getresuid16 +166 i386 vm86 ptregs_vm86 sys32_vm86_warning +167 i386 query_module +168 i386 poll sys_poll +169 i386 nfsservctl +170 i386 setresgid sys_setresgid16 +171 i386 getresgid sys_getresgid16 +172 i386 prctl sys_prctl +173 i386 rt_sigreturn ptregs_rt_sigreturn stub32_rt_sigreturn +174 i386 rt_sigaction sys_rt_sigaction sys32_rt_sigaction +175 i386 rt_sigprocmask sys_rt_sigprocmask sys32_rt_sigprocmask +176 i386 rt_sigpending sys_rt_sigpending sys32_rt_sigpending +177 i386 rt_sigtimedwait sys_rt_sigtimedwait compat_sys_rt_sigtimedwait +178 i386 rt_sigqueueinfo sys_rt_sigqueueinfo sys32_rt_sigqueueinfo +179 i386 rt_sigsuspend sys_rt_sigsuspend +180 i386 pread64 sys_pread64 sys32_pread +181 i386 pwrite64 sys_pwrite64 sys32_pwrite +182 i386 chown sys_chown16 +183 i386 getcwd sys_getcwd +184 i386 capget sys_capget +185 i386 capset sys_capset +186 i386 sigaltstack ptregs_sigaltstack stub32_sigaltstack +187 i386 sendfile sys_sendfile sys32_sendfile +188 i386 getpmsg +189 i386 putpmsg +190 i386 vfork ptregs_vfork stub32_vfork +191 i386 ugetrlimit sys_getrlimit compat_sys_getrlimit +192 i386 mmap2 sys_mmap_pgoff +193 i386 truncate64 sys_truncate64 sys32_truncate64 +194 i386 ftruncate64 sys_ftruncate64 sys32_ftruncate64 +195 i386 stat64 sys_stat64 sys32_stat64 +196 i386 lstat64 sys_lstat64 sys32_lstat64 +197 i386 fstat64 sys_fstat64 sys32_fstat64 +198 i386 lchown32 sys_lchown +199 i386 getuid32 sys_getuid +200 i386 getgid32 sys_getgid +201 i386 geteuid32 sys_geteuid +202 i386 getegid32 sys_getegid +203 i386 setreuid32 sys_setreuid +204 i386 setregid32 sys_setregid +205 i386 getgroups32 sys_getgroups +206 i386 setgroups32 sys_setgroups +207 i386 fchown32 sys_fchown +208 i386 setresuid32 sys_setresuid +209 i386 getresuid32 sys_getresuid +210 i386 setresgid32 sys_setresgid +211 i386 getresgid32 sys_getresgid +212 i386 chown32 sys_chown +213 i386 setuid32 sys_setuid +214 i386 setgid32 sys_setgid +215 i386 setfsuid32 sys_setfsuid +216 i386 setfsgid32 sys_setfsgid +217 i386 pivot_root sys_pivot_root +218 i386 mincore sys_mincore +219 i386 madvise sys_madvise +220 i386 getdents64 sys_getdents64 compat_sys_getdents64 +221 i386 fcntl64 sys_fcntl64 compat_sys_fcntl64 +# 222 is unused +# 223 is unused +224 i386 gettid sys_gettid +225 i386 readahead sys_readahead sys32_readahead +226 i386 setxattr sys_setxattr +227 i386 lsetxattr sys_lsetxattr +228 i386 fsetxattr sys_fsetxattr +229 i386 getxattr sys_getxattr +230 i386 lgetxattr sys_lgetxattr +231 i386 fgetxattr sys_fgetxattr +232 i386 listxattr sys_listxattr +233 i386 llistxattr sys_llistxattr +234 i386 flistxattr sys_flistxattr +235 i386 removexattr sys_removexattr +236 i386 lremovexattr sys_lremovexattr +237 i386 fremovexattr sys_fremovexattr +238 i386 tkill sys_tkill +239 i386 sendfile64 sys_sendfile64 +240 i386 futex sys_futex compat_sys_futex +241 i386 sched_setaffinity sys_sched_setaffinity compat_sys_sched_setaffinity +242 i386 sched_getaffinity sys_sched_getaffinity compat_sys_sched_getaffinity +243 i386 set_thread_area sys_set_thread_area +244 i386 get_thread_area sys_get_thread_area +245 i386 io_setup sys_io_setup compat_sys_io_setup +246 i386 io_destroy sys_io_destroy +247 i386 io_getevents sys_io_getevents compat_sys_io_getevents +248 i386 io_submit sys_io_submit compat_sys_io_submit +249 i386 io_cancel sys_io_cancel +250 i386 fadvise64 sys_fadvise64 sys32_fadvise64 +# 251 is available for reuse (was briefly sys_set_zone_reclaim) +252 i386 exit_group sys_exit_group +253 i386 lookup_dcookie sys_lookup_dcookie sys32_lookup_dcookie +254 i386 epoll_create sys_epoll_create +255 i386 epoll_ctl sys_epoll_ctl +256 i386 epoll_wait sys_epoll_wait +257 i386 remap_file_pages sys_remap_file_pages +258 i386 set_tid_address sys_set_tid_address +259 i386 timer_create sys_timer_create compat_sys_timer_create +260 i386 timer_settime sys_timer_settime compat_sys_timer_settime +261 i386 timer_gettime sys_timer_gettime compat_sys_timer_gettime +262 i386 timer_getoverrun sys_timer_getoverrun +263 i386 timer_delete sys_timer_delete +264 i386 clock_settime sys_clock_settime compat_sys_clock_settime +265 i386 clock_gettime sys_clock_gettime compat_sys_clock_gettime +266 i386 clock_getres sys_clock_getres compat_sys_clock_getres +267 i386 clock_nanosleep sys_clock_nanosleep compat_sys_clock_nanosleep +268 i386 statfs64 sys_statfs64 compat_sys_statfs64 +269 i386 fstatfs64 sys_fstatfs64 compat_sys_fstatfs64 +270 i386 tgkill sys_tgkill +271 i386 utimes sys_utimes compat_sys_utimes +272 i386 fadvise64_64 sys_fadvise64_64 sys32_fadvise64_64 +273 i386 vserver +274 i386 mbind sys_mbind +275 i386 get_mempolicy sys_get_mempolicy compat_sys_get_mempolicy +276 i386 set_mempolicy sys_set_mempolicy +277 i386 mq_open sys_mq_open compat_sys_mq_open +278 i386 mq_unlink sys_mq_unlink +279 i386 mq_timedsend sys_mq_timedsend compat_sys_mq_timedsend +280 i386 mq_timedreceive sys_mq_timedreceive compat_sys_mq_timedreceive +281 i386 mq_notify sys_mq_notify compat_sys_mq_notify +282 i386 mq_getsetaddr sys_mq_getsetattr compat_sys_mq_getsetattr +283 i386 kexec_load sys_kexec_load compat_sys_kexec_load +284 i386 waitid sys_waitid compat_sys_waitid +# 285 sys_setaltroot +286 i386 add_key sys_add_key +287 i386 request_key sys_request_key +288 i386 keyctl sys_keyctl +289 i386 ioprio_set sys_ioprio_set +290 i386 ioprio_get sys_ioprio_get +291 i386 inotify_init sys_inotify_init +292 i386 inotify_add_watch sys_inotify_add_watch +293 i386 inotify_rm_watch sys_inotify_rm_watch +294 i386 migrate_pages sys_migrate_pages +295 i386 openat sys_openat compat_sys_openat +296 i386 mkdirat sys_mkdirat +297 i386 mknodat sys_mknodat +298 i386 fchownat sys_fchownat +299 i386 futimesat sys_futimesat compat_sys_futimesat +300 i386 fstatat64 sys_fstatat64 sys32_fstatat +301 i386 unlinkat sys_unlinkat +302 i386 renameat sys_renameat +303 i386 linkat sys_linkat +304 i386 symlinkat sys_symlinkat +305 i386 readlinkat sys_readlinkat +306 i386 fchmodat sys_fchmodat +307 i386 faccessat sys_faccessat +308 i386 pselect6 sys_pselect6 compat_sys_pselect6 +309 i386 ppoll sys_ppoll compat_sys_ppoll +310 i386 unshare sys_unshare +311 i386 set_robust_list sys_set_robust_list compat_sys_set_robust_list +312 i386 get_robust_list sys_get_robust_list compat_sys_get_robust_list +313 i386 splice sys_splice +314 i386 sync_file_range sys_sync_file_range sys32_sync_file_range +315 i386 tee sys_tee +316 i386 vmsplice sys_vmsplice compat_sys_vmsplice +317 i386 move_pages sys_move_pages compat_sys_move_pages +318 i386 getcpu sys_getcpu +319 i386 epoll_pwait sys_epoll_pwait +320 i386 utimensat sys_utimensat compat_sys_utimensat +321 i386 signalfd sys_signalfd compat_sys_signalfd +322 i386 timerfd_create sys_timerfd_create +323 i386 eventfd sys_eventfd +324 i386 fallocate sys_fallocate sys32_fallocate +325 i386 timerfd_settime sys_timerfd_settime compat_sys_timerfd_settime +326 i386 timerfd_gettime sys_timerfd_gettime compat_sys_timerfd_gettime +327 i386 signalfd4 sys_signalfd4 compat_sys_signalfd4 +328 i386 eventfd2 sys_eventfd2 +329 i386 epoll_create1 sys_epoll_create1 +330 i386 dup3 sys_dup3 +331 i386 pipe2 sys_pipe2 +332 i386 inotify_init1 sys_inotify_init1 +333 i386 preadv sys_preadv compat_sys_preadv +334 i386 pwritev sys_pwritev compat_sys_pwritev +335 i386 rt_tgsigqueueinfo sys_rt_tgsigqueueinfo compat_sys_rt_tgsigqueueinfo +336 i386 perf_event_open sys_perf_event_open +337 i386 recvmmsg sys_recvmmsg compat_sys_recvmmsg +338 i386 fanotify_init sys_fanotify_init +339 i386 fanotify_mark sys_fanotify_mark sys32_fanotify_mark +340 i386 prlimit64 sys_prlimit64 +341 i386 name_to_handle_at sys_name_to_handle_at +342 i386 open_by_handle_at sys_open_by_handle_at compat_sys_open_by_handle_at +343 i386 clock_adjtime sys_clock_adjtime compat_sys_clock_adjtime +344 i386 syncfs sys_syncfs +345 i386 sendmmsg sys_sendmmsg compat_sys_sendmmsg +346 i386 setns sys_setns +347 i386 process_vm_readv sys_process_vm_readv compat_sys_process_vm_readv +348 i386 process_vm_writev sys_process_vm_writev compat_sys_process_vm_writev diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl new file mode 100644 index 0000000..b440a8f --- /dev/null +++ b/arch/x86/syscalls/syscall_64.tbl @@ -0,0 +1,320 @@ +# +# 64-bit system call numbers and entry vectors +# +# The format is: +# <number> <abi> <name> <entry point> +# +# The abi is always "64" for this file (for now.) +# +0 64 read sys_read +1 64 write sys_write +2 64 open sys_open +3 64 close sys_close +4 64 stat sys_newstat +5 64 fstat sys_newfstat +6 64 lstat sys_newlstat +7 64 poll sys_poll +8 64 lseek sys_lseek +9 64 mmap sys_mmap +10 64 mprotect sys_mprotect +11 64 munmap sys_munmap +12 64 brk sys_brk +13 64 rt_sigaction sys_rt_sigaction +14 64 rt_sigprocmask sys_rt_sigprocmask +15 64 rt_sigreturn stub_rt_sigreturn +16 64 ioctl sys_ioctl +17 64 pread64 sys_pread64 +18 64 pwrite64 sys_pwrite64 +19 64 readv sys_readv +20 64 writev sys_writev +21 64 access sys_access +22 64 pipe sys_pipe +23 64 select sys_select +24 64 sched_yield sys_sched_yield +25 64 mremap sys_mremap +26 64 msync sys_msync +27 64 mincore sys_mincore +28 64 madvise sys_madvise +29 64 shmget sys_shmget +30 64 shmat sys_shmat +31 64 shmctl sys_shmctl +32 64 dup sys_dup +33 64 dup2 sys_dup2 +34 64 pause sys_pause +35 64 nanosleep sys_nanosleep +36 64 getitimer sys_getitimer +37 64 alarm sys_alarm +38 64 setitimer sys_setitimer +39 64 getpid sys_getpid +40 64 sendfile sys_sendfile64 +41 64 socket sys_socket +42 64 connect sys_connect +43 64 accept sys_accept +44 64 sendto sys_sendto +45 64 recvfrom sys_recvfrom +46 64 sendmsg sys_sendmsg +47 64 recvmsg sys_recvmsg +48 64 shutdown sys_shutdown +49 64 bind sys_bind +50 64 listen sys_listen +51 64 getsockname sys_getsockname +52 64 getpeername sys_getpeername +53 64 socketpair sys_socketpair +54 64 setsockopt sys_setsockopt +55 64 getsockopt sys_getsockopt +56 64 clone stub_clone +57 64 fork stub_fork +58 64 vfork stub_vfork +59 64 execve stub_execve +60 64 exit sys_exit +61 64 wait4 sys_wait4 +62 64 kill sys_kill +63 64 uname sys_newuname +64 64 semget sys_semget +65 64 semop sys_semop +66 64 semctl sys_semctl +67 64 shmdt sys_shmdt +68 64 msgget sys_msgget +69 64 msgsnd sys_msgsnd +70 64 msgrcv sys_msgrcv +71 64 msgctl sys_msgctl +72 64 fcntl sys_fcntl +73 64 flock sys_flock +74 64 fsync sys_fsync +75 64 fdatasync sys_fdatasync +76 64 truncate sys_truncate +77 64 ftruncate sys_ftruncate +78 64 getdents sys_getdents +79 64 getcwd sys_getcwd +80 64 chdir sys_chdir +81 64 fchdir sys_fchdir +82 64 rename sys_rename +83 64 mkdir sys_mkdir +84 64 rmdir sys_rmdir +85 64 creat sys_creat +86 64 link sys_link +87 64 unlink sys_unlink +88 64 symlink sys_symlink +89 64 readlink sys_readlink +90 64 chmod sys_chmod +91 64 fchmod sys_fchmod +92 64 chown sys_chown +93 64 fchown sys_fchown +94 64 lchown sys_lchown +95 64 umask sys_umask +96 64 gettimeofday sys_gettimeofday +97 64 getrlimit sys_getrlimit +98 64 getrusage sys_getrusage +99 64 sysinfo sys_sysinfo +100 64 times sys_times +101 64 ptrace sys_ptrace +102 64 getuid sys_getuid +103 64 syslog sys_syslog +104 64 getgid sys_getgid +105 64 setuid sys_setuid +106 64 setgid sys_setgid +107 64 geteuid sys_geteuid +108 64 getegid sys_getegid +109 64 setpgid sys_setpgid +110 64 getppid sys_getppid +111 64 getpgrp sys_getpgrp +112 64 setsid sys_setsid +113 64 setreuid sys_setreuid +114 64 setregid sys_setregid +115 64 getgroups sys_getgroups +116 64 setgroups sys_setgroups +117 64 setresuid sys_setresuid +118 64 getresuid sys_getresuid +119 64 setresgid sys_setresgid +120 64 getresgid sys_getresgid +121 64 getpgid sys_getpgid +122 64 setfsuid sys_setfsuid +123 64 setfsgid sys_setfsgid +124 64 getsid sys_getsid +125 64 capget sys_capget +126 64 capset sys_capset +127 64 rt_sigpending sys_rt_sigpending +128 64 rt_sigtimedwait sys_rt_sigtimedwait +129 64 rt_sigqueueinfo sys_rt_sigqueueinfo +130 64 rt_sigsuspend sys_rt_sigsuspend +131 64 sigaltstack stub_sigaltstack +132 64 utime sys_utime +133 64 mknod sys_mknod +134 64 uselib +135 64 personality sys_personality +136 64 ustat sys_ustat +137 64 statfs sys_statfs +138 64 fstatfs sys_fstatfs +139 64 sysfs sys_sysfs +140 64 getpriority sys_getpriority +141 64 setpriority sys_setpriority +142 64 sched_setparam sys_sched_setparam +143 64 sched_getparam sys_sched_getparam +144 64 sched_setscheduler sys_sched_setscheduler +145 64 sched_getscheduler sys_sched_getscheduler +146 64 sched_get_priority_max sys_sched_get_priority_max +147 64 sched_get_priority_min sys_sched_get_priority_min +148 64 sched_rr_get_interval sys_sched_rr_get_interval +149 64 mlock sys_mlock +150 64 munlock sys_munlock +151 64 mlockall sys_mlockall +152 64 munlockall sys_munlockall +153 64 vhangup sys_vhangup +154 64 modify_ldt sys_modify_ldt +155 64 pivot_root sys_pivot_root +156 64 _sysctl sys_sysctl +157 64 prctl sys_prctl +158 64 arch_prctl sys_arch_prctl +159 64 adjtimex sys_adjtimex +160 64 setrlimit sys_setrlimit +161 64 chroot sys_chroot +162 64 sync sys_sync +163 64 acct sys_acct +164 64 settimeofday sys_settimeofday +165 64 mount sys_mount +166 64 umount2 sys_umount +167 64 swapon sys_swapon +168 64 swapoff sys_swapoff +169 64 reboot sys_reboot +170 64 sethostname sys_sethostname +171 64 setdomainname sys_setdomainname +172 64 iopl stub_iopl +173 64 ioperm sys_ioperm +174 64 create_module +175 64 init_module sys_init_module +176 64 delete_module sys_delete_module +177 64 get_kernel_syms +178 64 query_module +179 64 quotactl sys_quotactl +180 64 nfsservctl +181 64 getpmsg +182 64 putpmsg +183 64 afs_syscall +184 64 tuxcall +185 64 security +186 64 gettid sys_gettid +187 64 readahead sys_readahead +188 64 setxattr sys_setxattr +189 64 lsetxattr sys_lsetxattr +190 64 fsetxattr sys_fsetxattr +191 64 getxattr sys_getxattr +192 64 lgetxattr sys_lgetxattr +193 64 fgetxattr sys_fgetxattr +194 64 listxattr sys_listxattr +195 64 llistxattr sys_llistxattr +196 64 flistxattr sys_flistxattr +197 64 removexattr sys_removexattr +198 64 lremovexattr sys_lremovexattr +199 64 fremovexattr sys_fremovexattr +200 64 tkill sys_tkill +201 64 time sys_time +202 64 futex sys_futex +203 64 sched_setaffinity sys_sched_setaffinity +204 64 sched_getaffinity sys_sched_getaffinity +205 64 set_thread_area +206 64 io_setup sys_io_setup +207 64 io_destroy sys_io_destroy +208 64 io_getevents sys_io_getevents +209 64 io_submit sys_io_submit +210 64 io_cancel sys_io_cancel +211 64 get_thread_area +212 64 lookup_dcookie sys_lookup_dcookie +213 64 epoll_create sys_epoll_create +214 64 epoll_ctl_old +215 64 epoll_wait_old +216 64 remap_file_pages sys_remap_file_pages +217 64 getdents64 sys_getdents64 +218 64 set_tid_address sys_set_tid_address +219 64 restart_syscall sys_restart_syscall +220 64 semtimedop sys_semtimedop +221 64 fadvise64 sys_fadvise64 +222 64 timer_create sys_timer_create +223 64 timer_settime sys_timer_settime +224 64 timer_gettime sys_timer_gettime +225 64 timer_getoverrun sys_timer_getoverrun +226 64 timer_delete sys_timer_delete +227 64 clock_settime sys_clock_settime +228 64 clock_gettime sys_clock_gettime +229 64 clock_getres sys_clock_getres +230 64 clock_nanosleep sys_clock_nanosleep +231 64 exit_group sys_exit_group +232 64 epoll_wait sys_epoll_wait +233 64 epoll_ctl sys_epoll_ctl +234 64 tgkill sys_tgkill +235 64 utimes sys_utimes +236 64 vserver +237 64 mbind sys_mbind +238 64 set_mempolicy sys_set_mempolicy +239 64 get_mempolicy sys_get_mempolicy +240 64 mq_open sys_mq_open +241 64 mq_unlink sys_mq_unlink +242 64 mq_timedsend sys_mq_timedsend +243 64 mq_timedreceive sys_mq_timedreceive +244 64 mq_notify sys_mq_notify +245 64 mq_getsetattr sys_mq_getsetattr +246 64 kexec_load sys_kexec_load +247 64 waitid sys_waitid +248 64 add_key sys_add_key +249 64 request_key sys_request_key +250 64 keyctl sys_keyctl +251 64 ioprio_set sys_ioprio_set +252 64 ioprio_get sys_ioprio_get +253 64 inotify_init sys_inotify_init +254 64 inotify_add_watch sys_inotify_add_watch +255 64 inotify_rm_watch sys_inotify_rm_watch +256 64 migrate_pages sys_migrate_pages +257 64 openat sys_openat +258 64 mkdirat sys_mkdirat +259 64 mknodat sys_mknodat +260 64 fchownat sys_fchownat +261 64 futimesat sys_futimesat +262 64 newfstatat sys_newfstatat +263 64 unlinkat sys_unlinkat +264 64 renameat sys_renameat +265 64 linkat sys_linkat +266 64 symlinkat sys_symlinkat +267 64 readlinkat sys_readlinkat +268 64 fchmodat sys_fchmodat +269 64 faccessat sys_faccessat +270 64 pselect6 sys_pselect6 +271 64 ppoll sys_ppoll +272 64 unshare sys_unshare +273 64 set_robust_list sys_set_robust_list +274 64 get_robust_list sys_get_robust_list +275 64 splice sys_splice +276 64 tee sys_tee +277 64 sync_file_range sys_sync_file_range +278 64 vmsplice sys_vmsplice +279 64 move_pages sys_move_pages +280 64 utimensat sys_utimensat +281 64 epoll_pwait sys_epoll_pwait +282 64 signalfd sys_signalfd +283 64 timerfd_create sys_timerfd_create +284 64 eventfd sys_eventfd +285 64 fallocate sys_fallocate +286 64 timerfd_settime sys_timerfd_settime +287 64 timerfd_gettime sys_timerfd_gettime +288 64 accept4 sys_accept4 +289 64 signalfd4 sys_signalfd4 +290 64 eventfd2 sys_eventfd2 +291 64 epoll_create1 sys_epoll_create1 +292 64 dup3 sys_dup3 +293 64 pipe2 sys_pipe2 +294 64 inotify_init1 sys_inotify_init1 +295 64 preadv sys_preadv +296 64 pwritev sys_pwritev +297 64 rt_tgsigqueueinfo sys_rt_tgsigqueueinfo +298 64 perf_event_open sys_perf_event_open +299 64 recvmmsg sys_recvmmsg +300 64 fanotify_init sys_fanotify_init +301 64 fanotify_mark sys_fanotify_mark +302 64 prlimit64 sys_prlimit64 +303 64 name_to_handle_at sys_name_to_handle_at +304 64 open_by_handle_at sys_open_by_handle_at +305 64 clock_adjtime sys_clock_adjtime +306 64 syncfs sys_syncfs +307 64 sendmmsg sys_sendmmsg +308 64 setns sys_setns +309 64 getcpu sys_getcpu +310 64 process_vm_readv sys_process_vm_readv +311 64 process_vm_writev sys_process_vm_writev diff --git a/arch/x86/syscalls/syscallhdr.sh b/arch/x86/syscalls/syscallhdr.sh new file mode 100644 index 0000000..31fd5f1 --- /dev/null +++ b/arch/x86/syscalls/syscallhdr.sh @@ -0,0 +1,27 @@ +#!/bin/sh + +in="$1" +out="$2" +my_abis=`echo "($3)" | tr ',' '|'` +prefix="$4" +offset="$5" + +fileguard=_ASM_X86_`basename "$out" | sed \ + -e 'y/abcdefghijklmnopqrstuvwxyz/ABCDEFGHIJKLMNOPQRSTUVWXYZ/' \ + -e 's/[^A-Z0-9_]/_/g' -e 's/__/_/g'` +grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | ( + echo "#ifndef ${fileguard}" + echo "#define ${fileguard} 1" + echo "" + + while read nr abi name entry ; do + if [ -z "$offset" ]; then + echo "#define __NR_${prefix}${name} $nr" + else + echo "#define __NR_${prefix}${name} ($offset + $nr)" + fi + done + + echo "" + echo "#endif /* ${fileguard} */" +) > "$out" diff --git a/arch/x86/syscalls/syscalltbl.sh b/arch/x86/syscalls/syscalltbl.sh new file mode 100644 index 0000000..0e7f8ec --- /dev/null +++ b/arch/x86/syscalls/syscalltbl.sh @@ -0,0 +1,15 @@ +#!/bin/sh + +in="$1" +out="$2" + +grep '^[0-9]' "$in" | sort -n | ( + while read nr abi name entry compat; do + abi=`echo "$abi" | tr '[a-z]' '[A-Z]'` + if [ -n "$compat" ]; then + echo "__SYSCALL_${abi}($nr, $entry, $compat)" + elif [ -n "$entry" ]; then + echo "__SYSCALL_${abi}($nr, $entry, $entry)" + fi + done +) > "$out" diff --git a/arch/x86/um/Kconfig b/arch/x86/um/Kconfig index 1d97bd8..b2b54d2 100644 --- a/arch/x86/um/Kconfig +++ b/arch/x86/um/Kconfig @@ -6,14 +6,6 @@ menu "UML-specific options" menu "Host processor type and features" -config CMPXCHG_LOCAL - bool - default n - -config CMPXCHG_DOUBLE - bool - default n - source "arch/x86/Kconfig.cpu" endmenu diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile index 8fb5840..5d065b2 100644 --- a/arch/x86/um/Makefile +++ b/arch/x86/um/Makefile @@ -37,7 +37,8 @@ subarch-$(CONFIG_MODULES) += ../kernel/module.o USER_OBJS := bugs_$(BITS).o ptrace_user.o fault.o extra-y += user-offsets.s -$(obj)/user-offsets.s: c_flags = -Wp,-MD,$(depfile) $(USER_CFLAGS) +$(obj)/user-offsets.s: c_flags = -Wp,-MD,$(depfile) $(USER_CFLAGS) \ + -Iarch/x86/include/generated UNPROFILE_OBJS := stub_segv.o CFLAGS_stub_segv.o := $(CFLAGS_NO_HARDENING) diff --git a/arch/x86/um/shared/sysdep/ptrace.h b/arch/x86/um/shared/sysdep/ptrace.h index 711b162..2bbe1ec2 100644 --- a/arch/x86/um/shared/sysdep/ptrace.h +++ b/arch/x86/um/shared/sysdep/ptrace.h @@ -1,5 +1,15 @@ +#ifndef __SYSDEP_X86_PTRACE_H +#define __SYSDEP_X86_PTRACE_H + #ifdef __i386__ #include "ptrace_32.h" #else #include "ptrace_64.h" #endif + +static inline long regs_return_value(struct uml_pt_regs *regs) +{ + return UPT_SYSCALL_RET(regs); +} + +#endif /* __SYSDEP_X86_PTRACE_H */ diff --git a/arch/x86/um/sys_call_table_32.S b/arch/x86/um/sys_call_table_32.S deleted file mode 100644 index a7ca80d..0000000 --- a/arch/x86/um/sys_call_table_32.S +++ /dev/null @@ -1,26 +0,0 @@ -#include <linux/linkage.h> -/* Steal i386 syscall table for our purposes, but with some slight changes.*/ - -#define sys_iopl sys_ni_syscall -#define sys_ioperm sys_ni_syscall - -#define sys_vm86old sys_ni_syscall -#define sys_vm86 sys_ni_syscall - -#define old_mmap sys_old_mmap - -#define ptregs_fork sys_fork -#define ptregs_execve sys_execve -#define ptregs_iopl sys_iopl -#define ptregs_vm86old sys_vm86old -#define ptregs_clone sys_clone -#define ptregs_vm86 sys_vm86 -#define ptregs_sigaltstack sys_sigaltstack -#define ptregs_vfork sys_vfork - -.section .rodata,"a" - -#include "../kernel/syscall_table_32.S" - -ENTRY(syscall_table_size) -.long .-sys_call_table diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c new file mode 100644 index 0000000..416bd40 --- /dev/null +++ b/arch/x86/um/sys_call_table_32.c @@ -0,0 +1,55 @@ +/* + * System call table for UML/i386, copied from arch/x86/kernel/syscall_*.c + * with some changes for UML. + */ + +#include <linux/linkage.h> +#include <linux/sys.h> +#include <linux/cache.h> +#include <generated/user_constants.h> + +#define __NO_STUBS + +/* + * Below you can see, in terms of #define's, the differences between the x86-64 + * and the UML syscall table. + */ + +/* Not going to be implemented by UML, since we have no hardware. */ +#define sys_iopl sys_ni_syscall +#define sys_ioperm sys_ni_syscall + +#define sys_vm86old sys_ni_syscall +#define sys_vm86 sys_ni_syscall + +#define old_mmap sys_old_mmap + +#define ptregs_fork sys_fork +#define ptregs_execve sys_execve +#define ptregs_iopl sys_iopl +#define ptregs_vm86old sys_vm86old +#define ptregs_clone sys_clone +#define ptregs_vm86 sys_vm86 +#define ptregs_sigaltstack sys_sigaltstack +#define ptregs_vfork sys_vfork + +#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void sym(void) ; +#include <asm/syscalls_32.h> + +#undef __SYSCALL_I386 +#define __SYSCALL_I386(nr, sym, compat) [ nr ] = sym, + +typedef void (*sys_call_ptr_t)(void); + +extern void sys_ni_syscall(void); + +const sys_call_ptr_t sys_call_table[] __cacheline_aligned = { + /* + * Smells like a compiler bug -- it doesn't work + * when the & below is removed. + */ + [0 ... __NR_syscall_max] = &sys_ni_syscall, +#include <asm/syscalls_32.h> +}; + +int syscall_table_size = sizeof(sys_call_table); diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c index 99522f7..fe626c3 100644 --- a/arch/x86/um/sys_call_table_64.c +++ b/arch/x86/um/sys_call_table_64.c @@ -1,11 +1,12 @@ /* - * System call table for UML/x86-64, copied from arch/x86_64/kernel/syscall.c + * System call table for UML/x86-64, copied from arch/x86/kernel/syscall_*.c * with some changes for UML. */ #include <linux/linkage.h> #include <linux/sys.h> #include <linux/cache.h> +#include <generated/user_constants.h> #define __NO_STUBS @@ -34,31 +35,23 @@ #define stub_sigaltstack sys_sigaltstack #define stub_rt_sigreturn sys_rt_sigreturn -#define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ; -#undef _ASM_X86_UNISTD_64_H -#include "../../x86/include/asm/unistd_64.h" +#define __SYSCALL_64(nr, sym, compat) extern asmlinkage void sym(void) ; +#include <asm/syscalls_64.h> -#undef __SYSCALL -#define __SYSCALL(nr, sym) [ nr ] = sym, -#undef _ASM_X86_UNISTD_64_H +#undef __SYSCALL_64 +#define __SYSCALL_64(nr, sym, compat) [ nr ] = sym, typedef void (*sys_call_ptr_t)(void); extern void sys_ni_syscall(void); -/* - * We used to have a trick here which made sure that holes in the - * x86_64 table were filled in with sys_ni_syscall, but a comment in - * unistd_64.h says that holes aren't allowed, so the trick was - * removed. - * The trick looked like this - * [0 ... UM_NR_syscall_max] = &sys_ni_syscall - * before including unistd_64.h - the later initializations overwrote - * the sys_ni_syscall filler. - */ - -sys_call_ptr_t sys_call_table[] __cacheline_aligned = { -#include <asm/unistd_64.h> +const sys_call_ptr_t sys_call_table[] __cacheline_aligned = { + /* + * Smells like a compiler bug -- it doesn't work + * when the & below is removed. + */ + [0 ... __NR_syscall_max] = &sys_ni_syscall, +#include <asm/syscalls_64.h> }; int syscall_table_size = sizeof(sys_call_table); diff --git a/arch/x86/um/user-offsets.c b/arch/x86/um/user-offsets.c index ca49be8..5edf4f4 100644 --- a/arch/x86/um/user-offsets.c +++ b/arch/x86/um/user-offsets.c @@ -8,6 +8,18 @@ #include <asm/ptrace.h> #include <asm/types.h> +#ifdef __i386__ +#define __SYSCALL_I386(nr, sym, compat) [nr] = 1, +static char syscalls[] = { +#include <asm/syscalls_32.h> +}; +#else +#define __SYSCALL_64(nr, sym, compat) [nr] = 1, +static char syscalls[] = { +#include <asm/syscalls_64.h> +}; +#endif + #define DEFINE(sym, val) \ asm volatile("\n->" #sym " %0 " #val : : "i" (val)) @@ -77,4 +89,7 @@ void foo(void) DEFINE(UM_PROT_READ, PROT_READ); DEFINE(UM_PROT_WRITE, PROT_WRITE); DEFINE(UM_PROT_EXEC, PROT_EXEC); + + DEFINE(__NR_syscall_max, sizeof(syscalls) - 1); + DEFINE(NR_syscalls, sizeof(syscalls)); } diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 26c731a..fdce49c 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -29,7 +29,8 @@ config XEN_PVHVM config XEN_MAX_DOMAIN_MEMORY int - default 128 + default 500 if X86_64 + default 64 if X86_32 depends on XEN help This only affects the sizing of some bss arrays, the unused @@ -48,3 +49,4 @@ config XEN_DEBUG_FS help Enable statistics output and various tuning options in debugfs. Enabling this option may incur a significant performance overhead. + diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c index 7c0fedd..ef1db19 100644 --- a/arch/x86/xen/debugfs.c +++ b/arch/x86/xen/debugfs.c @@ -109,7 +109,7 @@ static const struct file_operations u32_array_fops = { .llseek = no_llseek, }; -struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode, +struct dentry *xen_debugfs_create_u32_array(const char *name, umode_t mode, struct dentry *parent, u32 *array, unsigned elements) { diff --git a/arch/x86/xen/debugfs.h b/arch/x86/xen/debugfs.h index e281320..78d2549 100644 --- a/arch/x86/xen/debugfs.h +++ b/arch/x86/xen/debugfs.h @@ -3,7 +3,7 @@ struct dentry * __init xen_init_debugfs(void); -struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode, +struct dentry *xen_debugfs_create_u32_array(const char *name, umode_t mode, struct dentry *parent, u32 *array, unsigned elements); diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c index 5a40d24..3a5f55d 100644 --- a/arch/x86/xen/grant-table.c +++ b/arch/x86/xen/grant-table.c @@ -54,6 +54,20 @@ static int map_pte_fn(pte_t *pte, struct page *pmd_page, return 0; } +/* + * This function is used to map shared frames to store grant status. It is + * different from map_pte_fn above, the frames type here is uint64_t. + */ +static int map_pte_fn_status(pte_t *pte, struct page *pmd_page, + unsigned long addr, void *data) +{ + uint64_t **frames = (uint64_t **)data; + + set_pte_at(&init_mm, addr, pte, mfn_pte((*frames)[0], PAGE_KERNEL)); + (*frames)++; + return 0; +} + static int unmap_pte_fn(pte_t *pte, struct page *pmd_page, unsigned long addr, void *data) { @@ -64,10 +78,10 @@ static int unmap_pte_fn(pte_t *pte, struct page *pmd_page, int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes, unsigned long max_nr_gframes, - struct grant_entry **__shared) + void **__shared) { int rc; - struct grant_entry *shared = *__shared; + void *shared = *__shared; if (shared == NULL) { struct vm_struct *area = @@ -83,8 +97,30 @@ int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes, return rc; } -void arch_gnttab_unmap_shared(struct grant_entry *shared, - unsigned long nr_gframes) +int arch_gnttab_map_status(uint64_t *frames, unsigned long nr_gframes, + unsigned long max_nr_gframes, + grant_status_t **__shared) +{ + int rc; + grant_status_t *shared = *__shared; + + if (shared == NULL) { + /* No need to pass in PTE as we are going to do it + * in apply_to_page_range anyhow. */ + struct vm_struct *area = + alloc_vm_area(PAGE_SIZE * max_nr_gframes, NULL); + BUG_ON(area == NULL); + shared = area->addr; + *__shared = shared; + } + + rc = apply_to_page_range(&init_mm, (unsigned long)shared, + PAGE_SIZE * nr_gframes, + map_pte_fn_status, &frames); + return rc; +} + +void arch_gnttab_unmap(void *shared, unsigned long nr_gframes) { apply_to_page_range(&init_mm, (unsigned long)shared, PAGE_SIZE * nr_gframes, unmap_pte_fn, NULL); diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index f4bf8aa..58a0e46 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1852,7 +1852,7 @@ pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd, xen_write_cr3(__pa(initial_page_table)); memblock_reserve(__pa(xen_start_info->pt_base), - xen_start_info->nr_pt_frames * PAGE_SIZE)); + xen_start_info->nr_pt_frames * PAGE_SIZE); return initial_page_table; } |