From 079f85e624189292d1c818b47764916bf8cf84a8 Mon Sep 17 00:00:00 2001 From: Michal Marek Date: Tue, 12 Apr 2011 13:30:24 +0200 Subject: x86, build: Do not set the root_dev field in bzImage This has been obsoleted by the root= commandline and the rdev utility for many, many years. People who still depend on this will surely have a copy of the rdev utility around, the rest of the world gets rid of another piece of buildhost-dependent data in the build. Thanks to Paul Bolle for the build.c cleanup. Cc: Paul Bolle Signed-off-by: Michal Marek Link: http://lkml.kernel.org/r/1302607824-24699-1-git-send-email-mmarek@suse.cz Signed-off-by: H. Peter Anvin --- arch/x86/boot/Makefile | 9 +-------- arch/x86/boot/tools/build.c | 33 ++++----------------------------- 2 files changed, 5 insertions(+), 37 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index f7cb086..95365a8 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -9,12 +9,6 @@ # Changed by many, many contributors over the years. # -# ROOT_DEV specifies the default root-device when making the image. -# This can be either FLOPPY, CURRENT, /dev/xxxx or empty, in which case -# the default of FLOPPY is used by 'build'. - -ROOT_DEV := CURRENT - # If you want to preset the SVGA mode, uncomment the next line and # set SVGA_MODE to whatever number you want. # Set it to -DSVGA_MODE=NORMAL_VGA if you just want the EGA/VGA mode. @@ -75,8 +69,7 @@ GCOV_PROFILE := n $(obj)/bzImage: asflags-y := $(SVGA_MODE) quiet_cmd_image = BUILD $@ -cmd_image = $(obj)/tools/build $(obj)/setup.bin $(obj)/vmlinux.bin \ - $(ROOT_DEV) > $@ +cmd_image = $(obj)/tools/build $(obj)/setup.bin $(obj)/vmlinux.bin > $@ $(obj)/bzImage: $(obj)/setup.bin $(obj)/vmlinux.bin $(obj)/tools/build FORCE $(call if_changed,image) diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c index ee3a4ea..fdc60a0 100644 --- a/arch/x86/boot/tools/build.c +++ b/arch/x86/boot/tools/build.c @@ -130,7 +130,7 @@ static void die(const char * str, ...) static void usage(void) { - die("Usage: build setup system [rootdev] [> image]"); + die("Usage: build setup system [> image]"); } int main(int argc, char ** argv) @@ -138,39 +138,14 @@ int main(int argc, char ** argv) unsigned int i, sz, setup_sectors; int c; u32 sys_size; - u8 major_root, minor_root; struct stat sb; FILE *file; int fd; void *kernel; u32 crc = 0xffffffffUL; - if ((argc < 3) || (argc > 4)) + if (argc != 3) usage(); - if (argc > 3) { - if (!strcmp(argv[3], "CURRENT")) { - if (stat("/", &sb)) { - perror("/"); - die("Couldn't stat /"); - } - major_root = major(sb.st_dev); - minor_root = minor(sb.st_dev); - } else if (strcmp(argv[3], "FLOPPY")) { - if (stat(argv[3], &sb)) { - perror(argv[3]); - die("Couldn't stat root device."); - } - major_root = major(sb.st_rdev); - minor_root = minor(sb.st_rdev); - } else { - major_root = 0; - minor_root = 0; - } - } else { - major_root = DEFAULT_MAJOR_ROOT; - minor_root = DEFAULT_MINOR_ROOT; - } - fprintf(stderr, "Root device is (%d, %d)\n", major_root, minor_root); /* Copy the setup code */ file = fopen(argv[1], "r"); @@ -193,8 +168,8 @@ int main(int argc, char ** argv) memset(buf+c, 0, i-c); /* Set the default root device */ - buf[508] = minor_root; - buf[509] = major_root; + buf[508] = DEFAULT_MINOR_ROOT; + buf[509] = DEFAULT_MAJOR_ROOT; fprintf(stderr, "Setup is %d bytes (padded to %d bytes).\n", c, i); -- cgit v1.1 From c4d017f21328c269deba3c7acde873204fe595b5 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Sat, 28 May 2011 10:36:22 -0700 Subject: x86: Convert vmalloc()+memset() to vzalloc() Signed-off-by: Joe Perches Cc: Jiri Kosina Link: http://lkml.kernel.org/r/10e35243fda0b8739c89ac32a7bdf348ec4752e1.1306603968.git.joe@perches.com Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr-test.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c index e1d1069..b008656 100644 --- a/arch/x86/mm/pageattr-test.c +++ b/arch/x86/mm/pageattr-test.c @@ -123,12 +123,11 @@ static int pageattr_test(void) if (print) printk(KERN_INFO "CPA self-test:\n"); - bm = vmalloc((max_pfn_mapped + 7) / 8); + bm = vzalloc((max_pfn_mapped + 7) / 8); if (!bm) { printk(KERN_ERR "CPA Cannot vmalloc bitmap\n"); return -ENOMEM; } - memset(bm, 0, (max_pfn_mapped + 7) / 8); failed += print_split(&sa); srandom32(100); -- cgit v1.1 From 55ba41202866f4e135c39d0bb88f128b96ba2167 Mon Sep 17 00:00:00 2001 From: Jack Steiner Date: Fri, 27 May 2011 09:52:56 -0500 Subject: x86, UV: Clean up uv_mmrs.h No code changes. Reformat definitions to make it more readable. I fixed alignment of comments in the structure definitions. Also aligned comments and most field definitions & values. Also sorted the defines for the SHIFT & MASK values for each MMR. This make the file visually much more acceptable. Some of the symbol names are still quite long. The file is based on post-processing of verilog definitions that are used for the node controller chip design. Although some symbol names are not what I would chose, I would like to maintain compatibility with the names used by the chip designers. We have a number of cross-reference utilities & having common names is important. Signed-off-by: Jack Steiner Link: http://lkml.kernel.org/r/20110527145256.GA31224@sgi.com Signed-off-by: Ingo Molnar -- arch/x86/include/asm/uv/uv_mmrs.h | 2873 +++++++++++++++++++++----------------- 1 file changed, 1600 insertions(+), 1273 deletions(-) --- arch/x86/include/asm/uv/uv_mmrs.h | 2889 +++++++++++++++++++++---------------- 1 file changed, 1608 insertions(+), 1281 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uv/uv_mmrs.h b/arch/x86/include/asm/uv/uv_mmrs.h index 4be52c8..10474fb 100644 --- a/arch/x86/include/asm/uv/uv_mmrs.h +++ b/arch/x86/include/asm/uv/uv_mmrs.h @@ -61,1689 +61,2016 @@ /* Compat: if this #define is present, UV headers support UV2 */ #define UV2_HUB_IS_SUPPORTED 1 -/* KABI compat: if this #define is present, KABI hacks are present */ -#define UV2_HUB_KABI_HACKS 1 - /* ========================================================================= */ /* UVH_BAU_DATA_BROADCAST */ /* ========================================================================= */ -#define UVH_BAU_DATA_BROADCAST 0x61688UL -#define UVH_BAU_DATA_BROADCAST_32 0x440 +#define UVH_BAU_DATA_BROADCAST 0x61688UL +#define UVH_BAU_DATA_BROADCAST_32 0x440 -#define UVH_BAU_DATA_BROADCAST_ENABLE_SHFT 0 -#define UVH_BAU_DATA_BROADCAST_ENABLE_MASK 0x0000000000000001UL +#define UVH_BAU_DATA_BROADCAST_ENABLE_SHFT 0 +#define UVH_BAU_DATA_BROADCAST_ENABLE_MASK 0x0000000000000001UL union uvh_bau_data_broadcast_u { - unsigned long v; - struct uvh_bau_data_broadcast_s { - unsigned long enable : 1; /* RW */ - unsigned long rsvd_1_63: 63; /* */ - } s; + unsigned long v; + struct uvh_bau_data_broadcast_s { + unsigned long enable:1; /* RW */ + unsigned long rsvd_1_63:63; + } s; }; /* ========================================================================= */ /* UVH_BAU_DATA_CONFIG */ /* ========================================================================= */ -#define UVH_BAU_DATA_CONFIG 0x61680UL -#define UVH_BAU_DATA_CONFIG_32 0x438 - -#define UVH_BAU_DATA_CONFIG_VECTOR_SHFT 0 -#define UVH_BAU_DATA_CONFIG_VECTOR_MASK 0x00000000000000ffUL -#define UVH_BAU_DATA_CONFIG_DM_SHFT 8 -#define UVH_BAU_DATA_CONFIG_DM_MASK 0x0000000000000700UL -#define UVH_BAU_DATA_CONFIG_DESTMODE_SHFT 11 -#define UVH_BAU_DATA_CONFIG_DESTMODE_MASK 0x0000000000000800UL -#define UVH_BAU_DATA_CONFIG_STATUS_SHFT 12 -#define UVH_BAU_DATA_CONFIG_STATUS_MASK 0x0000000000001000UL -#define UVH_BAU_DATA_CONFIG_P_SHFT 13 -#define UVH_BAU_DATA_CONFIG_P_MASK 0x0000000000002000UL -#define UVH_BAU_DATA_CONFIG_T_SHFT 15 -#define UVH_BAU_DATA_CONFIG_T_MASK 0x0000000000008000UL -#define UVH_BAU_DATA_CONFIG_M_SHFT 16 -#define UVH_BAU_DATA_CONFIG_M_MASK 0x0000000000010000UL -#define UVH_BAU_DATA_CONFIG_APIC_ID_SHFT 32 -#define UVH_BAU_DATA_CONFIG_APIC_ID_MASK 0xffffffff00000000UL +#define UVH_BAU_DATA_CONFIG 0x61680UL +#define UVH_BAU_DATA_CONFIG_32 0x438 + +#define UVH_BAU_DATA_CONFIG_VECTOR_SHFT 0 +#define UVH_BAU_DATA_CONFIG_DM_SHFT 8 +#define UVH_BAU_DATA_CONFIG_DESTMODE_SHFT 11 +#define UVH_BAU_DATA_CONFIG_STATUS_SHFT 12 +#define UVH_BAU_DATA_CONFIG_P_SHFT 13 +#define UVH_BAU_DATA_CONFIG_T_SHFT 15 +#define UVH_BAU_DATA_CONFIG_M_SHFT 16 +#define UVH_BAU_DATA_CONFIG_APIC_ID_SHFT 32 +#define UVH_BAU_DATA_CONFIG_VECTOR_MASK 0x00000000000000ffUL +#define UVH_BAU_DATA_CONFIG_DM_MASK 0x0000000000000700UL +#define UVH_BAU_DATA_CONFIG_DESTMODE_MASK 0x0000000000000800UL +#define UVH_BAU_DATA_CONFIG_STATUS_MASK 0x0000000000001000UL +#define UVH_BAU_DATA_CONFIG_P_MASK 0x0000000000002000UL +#define UVH_BAU_DATA_CONFIG_T_MASK 0x0000000000008000UL +#define UVH_BAU_DATA_CONFIG_M_MASK 0x0000000000010000UL +#define UVH_BAU_DATA_CONFIG_APIC_ID_MASK 0xffffffff00000000UL union uvh_bau_data_config_u { - unsigned long v; - struct uvh_bau_data_config_s { - unsigned long vector_ : 8; /* RW */ - unsigned long dm : 3; /* RW */ - unsigned long destmode : 1; /* RW */ - unsigned long status : 1; /* RO */ - unsigned long p : 1; /* RO */ - unsigned long rsvd_14 : 1; /* */ - unsigned long t : 1; /* RO */ - unsigned long m : 1; /* RW */ - unsigned long rsvd_17_31: 15; /* */ - unsigned long apic_id : 32; /* RW */ - } s; + unsigned long v; + struct uvh_bau_data_config_s { + unsigned long vector_:8; /* RW */ + unsigned long dm:3; /* RW */ + unsigned long destmode:1; /* RW */ + unsigned long status:1; /* RO */ + unsigned long p:1; /* RO */ + unsigned long rsvd_14:1; + unsigned long t:1; /* RO */ + unsigned long m:1; /* RW */ + unsigned long rsvd_17_31:15; + unsigned long apic_id:32; /* RW */ + } s; }; /* ========================================================================= */ /* UVH_EVENT_OCCURRED0 */ /* ========================================================================= */ -#define UVH_EVENT_OCCURRED0 0x70000UL -#define UVH_EVENT_OCCURRED0_32 0x5e8 - -#define UV1H_EVENT_OCCURRED0_LB_HCERR_SHFT 0 -#define UV1H_EVENT_OCCURRED0_LB_HCERR_MASK 0x0000000000000001UL -#define UV1H_EVENT_OCCURRED0_GR0_HCERR_SHFT 1 -#define UV1H_EVENT_OCCURRED0_GR0_HCERR_MASK 0x0000000000000002UL -#define UV1H_EVENT_OCCURRED0_GR1_HCERR_SHFT 2 -#define UV1H_EVENT_OCCURRED0_GR1_HCERR_MASK 0x0000000000000004UL -#define UV1H_EVENT_OCCURRED0_LH_HCERR_SHFT 3 -#define UV1H_EVENT_OCCURRED0_LH_HCERR_MASK 0x0000000000000008UL -#define UV1H_EVENT_OCCURRED0_RH_HCERR_SHFT 4 -#define UV1H_EVENT_OCCURRED0_RH_HCERR_MASK 0x0000000000000010UL -#define UV1H_EVENT_OCCURRED0_XN_HCERR_SHFT 5 -#define UV1H_EVENT_OCCURRED0_XN_HCERR_MASK 0x0000000000000020UL -#define UV1H_EVENT_OCCURRED0_SI_HCERR_SHFT 6 -#define UV1H_EVENT_OCCURRED0_SI_HCERR_MASK 0x0000000000000040UL -#define UV1H_EVENT_OCCURRED0_LB_AOERR0_SHFT 7 -#define UV1H_EVENT_OCCURRED0_LB_AOERR0_MASK 0x0000000000000080UL -#define UV1H_EVENT_OCCURRED0_GR0_AOERR0_SHFT 8 -#define UV1H_EVENT_OCCURRED0_GR0_AOERR0_MASK 0x0000000000000100UL -#define UV1H_EVENT_OCCURRED0_GR1_AOERR0_SHFT 9 -#define UV1H_EVENT_OCCURRED0_GR1_AOERR0_MASK 0x0000000000000200UL -#define UV1H_EVENT_OCCURRED0_LH_AOERR0_SHFT 10 -#define UV1H_EVENT_OCCURRED0_LH_AOERR0_MASK 0x0000000000000400UL -#define UV1H_EVENT_OCCURRED0_RH_AOERR0_SHFT 11 -#define UV1H_EVENT_OCCURRED0_RH_AOERR0_MASK 0x0000000000000800UL -#define UV1H_EVENT_OCCURRED0_XN_AOERR0_SHFT 12 -#define UV1H_EVENT_OCCURRED0_XN_AOERR0_MASK 0x0000000000001000UL -#define UV1H_EVENT_OCCURRED0_SI_AOERR0_SHFT 13 -#define UV1H_EVENT_OCCURRED0_SI_AOERR0_MASK 0x0000000000002000UL -#define UV1H_EVENT_OCCURRED0_LB_AOERR1_SHFT 14 -#define UV1H_EVENT_OCCURRED0_LB_AOERR1_MASK 0x0000000000004000UL -#define UV1H_EVENT_OCCURRED0_GR0_AOERR1_SHFT 15 -#define UV1H_EVENT_OCCURRED0_GR0_AOERR1_MASK 0x0000000000008000UL -#define UV1H_EVENT_OCCURRED0_GR1_AOERR1_SHFT 16 -#define UV1H_EVENT_OCCURRED0_GR1_AOERR1_MASK 0x0000000000010000UL -#define UV1H_EVENT_OCCURRED0_LH_AOERR1_SHFT 17 -#define UV1H_EVENT_OCCURRED0_LH_AOERR1_MASK 0x0000000000020000UL -#define UV1H_EVENT_OCCURRED0_RH_AOERR1_SHFT 18 -#define UV1H_EVENT_OCCURRED0_RH_AOERR1_MASK 0x0000000000040000UL -#define UV1H_EVENT_OCCURRED0_XN_AOERR1_SHFT 19 -#define UV1H_EVENT_OCCURRED0_XN_AOERR1_MASK 0x0000000000080000UL -#define UV1H_EVENT_OCCURRED0_SI_AOERR1_SHFT 20 -#define UV1H_EVENT_OCCURRED0_SI_AOERR1_MASK 0x0000000000100000UL -#define UV1H_EVENT_OCCURRED0_RH_VPI_INT_SHFT 21 -#define UV1H_EVENT_OCCURRED0_RH_VPI_INT_MASK 0x0000000000200000UL -#define UV1H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT 22 -#define UV1H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK 0x0000000000400000UL -#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT 23 -#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK 0x0000000000800000UL -#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT 24 -#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK 0x0000000001000000UL -#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT 25 -#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK 0x0000000002000000UL -#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT 26 -#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK 0x0000000004000000UL -#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT 27 -#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK 0x0000000008000000UL -#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT 28 -#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK 0x0000000010000000UL -#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT 29 -#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK 0x0000000020000000UL -#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT 30 -#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK 0x0000000040000000UL -#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT 31 -#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK 0x0000000080000000UL -#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT 32 -#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK 0x0000000100000000UL -#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT 33 -#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK 0x0000000200000000UL -#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT 34 -#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK 0x0000000400000000UL -#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT 35 -#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK 0x0000000800000000UL -#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT 36 -#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK 0x0000001000000000UL -#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT 37 -#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK 0x0000002000000000UL -#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT 38 -#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK 0x0000004000000000UL -#define UV1H_EVENT_OCCURRED0_L1_NMI_INT_SHFT 39 -#define UV1H_EVENT_OCCURRED0_L1_NMI_INT_MASK 0x0000008000000000UL -#define UV1H_EVENT_OCCURRED0_STOP_CLOCK_SHFT 40 -#define UV1H_EVENT_OCCURRED0_STOP_CLOCK_MASK 0x0000010000000000UL -#define UV1H_EVENT_OCCURRED0_ASIC_TO_L1_SHFT 41 -#define UV1H_EVENT_OCCURRED0_ASIC_TO_L1_MASK 0x0000020000000000UL -#define UV1H_EVENT_OCCURRED0_L1_TO_ASIC_SHFT 42 -#define UV1H_EVENT_OCCURRED0_L1_TO_ASIC_MASK 0x0000040000000000UL -#define UV1H_EVENT_OCCURRED0_LTC_INT_SHFT 43 -#define UV1H_EVENT_OCCURRED0_LTC_INT_MASK 0x0000080000000000UL -#define UV1H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT 44 -#define UV1H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK 0x0000100000000000UL -#define UV1H_EVENT_OCCURRED0_IPI_INT_SHFT 45 -#define UV1H_EVENT_OCCURRED0_IPI_INT_MASK 0x0000200000000000UL -#define UV1H_EVENT_OCCURRED0_EXTIO_INT0_SHFT 46 -#define UV1H_EVENT_OCCURRED0_EXTIO_INT0_MASK 0x0000400000000000UL -#define UV1H_EVENT_OCCURRED0_EXTIO_INT1_SHFT 47 -#define UV1H_EVENT_OCCURRED0_EXTIO_INT1_MASK 0x0000800000000000UL -#define UV1H_EVENT_OCCURRED0_EXTIO_INT2_SHFT 48 -#define UV1H_EVENT_OCCURRED0_EXTIO_INT2_MASK 0x0001000000000000UL -#define UV1H_EVENT_OCCURRED0_EXTIO_INT3_SHFT 49 -#define UV1H_EVENT_OCCURRED0_EXTIO_INT3_MASK 0x0002000000000000UL -#define UV1H_EVENT_OCCURRED0_PROFILE_INT_SHFT 50 -#define UV1H_EVENT_OCCURRED0_PROFILE_INT_MASK 0x0004000000000000UL -#define UV1H_EVENT_OCCURRED0_RTC0_SHFT 51 -#define UV1H_EVENT_OCCURRED0_RTC0_MASK 0x0008000000000000UL -#define UV1H_EVENT_OCCURRED0_RTC1_SHFT 52 -#define UV1H_EVENT_OCCURRED0_RTC1_MASK 0x0010000000000000UL -#define UV1H_EVENT_OCCURRED0_RTC2_SHFT 53 -#define UV1H_EVENT_OCCURRED0_RTC2_MASK 0x0020000000000000UL -#define UV1H_EVENT_OCCURRED0_RTC3_SHFT 54 -#define UV1H_EVENT_OCCURRED0_RTC3_MASK 0x0040000000000000UL -#define UV1H_EVENT_OCCURRED0_BAU_DATA_SHFT 55 -#define UV1H_EVENT_OCCURRED0_BAU_DATA_MASK 0x0080000000000000UL -#define UV1H_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_SHFT 56 -#define UV1H_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_MASK 0x0100000000000000UL - -#define UV2H_EVENT_OCCURRED0_LB_HCERR_SHFT 0 -#define UV2H_EVENT_OCCURRED0_LB_HCERR_MASK 0x0000000000000001UL -#define UV2H_EVENT_OCCURRED0_QP_HCERR_SHFT 1 -#define UV2H_EVENT_OCCURRED0_QP_HCERR_MASK 0x0000000000000002UL -#define UV2H_EVENT_OCCURRED0_RH_HCERR_SHFT 2 -#define UV2H_EVENT_OCCURRED0_RH_HCERR_MASK 0x0000000000000004UL -#define UV2H_EVENT_OCCURRED0_LH0_HCERR_SHFT 3 -#define UV2H_EVENT_OCCURRED0_LH0_HCERR_MASK 0x0000000000000008UL -#define UV2H_EVENT_OCCURRED0_LH1_HCERR_SHFT 4 -#define UV2H_EVENT_OCCURRED0_LH1_HCERR_MASK 0x0000000000000010UL -#define UV2H_EVENT_OCCURRED0_GR0_HCERR_SHFT 5 -#define UV2H_EVENT_OCCURRED0_GR0_HCERR_MASK 0x0000000000000020UL -#define UV2H_EVENT_OCCURRED0_GR1_HCERR_SHFT 6 -#define UV2H_EVENT_OCCURRED0_GR1_HCERR_MASK 0x0000000000000040UL -#define UV2H_EVENT_OCCURRED0_NI0_HCERR_SHFT 7 -#define UV2H_EVENT_OCCURRED0_NI0_HCERR_MASK 0x0000000000000080UL -#define UV2H_EVENT_OCCURRED0_NI1_HCERR_SHFT 8 -#define UV2H_EVENT_OCCURRED0_NI1_HCERR_MASK 0x0000000000000100UL -#define UV2H_EVENT_OCCURRED0_LB_AOERR0_SHFT 9 -#define UV2H_EVENT_OCCURRED0_LB_AOERR0_MASK 0x0000000000000200UL -#define UV2H_EVENT_OCCURRED0_QP_AOERR0_SHFT 10 -#define UV2H_EVENT_OCCURRED0_QP_AOERR0_MASK 0x0000000000000400UL -#define UV2H_EVENT_OCCURRED0_RH_AOERR0_SHFT 11 -#define UV2H_EVENT_OCCURRED0_RH_AOERR0_MASK 0x0000000000000800UL -#define UV2H_EVENT_OCCURRED0_LH0_AOERR0_SHFT 12 -#define UV2H_EVENT_OCCURRED0_LH0_AOERR0_MASK 0x0000000000001000UL -#define UV2H_EVENT_OCCURRED0_LH1_AOERR0_SHFT 13 -#define UV2H_EVENT_OCCURRED0_LH1_AOERR0_MASK 0x0000000000002000UL -#define UV2H_EVENT_OCCURRED0_GR0_AOERR0_SHFT 14 -#define UV2H_EVENT_OCCURRED0_GR0_AOERR0_MASK 0x0000000000004000UL -#define UV2H_EVENT_OCCURRED0_GR1_AOERR0_SHFT 15 -#define UV2H_EVENT_OCCURRED0_GR1_AOERR0_MASK 0x0000000000008000UL -#define UV2H_EVENT_OCCURRED0_XB_AOERR0_SHFT 16 -#define UV2H_EVENT_OCCURRED0_XB_AOERR0_MASK 0x0000000000010000UL -#define UV2H_EVENT_OCCURRED0_RT_AOERR0_SHFT 17 -#define UV2H_EVENT_OCCURRED0_RT_AOERR0_MASK 0x0000000000020000UL -#define UV2H_EVENT_OCCURRED0_NI0_AOERR0_SHFT 18 -#define UV2H_EVENT_OCCURRED0_NI0_AOERR0_MASK 0x0000000000040000UL -#define UV2H_EVENT_OCCURRED0_NI1_AOERR0_SHFT 19 -#define UV2H_EVENT_OCCURRED0_NI1_AOERR0_MASK 0x0000000000080000UL -#define UV2H_EVENT_OCCURRED0_LB_AOERR1_SHFT 20 -#define UV2H_EVENT_OCCURRED0_LB_AOERR1_MASK 0x0000000000100000UL -#define UV2H_EVENT_OCCURRED0_QP_AOERR1_SHFT 21 -#define UV2H_EVENT_OCCURRED0_QP_AOERR1_MASK 0x0000000000200000UL -#define UV2H_EVENT_OCCURRED0_RH_AOERR1_SHFT 22 -#define UV2H_EVENT_OCCURRED0_RH_AOERR1_MASK 0x0000000000400000UL -#define UV2H_EVENT_OCCURRED0_LH0_AOERR1_SHFT 23 -#define UV2H_EVENT_OCCURRED0_LH0_AOERR1_MASK 0x0000000000800000UL -#define UV2H_EVENT_OCCURRED0_LH1_AOERR1_SHFT 24 -#define UV2H_EVENT_OCCURRED0_LH1_AOERR1_MASK 0x0000000001000000UL -#define UV2H_EVENT_OCCURRED0_GR0_AOERR1_SHFT 25 -#define UV2H_EVENT_OCCURRED0_GR0_AOERR1_MASK 0x0000000002000000UL -#define UV2H_EVENT_OCCURRED0_GR1_AOERR1_SHFT 26 -#define UV2H_EVENT_OCCURRED0_GR1_AOERR1_MASK 0x0000000004000000UL -#define UV2H_EVENT_OCCURRED0_XB_AOERR1_SHFT 27 -#define UV2H_EVENT_OCCURRED0_XB_AOERR1_MASK 0x0000000008000000UL -#define UV2H_EVENT_OCCURRED0_RT_AOERR1_SHFT 28 -#define UV2H_EVENT_OCCURRED0_RT_AOERR1_MASK 0x0000000010000000UL -#define UV2H_EVENT_OCCURRED0_NI0_AOERR1_SHFT 29 -#define UV2H_EVENT_OCCURRED0_NI0_AOERR1_MASK 0x0000000020000000UL -#define UV2H_EVENT_OCCURRED0_NI1_AOERR1_SHFT 30 -#define UV2H_EVENT_OCCURRED0_NI1_AOERR1_MASK 0x0000000040000000UL -#define UV2H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT 31 -#define UV2H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK 0x0000000080000000UL -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT 32 -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK 0x0000000100000000UL -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT 33 -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK 0x0000000200000000UL -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT 34 -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK 0x0000000400000000UL -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT 35 -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK 0x0000000800000000UL -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT 36 -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK 0x0000001000000000UL -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT 37 -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK 0x0000002000000000UL -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT 38 -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK 0x0000004000000000UL -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT 39 -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK 0x0000008000000000UL -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT 40 -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK 0x0000010000000000UL -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT 41 -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK 0x0000020000000000UL -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT 42 -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK 0x0000040000000000UL -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT 43 -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK 0x0000080000000000UL -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT 44 -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK 0x0000100000000000UL -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT 45 -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK 0x0000200000000000UL -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT 46 -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK 0x0000400000000000UL -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT 47 -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK 0x0000800000000000UL -#define UV2H_EVENT_OCCURRED0_L1_NMI_INT_SHFT 48 -#define UV2H_EVENT_OCCURRED0_L1_NMI_INT_MASK 0x0001000000000000UL -#define UV2H_EVENT_OCCURRED0_STOP_CLOCK_SHFT 49 -#define UV2H_EVENT_OCCURRED0_STOP_CLOCK_MASK 0x0002000000000000UL -#define UV2H_EVENT_OCCURRED0_ASIC_TO_L1_SHFT 50 -#define UV2H_EVENT_OCCURRED0_ASIC_TO_L1_MASK 0x0004000000000000UL -#define UV2H_EVENT_OCCURRED0_L1_TO_ASIC_SHFT 51 -#define UV2H_EVENT_OCCURRED0_L1_TO_ASIC_MASK 0x0008000000000000UL -#define UV2H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT 52 -#define UV2H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK 0x0010000000000000UL -#define UV2H_EVENT_OCCURRED0_IPI_INT_SHFT 53 -#define UV2H_EVENT_OCCURRED0_IPI_INT_MASK 0x0020000000000000UL -#define UV2H_EVENT_OCCURRED0_EXTIO_INT0_SHFT 54 -#define UV2H_EVENT_OCCURRED0_EXTIO_INT0_MASK 0x0040000000000000UL -#define UV2H_EVENT_OCCURRED0_EXTIO_INT1_SHFT 55 -#define UV2H_EVENT_OCCURRED0_EXTIO_INT1_MASK 0x0080000000000000UL -#define UV2H_EVENT_OCCURRED0_EXTIO_INT2_SHFT 56 -#define UV2H_EVENT_OCCURRED0_EXTIO_INT2_MASK 0x0100000000000000UL -#define UV2H_EVENT_OCCURRED0_EXTIO_INT3_SHFT 57 -#define UV2H_EVENT_OCCURRED0_EXTIO_INT3_MASK 0x0200000000000000UL -#define UV2H_EVENT_OCCURRED0_PROFILE_INT_SHFT 58 -#define UV2H_EVENT_OCCURRED0_PROFILE_INT_MASK 0x0400000000000000UL +#define UVH_EVENT_OCCURRED0 0x70000UL +#define UVH_EVENT_OCCURRED0_32 0x5e8 + +#define UV1H_EVENT_OCCURRED0_LB_HCERR_SHFT 0 +#define UV1H_EVENT_OCCURRED0_GR0_HCERR_SHFT 1 +#define UV1H_EVENT_OCCURRED0_GR1_HCERR_SHFT 2 +#define UV1H_EVENT_OCCURRED0_LH_HCERR_SHFT 3 +#define UV1H_EVENT_OCCURRED0_RH_HCERR_SHFT 4 +#define UV1H_EVENT_OCCURRED0_XN_HCERR_SHFT 5 +#define UV1H_EVENT_OCCURRED0_SI_HCERR_SHFT 6 +#define UV1H_EVENT_OCCURRED0_LB_AOERR0_SHFT 7 +#define UV1H_EVENT_OCCURRED0_GR0_AOERR0_SHFT 8 +#define UV1H_EVENT_OCCURRED0_GR1_AOERR0_SHFT 9 +#define UV1H_EVENT_OCCURRED0_LH_AOERR0_SHFT 10 +#define UV1H_EVENT_OCCURRED0_RH_AOERR0_SHFT 11 +#define UV1H_EVENT_OCCURRED0_XN_AOERR0_SHFT 12 +#define UV1H_EVENT_OCCURRED0_SI_AOERR0_SHFT 13 +#define UV1H_EVENT_OCCURRED0_LB_AOERR1_SHFT 14 +#define UV1H_EVENT_OCCURRED0_GR0_AOERR1_SHFT 15 +#define UV1H_EVENT_OCCURRED0_GR1_AOERR1_SHFT 16 +#define UV1H_EVENT_OCCURRED0_LH_AOERR1_SHFT 17 +#define UV1H_EVENT_OCCURRED0_RH_AOERR1_SHFT 18 +#define UV1H_EVENT_OCCURRED0_XN_AOERR1_SHFT 19 +#define UV1H_EVENT_OCCURRED0_SI_AOERR1_SHFT 20 +#define UV1H_EVENT_OCCURRED0_RH_VPI_INT_SHFT 21 +#define UV1H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT 22 +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT 23 +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT 24 +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT 25 +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT 26 +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT 27 +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT 28 +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT 29 +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT 30 +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT 31 +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT 32 +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT 33 +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT 34 +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT 35 +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT 36 +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT 37 +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT 38 +#define UV1H_EVENT_OCCURRED0_L1_NMI_INT_SHFT 39 +#define UV1H_EVENT_OCCURRED0_STOP_CLOCK_SHFT 40 +#define UV1H_EVENT_OCCURRED0_ASIC_TO_L1_SHFT 41 +#define UV1H_EVENT_OCCURRED0_L1_TO_ASIC_SHFT 42 +#define UV1H_EVENT_OCCURRED0_LTC_INT_SHFT 43 +#define UV1H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT 44 +#define UV1H_EVENT_OCCURRED0_IPI_INT_SHFT 45 +#define UV1H_EVENT_OCCURRED0_EXTIO_INT0_SHFT 46 +#define UV1H_EVENT_OCCURRED0_EXTIO_INT1_SHFT 47 +#define UV1H_EVENT_OCCURRED0_EXTIO_INT2_SHFT 48 +#define UV1H_EVENT_OCCURRED0_EXTIO_INT3_SHFT 49 +#define UV1H_EVENT_OCCURRED0_PROFILE_INT_SHFT 50 +#define UV1H_EVENT_OCCURRED0_RTC0_SHFT 51 +#define UV1H_EVENT_OCCURRED0_RTC1_SHFT 52 +#define UV1H_EVENT_OCCURRED0_RTC2_SHFT 53 +#define UV1H_EVENT_OCCURRED0_RTC3_SHFT 54 +#define UV1H_EVENT_OCCURRED0_BAU_DATA_SHFT 55 +#define UV1H_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_SHFT 56 +#define UV1H_EVENT_OCCURRED0_LB_HCERR_MASK 0x0000000000000001UL +#define UV1H_EVENT_OCCURRED0_GR0_HCERR_MASK 0x0000000000000002UL +#define UV1H_EVENT_OCCURRED0_GR1_HCERR_MASK 0x0000000000000004UL +#define UV1H_EVENT_OCCURRED0_LH_HCERR_MASK 0x0000000000000008UL +#define UV1H_EVENT_OCCURRED0_RH_HCERR_MASK 0x0000000000000010UL +#define UV1H_EVENT_OCCURRED0_XN_HCERR_MASK 0x0000000000000020UL +#define UV1H_EVENT_OCCURRED0_SI_HCERR_MASK 0x0000000000000040UL +#define UV1H_EVENT_OCCURRED0_LB_AOERR0_MASK 0x0000000000000080UL +#define UV1H_EVENT_OCCURRED0_GR0_AOERR0_MASK 0x0000000000000100UL +#define UV1H_EVENT_OCCURRED0_GR1_AOERR0_MASK 0x0000000000000200UL +#define UV1H_EVENT_OCCURRED0_LH_AOERR0_MASK 0x0000000000000400UL +#define UV1H_EVENT_OCCURRED0_RH_AOERR0_MASK 0x0000000000000800UL +#define UV1H_EVENT_OCCURRED0_XN_AOERR0_MASK 0x0000000000001000UL +#define UV1H_EVENT_OCCURRED0_SI_AOERR0_MASK 0x0000000000002000UL +#define UV1H_EVENT_OCCURRED0_LB_AOERR1_MASK 0x0000000000004000UL +#define UV1H_EVENT_OCCURRED0_GR0_AOERR1_MASK 0x0000000000008000UL +#define UV1H_EVENT_OCCURRED0_GR1_AOERR1_MASK 0x0000000000010000UL +#define UV1H_EVENT_OCCURRED0_LH_AOERR1_MASK 0x0000000000020000UL +#define UV1H_EVENT_OCCURRED0_RH_AOERR1_MASK 0x0000000000040000UL +#define UV1H_EVENT_OCCURRED0_XN_AOERR1_MASK 0x0000000000080000UL +#define UV1H_EVENT_OCCURRED0_SI_AOERR1_MASK 0x0000000000100000UL +#define UV1H_EVENT_OCCURRED0_RH_VPI_INT_MASK 0x0000000000200000UL +#define UV1H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK 0x0000000000400000UL +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK 0x0000000000800000UL +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK 0x0000000001000000UL +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK 0x0000000002000000UL +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK 0x0000000004000000UL +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK 0x0000000008000000UL +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK 0x0000000010000000UL +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK 0x0000000020000000UL +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK 0x0000000040000000UL +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK 0x0000000080000000UL +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK 0x0000000100000000UL +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK 0x0000000200000000UL +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK 0x0000000400000000UL +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK 0x0000000800000000UL +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK 0x0000001000000000UL +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK 0x0000002000000000UL +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK 0x0000004000000000UL +#define UV1H_EVENT_OCCURRED0_L1_NMI_INT_MASK 0x0000008000000000UL +#define UV1H_EVENT_OCCURRED0_STOP_CLOCK_MASK 0x0000010000000000UL +#define UV1H_EVENT_OCCURRED0_ASIC_TO_L1_MASK 0x0000020000000000UL +#define UV1H_EVENT_OCCURRED0_L1_TO_ASIC_MASK 0x0000040000000000UL +#define UV1H_EVENT_OCCURRED0_LTC_INT_MASK 0x0000080000000000UL +#define UV1H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK 0x0000100000000000UL +#define UV1H_EVENT_OCCURRED0_IPI_INT_MASK 0x0000200000000000UL +#define UV1H_EVENT_OCCURRED0_EXTIO_INT0_MASK 0x0000400000000000UL +#define UV1H_EVENT_OCCURRED0_EXTIO_INT1_MASK 0x0000800000000000UL +#define UV1H_EVENT_OCCURRED0_EXTIO_INT2_MASK 0x0001000000000000UL +#define UV1H_EVENT_OCCURRED0_EXTIO_INT3_MASK 0x0002000000000000UL +#define UV1H_EVENT_OCCURRED0_PROFILE_INT_MASK 0x0004000000000000UL +#define UV1H_EVENT_OCCURRED0_RTC0_MASK 0x0008000000000000UL +#define UV1H_EVENT_OCCURRED0_RTC1_MASK 0x0010000000000000UL +#define UV1H_EVENT_OCCURRED0_RTC2_MASK 0x0020000000000000UL +#define UV1H_EVENT_OCCURRED0_RTC3_MASK 0x0040000000000000UL +#define UV1H_EVENT_OCCURRED0_BAU_DATA_MASK 0x0080000000000000UL +#define UV1H_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_MASK 0x0100000000000000UL + +#define UV2H_EVENT_OCCURRED0_LB_HCERR_SHFT 0 +#define UV2H_EVENT_OCCURRED0_QP_HCERR_SHFT 1 +#define UV2H_EVENT_OCCURRED0_RH_HCERR_SHFT 2 +#define UV2H_EVENT_OCCURRED0_LH0_HCERR_SHFT 3 +#define UV2H_EVENT_OCCURRED0_LH1_HCERR_SHFT 4 +#define UV2H_EVENT_OCCURRED0_GR0_HCERR_SHFT 5 +#define UV2H_EVENT_OCCURRED0_GR1_HCERR_SHFT 6 +#define UV2H_EVENT_OCCURRED0_NI0_HCERR_SHFT 7 +#define UV2H_EVENT_OCCURRED0_NI1_HCERR_SHFT 8 +#define UV2H_EVENT_OCCURRED0_LB_AOERR0_SHFT 9 +#define UV2H_EVENT_OCCURRED0_QP_AOERR0_SHFT 10 +#define UV2H_EVENT_OCCURRED0_RH_AOERR0_SHFT 11 +#define UV2H_EVENT_OCCURRED0_LH0_AOERR0_SHFT 12 +#define UV2H_EVENT_OCCURRED0_LH1_AOERR0_SHFT 13 +#define UV2H_EVENT_OCCURRED0_GR0_AOERR0_SHFT 14 +#define UV2H_EVENT_OCCURRED0_GR1_AOERR0_SHFT 15 +#define UV2H_EVENT_OCCURRED0_XB_AOERR0_SHFT 16 +#define UV2H_EVENT_OCCURRED0_RT_AOERR0_SHFT 17 +#define UV2H_EVENT_OCCURRED0_NI0_AOERR0_SHFT 18 +#define UV2H_EVENT_OCCURRED0_NI1_AOERR0_SHFT 19 +#define UV2H_EVENT_OCCURRED0_LB_AOERR1_SHFT 20 +#define UV2H_EVENT_OCCURRED0_QP_AOERR1_SHFT 21 +#define UV2H_EVENT_OCCURRED0_RH_AOERR1_SHFT 22 +#define UV2H_EVENT_OCCURRED0_LH0_AOERR1_SHFT 23 +#define UV2H_EVENT_OCCURRED0_LH1_AOERR1_SHFT 24 +#define UV2H_EVENT_OCCURRED0_GR0_AOERR1_SHFT 25 +#define UV2H_EVENT_OCCURRED0_GR1_AOERR1_SHFT 26 +#define UV2H_EVENT_OCCURRED0_XB_AOERR1_SHFT 27 +#define UV2H_EVENT_OCCURRED0_RT_AOERR1_SHFT 28 +#define UV2H_EVENT_OCCURRED0_NI0_AOERR1_SHFT 29 +#define UV2H_EVENT_OCCURRED0_NI1_AOERR1_SHFT 30 +#define UV2H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT 31 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT 32 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT 33 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT 34 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT 35 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT 36 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT 37 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT 38 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT 39 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT 40 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT 41 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT 42 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT 43 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT 44 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT 45 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT 46 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT 47 +#define UV2H_EVENT_OCCURRED0_L1_NMI_INT_SHFT 48 +#define UV2H_EVENT_OCCURRED0_STOP_CLOCK_SHFT 49 +#define UV2H_EVENT_OCCURRED0_ASIC_TO_L1_SHFT 50 +#define UV2H_EVENT_OCCURRED0_L1_TO_ASIC_SHFT 51 +#define UV2H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT 52 +#define UV2H_EVENT_OCCURRED0_IPI_INT_SHFT 53 +#define UV2H_EVENT_OCCURRED0_EXTIO_INT0_SHFT 54 +#define UV2H_EVENT_OCCURRED0_EXTIO_INT1_SHFT 55 +#define UV2H_EVENT_OCCURRED0_EXTIO_INT2_SHFT 56 +#define UV2H_EVENT_OCCURRED0_EXTIO_INT3_SHFT 57 +#define UV2H_EVENT_OCCURRED0_PROFILE_INT_SHFT 58 +#define UV2H_EVENT_OCCURRED0_LB_HCERR_MASK 0x0000000000000001UL +#define UV2H_EVENT_OCCURRED0_QP_HCERR_MASK 0x0000000000000002UL +#define UV2H_EVENT_OCCURRED0_RH_HCERR_MASK 0x0000000000000004UL +#define UV2H_EVENT_OCCURRED0_LH0_HCERR_MASK 0x0000000000000008UL +#define UV2H_EVENT_OCCURRED0_LH1_HCERR_MASK 0x0000000000000010UL +#define UV2H_EVENT_OCCURRED0_GR0_HCERR_MASK 0x0000000000000020UL +#define UV2H_EVENT_OCCURRED0_GR1_HCERR_MASK 0x0000000000000040UL +#define UV2H_EVENT_OCCURRED0_NI0_HCERR_MASK 0x0000000000000080UL +#define UV2H_EVENT_OCCURRED0_NI1_HCERR_MASK 0x0000000000000100UL +#define UV2H_EVENT_OCCURRED0_LB_AOERR0_MASK 0x0000000000000200UL +#define UV2H_EVENT_OCCURRED0_QP_AOERR0_MASK 0x0000000000000400UL +#define UV2H_EVENT_OCCURRED0_RH_AOERR0_MASK 0x0000000000000800UL +#define UV2H_EVENT_OCCURRED0_LH0_AOERR0_MASK 0x0000000000001000UL +#define UV2H_EVENT_OCCURRED0_LH1_AOERR0_MASK 0x0000000000002000UL +#define UV2H_EVENT_OCCURRED0_GR0_AOERR0_MASK 0x0000000000004000UL +#define UV2H_EVENT_OCCURRED0_GR1_AOERR0_MASK 0x0000000000008000UL +#define UV2H_EVENT_OCCURRED0_XB_AOERR0_MASK 0x0000000000010000UL +#define UV2H_EVENT_OCCURRED0_RT_AOERR0_MASK 0x0000000000020000UL +#define UV2H_EVENT_OCCURRED0_NI0_AOERR0_MASK 0x0000000000040000UL +#define UV2H_EVENT_OCCURRED0_NI1_AOERR0_MASK 0x0000000000080000UL +#define UV2H_EVENT_OCCURRED0_LB_AOERR1_MASK 0x0000000000100000UL +#define UV2H_EVENT_OCCURRED0_QP_AOERR1_MASK 0x0000000000200000UL +#define UV2H_EVENT_OCCURRED0_RH_AOERR1_MASK 0x0000000000400000UL +#define UV2H_EVENT_OCCURRED0_LH0_AOERR1_MASK 0x0000000000800000UL +#define UV2H_EVENT_OCCURRED0_LH1_AOERR1_MASK 0x0000000001000000UL +#define UV2H_EVENT_OCCURRED0_GR0_AOERR1_MASK 0x0000000002000000UL +#define UV2H_EVENT_OCCURRED0_GR1_AOERR1_MASK 0x0000000004000000UL +#define UV2H_EVENT_OCCURRED0_XB_AOERR1_MASK 0x0000000008000000UL +#define UV2H_EVENT_OCCURRED0_RT_AOERR1_MASK 0x0000000010000000UL +#define UV2H_EVENT_OCCURRED0_NI0_AOERR1_MASK 0x0000000020000000UL +#define UV2H_EVENT_OCCURRED0_NI1_AOERR1_MASK 0x0000000040000000UL +#define UV2H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK 0x0000000080000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK 0x0000000100000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK 0x0000000200000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK 0x0000000400000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK 0x0000000800000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK 0x0000001000000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK 0x0000002000000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK 0x0000004000000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK 0x0000008000000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK 0x0000010000000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK 0x0000020000000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK 0x0000040000000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK 0x0000080000000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK 0x0000100000000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK 0x0000200000000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK 0x0000400000000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK 0x0000800000000000UL +#define UV2H_EVENT_OCCURRED0_L1_NMI_INT_MASK 0x0001000000000000UL +#define UV2H_EVENT_OCCURRED0_STOP_CLOCK_MASK 0x0002000000000000UL +#define UV2H_EVENT_OCCURRED0_ASIC_TO_L1_MASK 0x0004000000000000UL +#define UV2H_EVENT_OCCURRED0_L1_TO_ASIC_MASK 0x0008000000000000UL +#define UV2H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK 0x0010000000000000UL +#define UV2H_EVENT_OCCURRED0_IPI_INT_MASK 0x0020000000000000UL +#define UV2H_EVENT_OCCURRED0_EXTIO_INT0_MASK 0x0040000000000000UL +#define UV2H_EVENT_OCCURRED0_EXTIO_INT1_MASK 0x0080000000000000UL +#define UV2H_EVENT_OCCURRED0_EXTIO_INT2_MASK 0x0100000000000000UL +#define UV2H_EVENT_OCCURRED0_EXTIO_INT3_MASK 0x0200000000000000UL +#define UV2H_EVENT_OCCURRED0_PROFILE_INT_MASK 0x0400000000000000UL union uvh_event_occurred0_u { - unsigned long v; - struct uv1h_event_occurred0_s { - unsigned long lb_hcerr : 1; /* RW, W1C */ - unsigned long gr0_hcerr : 1; /* RW, W1C */ - unsigned long gr1_hcerr : 1; /* RW, W1C */ - unsigned long lh_hcerr : 1; /* RW, W1C */ - unsigned long rh_hcerr : 1; /* RW, W1C */ - unsigned long xn_hcerr : 1; /* RW, W1C */ - unsigned long si_hcerr : 1; /* RW, W1C */ - unsigned long lb_aoerr0 : 1; /* RW, W1C */ - unsigned long gr0_aoerr0 : 1; /* RW, W1C */ - unsigned long gr1_aoerr0 : 1; /* RW, W1C */ - unsigned long lh_aoerr0 : 1; /* RW, W1C */ - unsigned long rh_aoerr0 : 1; /* RW, W1C */ - unsigned long xn_aoerr0 : 1; /* RW, W1C */ - unsigned long si_aoerr0 : 1; /* RW, W1C */ - unsigned long lb_aoerr1 : 1; /* RW, W1C */ - unsigned long gr0_aoerr1 : 1; /* RW, W1C */ - unsigned long gr1_aoerr1 : 1; /* RW, W1C */ - unsigned long lh_aoerr1 : 1; /* RW, W1C */ - unsigned long rh_aoerr1 : 1; /* RW, W1C */ - unsigned long xn_aoerr1 : 1; /* RW, W1C */ - unsigned long si_aoerr1 : 1; /* RW, W1C */ - unsigned long rh_vpi_int : 1; /* RW, W1C */ - unsigned long system_shutdown_int : 1; /* RW, W1C */ - unsigned long lb_irq_int_0 : 1; /* RW, W1C */ - unsigned long lb_irq_int_1 : 1; /* RW, W1C */ - unsigned long lb_irq_int_2 : 1; /* RW, W1C */ - unsigned long lb_irq_int_3 : 1; /* RW, W1C */ - unsigned long lb_irq_int_4 : 1; /* RW, W1C */ - unsigned long lb_irq_int_5 : 1; /* RW, W1C */ - unsigned long lb_irq_int_6 : 1; /* RW, W1C */ - unsigned long lb_irq_int_7 : 1; /* RW, W1C */ - unsigned long lb_irq_int_8 : 1; /* RW, W1C */ - unsigned long lb_irq_int_9 : 1; /* RW, W1C */ - unsigned long lb_irq_int_10 : 1; /* RW, W1C */ - unsigned long lb_irq_int_11 : 1; /* RW, W1C */ - unsigned long lb_irq_int_12 : 1; /* RW, W1C */ - unsigned long lb_irq_int_13 : 1; /* RW, W1C */ - unsigned long lb_irq_int_14 : 1; /* RW, W1C */ - unsigned long lb_irq_int_15 : 1; /* RW, W1C */ - unsigned long l1_nmi_int : 1; /* RW, W1C */ - unsigned long stop_clock : 1; /* RW, W1C */ - unsigned long asic_to_l1 : 1; /* RW, W1C */ - unsigned long l1_to_asic : 1; /* RW, W1C */ - unsigned long ltc_int : 1; /* RW, W1C */ - unsigned long la_seq_trigger : 1; /* RW, W1C */ - unsigned long ipi_int : 1; /* RW, W1C */ - unsigned long extio_int0 : 1; /* RW, W1C */ - unsigned long extio_int1 : 1; /* RW, W1C */ - unsigned long extio_int2 : 1; /* RW, W1C */ - unsigned long extio_int3 : 1; /* RW, W1C */ - unsigned long profile_int : 1; /* RW, W1C */ - unsigned long rtc0 : 1; /* RW, W1C */ - unsigned long rtc1 : 1; /* RW, W1C */ - unsigned long rtc2 : 1; /* RW, W1C */ - unsigned long rtc3 : 1; /* RW, W1C */ - unsigned long bau_data : 1; /* RW, W1C */ - unsigned long power_management_req : 1; /* RW, W1C */ - unsigned long rsvd_57_63 : 7; /* */ - } s1; - struct uv2h_event_occurred0_s { - unsigned long lb_hcerr : 1; /* RW */ - unsigned long qp_hcerr : 1; /* RW */ - unsigned long rh_hcerr : 1; /* RW */ - unsigned long lh0_hcerr : 1; /* RW */ - unsigned long lh1_hcerr : 1; /* RW */ - unsigned long gr0_hcerr : 1; /* RW */ - unsigned long gr1_hcerr : 1; /* RW */ - unsigned long ni0_hcerr : 1; /* RW */ - unsigned long ni1_hcerr : 1; /* RW */ - unsigned long lb_aoerr0 : 1; /* RW */ - unsigned long qp_aoerr0 : 1; /* RW */ - unsigned long rh_aoerr0 : 1; /* RW */ - unsigned long lh0_aoerr0 : 1; /* RW */ - unsigned long lh1_aoerr0 : 1; /* RW */ - unsigned long gr0_aoerr0 : 1; /* RW */ - unsigned long gr1_aoerr0 : 1; /* RW */ - unsigned long xb_aoerr0 : 1; /* RW */ - unsigned long rt_aoerr0 : 1; /* RW */ - unsigned long ni0_aoerr0 : 1; /* RW */ - unsigned long ni1_aoerr0 : 1; /* RW */ - unsigned long lb_aoerr1 : 1; /* RW */ - unsigned long qp_aoerr1 : 1; /* RW */ - unsigned long rh_aoerr1 : 1; /* RW */ - unsigned long lh0_aoerr1 : 1; /* RW */ - unsigned long lh1_aoerr1 : 1; /* RW */ - unsigned long gr0_aoerr1 : 1; /* RW */ - unsigned long gr1_aoerr1 : 1; /* RW */ - unsigned long xb_aoerr1 : 1; /* RW */ - unsigned long rt_aoerr1 : 1; /* RW */ - unsigned long ni0_aoerr1 : 1; /* RW */ - unsigned long ni1_aoerr1 : 1; /* RW */ - unsigned long system_shutdown_int : 1; /* RW */ - unsigned long lb_irq_int_0 : 1; /* RW */ - unsigned long lb_irq_int_1 : 1; /* RW */ - unsigned long lb_irq_int_2 : 1; /* RW */ - unsigned long lb_irq_int_3 : 1; /* RW */ - unsigned long lb_irq_int_4 : 1; /* RW */ - unsigned long lb_irq_int_5 : 1; /* RW */ - unsigned long lb_irq_int_6 : 1; /* RW */ - unsigned long lb_irq_int_7 : 1; /* RW */ - unsigned long lb_irq_int_8 : 1; /* RW */ - unsigned long lb_irq_int_9 : 1; /* RW */ - unsigned long lb_irq_int_10 : 1; /* RW */ - unsigned long lb_irq_int_11 : 1; /* RW */ - unsigned long lb_irq_int_12 : 1; /* RW */ - unsigned long lb_irq_int_13 : 1; /* RW */ - unsigned long lb_irq_int_14 : 1; /* RW */ - unsigned long lb_irq_int_15 : 1; /* RW */ - unsigned long l1_nmi_int : 1; /* RW */ - unsigned long stop_clock : 1; /* RW */ - unsigned long asic_to_l1 : 1; /* RW */ - unsigned long l1_to_asic : 1; /* RW */ - unsigned long la_seq_trigger : 1; /* RW */ - unsigned long ipi_int : 1; /* RW */ - unsigned long extio_int0 : 1; /* RW */ - unsigned long extio_int1 : 1; /* RW */ - unsigned long extio_int2 : 1; /* RW */ - unsigned long extio_int3 : 1; /* RW */ - unsigned long profile_int : 1; /* RW */ - unsigned long rsvd_59_63 : 5; /* */ - } s2; + unsigned long v; + struct uv1h_event_occurred0_s { + unsigned long lb_hcerr:1; /* RW, W1C */ + unsigned long gr0_hcerr:1; /* RW, W1C */ + unsigned long gr1_hcerr:1; /* RW, W1C */ + unsigned long lh_hcerr:1; /* RW, W1C */ + unsigned long rh_hcerr:1; /* RW, W1C */ + unsigned long xn_hcerr:1; /* RW, W1C */ + unsigned long si_hcerr:1; /* RW, W1C */ + unsigned long lb_aoerr0:1; /* RW, W1C */ + unsigned long gr0_aoerr0:1; /* RW, W1C */ + unsigned long gr1_aoerr0:1; /* RW, W1C */ + unsigned long lh_aoerr0:1; /* RW, W1C */ + unsigned long rh_aoerr0:1; /* RW, W1C */ + unsigned long xn_aoerr0:1; /* RW, W1C */ + unsigned long si_aoerr0:1; /* RW, W1C */ + unsigned long lb_aoerr1:1; /* RW, W1C */ + unsigned long gr0_aoerr1:1; /* RW, W1C */ + unsigned long gr1_aoerr1:1; /* RW, W1C */ + unsigned long lh_aoerr1:1; /* RW, W1C */ + unsigned long rh_aoerr1:1; /* RW, W1C */ + unsigned long xn_aoerr1:1; /* RW, W1C */ + unsigned long si_aoerr1:1; /* RW, W1C */ + unsigned long rh_vpi_int:1; /* RW, W1C */ + unsigned long system_shutdown_int:1; /* RW, W1C */ + unsigned long lb_irq_int_0:1; /* RW, W1C */ + unsigned long lb_irq_int_1:1; /* RW, W1C */ + unsigned long lb_irq_int_2:1; /* RW, W1C */ + unsigned long lb_irq_int_3:1; /* RW, W1C */ + unsigned long lb_irq_int_4:1; /* RW, W1C */ + unsigned long lb_irq_int_5:1; /* RW, W1C */ + unsigned long lb_irq_int_6:1; /* RW, W1C */ + unsigned long lb_irq_int_7:1; /* RW, W1C */ + unsigned long lb_irq_int_8:1; /* RW, W1C */ + unsigned long lb_irq_int_9:1; /* RW, W1C */ + unsigned long lb_irq_int_10:1; /* RW, W1C */ + unsigned long lb_irq_int_11:1; /* RW, W1C */ + unsigned long lb_irq_int_12:1; /* RW, W1C */ + unsigned long lb_irq_int_13:1; /* RW, W1C */ + unsigned long lb_irq_int_14:1; /* RW, W1C */ + unsigned long lb_irq_int_15:1; /* RW, W1C */ + unsigned long l1_nmi_int:1; /* RW, W1C */ + unsigned long stop_clock:1; /* RW, W1C */ + unsigned long asic_to_l1:1; /* RW, W1C */ + unsigned long l1_to_asic:1; /* RW, W1C */ + unsigned long ltc_int:1; /* RW, W1C */ + unsigned long la_seq_trigger:1; /* RW, W1C */ + unsigned long ipi_int:1; /* RW, W1C */ + unsigned long extio_int0:1; /* RW, W1C */ + unsigned long extio_int1:1; /* RW, W1C */ + unsigned long extio_int2:1; /* RW, W1C */ + unsigned long extio_int3:1; /* RW, W1C */ + unsigned long profile_int:1; /* RW, W1C */ + unsigned long rtc0:1; /* RW, W1C */ + unsigned long rtc1:1; /* RW, W1C */ + unsigned long rtc2:1; /* RW, W1C */ + unsigned long rtc3:1; /* RW, W1C */ + unsigned long bau_data:1; /* RW, W1C */ + unsigned long power_management_req:1; /* RW, W1C */ + unsigned long rsvd_57_63:7; + } s1; + struct uv2h_event_occurred0_s { + unsigned long lb_hcerr:1; /* RW */ + unsigned long qp_hcerr:1; /* RW */ + unsigned long rh_hcerr:1; /* RW */ + unsigned long lh0_hcerr:1; /* RW */ + unsigned long lh1_hcerr:1; /* RW */ + unsigned long gr0_hcerr:1; /* RW */ + unsigned long gr1_hcerr:1; /* RW */ + unsigned long ni0_hcerr:1; /* RW */ + unsigned long ni1_hcerr:1; /* RW */ + unsigned long lb_aoerr0:1; /* RW */ + unsigned long qp_aoerr0:1; /* RW */ + unsigned long rh_aoerr0:1; /* RW */ + unsigned long lh0_aoerr0:1; /* RW */ + unsigned long lh1_aoerr0:1; /* RW */ + unsigned long gr0_aoerr0:1; /* RW */ + unsigned long gr1_aoerr0:1; /* RW */ + unsigned long xb_aoerr0:1; /* RW */ + unsigned long rt_aoerr0:1; /* RW */ + unsigned long ni0_aoerr0:1; /* RW */ + unsigned long ni1_aoerr0:1; /* RW */ + unsigned long lb_aoerr1:1; /* RW */ + unsigned long qp_aoerr1:1; /* RW */ + unsigned long rh_aoerr1:1; /* RW */ + unsigned long lh0_aoerr1:1; /* RW */ + unsigned long lh1_aoerr1:1; /* RW */ + unsigned long gr0_aoerr1:1; /* RW */ + unsigned long gr1_aoerr1:1; /* RW */ + unsigned long xb_aoerr1:1; /* RW */ + unsigned long rt_aoerr1:1; /* RW */ + unsigned long ni0_aoerr1:1; /* RW */ + unsigned long ni1_aoerr1:1; /* RW */ + unsigned long system_shutdown_int:1; /* RW */ + unsigned long lb_irq_int_0:1; /* RW */ + unsigned long lb_irq_int_1:1; /* RW */ + unsigned long lb_irq_int_2:1; /* RW */ + unsigned long lb_irq_int_3:1; /* RW */ + unsigned long lb_irq_int_4:1; /* RW */ + unsigned long lb_irq_int_5:1; /* RW */ + unsigned long lb_irq_int_6:1; /* RW */ + unsigned long lb_irq_int_7:1; /* RW */ + unsigned long lb_irq_int_8:1; /* RW */ + unsigned long lb_irq_int_9:1; /* RW */ + unsigned long lb_irq_int_10:1; /* RW */ + unsigned long lb_irq_int_11:1; /* RW */ + unsigned long lb_irq_int_12:1; /* RW */ + unsigned long lb_irq_int_13:1; /* RW */ + unsigned long lb_irq_int_14:1; /* RW */ + unsigned long lb_irq_int_15:1; /* RW */ + unsigned long l1_nmi_int:1; /* RW */ + unsigned long stop_clock:1; /* RW */ + unsigned long asic_to_l1:1; /* RW */ + unsigned long l1_to_asic:1; /* RW */ + unsigned long la_seq_trigger:1; /* RW */ + unsigned long ipi_int:1; /* RW */ + unsigned long extio_int0:1; /* RW */ + unsigned long extio_int1:1; /* RW */ + unsigned long extio_int2:1; /* RW */ + unsigned long extio_int3:1; /* RW */ + unsigned long profile_int:1; /* RW */ + unsigned long rsvd_59_63:5; + } s2; }; /* ========================================================================= */ /* UVH_EVENT_OCCURRED0_ALIAS */ /* ========================================================================= */ -#define UVH_EVENT_OCCURRED0_ALIAS 0x0000000000070008UL -#define UVH_EVENT_OCCURRED0_ALIAS_32 0x5f0 +#define UVH_EVENT_OCCURRED0_ALIAS 0x0000000000070008UL +#define UVH_EVENT_OCCURRED0_ALIAS_32 0x5f0 /* ========================================================================= */ /* UVH_GR0_TLB_INT0_CONFIG */ /* ========================================================================= */ -#define UVH_GR0_TLB_INT0_CONFIG 0x61b00UL - -#define UVH_GR0_TLB_INT0_CONFIG_VECTOR_SHFT 0 -#define UVH_GR0_TLB_INT0_CONFIG_VECTOR_MASK 0x00000000000000ffUL -#define UVH_GR0_TLB_INT0_CONFIG_DM_SHFT 8 -#define UVH_GR0_TLB_INT0_CONFIG_DM_MASK 0x0000000000000700UL -#define UVH_GR0_TLB_INT0_CONFIG_DESTMODE_SHFT 11 -#define UVH_GR0_TLB_INT0_CONFIG_DESTMODE_MASK 0x0000000000000800UL -#define UVH_GR0_TLB_INT0_CONFIG_STATUS_SHFT 12 -#define UVH_GR0_TLB_INT0_CONFIG_STATUS_MASK 0x0000000000001000UL -#define UVH_GR0_TLB_INT0_CONFIG_P_SHFT 13 -#define UVH_GR0_TLB_INT0_CONFIG_P_MASK 0x0000000000002000UL -#define UVH_GR0_TLB_INT0_CONFIG_T_SHFT 15 -#define UVH_GR0_TLB_INT0_CONFIG_T_MASK 0x0000000000008000UL -#define UVH_GR0_TLB_INT0_CONFIG_M_SHFT 16 -#define UVH_GR0_TLB_INT0_CONFIG_M_MASK 0x0000000000010000UL -#define UVH_GR0_TLB_INT0_CONFIG_APIC_ID_SHFT 32 -#define UVH_GR0_TLB_INT0_CONFIG_APIC_ID_MASK 0xffffffff00000000UL +#define UVH_GR0_TLB_INT0_CONFIG 0x61b00UL + +#define UVH_GR0_TLB_INT0_CONFIG_VECTOR_SHFT 0 +#define UVH_GR0_TLB_INT0_CONFIG_DM_SHFT 8 +#define UVH_GR0_TLB_INT0_CONFIG_DESTMODE_SHFT 11 +#define UVH_GR0_TLB_INT0_CONFIG_STATUS_SHFT 12 +#define UVH_GR0_TLB_INT0_CONFIG_P_SHFT 13 +#define UVH_GR0_TLB_INT0_CONFIG_T_SHFT 15 +#define UVH_GR0_TLB_INT0_CONFIG_M_SHFT 16 +#define UVH_GR0_TLB_INT0_CONFIG_APIC_ID_SHFT 32 +#define UVH_GR0_TLB_INT0_CONFIG_VECTOR_MASK 0x00000000000000ffUL +#define UVH_GR0_TLB_INT0_CONFIG_DM_MASK 0x0000000000000700UL +#define UVH_GR0_TLB_INT0_CONFIG_DESTMODE_MASK 0x0000000000000800UL +#define UVH_GR0_TLB_INT0_CONFIG_STATUS_MASK 0x0000000000001000UL +#define UVH_GR0_TLB_INT0_CONFIG_P_MASK 0x0000000000002000UL +#define UVH_GR0_TLB_INT0_CONFIG_T_MASK 0x0000000000008000UL +#define UVH_GR0_TLB_INT0_CONFIG_M_MASK 0x0000000000010000UL +#define UVH_GR0_TLB_INT0_CONFIG_APIC_ID_MASK 0xffffffff00000000UL union uvh_gr0_tlb_int0_config_u { - unsigned long v; - struct uvh_gr0_tlb_int0_config_s { - unsigned long vector_ : 8; /* RW */ - unsigned long dm : 3; /* RW */ - unsigned long destmode : 1; /* RW */ - unsigned long status : 1; /* RO */ - unsigned long p : 1; /* RO */ - unsigned long rsvd_14 : 1; /* */ - unsigned long t : 1; /* RO */ - unsigned long m : 1; /* RW */ - unsigned long rsvd_17_31: 15; /* */ - unsigned long apic_id : 32; /* RW */ - } s; + unsigned long v; + struct uvh_gr0_tlb_int0_config_s { + unsigned long vector_:8; /* RW */ + unsigned long dm:3; /* RW */ + unsigned long destmode:1; /* RW */ + unsigned long status:1; /* RO */ + unsigned long p:1; /* RO */ + unsigned long rsvd_14:1; + unsigned long t:1; /* RO */ + unsigned long m:1; /* RW */ + unsigned long rsvd_17_31:15; + unsigned long apic_id:32; /* RW */ + } s; }; /* ========================================================================= */ /* UVH_GR0_TLB_INT1_CONFIG */ /* ========================================================================= */ -#define UVH_GR0_TLB_INT1_CONFIG 0x61b40UL - -#define UVH_GR0_TLB_INT1_CONFIG_VECTOR_SHFT 0 -#define UVH_GR0_TLB_INT1_CONFIG_VECTOR_MASK 0x00000000000000ffUL -#define UVH_GR0_TLB_INT1_CONFIG_DM_SHFT 8 -#define UVH_GR0_TLB_INT1_CONFIG_DM_MASK 0x0000000000000700UL -#define UVH_GR0_TLB_INT1_CONFIG_DESTMODE_SHFT 11 -#define UVH_GR0_TLB_INT1_CONFIG_DESTMODE_MASK 0x0000000000000800UL -#define UVH_GR0_TLB_INT1_CONFIG_STATUS_SHFT 12 -#define UVH_GR0_TLB_INT1_CONFIG_STATUS_MASK 0x0000000000001000UL -#define UVH_GR0_TLB_INT1_CONFIG_P_SHFT 13 -#define UVH_GR0_TLB_INT1_CONFIG_P_MASK 0x0000000000002000UL -#define UVH_GR0_TLB_INT1_CONFIG_T_SHFT 15 -#define UVH_GR0_TLB_INT1_CONFIG_T_MASK 0x0000000000008000UL -#define UVH_GR0_TLB_INT1_CONFIG_M_SHFT 16 -#define UVH_GR0_TLB_INT1_CONFIG_M_MASK 0x0000000000010000UL -#define UVH_GR0_TLB_INT1_CONFIG_APIC_ID_SHFT 32 -#define UVH_GR0_TLB_INT1_CONFIG_APIC_ID_MASK 0xffffffff00000000UL +#define UVH_GR0_TLB_INT1_CONFIG 0x61b40UL + +#define UVH_GR0_TLB_INT1_CONFIG_VECTOR_SHFT 0 +#define UVH_GR0_TLB_INT1_CONFIG_DM_SHFT 8 +#define UVH_GR0_TLB_INT1_CONFIG_DESTMODE_SHFT 11 +#define UVH_GR0_TLB_INT1_CONFIG_STATUS_SHFT 12 +#define UVH_GR0_TLB_INT1_CONFIG_P_SHFT 13 +#define UVH_GR0_TLB_INT1_CONFIG_T_SHFT 15 +#define UVH_GR0_TLB_INT1_CONFIG_M_SHFT 16 +#define UVH_GR0_TLB_INT1_CONFIG_APIC_ID_SHFT 32 +#define UVH_GR0_TLB_INT1_CONFIG_VECTOR_MASK 0x00000000000000ffUL +#define UVH_GR0_TLB_INT1_CONFIG_DM_MASK 0x0000000000000700UL +#define UVH_GR0_TLB_INT1_CONFIG_DESTMODE_MASK 0x0000000000000800UL +#define UVH_GR0_TLB_INT1_CONFIG_STATUS_MASK 0x0000000000001000UL +#define UVH_GR0_TLB_INT1_CONFIG_P_MASK 0x0000000000002000UL +#define UVH_GR0_TLB_INT1_CONFIG_T_MASK 0x0000000000008000UL +#define UVH_GR0_TLB_INT1_CONFIG_M_MASK 0x0000000000010000UL +#define UVH_GR0_TLB_INT1_CONFIG_APIC_ID_MASK 0xffffffff00000000UL union uvh_gr0_tlb_int1_config_u { - unsigned long v; - struct uvh_gr0_tlb_int1_config_s { - unsigned long vector_ : 8; /* RW */ - unsigned long dm : 3; /* RW */ - unsigned long destmode : 1; /* RW */ - unsigned long status : 1; /* RO */ - unsigned long p : 1; /* RO */ - unsigned long rsvd_14 : 1; /* */ - unsigned long t : 1; /* RO */ - unsigned long m : 1; /* RW */ - unsigned long rsvd_17_31: 15; /* */ - unsigned long apic_id : 32; /* RW */ - } s; + unsigned long v; + struct uvh_gr0_tlb_int1_config_s { + unsigned long vector_:8; /* RW */ + unsigned long dm:3; /* RW */ + unsigned long destmode:1; /* RW */ + unsigned long status:1; /* RO */ + unsigned long p:1; /* RO */ + unsigned long rsvd_14:1; + unsigned long t:1; /* RO */ + unsigned long m:1; /* RW */ + unsigned long rsvd_17_31:15; + unsigned long apic_id:32; /* RW */ + } s; +}; + +/* ========================================================================= */ +/* UVH_GR0_TLB_MMR_CONTROL */ +/* ========================================================================= */ +#define UV1H_GR0_TLB_MMR_CONTROL 0x401080UL +#define UV2H_GR0_TLB_MMR_CONTROL 0xc01080UL +#define UVH_GR0_TLB_MMR_CONTROL (is_uv1_hub() ? \ + UV1H_GR0_TLB_MMR_CONTROL : \ + UV2H_GR0_TLB_MMR_CONTROL) + +#define UVH_GR0_TLB_MMR_CONTROL_INDEX_SHFT 0 +#define UVH_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT 12 +#define UVH_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16 +#define UVH_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20 +#define UVH_GR0_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30 +#define UVH_GR0_TLB_MMR_CONTROL_MMR_READ_SHFT 31 +#define UVH_GR0_TLB_MMR_CONTROL_INDEX_MASK 0x0000000000000fffUL +#define UVH_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK 0x0000000000003000UL +#define UVH_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL +#define UVH_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL +#define UVH_GR0_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL +#define UVH_GR0_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL + +#define UV1H_GR0_TLB_MMR_CONTROL_INDEX_SHFT 0 +#define UV1H_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT 12 +#define UV1H_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16 +#define UV1H_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20 +#define UV1H_GR0_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30 +#define UV1H_GR0_TLB_MMR_CONTROL_MMR_READ_SHFT 31 +#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_CON_SHFT 48 +#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_SHFT 52 +#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBPGSIZE_SHFT 54 +#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBRREG_SHFT 56 +#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBLRUV_SHFT 60 +#define UV1H_GR0_TLB_MMR_CONTROL_INDEX_MASK 0x0000000000000fffUL +#define UV1H_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK 0x0000000000003000UL +#define UV1H_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL +#define UV1H_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL +#define UV1H_GR0_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL +#define UV1H_GR0_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL +#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_CON_MASK 0x0001000000000000UL +#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_MASK 0x0010000000000000UL +#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBPGSIZE_MASK 0x0040000000000000UL +#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBRREG_MASK 0x0100000000000000UL +#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBLRUV_MASK 0x1000000000000000UL + +#define UV2H_GR0_TLB_MMR_CONTROL_INDEX_SHFT 0 +#define UV2H_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT 12 +#define UV2H_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16 +#define UV2H_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20 +#define UV2H_GR0_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30 +#define UV2H_GR0_TLB_MMR_CONTROL_MMR_READ_SHFT 31 +#define UV2H_GR0_TLB_MMR_CONTROL_MMR_OP_DONE_SHFT 32 +#define UV2H_GR0_TLB_MMR_CONTROL_MMR_INJ_CON_SHFT 48 +#define UV2H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_SHFT 52 +#define UV2H_GR0_TLB_MMR_CONTROL_INDEX_MASK 0x0000000000000fffUL +#define UV2H_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK 0x0000000000003000UL +#define UV2H_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL +#define UV2H_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL +#define UV2H_GR0_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL +#define UV2H_GR0_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL +#define UV2H_GR0_TLB_MMR_CONTROL_MMR_OP_DONE_MASK 0x0000000100000000UL +#define UV2H_GR0_TLB_MMR_CONTROL_MMR_INJ_CON_MASK 0x0001000000000000UL +#define UV2H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_MASK 0x0010000000000000UL + +union uvh_gr0_tlb_mmr_control_u { + unsigned long v; + struct uvh_gr0_tlb_mmr_control_s { + unsigned long index:12; /* RW */ + unsigned long mem_sel:2; /* RW */ + unsigned long rsvd_14_15:2; + unsigned long auto_valid_en:1; /* RW */ + unsigned long rsvd_17_19:3; + unsigned long mmr_hash_index_en:1; /* RW */ + unsigned long rsvd_21_29:9; + unsigned long mmr_write:1; /* WP */ + unsigned long mmr_read:1; /* WP */ + unsigned long rsvd_32_63:32; + } s; + struct uv1h_gr0_tlb_mmr_control_s { + unsigned long index:12; /* RW */ + unsigned long mem_sel:2; /* RW */ + unsigned long rsvd_14_15:2; + unsigned long auto_valid_en:1; /* RW */ + unsigned long rsvd_17_19:3; + unsigned long mmr_hash_index_en:1; /* RW */ + unsigned long rsvd_21_29:9; + unsigned long mmr_write:1; /* WP */ + unsigned long mmr_read:1; /* WP */ + unsigned long rsvd_32_47:16; + unsigned long mmr_inj_con:1; /* RW */ + unsigned long rsvd_49_51:3; + unsigned long mmr_inj_tlbram:1; /* RW */ + unsigned long rsvd_53:1; + unsigned long mmr_inj_tlbpgsize:1; /* RW */ + unsigned long rsvd_55:1; + unsigned long mmr_inj_tlbrreg:1; /* RW */ + unsigned long rsvd_57_59:3; + unsigned long mmr_inj_tlblruv:1; /* RW */ + unsigned long rsvd_61_63:3; + } s1; + struct uv2h_gr0_tlb_mmr_control_s { + unsigned long index:12; /* RW */ + unsigned long mem_sel:2; /* RW */ + unsigned long rsvd_14_15:2; + unsigned long auto_valid_en:1; /* RW */ + unsigned long rsvd_17_19:3; + unsigned long mmr_hash_index_en:1; /* RW */ + unsigned long rsvd_21_29:9; + unsigned long mmr_write:1; /* WP */ + unsigned long mmr_read:1; /* WP */ + unsigned long mmr_op_done:1; /* RW */ + unsigned long rsvd_33_47:15; + unsigned long mmr_inj_con:1; /* RW */ + unsigned long rsvd_49_51:3; + unsigned long mmr_inj_tlbram:1; /* RW */ + unsigned long rsvd_53_63:11; + } s2; +}; + +/* ========================================================================= */ +/* UVH_GR0_TLB_MMR_READ_DATA_HI */ +/* ========================================================================= */ +#define UV1H_GR0_TLB_MMR_READ_DATA_HI 0x4010a0UL +#define UV2H_GR0_TLB_MMR_READ_DATA_HI 0xc010a0UL +#define UVH_GR0_TLB_MMR_READ_DATA_HI (is_uv1_hub() ? \ + UV1H_GR0_TLB_MMR_READ_DATA_HI : \ + UV2H_GR0_TLB_MMR_READ_DATA_HI) + +#define UVH_GR0_TLB_MMR_READ_DATA_HI_PFN_SHFT 0 +#define UVH_GR0_TLB_MMR_READ_DATA_HI_GAA_SHFT 41 +#define UVH_GR0_TLB_MMR_READ_DATA_HI_DIRTY_SHFT 43 +#define UVH_GR0_TLB_MMR_READ_DATA_HI_LARGER_SHFT 44 +#define UVH_GR0_TLB_MMR_READ_DATA_HI_PFN_MASK 0x000001ffffffffffUL +#define UVH_GR0_TLB_MMR_READ_DATA_HI_GAA_MASK 0x0000060000000000UL +#define UVH_GR0_TLB_MMR_READ_DATA_HI_DIRTY_MASK 0x0000080000000000UL +#define UVH_GR0_TLB_MMR_READ_DATA_HI_LARGER_MASK 0x0000100000000000UL + +union uvh_gr0_tlb_mmr_read_data_hi_u { + unsigned long v; + struct uvh_gr0_tlb_mmr_read_data_hi_s { + unsigned long pfn:41; /* RO */ + unsigned long gaa:2; /* RO */ + unsigned long dirty:1; /* RO */ + unsigned long larger:1; /* RO */ + unsigned long rsvd_45_63:19; + } s; +}; + +/* ========================================================================= */ +/* UVH_GR0_TLB_MMR_READ_DATA_LO */ +/* ========================================================================= */ +#define UV1H_GR0_TLB_MMR_READ_DATA_LO 0x4010a8UL +#define UV2H_GR0_TLB_MMR_READ_DATA_LO 0xc010a8UL +#define UVH_GR0_TLB_MMR_READ_DATA_LO (is_uv1_hub() ? \ + UV1H_GR0_TLB_MMR_READ_DATA_LO : \ + UV2H_GR0_TLB_MMR_READ_DATA_LO) + +#define UVH_GR0_TLB_MMR_READ_DATA_LO_VPN_SHFT 0 +#define UVH_GR0_TLB_MMR_READ_DATA_LO_ASID_SHFT 39 +#define UVH_GR0_TLB_MMR_READ_DATA_LO_VALID_SHFT 63 +#define UVH_GR0_TLB_MMR_READ_DATA_LO_VPN_MASK 0x0000007fffffffffUL +#define UVH_GR0_TLB_MMR_READ_DATA_LO_ASID_MASK 0x7fffff8000000000UL +#define UVH_GR0_TLB_MMR_READ_DATA_LO_VALID_MASK 0x8000000000000000UL + +union uvh_gr0_tlb_mmr_read_data_lo_u { + unsigned long v; + struct uvh_gr0_tlb_mmr_read_data_lo_s { + unsigned long vpn:39; /* RO */ + unsigned long asid:24; /* RO */ + unsigned long valid:1; /* RO */ + } s; }; /* ========================================================================= */ /* UVH_GR1_TLB_INT0_CONFIG */ /* ========================================================================= */ -#define UVH_GR1_TLB_INT0_CONFIG 0x61f00UL - -#define UVH_GR1_TLB_INT0_CONFIG_VECTOR_SHFT 0 -#define UVH_GR1_TLB_INT0_CONFIG_VECTOR_MASK 0x00000000000000ffUL -#define UVH_GR1_TLB_INT0_CONFIG_DM_SHFT 8 -#define UVH_GR1_TLB_INT0_CONFIG_DM_MASK 0x0000000000000700UL -#define UVH_GR1_TLB_INT0_CONFIG_DESTMODE_SHFT 11 -#define UVH_GR1_TLB_INT0_CONFIG_DESTMODE_MASK 0x0000000000000800UL -#define UVH_GR1_TLB_INT0_CONFIG_STATUS_SHFT 12 -#define UVH_GR1_TLB_INT0_CONFIG_STATUS_MASK 0x0000000000001000UL -#define UVH_GR1_TLB_INT0_CONFIG_P_SHFT 13 -#define UVH_GR1_TLB_INT0_CONFIG_P_MASK 0x0000000000002000UL -#define UVH_GR1_TLB_INT0_CONFIG_T_SHFT 15 -#define UVH_GR1_TLB_INT0_CONFIG_T_MASK 0x0000000000008000UL -#define UVH_GR1_TLB_INT0_CONFIG_M_SHFT 16 -#define UVH_GR1_TLB_INT0_CONFIG_M_MASK 0x0000000000010000UL -#define UVH_GR1_TLB_INT0_CONFIG_APIC_ID_SHFT 32 -#define UVH_GR1_TLB_INT0_CONFIG_APIC_ID_MASK 0xffffffff00000000UL +#define UVH_GR1_TLB_INT0_CONFIG 0x61f00UL + +#define UVH_GR1_TLB_INT0_CONFIG_VECTOR_SHFT 0 +#define UVH_GR1_TLB_INT0_CONFIG_DM_SHFT 8 +#define UVH_GR1_TLB_INT0_CONFIG_DESTMODE_SHFT 11 +#define UVH_GR1_TLB_INT0_CONFIG_STATUS_SHFT 12 +#define UVH_GR1_TLB_INT0_CONFIG_P_SHFT 13 +#define UVH_GR1_TLB_INT0_CONFIG_T_SHFT 15 +#define UVH_GR1_TLB_INT0_CONFIG_M_SHFT 16 +#define UVH_GR1_TLB_INT0_CONFIG_APIC_ID_SHFT 32 +#define UVH_GR1_TLB_INT0_CONFIG_VECTOR_MASK 0x00000000000000ffUL +#define UVH_GR1_TLB_INT0_CONFIG_DM_MASK 0x0000000000000700UL +#define UVH_GR1_TLB_INT0_CONFIG_DESTMODE_MASK 0x0000000000000800UL +#define UVH_GR1_TLB_INT0_CONFIG_STATUS_MASK 0x0000000000001000UL +#define UVH_GR1_TLB_INT0_CONFIG_P_MASK 0x0000000000002000UL +#define UVH_GR1_TLB_INT0_CONFIG_T_MASK 0x0000000000008000UL +#define UVH_GR1_TLB_INT0_CONFIG_M_MASK 0x0000000000010000UL +#define UVH_GR1_TLB_INT0_CONFIG_APIC_ID_MASK 0xffffffff00000000UL union uvh_gr1_tlb_int0_config_u { - unsigned long v; - struct uvh_gr1_tlb_int0_config_s { - unsigned long vector_ : 8; /* RW */ - unsigned long dm : 3; /* RW */ - unsigned long destmode : 1; /* RW */ - unsigned long status : 1; /* RO */ - unsigned long p : 1; /* RO */ - unsigned long rsvd_14 : 1; /* */ - unsigned long t : 1; /* RO */ - unsigned long m : 1; /* RW */ - unsigned long rsvd_17_31: 15; /* */ - unsigned long apic_id : 32; /* RW */ - } s; + unsigned long v; + struct uvh_gr1_tlb_int0_config_s { + unsigned long vector_:8; /* RW */ + unsigned long dm:3; /* RW */ + unsigned long destmode:1; /* RW */ + unsigned long status:1; /* RO */ + unsigned long p:1; /* RO */ + unsigned long rsvd_14:1; + unsigned long t:1; /* RO */ + unsigned long m:1; /* RW */ + unsigned long rsvd_17_31:15; + unsigned long apic_id:32; /* RW */ + } s; }; /* ========================================================================= */ /* UVH_GR1_TLB_INT1_CONFIG */ /* ========================================================================= */ -#define UVH_GR1_TLB_INT1_CONFIG 0x61f40UL - -#define UVH_GR1_TLB_INT1_CONFIG_VECTOR_SHFT 0 -#define UVH_GR1_TLB_INT1_CONFIG_VECTOR_MASK 0x00000000000000ffUL -#define UVH_GR1_TLB_INT1_CONFIG_DM_SHFT 8 -#define UVH_GR1_TLB_INT1_CONFIG_DM_MASK 0x0000000000000700UL -#define UVH_GR1_TLB_INT1_CONFIG_DESTMODE_SHFT 11 -#define UVH_GR1_TLB_INT1_CONFIG_DESTMODE_MASK 0x0000000000000800UL -#define UVH_GR1_TLB_INT1_CONFIG_STATUS_SHFT 12 -#define UVH_GR1_TLB_INT1_CONFIG_STATUS_MASK 0x0000000000001000UL -#define UVH_GR1_TLB_INT1_CONFIG_P_SHFT 13 -#define UVH_GR1_TLB_INT1_CONFIG_P_MASK 0x0000000000002000UL -#define UVH_GR1_TLB_INT1_CONFIG_T_SHFT 15 -#define UVH_GR1_TLB_INT1_CONFIG_T_MASK 0x0000000000008000UL -#define UVH_GR1_TLB_INT1_CONFIG_M_SHFT 16 -#define UVH_GR1_TLB_INT1_CONFIG_M_MASK 0x0000000000010000UL -#define UVH_GR1_TLB_INT1_CONFIG_APIC_ID_SHFT 32 -#define UVH_GR1_TLB_INT1_CONFIG_APIC_ID_MASK 0xffffffff00000000UL +#define UVH_GR1_TLB_INT1_CONFIG 0x61f40UL + +#define UVH_GR1_TLB_INT1_CONFIG_VECTOR_SHFT 0 +#define UVH_GR1_TLB_INT1_CONFIG_DM_SHFT 8 +#define UVH_GR1_TLB_INT1_CONFIG_DESTMODE_SHFT 11 +#define UVH_GR1_TLB_INT1_CONFIG_STATUS_SHFT 12 +#define UVH_GR1_TLB_INT1_CONFIG_P_SHFT 13 +#define UVH_GR1_TLB_INT1_CONFIG_T_SHFT 15 +#define UVH_GR1_TLB_INT1_CONFIG_M_SHFT 16 +#define UVH_GR1_TLB_INT1_CONFIG_APIC_ID_SHFT 32 +#define UVH_GR1_TLB_INT1_CONFIG_VECTOR_MASK 0x00000000000000ffUL +#define UVH_GR1_TLB_INT1_CONFIG_DM_MASK 0x0000000000000700UL +#define UVH_GR1_TLB_INT1_CONFIG_DESTMODE_MASK 0x0000000000000800UL +#define UVH_GR1_TLB_INT1_CONFIG_STATUS_MASK 0x0000000000001000UL +#define UVH_GR1_TLB_INT1_CONFIG_P_MASK 0x0000000000002000UL +#define UVH_GR1_TLB_INT1_CONFIG_T_MASK 0x0000000000008000UL +#define UVH_GR1_TLB_INT1_CONFIG_M_MASK 0x0000000000010000UL +#define UVH_GR1_TLB_INT1_CONFIG_APIC_ID_MASK 0xffffffff00000000UL union uvh_gr1_tlb_int1_config_u { - unsigned long v; - struct uvh_gr1_tlb_int1_config_s { - unsigned long vector_ : 8; /* RW */ - unsigned long dm : 3; /* RW */ - unsigned long destmode : 1; /* RW */ - unsigned long status : 1; /* RO */ - unsigned long p : 1; /* RO */ - unsigned long rsvd_14 : 1; /* */ - unsigned long t : 1; /* RO */ - unsigned long m : 1; /* RW */ - unsigned long rsvd_17_31: 15; /* */ - unsigned long apic_id : 32; /* RW */ - } s; + unsigned long v; + struct uvh_gr1_tlb_int1_config_s { + unsigned long vector_:8; /* RW */ + unsigned long dm:3; /* RW */ + unsigned long destmode:1; /* RW */ + unsigned long status:1; /* RO */ + unsigned long p:1; /* RO */ + unsigned long rsvd_14:1; + unsigned long t:1; /* RO */ + unsigned long m:1; /* RW */ + unsigned long rsvd_17_31:15; + unsigned long apic_id:32; /* RW */ + } s; +}; + +/* ========================================================================= */ +/* UVH_GR1_TLB_MMR_CONTROL */ +/* ========================================================================= */ +#define UV1H_GR1_TLB_MMR_CONTROL 0x801080UL +#define UV2H_GR1_TLB_MMR_CONTROL 0x1001080UL +#define UVH_GR1_TLB_MMR_CONTROL (is_uv1_hub() ? \ + UV1H_GR1_TLB_MMR_CONTROL : \ + UV2H_GR1_TLB_MMR_CONTROL) + +#define UVH_GR1_TLB_MMR_CONTROL_INDEX_SHFT 0 +#define UVH_GR1_TLB_MMR_CONTROL_MEM_SEL_SHFT 12 +#define UVH_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16 +#define UVH_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20 +#define UVH_GR1_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30 +#define UVH_GR1_TLB_MMR_CONTROL_MMR_READ_SHFT 31 +#define UVH_GR1_TLB_MMR_CONTROL_INDEX_MASK 0x0000000000000fffUL +#define UVH_GR1_TLB_MMR_CONTROL_MEM_SEL_MASK 0x0000000000003000UL +#define UVH_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL +#define UVH_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL +#define UVH_GR1_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL +#define UVH_GR1_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL + +#define UV1H_GR1_TLB_MMR_CONTROL_INDEX_SHFT 0 +#define UV1H_GR1_TLB_MMR_CONTROL_MEM_SEL_SHFT 12 +#define UV1H_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16 +#define UV1H_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20 +#define UV1H_GR1_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30 +#define UV1H_GR1_TLB_MMR_CONTROL_MMR_READ_SHFT 31 +#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_CON_SHFT 48 +#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_SHFT 52 +#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBPGSIZE_SHFT 54 +#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBRREG_SHFT 56 +#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBLRUV_SHFT 60 +#define UV1H_GR1_TLB_MMR_CONTROL_INDEX_MASK 0x0000000000000fffUL +#define UV1H_GR1_TLB_MMR_CONTROL_MEM_SEL_MASK 0x0000000000003000UL +#define UV1H_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL +#define UV1H_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL +#define UV1H_GR1_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL +#define UV1H_GR1_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL +#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_CON_MASK 0x0001000000000000UL +#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_MASK 0x0010000000000000UL +#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBPGSIZE_MASK 0x0040000000000000UL +#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBRREG_MASK 0x0100000000000000UL +#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBLRUV_MASK 0x1000000000000000UL + +#define UV2H_GR1_TLB_MMR_CONTROL_INDEX_SHFT 0 +#define UV2H_GR1_TLB_MMR_CONTROL_MEM_SEL_SHFT 12 +#define UV2H_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16 +#define UV2H_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20 +#define UV2H_GR1_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30 +#define UV2H_GR1_TLB_MMR_CONTROL_MMR_READ_SHFT 31 +#define UV2H_GR1_TLB_MMR_CONTROL_MMR_OP_DONE_SHFT 32 +#define UV2H_GR1_TLB_MMR_CONTROL_MMR_INJ_CON_SHFT 48 +#define UV2H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_SHFT 52 +#define UV2H_GR1_TLB_MMR_CONTROL_INDEX_MASK 0x0000000000000fffUL +#define UV2H_GR1_TLB_MMR_CONTROL_MEM_SEL_MASK 0x0000000000003000UL +#define UV2H_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL +#define UV2H_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL +#define UV2H_GR1_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL +#define UV2H_GR1_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL +#define UV2H_GR1_TLB_MMR_CONTROL_MMR_OP_DONE_MASK 0x0000000100000000UL +#define UV2H_GR1_TLB_MMR_CONTROL_MMR_INJ_CON_MASK 0x0001000000000000UL +#define UV2H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_MASK 0x0010000000000000UL + +union uvh_gr1_tlb_mmr_control_u { + unsigned long v; + struct uvh_gr1_tlb_mmr_control_s { + unsigned long index:12; /* RW */ + unsigned long mem_sel:2; /* RW */ + unsigned long rsvd_14_15:2; + unsigned long auto_valid_en:1; /* RW */ + unsigned long rsvd_17_19:3; + unsigned long mmr_hash_index_en:1; /* RW */ + unsigned long rsvd_21_29:9; + unsigned long mmr_write:1; /* WP */ + unsigned long mmr_read:1; /* WP */ + unsigned long rsvd_32_63:32; + } s; + struct uv1h_gr1_tlb_mmr_control_s { + unsigned long index:12; /* RW */ + unsigned long mem_sel:2; /* RW */ + unsigned long rsvd_14_15:2; + unsigned long auto_valid_en:1; /* RW */ + unsigned long rsvd_17_19:3; + unsigned long mmr_hash_index_en:1; /* RW */ + unsigned long rsvd_21_29:9; + unsigned long mmr_write:1; /* WP */ + unsigned long mmr_read:1; /* WP */ + unsigned long rsvd_32_47:16; + unsigned long mmr_inj_con:1; /* RW */ + unsigned long rsvd_49_51:3; + unsigned long mmr_inj_tlbram:1; /* RW */ + unsigned long rsvd_53:1; + unsigned long mmr_inj_tlbpgsize:1; /* RW */ + unsigned long rsvd_55:1; + unsigned long mmr_inj_tlbrreg:1; /* RW */ + unsigned long rsvd_57_59:3; + unsigned long mmr_inj_tlblruv:1; /* RW */ + unsigned long rsvd_61_63:3; + } s1; + struct uv2h_gr1_tlb_mmr_control_s { + unsigned long index:12; /* RW */ + unsigned long mem_sel:2; /* RW */ + unsigned long rsvd_14_15:2; + unsigned long auto_valid_en:1; /* RW */ + unsigned long rsvd_17_19:3; + unsigned long mmr_hash_index_en:1; /* RW */ + unsigned long rsvd_21_29:9; + unsigned long mmr_write:1; /* WP */ + unsigned long mmr_read:1; /* WP */ + unsigned long mmr_op_done:1; /* RW */ + unsigned long rsvd_33_47:15; + unsigned long mmr_inj_con:1; /* RW */ + unsigned long rsvd_49_51:3; + unsigned long mmr_inj_tlbram:1; /* RW */ + unsigned long rsvd_53_63:11; + } s2; +}; + +/* ========================================================================= */ +/* UVH_GR1_TLB_MMR_READ_DATA_HI */ +/* ========================================================================= */ +#define UV1H_GR1_TLB_MMR_READ_DATA_HI 0x8010a0UL +#define UV2H_GR1_TLB_MMR_READ_DATA_HI 0x10010a0UL +#define UVH_GR1_TLB_MMR_READ_DATA_HI (is_uv1_hub() ? \ + UV1H_GR1_TLB_MMR_READ_DATA_HI : \ + UV2H_GR1_TLB_MMR_READ_DATA_HI) + +#define UVH_GR1_TLB_MMR_READ_DATA_HI_PFN_SHFT 0 +#define UVH_GR1_TLB_MMR_READ_DATA_HI_GAA_SHFT 41 +#define UVH_GR1_TLB_MMR_READ_DATA_HI_DIRTY_SHFT 43 +#define UVH_GR1_TLB_MMR_READ_DATA_HI_LARGER_SHFT 44 +#define UVH_GR1_TLB_MMR_READ_DATA_HI_PFN_MASK 0x000001ffffffffffUL +#define UVH_GR1_TLB_MMR_READ_DATA_HI_GAA_MASK 0x0000060000000000UL +#define UVH_GR1_TLB_MMR_READ_DATA_HI_DIRTY_MASK 0x0000080000000000UL +#define UVH_GR1_TLB_MMR_READ_DATA_HI_LARGER_MASK 0x0000100000000000UL + +union uvh_gr1_tlb_mmr_read_data_hi_u { + unsigned long v; + struct uvh_gr1_tlb_mmr_read_data_hi_s { + unsigned long pfn:41; /* RO */ + unsigned long gaa:2; /* RO */ + unsigned long dirty:1; /* RO */ + unsigned long larger:1; /* RO */ + unsigned long rsvd_45_63:19; + } s; +}; + +/* ========================================================================= */ +/* UVH_GR1_TLB_MMR_READ_DATA_LO */ +/* ========================================================================= */ +#define UV1H_GR1_TLB_MMR_READ_DATA_LO 0x8010a8UL +#define UV2H_GR1_TLB_MMR_READ_DATA_LO 0x10010a8UL +#define UVH_GR1_TLB_MMR_READ_DATA_LO (is_uv1_hub() ? \ + UV1H_GR1_TLB_MMR_READ_DATA_LO : \ + UV2H_GR1_TLB_MMR_READ_DATA_LO) + +#define UVH_GR1_TLB_MMR_READ_DATA_LO_VPN_SHFT 0 +#define UVH_GR1_TLB_MMR_READ_DATA_LO_ASID_SHFT 39 +#define UVH_GR1_TLB_MMR_READ_DATA_LO_VALID_SHFT 63 +#define UVH_GR1_TLB_MMR_READ_DATA_LO_VPN_MASK 0x0000007fffffffffUL +#define UVH_GR1_TLB_MMR_READ_DATA_LO_ASID_MASK 0x7fffff8000000000UL +#define UVH_GR1_TLB_MMR_READ_DATA_LO_VALID_MASK 0x8000000000000000UL + +union uvh_gr1_tlb_mmr_read_data_lo_u { + unsigned long v; + struct uvh_gr1_tlb_mmr_read_data_lo_s { + unsigned long vpn:39; /* RO */ + unsigned long asid:24; /* RO */ + unsigned long valid:1; /* RO */ + } s; }; /* ========================================================================= */ /* UVH_INT_CMPB */ /* ========================================================================= */ -#define UVH_INT_CMPB 0x22080UL +#define UVH_INT_CMPB 0x22080UL -#define UVH_INT_CMPB_REAL_TIME_CMPB_SHFT 0 -#define UVH_INT_CMPB_REAL_TIME_CMPB_MASK 0x00ffffffffffffffUL +#define UVH_INT_CMPB_REAL_TIME_CMPB_SHFT 0 +#define UVH_INT_CMPB_REAL_TIME_CMPB_MASK 0x00ffffffffffffffUL union uvh_int_cmpb_u { - unsigned long v; - struct uvh_int_cmpb_s { - unsigned long real_time_cmpb : 56; /* RW */ - unsigned long rsvd_56_63 : 8; /* */ - } s; + unsigned long v; + struct uvh_int_cmpb_s { + unsigned long real_time_cmpb:56; /* RW */ + unsigned long rsvd_56_63:8; + } s; }; /* ========================================================================= */ /* UVH_INT_CMPC */ /* ========================================================================= */ -#define UVH_INT_CMPC 0x22100UL +#define UVH_INT_CMPC 0x22100UL -#define UV1H_INT_CMPC_REAL_TIME_CMPC_SHFT 0 -#define UV2H_INT_CMPC_REAL_TIME_CMPC_SHFT 0 -#define UVH_INT_CMPC_REAL_TIME_CMPC_SHFT (is_uv1_hub() ? \ - UV1H_INT_CMPC_REAL_TIME_CMPC_SHFT : \ - UV2H_INT_CMPC_REAL_TIME_CMPC_SHFT) -#define UV1H_INT_CMPC_REAL_TIME_CMPC_MASK 0xffffffffffffffUL -#define UV2H_INT_CMPC_REAL_TIME_CMPC_MASK 0xffffffffffffffUL -#define UVH_INT_CMPC_REAL_TIME_CMPC_MASK (is_uv1_hub() ? \ - UV1H_INT_CMPC_REAL_TIME_CMPC_MASK : \ - UV2H_INT_CMPC_REAL_TIME_CMPC_MASK) +#define UVH_INT_CMPC_REAL_TIME_CMPC_SHFT 0 +#define UVH_INT_CMPC_REAL_TIME_CMPC_MASK 0xffffffffffffffUL union uvh_int_cmpc_u { - unsigned long v; - struct uvh_int_cmpc_s { - unsigned long real_time_cmpc : 56; /* RW */ - unsigned long rsvd_56_63 : 8; /* */ - } s; + unsigned long v; + struct uvh_int_cmpc_s { + unsigned long real_time_cmpc:56; /* RW */ + unsigned long rsvd_56_63:8; + } s; }; /* ========================================================================= */ /* UVH_INT_CMPD */ /* ========================================================================= */ -#define UVH_INT_CMPD 0x22180UL +#define UVH_INT_CMPD 0x22180UL -#define UV1H_INT_CMPD_REAL_TIME_CMPD_SHFT 0 -#define UV2H_INT_CMPD_REAL_TIME_CMPD_SHFT 0 -#define UVH_INT_CMPD_REAL_TIME_CMPD_SHFT (is_uv1_hub() ? \ - UV1H_INT_CMPD_REAL_TIME_CMPD_SHFT : \ - UV2H_INT_CMPD_REAL_TIME_CMPD_SHFT) -#define UV1H_INT_CMPD_REAL_TIME_CMPD_MASK 0xffffffffffffffUL -#define UV2H_INT_CMPD_REAL_TIME_CMPD_MASK 0xffffffffffffffUL -#define UVH_INT_CMPD_REAL_TIME_CMPD_MASK (is_uv1_hub() ? \ - UV1H_INT_CMPD_REAL_TIME_CMPD_MASK : \ - UV2H_INT_CMPD_REAL_TIME_CMPD_MASK) +#define UVH_INT_CMPD_REAL_TIME_CMPD_SHFT 0 +#define UVH_INT_CMPD_REAL_TIME_CMPD_MASK 0xffffffffffffffUL union uvh_int_cmpd_u { - unsigned long v; - struct uvh_int_cmpd_s { - unsigned long real_time_cmpd : 56; /* RW */ - unsigned long rsvd_56_63 : 8; /* */ - } s; + unsigned long v; + struct uvh_int_cmpd_s { + unsigned long real_time_cmpd:56; /* RW */ + unsigned long rsvd_56_63:8; + } s; }; /* ========================================================================= */ /* UVH_IPI_INT */ /* ========================================================================= */ -#define UVH_IPI_INT 0x60500UL -#define UVH_IPI_INT_32 0x348 +#define UVH_IPI_INT 0x60500UL +#define UVH_IPI_INT_32 0x348 -#define UVH_IPI_INT_VECTOR_SHFT 0 -#define UVH_IPI_INT_VECTOR_MASK 0x00000000000000ffUL -#define UVH_IPI_INT_DELIVERY_MODE_SHFT 8 -#define UVH_IPI_INT_DELIVERY_MODE_MASK 0x0000000000000700UL -#define UVH_IPI_INT_DESTMODE_SHFT 11 -#define UVH_IPI_INT_DESTMODE_MASK 0x0000000000000800UL -#define UVH_IPI_INT_APIC_ID_SHFT 16 -#define UVH_IPI_INT_APIC_ID_MASK 0x0000ffffffff0000UL -#define UVH_IPI_INT_SEND_SHFT 63 -#define UVH_IPI_INT_SEND_MASK 0x8000000000000000UL +#define UVH_IPI_INT_VECTOR_SHFT 0 +#define UVH_IPI_INT_DELIVERY_MODE_SHFT 8 +#define UVH_IPI_INT_DESTMODE_SHFT 11 +#define UVH_IPI_INT_APIC_ID_SHFT 16 +#define UVH_IPI_INT_SEND_SHFT 63 +#define UVH_IPI_INT_VECTOR_MASK 0x00000000000000ffUL +#define UVH_IPI_INT_DELIVERY_MODE_MASK 0x0000000000000700UL +#define UVH_IPI_INT_DESTMODE_MASK 0x0000000000000800UL +#define UVH_IPI_INT_APIC_ID_MASK 0x0000ffffffff0000UL +#define UVH_IPI_INT_SEND_MASK 0x8000000000000000UL union uvh_ipi_int_u { - unsigned long v; - struct uvh_ipi_int_s { - unsigned long vector_ : 8; /* RW */ - unsigned long delivery_mode : 3; /* RW */ - unsigned long destmode : 1; /* RW */ - unsigned long rsvd_12_15 : 4; /* */ - unsigned long apic_id : 32; /* RW */ - unsigned long rsvd_48_62 : 15; /* */ - unsigned long send : 1; /* WP */ - } s; + unsigned long v; + struct uvh_ipi_int_s { + unsigned long vector_:8; /* RW */ + unsigned long delivery_mode:3; /* RW */ + unsigned long destmode:1; /* RW */ + unsigned long rsvd_12_15:4; + unsigned long apic_id:32; /* RW */ + unsigned long rsvd_48_62:15; + unsigned long send:1; /* WP */ + } s; }; /* ========================================================================= */ /* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST */ /* ========================================================================= */ -#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST 0x320050UL -#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_32 0x9c0 +#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST 0x320050UL +#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_32 0x9c0 #define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_SHFT 4 -#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_MASK 0x000007fffffffff0UL #define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_NODE_ID_SHFT 49 +#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_MASK 0x000007fffffffff0UL #define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_NODE_ID_MASK 0x7ffe000000000000UL union uvh_lb_bau_intd_payload_queue_first_u { - unsigned long v; - struct uvh_lb_bau_intd_payload_queue_first_s { - unsigned long rsvd_0_3: 4; /* */ - unsigned long address : 39; /* RW */ - unsigned long rsvd_43_48: 6; /* */ - unsigned long node_id : 14; /* RW */ - unsigned long rsvd_63 : 1; /* */ - } s; + unsigned long v; + struct uvh_lb_bau_intd_payload_queue_first_s { + unsigned long rsvd_0_3:4; + unsigned long address:39; /* RW */ + unsigned long rsvd_43_48:6; + unsigned long node_id:14; /* RW */ + unsigned long rsvd_63:1; + } s; }; /* ========================================================================= */ /* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST */ /* ========================================================================= */ -#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST 0x320060UL -#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_32 0x9c8 +#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST 0x320060UL +#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_32 0x9c8 -#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_SHFT 4 -#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_MASK 0x000007fffffffff0UL +#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_SHFT 4 +#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_MASK 0x000007fffffffff0UL union uvh_lb_bau_intd_payload_queue_last_u { - unsigned long v; - struct uvh_lb_bau_intd_payload_queue_last_s { - unsigned long rsvd_0_3: 4; /* */ - unsigned long address : 39; /* RW */ - unsigned long rsvd_43_63: 21; /* */ - } s; + unsigned long v; + struct uvh_lb_bau_intd_payload_queue_last_s { + unsigned long rsvd_0_3:4; + unsigned long address:39; /* RW */ + unsigned long rsvd_43_63:21; + } s; }; /* ========================================================================= */ /* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL */ /* ========================================================================= */ -#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL 0x320070UL -#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_32 0x9d0 +#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL 0x320070UL +#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_32 0x9d0 -#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_SHFT 4 -#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_MASK 0x000007fffffffff0UL +#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_SHFT 4 +#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_MASK 0x000007fffffffff0UL union uvh_lb_bau_intd_payload_queue_tail_u { - unsigned long v; - struct uvh_lb_bau_intd_payload_queue_tail_s { - unsigned long rsvd_0_3: 4; /* */ - unsigned long address : 39; /* RW */ - unsigned long rsvd_43_63: 21; /* */ - } s; + unsigned long v; + struct uvh_lb_bau_intd_payload_queue_tail_s { + unsigned long rsvd_0_3:4; + unsigned long address:39; /* RW */ + unsigned long rsvd_43_63:21; + } s; }; /* ========================================================================= */ /* UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE */ /* ========================================================================= */ -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE 0x320080UL -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_32 0xa68 +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE 0x320080UL +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_32 0xa68 #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_SHFT 0 -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_MASK 0x0000000000000001UL #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_SHFT 1 -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_MASK 0x0000000000000002UL #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_SHFT 2 -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_MASK 0x0000000000000004UL #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_3_SHFT 3 -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_3_MASK 0x0000000000000008UL #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_4_SHFT 4 -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_4_MASK 0x0000000000000010UL #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_5_SHFT 5 -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_5_MASK 0x0000000000000020UL #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_6_SHFT 6 -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_6_MASK 0x0000000000000040UL #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_7_SHFT 7 -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_7_MASK 0x0000000000000080UL #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_0_SHFT 8 -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_0_MASK 0x0000000000000100UL #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_1_SHFT 9 -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_1_MASK 0x0000000000000200UL #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_2_SHFT 10 -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_2_MASK 0x0000000000000400UL #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_3_SHFT 11 -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_3_MASK 0x0000000000000800UL #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_4_SHFT 12 -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_4_MASK 0x0000000000001000UL #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_5_SHFT 13 -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_5_MASK 0x0000000000002000UL #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_SHFT 14 -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_MASK 0x0000000000004000UL #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_SHFT 15 +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_MASK 0x0000000000000001UL +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_MASK 0x0000000000000002UL +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_MASK 0x0000000000000004UL +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_3_MASK 0x0000000000000008UL +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_4_MASK 0x0000000000000010UL +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_5_MASK 0x0000000000000020UL +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_6_MASK 0x0000000000000040UL +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_7_MASK 0x0000000000000080UL +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_0_MASK 0x0000000000000100UL +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_1_MASK 0x0000000000000200UL +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_2_MASK 0x0000000000000400UL +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_3_MASK 0x0000000000000800UL +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_4_MASK 0x0000000000001000UL +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_5_MASK 0x0000000000002000UL +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_MASK 0x0000000000004000UL #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_MASK 0x0000000000008000UL union uvh_lb_bau_intd_software_acknowledge_u { - unsigned long v; - struct uvh_lb_bau_intd_software_acknowledge_s { - unsigned long pending_0 : 1; /* RW, W1C */ - unsigned long pending_1 : 1; /* RW, W1C */ - unsigned long pending_2 : 1; /* RW, W1C */ - unsigned long pending_3 : 1; /* RW, W1C */ - unsigned long pending_4 : 1; /* RW, W1C */ - unsigned long pending_5 : 1; /* RW, W1C */ - unsigned long pending_6 : 1; /* RW, W1C */ - unsigned long pending_7 : 1; /* RW, W1C */ - unsigned long timeout_0 : 1; /* RW, W1C */ - unsigned long timeout_1 : 1; /* RW, W1C */ - unsigned long timeout_2 : 1; /* RW, W1C */ - unsigned long timeout_3 : 1; /* RW, W1C */ - unsigned long timeout_4 : 1; /* RW, W1C */ - unsigned long timeout_5 : 1; /* RW, W1C */ - unsigned long timeout_6 : 1; /* RW, W1C */ - unsigned long timeout_7 : 1; /* RW, W1C */ - unsigned long rsvd_16_63: 48; /* */ - } s; + unsigned long v; + struct uvh_lb_bau_intd_software_acknowledge_s { + unsigned long pending_0:1; /* RW, W1C */ + unsigned long pending_1:1; /* RW, W1C */ + unsigned long pending_2:1; /* RW, W1C */ + unsigned long pending_3:1; /* RW, W1C */ + unsigned long pending_4:1; /* RW, W1C */ + unsigned long pending_5:1; /* RW, W1C */ + unsigned long pending_6:1; /* RW, W1C */ + unsigned long pending_7:1; /* RW, W1C */ + unsigned long timeout_0:1; /* RW, W1C */ + unsigned long timeout_1:1; /* RW, W1C */ + unsigned long timeout_2:1; /* RW, W1C */ + unsigned long timeout_3:1; /* RW, W1C */ + unsigned long timeout_4:1; /* RW, W1C */ + unsigned long timeout_5:1; /* RW, W1C */ + unsigned long timeout_6:1; /* RW, W1C */ + unsigned long timeout_7:1; /* RW, W1C */ + unsigned long rsvd_16_63:48; + } s; }; /* ========================================================================= */ /* UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS */ /* ========================================================================= */ -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS 0x0000000000320088UL -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS_32 0xa70 +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS 0x0000000000320088UL +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS_32 0xa70 /* ========================================================================= */ /* UVH_LB_BAU_MISC_CONTROL */ /* ========================================================================= */ -#define UVH_LB_BAU_MISC_CONTROL 0x320170UL -#define UVH_LB_BAU_MISC_CONTROL_32 0xa10 - -#define UVH_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0 -#define UVH_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK 0x00000000000000ffUL -#define UVH_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT 8 -#define UVH_LB_BAU_MISC_CONTROL_APIC_MODE_MASK 0x0000000000000100UL -#define UVH_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_SHFT 9 -#define UVH_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK 0x0000000000000200UL -#define UVH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT 10 -#define UVH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK 0x0000000000000400UL +#define UVH_LB_BAU_MISC_CONTROL 0x320170UL +#define UVH_LB_BAU_MISC_CONTROL_32 0xa10 + +#define UVH_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0 +#define UVH_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT 8 +#define UVH_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_SHFT 9 +#define UVH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT 10 #define UVH_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_SHFT 11 -#define UVH_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL #define UVH_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_SHFT 14 -#define UVH_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL #define UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT 15 -#define UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK 0x0000000000008000UL #define UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT 16 -#define UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK 0x00000000000f0000UL #define UVH_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_SHFT 20 -#define UVH_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL #define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_SHFT 21 -#define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL #define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_SHFT 22 -#define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL #define UVH_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_SHFT 23 -#define UVH_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_MASK 0x0000000000800000UL #define UVH_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_SHFT 24 -#define UVH_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000007000000UL #define UVH_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_SHFT 27 -#define UVH_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL #define UVH_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_SHFT 28 +#define UVH_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK 0x00000000000000ffUL +#define UVH_LB_BAU_MISC_CONTROL_APIC_MODE_MASK 0x0000000000000100UL +#define UVH_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK 0x0000000000000200UL +#define UVH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK 0x0000000000000400UL +#define UVH_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL +#define UVH_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL +#define UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK 0x0000000000008000UL +#define UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK 0x00000000000f0000UL +#define UVH_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL +#define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL +#define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL +#define UVH_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_MASK 0x0000000000800000UL +#define UVH_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000007000000UL +#define UVH_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL #define UVH_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000010000000UL -#define UV1H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0 -#define UV1H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK 0x00000000000000ffUL -#define UV1H_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT 8 -#define UV1H_LB_BAU_MISC_CONTROL_APIC_MODE_MASK 0x0000000000000100UL -#define UV1H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_SHFT 9 -#define UV1H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK 0x0000000000000200UL -#define UV1H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT 10 -#define UV1H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK 0x0000000000000400UL +#define UV1H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0 +#define UV1H_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT 8 +#define UV1H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_SHFT 9 +#define UV1H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT 10 #define UV1H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_SHFT 11 -#define UV1H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL #define UV1H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_SHFT 14 -#define UV1H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL #define UV1H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT 15 -#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK 0x0000000000008000UL #define UV1H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT 16 -#define UV1H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK 0x00000000000f0000UL #define UV1H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_SHFT 20 -#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL #define UV1H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_SHFT 21 -#define UV1H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL #define UV1H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_SHFT 22 -#define UV1H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL #define UV1H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_SHFT 23 -#define UV1H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_MASK 0x0000000000800000UL #define UV1H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_SHFT 24 -#define UV1H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000007000000UL #define UV1H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_SHFT 27 -#define UV1H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL #define UV1H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_SHFT 28 +#define UV1H_LB_BAU_MISC_CONTROL_FUN_SHFT 48 +#define UV1H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK 0x00000000000000ffUL +#define UV1H_LB_BAU_MISC_CONTROL_APIC_MODE_MASK 0x0000000000000100UL +#define UV1H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK 0x0000000000000200UL +#define UV1H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK 0x0000000000000400UL +#define UV1H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL +#define UV1H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL +#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK 0x0000000000008000UL +#define UV1H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK 0x00000000000f0000UL +#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL +#define UV1H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL +#define UV1H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL +#define UV1H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_MASK 0x0000000000800000UL +#define UV1H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000007000000UL +#define UV1H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL #define UV1H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000010000000UL -#define UV1H_LB_BAU_MISC_CONTROL_FUN_SHFT 48 -#define UV1H_LB_BAU_MISC_CONTROL_FUN_MASK 0xffff000000000000UL - -#define UV2H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0 -#define UV2H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK 0x00000000000000ffUL -#define UV2H_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT 8 -#define UV2H_LB_BAU_MISC_CONTROL_APIC_MODE_MASK 0x0000000000000100UL -#define UV2H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_SHFT 9 -#define UV2H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK 0x0000000000000200UL -#define UV2H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT 10 -#define UV2H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK 0x0000000000000400UL +#define UV1H_LB_BAU_MISC_CONTROL_FUN_MASK 0xffff000000000000UL + +#define UV2H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0 +#define UV2H_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT 8 +#define UV2H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_SHFT 9 +#define UV2H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT 10 #define UV2H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_SHFT 11 -#define UV2H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL #define UV2H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_SHFT 14 -#define UV2H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL #define UV2H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT 15 -#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK 0x0000000000008000UL #define UV2H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT 16 -#define UV2H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK 0x00000000000f0000UL #define UV2H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_SHFT 20 -#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL #define UV2H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_SHFT 21 -#define UV2H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL #define UV2H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_SHFT 22 -#define UV2H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL #define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_SHFT 23 -#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_MASK 0x0000000000800000UL #define UV2H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_SHFT 24 -#define UV2H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000007000000UL #define UV2H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_SHFT 27 -#define UV2H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL #define UV2H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_SHFT 28 -#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000010000000UL #define UV2H_LB_BAU_MISC_CONTROL_ENABLE_AUTOMATIC_APIC_MODE_SELECTION_SHFT 29 -#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_AUTOMATIC_APIC_MODE_SELECTION_MASK 0x0000000020000000UL -#define UV2H_LB_BAU_MISC_CONTROL_APIC_MODE_STATUS_SHFT 30 -#define UV2H_LB_BAU_MISC_CONTROL_APIC_MODE_STATUS_MASK 0x0000000040000000UL +#define UV2H_LB_BAU_MISC_CONTROL_APIC_MODE_STATUS_SHFT 30 #define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_INTERRUPTS_TO_SELF_SHFT 31 -#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_INTERRUPTS_TO_SELF_MASK 0x0000000080000000UL #define UV2H_LB_BAU_MISC_CONTROL_ENABLE_LOCK_BASED_SYSTEM_FLUSH_SHFT 32 -#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_LOCK_BASED_SYSTEM_FLUSH_MASK 0x0000000100000000UL #define UV2H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_SHFT 33 -#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_MASK 0x0000000200000000UL #define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_INT_PRIO_UDT_TO_SELF_SHFT 34 -#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_INT_PRIO_UDT_TO_SELF_MASK 0x0000000400000000UL #define UV2H_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_SHFT 35 +#define UV2H_LB_BAU_MISC_CONTROL_FUN_SHFT 48 +#define UV2H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK 0x00000000000000ffUL +#define UV2H_LB_BAU_MISC_CONTROL_APIC_MODE_MASK 0x0000000000000100UL +#define UV2H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK 0x0000000000000200UL +#define UV2H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK 0x0000000000000400UL +#define UV2H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL +#define UV2H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL +#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK 0x0000000000008000UL +#define UV2H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK 0x00000000000f0000UL +#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL +#define UV2H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL +#define UV2H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL +#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_MASK 0x0000000000800000UL +#define UV2H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000007000000UL +#define UV2H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL +#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000010000000UL +#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_AUTOMATIC_APIC_MODE_SELECTION_MASK 0x0000000020000000UL +#define UV2H_LB_BAU_MISC_CONTROL_APIC_MODE_STATUS_MASK 0x0000000040000000UL +#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_INTERRUPTS_TO_SELF_MASK 0x0000000080000000UL +#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_LOCK_BASED_SYSTEM_FLUSH_MASK 0x0000000100000000UL +#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_MASK 0x0000000200000000UL +#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_INT_PRIO_UDT_TO_SELF_MASK 0x0000000400000000UL #define UV2H_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_MASK 0x0000000800000000UL -#define UV2H_LB_BAU_MISC_CONTROL_FUN_SHFT 48 -#define UV2H_LB_BAU_MISC_CONTROL_FUN_MASK 0xffff000000000000UL +#define UV2H_LB_BAU_MISC_CONTROL_FUN_MASK 0xffff000000000000UL union uvh_lb_bau_misc_control_u { - unsigned long v; - struct uvh_lb_bau_misc_control_s { - unsigned long rejection_delay : 8; /* RW */ - unsigned long apic_mode : 1; /* RW */ - unsigned long force_broadcast : 1; /* RW */ - unsigned long force_lock_nop : 1; /* RW */ - unsigned long qpi_agent_presence_vector : 3; /* RW */ - unsigned long descriptor_fetch_mode : 1; /* RW */ - unsigned long enable_intd_soft_ack_mode : 1; /* RW */ - unsigned long intd_soft_ack_timeout_period : 4; /* RW */ - unsigned long enable_dual_mapping_mode : 1; /* RW */ - unsigned long vga_io_port_decode_enable : 1; /* RW */ - unsigned long vga_io_port_16_bit_decode : 1; /* RW */ - unsigned long suppress_dest_registration : 1; /* RW */ - unsigned long programmed_initial_priority : 3; /* RW */ - unsigned long use_incoming_priority : 1; /* RW */ - unsigned long enable_programmed_initial_priority : 1; /* RW */ - unsigned long rsvd_29_63 : 35; - } s; - struct uv1h_lb_bau_misc_control_s { - unsigned long rejection_delay : 8; /* RW */ - unsigned long apic_mode : 1; /* RW */ - unsigned long force_broadcast : 1; /* RW */ - unsigned long force_lock_nop : 1; /* RW */ - unsigned long qpi_agent_presence_vector : 3; /* RW */ - unsigned long descriptor_fetch_mode : 1; /* RW */ - unsigned long enable_intd_soft_ack_mode : 1; /* RW */ - unsigned long intd_soft_ack_timeout_period : 4; /* RW */ - unsigned long enable_dual_mapping_mode : 1; /* RW */ - unsigned long vga_io_port_decode_enable : 1; /* RW */ - unsigned long vga_io_port_16_bit_decode : 1; /* RW */ - unsigned long suppress_dest_registration : 1; /* RW */ - unsigned long programmed_initial_priority : 3; /* RW */ - unsigned long use_incoming_priority : 1; /* RW */ - unsigned long enable_programmed_initial_priority : 1; /* RW */ - unsigned long rsvd_29_47 : 19; /* */ - unsigned long fun : 16; /* RW */ - } s1; - struct uv2h_lb_bau_misc_control_s { - unsigned long rejection_delay : 8; /* RW */ - unsigned long apic_mode : 1; /* RW */ - unsigned long force_broadcast : 1; /* RW */ - unsigned long force_lock_nop : 1; /* RW */ - unsigned long qpi_agent_presence_vector : 3; /* RW */ - unsigned long descriptor_fetch_mode : 1; /* RW */ - unsigned long enable_intd_soft_ack_mode : 1; /* RW */ - unsigned long intd_soft_ack_timeout_period : 4; /* RW */ - unsigned long enable_dual_mapping_mode : 1; /* RW */ - unsigned long vga_io_port_decode_enable : 1; /* RW */ - unsigned long vga_io_port_16_bit_decode : 1; /* RW */ - unsigned long suppress_dest_registration : 1; /* RW */ - unsigned long programmed_initial_priority : 3; /* RW */ - unsigned long use_incoming_priority : 1; /* RW */ - unsigned long enable_programmed_initial_priority : 1; /* RW */ - unsigned long enable_automatic_apic_mode_selection : 1; /* RW */ - unsigned long apic_mode_status : 1; /* RO */ - unsigned long suppress_interrupts_to_self : 1; /* RW */ - unsigned long enable_lock_based_system_flush : 1; /* RW */ - unsigned long enable_extended_sb_status : 1; /* RW */ - unsigned long suppress_int_prio_udt_to_self : 1; /* RW */ - unsigned long use_legacy_descriptor_formats : 1; /* RW */ - unsigned long rsvd_36_47 : 12; /* */ - unsigned long fun : 16; /* RW */ - } s2; + unsigned long v; + struct uvh_lb_bau_misc_control_s { + unsigned long rejection_delay:8; /* RW */ + unsigned long apic_mode:1; /* RW */ + unsigned long force_broadcast:1; /* RW */ + unsigned long force_lock_nop:1; /* RW */ + unsigned long qpi_agent_presence_vector:3; /* RW */ + unsigned long descriptor_fetch_mode:1; /* RW */ + unsigned long enable_intd_soft_ack_mode:1; /* RW */ + unsigned long intd_soft_ack_timeout_period:4; /* RW */ + unsigned long enable_dual_mapping_mode:1; /* RW */ + unsigned long vga_io_port_decode_enable:1; /* RW */ + unsigned long vga_io_port_16_bit_decode:1; /* RW */ + unsigned long suppress_dest_registration:1; /* RW */ + unsigned long programmed_initial_priority:3; /* RW */ + unsigned long use_incoming_priority:1; /* RW */ + unsigned long enable_programmed_initial_priority:1;/* RW */ + unsigned long rsvd_29_63:35; + } s; + struct uv1h_lb_bau_misc_control_s { + unsigned long rejection_delay:8; /* RW */ + unsigned long apic_mode:1; /* RW */ + unsigned long force_broadcast:1; /* RW */ + unsigned long force_lock_nop:1; /* RW */ + unsigned long qpi_agent_presence_vector:3; /* RW */ + unsigned long descriptor_fetch_mode:1; /* RW */ + unsigned long enable_intd_soft_ack_mode:1; /* RW */ + unsigned long intd_soft_ack_timeout_period:4; /* RW */ + unsigned long enable_dual_mapping_mode:1; /* RW */ + unsigned long vga_io_port_decode_enable:1; /* RW */ + unsigned long vga_io_port_16_bit_decode:1; /* RW */ + unsigned long suppress_dest_registration:1; /* RW */ + unsigned long programmed_initial_priority:3; /* RW */ + unsigned long use_incoming_priority:1; /* RW */ + unsigned long enable_programmed_initial_priority:1;/* RW */ + unsigned long rsvd_29_47:19; + unsigned long fun:16; /* RW */ + } s1; + struct uv2h_lb_bau_misc_control_s { + unsigned long rejection_delay:8; /* RW */ + unsigned long apic_mode:1; /* RW */ + unsigned long force_broadcast:1; /* RW */ + unsigned long force_lock_nop:1; /* RW */ + unsigned long qpi_agent_presence_vector:3; /* RW */ + unsigned long descriptor_fetch_mode:1; /* RW */ + unsigned long enable_intd_soft_ack_mode:1; /* RW */ + unsigned long intd_soft_ack_timeout_period:4; /* RW */ + unsigned long enable_dual_mapping_mode:1; /* RW */ + unsigned long vga_io_port_decode_enable:1; /* RW */ + unsigned long vga_io_port_16_bit_decode:1; /* RW */ + unsigned long suppress_dest_registration:1; /* RW */ + unsigned long programmed_initial_priority:3; /* RW */ + unsigned long use_incoming_priority:1; /* RW */ + unsigned long enable_programmed_initial_priority:1;/* RW */ + unsigned long enable_automatic_apic_mode_selection:1;/* RW */ + unsigned long apic_mode_status:1; /* RO */ + unsigned long suppress_interrupts_to_self:1; /* RW */ + unsigned long enable_lock_based_system_flush:1;/* RW */ + unsigned long enable_extended_sb_status:1; /* RW */ + unsigned long suppress_int_prio_udt_to_self:1;/* RW */ + unsigned long use_legacy_descriptor_formats:1;/* RW */ + unsigned long rsvd_36_47:12; + unsigned long fun:16; /* RW */ + } s2; }; /* ========================================================================= */ /* UVH_LB_BAU_SB_ACTIVATION_CONTROL */ /* ========================================================================= */ -#define UVH_LB_BAU_SB_ACTIVATION_CONTROL 0x320020UL -#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_32 0x9a8 +#define UVH_LB_BAU_SB_ACTIVATION_CONTROL 0x320020UL +#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_32 0x9a8 -#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INDEX_SHFT 0 -#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INDEX_MASK 0x000000000000003fUL -#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT 62 -#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_MASK 0x4000000000000000UL -#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INIT_SHFT 63 -#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INIT_MASK 0x8000000000000000UL +#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INDEX_SHFT 0 +#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT 62 +#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INIT_SHFT 63 +#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INDEX_MASK 0x000000000000003fUL +#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_MASK 0x4000000000000000UL +#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INIT_MASK 0x8000000000000000UL union uvh_lb_bau_sb_activation_control_u { - unsigned long v; - struct uvh_lb_bau_sb_activation_control_s { - unsigned long index : 6; /* RW */ - unsigned long rsvd_6_61: 56; /* */ - unsigned long push : 1; /* WP */ - unsigned long init : 1; /* WP */ - } s; + unsigned long v; + struct uvh_lb_bau_sb_activation_control_s { + unsigned long index:6; /* RW */ + unsigned long rsvd_6_61:56; + unsigned long push:1; /* WP */ + unsigned long init:1; /* WP */ + } s; }; /* ========================================================================= */ /* UVH_LB_BAU_SB_ACTIVATION_STATUS_0 */ /* ========================================================================= */ -#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0 0x320030UL -#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_32 0x9b0 +#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0 0x320030UL +#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_32 0x9b0 -#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_STATUS_SHFT 0 -#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_STATUS_MASK 0xffffffffffffffffUL +#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_STATUS_SHFT 0 +#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_STATUS_MASK 0xffffffffffffffffUL union uvh_lb_bau_sb_activation_status_0_u { - unsigned long v; - struct uvh_lb_bau_sb_activation_status_0_s { - unsigned long status : 64; /* RW */ - } s; + unsigned long v; + struct uvh_lb_bau_sb_activation_status_0_s { + unsigned long status:64; /* RW */ + } s; }; /* ========================================================================= */ /* UVH_LB_BAU_SB_ACTIVATION_STATUS_1 */ /* ========================================================================= */ -#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1 0x320040UL -#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_32 0x9b8 +#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1 0x320040UL +#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_32 0x9b8 -#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_STATUS_SHFT 0 -#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_STATUS_MASK 0xffffffffffffffffUL +#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_STATUS_SHFT 0 +#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_STATUS_MASK 0xffffffffffffffffUL union uvh_lb_bau_sb_activation_status_1_u { - unsigned long v; - struct uvh_lb_bau_sb_activation_status_1_s { - unsigned long status : 64; /* RW */ - } s; + unsigned long v; + struct uvh_lb_bau_sb_activation_status_1_s { + unsigned long status:64; /* RW */ + } s; }; /* ========================================================================= */ /* UVH_LB_BAU_SB_DESCRIPTOR_BASE */ /* ========================================================================= */ -#define UVH_LB_BAU_SB_DESCRIPTOR_BASE 0x320010UL -#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_32 0x9a0 +#define UVH_LB_BAU_SB_DESCRIPTOR_BASE 0x320010UL +#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_32 0x9a0 -#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_SHFT 12 -#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_MASK 0x000007fffffff000UL -#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_SHFT 49 -#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_MASK 0x7ffe000000000000UL +#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_SHFT 12 +#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_SHFT 49 +#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_MASK 0x000007fffffff000UL +#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_MASK 0x7ffe000000000000UL union uvh_lb_bau_sb_descriptor_base_u { - unsigned long v; - struct uvh_lb_bau_sb_descriptor_base_s { - unsigned long rsvd_0_11 : 12; /* */ - unsigned long page_address : 31; /* RW */ - unsigned long rsvd_43_48 : 6; /* */ - unsigned long node_id : 14; /* RW */ - unsigned long rsvd_63 : 1; /* */ - } s; + unsigned long v; + struct uvh_lb_bau_sb_descriptor_base_s { + unsigned long rsvd_0_11:12; + unsigned long page_address:31; /* RW */ + unsigned long rsvd_43_48:6; + unsigned long node_id:14; /* RW */ + unsigned long rsvd_63:1; + } s; }; /* ========================================================================= */ /* UVH_NODE_ID */ /* ========================================================================= */ -#define UVH_NODE_ID 0x0UL - -#define UVH_NODE_ID_FORCE1_SHFT 0 -#define UVH_NODE_ID_FORCE1_MASK 0x0000000000000001UL -#define UVH_NODE_ID_MANUFACTURER_SHFT 1 -#define UVH_NODE_ID_MANUFACTURER_MASK 0x0000000000000ffeUL -#define UVH_NODE_ID_PART_NUMBER_SHFT 12 -#define UVH_NODE_ID_PART_NUMBER_MASK 0x000000000ffff000UL -#define UVH_NODE_ID_REVISION_SHFT 28 -#define UVH_NODE_ID_REVISION_MASK 0x00000000f0000000UL -#define UVH_NODE_ID_NODE_ID_SHFT 32 -#define UVH_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL - -#define UV1H_NODE_ID_FORCE1_SHFT 0 -#define UV1H_NODE_ID_FORCE1_MASK 0x0000000000000001UL -#define UV1H_NODE_ID_MANUFACTURER_SHFT 1 -#define UV1H_NODE_ID_MANUFACTURER_MASK 0x0000000000000ffeUL -#define UV1H_NODE_ID_PART_NUMBER_SHFT 12 -#define UV1H_NODE_ID_PART_NUMBER_MASK 0x000000000ffff000UL -#define UV1H_NODE_ID_REVISION_SHFT 28 -#define UV1H_NODE_ID_REVISION_MASK 0x00000000f0000000UL -#define UV1H_NODE_ID_NODE_ID_SHFT 32 -#define UV1H_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL -#define UV1H_NODE_ID_NODES_PER_BIT_SHFT 48 -#define UV1H_NODE_ID_NODES_PER_BIT_MASK 0x007f000000000000UL -#define UV1H_NODE_ID_NI_PORT_SHFT 56 -#define UV1H_NODE_ID_NI_PORT_MASK 0x0f00000000000000UL - -#define UV2H_NODE_ID_FORCE1_SHFT 0 -#define UV2H_NODE_ID_FORCE1_MASK 0x0000000000000001UL -#define UV2H_NODE_ID_MANUFACTURER_SHFT 1 -#define UV2H_NODE_ID_MANUFACTURER_MASK 0x0000000000000ffeUL -#define UV2H_NODE_ID_PART_NUMBER_SHFT 12 -#define UV2H_NODE_ID_PART_NUMBER_MASK 0x000000000ffff000UL -#define UV2H_NODE_ID_REVISION_SHFT 28 -#define UV2H_NODE_ID_REVISION_MASK 0x00000000f0000000UL -#define UV2H_NODE_ID_NODE_ID_SHFT 32 -#define UV2H_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL -#define UV2H_NODE_ID_NODES_PER_BIT_SHFT 50 -#define UV2H_NODE_ID_NODES_PER_BIT_MASK 0x01fc000000000000UL -#define UV2H_NODE_ID_NI_PORT_SHFT 57 -#define UV2H_NODE_ID_NI_PORT_MASK 0x3e00000000000000UL +#define UVH_NODE_ID 0x0UL + +#define UVH_NODE_ID_FORCE1_SHFT 0 +#define UVH_NODE_ID_MANUFACTURER_SHFT 1 +#define UVH_NODE_ID_PART_NUMBER_SHFT 12 +#define UVH_NODE_ID_REVISION_SHFT 28 +#define UVH_NODE_ID_NODE_ID_SHFT 32 +#define UVH_NODE_ID_FORCE1_MASK 0x0000000000000001UL +#define UVH_NODE_ID_MANUFACTURER_MASK 0x0000000000000ffeUL +#define UVH_NODE_ID_PART_NUMBER_MASK 0x000000000ffff000UL +#define UVH_NODE_ID_REVISION_MASK 0x00000000f0000000UL +#define UVH_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL + +#define UV1H_NODE_ID_FORCE1_SHFT 0 +#define UV1H_NODE_ID_MANUFACTURER_SHFT 1 +#define UV1H_NODE_ID_PART_NUMBER_SHFT 12 +#define UV1H_NODE_ID_REVISION_SHFT 28 +#define UV1H_NODE_ID_NODE_ID_SHFT 32 +#define UV1H_NODE_ID_NODES_PER_BIT_SHFT 48 +#define UV1H_NODE_ID_NI_PORT_SHFT 56 +#define UV1H_NODE_ID_FORCE1_MASK 0x0000000000000001UL +#define UV1H_NODE_ID_MANUFACTURER_MASK 0x0000000000000ffeUL +#define UV1H_NODE_ID_PART_NUMBER_MASK 0x000000000ffff000UL +#define UV1H_NODE_ID_REVISION_MASK 0x00000000f0000000UL +#define UV1H_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL +#define UV1H_NODE_ID_NODES_PER_BIT_MASK 0x007f000000000000UL +#define UV1H_NODE_ID_NI_PORT_MASK 0x0f00000000000000UL + +#define UV2H_NODE_ID_FORCE1_SHFT 0 +#define UV2H_NODE_ID_MANUFACTURER_SHFT 1 +#define UV2H_NODE_ID_PART_NUMBER_SHFT 12 +#define UV2H_NODE_ID_REVISION_SHFT 28 +#define UV2H_NODE_ID_NODE_ID_SHFT 32 +#define UV2H_NODE_ID_NODES_PER_BIT_SHFT 50 +#define UV2H_NODE_ID_NI_PORT_SHFT 57 +#define UV2H_NODE_ID_FORCE1_MASK 0x0000000000000001UL +#define UV2H_NODE_ID_MANUFACTURER_MASK 0x0000000000000ffeUL +#define UV2H_NODE_ID_PART_NUMBER_MASK 0x000000000ffff000UL +#define UV2H_NODE_ID_REVISION_MASK 0x00000000f0000000UL +#define UV2H_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL +#define UV2H_NODE_ID_NODES_PER_BIT_MASK 0x01fc000000000000UL +#define UV2H_NODE_ID_NI_PORT_MASK 0x3e00000000000000UL union uvh_node_id_u { - unsigned long v; - struct uvh_node_id_s { - unsigned long force1 : 1; /* RO */ - unsigned long manufacturer : 11; /* RO */ - unsigned long part_number : 16; /* RO */ - unsigned long revision : 4; /* RO */ - unsigned long node_id : 15; /* RW */ - unsigned long rsvd_47_63 : 17; - } s; - struct uv1h_node_id_s { - unsigned long force1 : 1; /* RO */ - unsigned long manufacturer : 11; /* RO */ - unsigned long part_number : 16; /* RO */ - unsigned long revision : 4; /* RO */ - unsigned long node_id : 15; /* RW */ - unsigned long rsvd_47 : 1; /* */ - unsigned long nodes_per_bit : 7; /* RW */ - unsigned long rsvd_55 : 1; /* */ - unsigned long ni_port : 4; /* RO */ - unsigned long rsvd_60_63 : 4; /* */ - } s1; - struct uv2h_node_id_s { - unsigned long force1 : 1; /* RO */ - unsigned long manufacturer : 11; /* RO */ - unsigned long part_number : 16; /* RO */ - unsigned long revision : 4; /* RO */ - unsigned long node_id : 15; /* RW */ - unsigned long rsvd_47_49 : 3; /* */ - unsigned long nodes_per_bit : 7; /* RO */ - unsigned long ni_port : 5; /* RO */ - unsigned long rsvd_62_63 : 2; /* */ - } s2; + unsigned long v; + struct uvh_node_id_s { + unsigned long force1:1; /* RO */ + unsigned long manufacturer:11; /* RO */ + unsigned long part_number:16; /* RO */ + unsigned long revision:4; /* RO */ + unsigned long node_id:15; /* RW */ + unsigned long rsvd_47_63:17; + } s; + struct uv1h_node_id_s { + unsigned long force1:1; /* RO */ + unsigned long manufacturer:11; /* RO */ + unsigned long part_number:16; /* RO */ + unsigned long revision:4; /* RO */ + unsigned long node_id:15; /* RW */ + unsigned long rsvd_47:1; + unsigned long nodes_per_bit:7; /* RW */ + unsigned long rsvd_55:1; + unsigned long ni_port:4; /* RO */ + unsigned long rsvd_60_63:4; + } s1; + struct uv2h_node_id_s { + unsigned long force1:1; /* RO */ + unsigned long manufacturer:11; /* RO */ + unsigned long part_number:16; /* RO */ + unsigned long revision:4; /* RO */ + unsigned long node_id:15; /* RW */ + unsigned long rsvd_47_49:3; + unsigned long nodes_per_bit:7; /* RO */ + unsigned long ni_port:5; /* RO */ + unsigned long rsvd_62_63:2; + } s2; }; /* ========================================================================= */ /* UVH_NODE_PRESENT_TABLE */ /* ========================================================================= */ -#define UVH_NODE_PRESENT_TABLE 0x1400UL -#define UVH_NODE_PRESENT_TABLE_DEPTH 16 +#define UVH_NODE_PRESENT_TABLE 0x1400UL +#define UVH_NODE_PRESENT_TABLE_DEPTH 16 -#define UVH_NODE_PRESENT_TABLE_NODES_SHFT 0 -#define UVH_NODE_PRESENT_TABLE_NODES_MASK 0xffffffffffffffffUL +#define UVH_NODE_PRESENT_TABLE_NODES_SHFT 0 +#define UVH_NODE_PRESENT_TABLE_NODES_MASK 0xffffffffffffffffUL union uvh_node_present_table_u { - unsigned long v; - struct uvh_node_present_table_s { - unsigned long nodes : 64; /* RW */ - } s; + unsigned long v; + struct uvh_node_present_table_s { + unsigned long nodes:64; /* RW */ + } s; }; /* ========================================================================= */ /* UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR */ /* ========================================================================= */ -#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR 0x16000c8UL +#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR 0x16000c8UL #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_SHFT 24 -#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_MASK 0x00000000ff000000UL #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_SHFT 48 -#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_MASK 0x001f000000000000UL #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_SHFT 63 +#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_MASK 0x00000000ff000000UL +#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_MASK 0x001f000000000000UL #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_MASK 0x8000000000000000UL union uvh_rh_gam_alias210_overlay_config_0_mmr_u { - unsigned long v; - struct uvh_rh_gam_alias210_overlay_config_0_mmr_s { - unsigned long rsvd_0_23: 24; /* */ - unsigned long base : 8; /* RW */ - unsigned long rsvd_32_47: 16; /* */ - unsigned long m_alias : 5; /* RW */ - unsigned long rsvd_53_62: 10; /* */ - unsigned long enable : 1; /* RW */ - } s; + unsigned long v; + struct uvh_rh_gam_alias210_overlay_config_0_mmr_s { + unsigned long rsvd_0_23:24; + unsigned long base:8; /* RW */ + unsigned long rsvd_32_47:16; + unsigned long m_alias:5; /* RW */ + unsigned long rsvd_53_62:10; + unsigned long enable:1; /* RW */ + } s; }; /* ========================================================================= */ /* UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR */ /* ========================================================================= */ -#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR 0x16000d8UL +#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR 0x16000d8UL #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_SHFT 24 -#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_MASK 0x00000000ff000000UL #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_SHFT 48 -#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_MASK 0x001f000000000000UL #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_SHFT 63 +#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_MASK 0x00000000ff000000UL +#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_MASK 0x001f000000000000UL #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_MASK 0x8000000000000000UL union uvh_rh_gam_alias210_overlay_config_1_mmr_u { - unsigned long v; - struct uvh_rh_gam_alias210_overlay_config_1_mmr_s { - unsigned long rsvd_0_23: 24; /* */ - unsigned long base : 8; /* RW */ - unsigned long rsvd_32_47: 16; /* */ - unsigned long m_alias : 5; /* RW */ - unsigned long rsvd_53_62: 10; /* */ - unsigned long enable : 1; /* RW */ - } s; + unsigned long v; + struct uvh_rh_gam_alias210_overlay_config_1_mmr_s { + unsigned long rsvd_0_23:24; + unsigned long base:8; /* RW */ + unsigned long rsvd_32_47:16; + unsigned long m_alias:5; /* RW */ + unsigned long rsvd_53_62:10; + unsigned long enable:1; /* RW */ + } s; }; /* ========================================================================= */ /* UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR */ /* ========================================================================= */ -#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR 0x16000e8UL +#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR 0x16000e8UL #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_SHFT 24 -#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_MASK 0x00000000ff000000UL #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_SHFT 48 -#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_MASK 0x001f000000000000UL #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_SHFT 63 +#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_MASK 0x00000000ff000000UL +#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_MASK 0x001f000000000000UL #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_MASK 0x8000000000000000UL union uvh_rh_gam_alias210_overlay_config_2_mmr_u { - unsigned long v; - struct uvh_rh_gam_alias210_overlay_config_2_mmr_s { - unsigned long rsvd_0_23: 24; /* */ - unsigned long base : 8; /* RW */ - unsigned long rsvd_32_47: 16; /* */ - unsigned long m_alias : 5; /* RW */ - unsigned long rsvd_53_62: 10; /* */ - unsigned long enable : 1; /* RW */ - } s; + unsigned long v; + struct uvh_rh_gam_alias210_overlay_config_2_mmr_s { + unsigned long rsvd_0_23:24; + unsigned long base:8; /* RW */ + unsigned long rsvd_32_47:16; + unsigned long m_alias:5; /* RW */ + unsigned long rsvd_53_62:10; + unsigned long enable:1; /* RW */ + } s; }; /* ========================================================================= */ /* UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR */ /* ========================================================================= */ -#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR 0x16000d0UL +#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR 0x16000d0UL #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT 24 #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_MASK 0x00003fffff000000UL union uvh_rh_gam_alias210_redirect_config_0_mmr_u { - unsigned long v; - struct uvh_rh_gam_alias210_redirect_config_0_mmr_s { - unsigned long rsvd_0_23 : 24; /* */ - unsigned long dest_base : 22; /* RW */ - unsigned long rsvd_46_63: 18; /* */ - } s; + unsigned long v; + struct uvh_rh_gam_alias210_redirect_config_0_mmr_s { + unsigned long rsvd_0_23:24; + unsigned long dest_base:22; /* RW */ + unsigned long rsvd_46_63:18; + } s; }; /* ========================================================================= */ /* UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR */ /* ========================================================================= */ -#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR 0x16000e0UL +#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR 0x16000e0UL #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_SHFT 24 #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_MASK 0x00003fffff000000UL union uvh_rh_gam_alias210_redirect_config_1_mmr_u { - unsigned long v; - struct uvh_rh_gam_alias210_redirect_config_1_mmr_s { - unsigned long rsvd_0_23 : 24; /* */ - unsigned long dest_base : 22; /* RW */ - unsigned long rsvd_46_63: 18; /* */ - } s; + unsigned long v; + struct uvh_rh_gam_alias210_redirect_config_1_mmr_s { + unsigned long rsvd_0_23:24; + unsigned long dest_base:22; /* RW */ + unsigned long rsvd_46_63:18; + } s; }; /* ========================================================================= */ /* UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR */ /* ========================================================================= */ -#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR 0x16000f0UL +#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR 0x16000f0UL #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_SHFT 24 #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_MASK 0x00003fffff000000UL union uvh_rh_gam_alias210_redirect_config_2_mmr_u { - unsigned long v; - struct uvh_rh_gam_alias210_redirect_config_2_mmr_s { - unsigned long rsvd_0_23 : 24; /* */ - unsigned long dest_base : 22; /* RW */ - unsigned long rsvd_46_63: 18; /* */ - } s; + unsigned long v; + struct uvh_rh_gam_alias210_redirect_config_2_mmr_s { + unsigned long rsvd_0_23:24; + unsigned long dest_base:22; /* RW */ + unsigned long rsvd_46_63:18; + } s; }; /* ========================================================================= */ /* UVH_RH_GAM_CONFIG_MMR */ /* ========================================================================= */ -#define UVH_RH_GAM_CONFIG_MMR 0x1600000UL +#define UVH_RH_GAM_CONFIG_MMR 0x1600000UL -#define UVH_RH_GAM_CONFIG_MMR_M_SKT_SHFT 0 -#define UVH_RH_GAM_CONFIG_MMR_M_SKT_MASK 0x000000000000003fUL -#define UVH_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6 -#define UVH_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL +#define UVH_RH_GAM_CONFIG_MMR_M_SKT_SHFT 0 +#define UVH_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6 +#define UVH_RH_GAM_CONFIG_MMR_M_SKT_MASK 0x000000000000003fUL +#define UVH_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL -#define UV1H_RH_GAM_CONFIG_MMR_M_SKT_SHFT 0 -#define UV1H_RH_GAM_CONFIG_MMR_M_SKT_MASK 0x000000000000003fUL -#define UV1H_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6 -#define UV1H_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL -#define UV1H_RH_GAM_CONFIG_MMR_MMIOL_CFG_SHFT 12 -#define UV1H_RH_GAM_CONFIG_MMR_MMIOL_CFG_MASK 0x0000000000001000UL +#define UV1H_RH_GAM_CONFIG_MMR_M_SKT_SHFT 0 +#define UV1H_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6 +#define UV1H_RH_GAM_CONFIG_MMR_MMIOL_CFG_SHFT 12 +#define UV1H_RH_GAM_CONFIG_MMR_M_SKT_MASK 0x000000000000003fUL +#define UV1H_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL +#define UV1H_RH_GAM_CONFIG_MMR_MMIOL_CFG_MASK 0x0000000000001000UL -#define UV2H_RH_GAM_CONFIG_MMR_M_SKT_SHFT 0 -#define UV2H_RH_GAM_CONFIG_MMR_M_SKT_MASK 0x000000000000003fUL -#define UV2H_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6 -#define UV2H_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL +#define UV2H_RH_GAM_CONFIG_MMR_M_SKT_SHFT 0 +#define UV2H_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6 +#define UV2H_RH_GAM_CONFIG_MMR_M_SKT_MASK 0x000000000000003fUL +#define UV2H_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL union uvh_rh_gam_config_mmr_u { - unsigned long v; - struct uvh_rh_gam_config_mmr_s { - unsigned long m_skt : 6; /* RW */ - unsigned long n_skt : 4; /* RW */ - unsigned long rsvd_10_63 : 54; - } s; - struct uv1h_rh_gam_config_mmr_s { - unsigned long m_skt : 6; /* RW */ - unsigned long n_skt : 4; /* RW */ - unsigned long rsvd_10_11: 2; /* */ - unsigned long mmiol_cfg : 1; /* RW */ - unsigned long rsvd_13_63: 51; /* */ - } s1; - struct uv2h_rh_gam_config_mmr_s { - unsigned long m_skt : 6; /* RW */ - unsigned long n_skt : 4; /* RW */ - unsigned long rsvd_10_63: 54; /* */ - } s2; + unsigned long v; + struct uvh_rh_gam_config_mmr_s { + unsigned long m_skt:6; /* RW */ + unsigned long n_skt:4; /* RW */ + unsigned long rsvd_10_63:54; + } s; + struct uv1h_rh_gam_config_mmr_s { + unsigned long m_skt:6; /* RW */ + unsigned long n_skt:4; /* RW */ + unsigned long rsvd_10_11:2; + unsigned long mmiol_cfg:1; /* RW */ + unsigned long rsvd_13_63:51; + } s1; + struct uv2h_rh_gam_config_mmr_s { + unsigned long m_skt:6; /* RW */ + unsigned long n_skt:4; /* RW */ + unsigned long rsvd_10_63:54; + } s2; }; /* ========================================================================= */ /* UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR */ /* ========================================================================= */ -#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x1600010UL +#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x1600010UL -#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 28 -#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff0000000UL +#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 28 +#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff0000000UL -#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 28 -#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff0000000UL -#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_GR4_SHFT 48 -#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_GR4_MASK 0x0001000000000000UL -#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT 52 -#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL -#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 -#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL +#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 28 +#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_GR4_SHFT 48 +#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT 52 +#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 +#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff0000000UL +#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_GR4_MASK 0x0001000000000000UL +#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL +#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL -#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 28 -#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff0000000UL -#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT 52 -#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL -#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 -#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL +#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 28 +#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT 52 +#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 +#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff0000000UL +#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL +#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL union uvh_rh_gam_gru_overlay_config_mmr_u { - unsigned long v; - struct uvh_rh_gam_gru_overlay_config_mmr_s { - unsigned long rsvd_0_27: 28; /* */ - unsigned long base : 18; /* RW */ - unsigned long rsvd_46_62 : 17; - unsigned long enable : 1; /* RW */ - } s; - struct uv1h_rh_gam_gru_overlay_config_mmr_s { - unsigned long rsvd_0_27: 28; /* */ - unsigned long base : 18; /* RW */ - unsigned long rsvd_46_47: 2; /* */ - unsigned long gr4 : 1; /* RW */ - unsigned long rsvd_49_51: 3; /* */ - unsigned long n_gru : 4; /* RW */ - unsigned long rsvd_56_62: 7; /* */ - unsigned long enable : 1; /* RW */ - } s1; - struct uv2h_rh_gam_gru_overlay_config_mmr_s { - unsigned long rsvd_0_27: 28; /* */ - unsigned long base : 18; /* RW */ - unsigned long rsvd_46_51: 6; /* */ - unsigned long n_gru : 4; /* RW */ - unsigned long rsvd_56_62: 7; /* */ - unsigned long enable : 1; /* RW */ - } s2; + unsigned long v; + struct uvh_rh_gam_gru_overlay_config_mmr_s { + unsigned long rsvd_0_27:28; + unsigned long base:18; /* RW */ + unsigned long rsvd_46_62:17; + unsigned long enable:1; /* RW */ + } s; + struct uv1h_rh_gam_gru_overlay_config_mmr_s { + unsigned long rsvd_0_27:28; + unsigned long base:18; /* RW */ + unsigned long rsvd_46_47:2; + unsigned long gr4:1; /* RW */ + unsigned long rsvd_49_51:3; + unsigned long n_gru:4; /* RW */ + unsigned long rsvd_56_62:7; + unsigned long enable:1; /* RW */ + } s1; + struct uv2h_rh_gam_gru_overlay_config_mmr_s { + unsigned long rsvd_0_27:28; + unsigned long base:18; /* RW */ + unsigned long rsvd_46_51:6; + unsigned long n_gru:4; /* RW */ + unsigned long rsvd_56_62:7; + unsigned long enable:1; /* RW */ + } s2; }; /* ========================================================================= */ /* UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR */ /* ========================================================================= */ -#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR 0x1600030UL +#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR 0x1600030UL -#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT 30 -#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003fffc0000000UL -#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_SHFT 46 -#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_MASK 0x000fc00000000000UL -#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_SHFT 52 -#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_MASK 0x00f0000000000000UL +#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT 30 +#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_SHFT 46 +#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_SHFT 52 #define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 +#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003fffc0000000UL +#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_MASK 0x000fc00000000000UL +#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_MASK 0x00f0000000000000UL #define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL -#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT 27 -#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff8000000UL -#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_SHFT 46 -#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_MASK 0x000fc00000000000UL -#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_SHFT 52 -#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_MASK 0x00f0000000000000UL +#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT 27 +#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_SHFT 46 +#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_SHFT 52 #define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 +#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff8000000UL +#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_MASK 0x000fc00000000000UL +#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_MASK 0x00f0000000000000UL #define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL union uvh_rh_gam_mmioh_overlay_config_mmr_u { - unsigned long v; - struct uv1h_rh_gam_mmioh_overlay_config_mmr_s { - unsigned long rsvd_0_29: 30; /* */ - unsigned long base : 16; /* RW */ - unsigned long m_io : 6; /* RW */ - unsigned long n_io : 4; /* RW */ - unsigned long rsvd_56_62: 7; /* */ - unsigned long enable : 1; /* RW */ - } s1; - struct uv2h_rh_gam_mmioh_overlay_config_mmr_s { - unsigned long rsvd_0_26: 27; /* */ - unsigned long base : 19; /* RW */ - unsigned long m_io : 6; /* RW */ - unsigned long n_io : 4; /* RW */ - unsigned long rsvd_56_62: 7; /* */ - unsigned long enable : 1; /* RW */ - } s2; + unsigned long v; + struct uv1h_rh_gam_mmioh_overlay_config_mmr_s { + unsigned long rsvd_0_29:30; + unsigned long base:16; /* RW */ + unsigned long m_io:6; /* RW */ + unsigned long n_io:4; /* RW */ + unsigned long rsvd_56_62:7; + unsigned long enable:1; /* RW */ + } s1; + struct uv2h_rh_gam_mmioh_overlay_config_mmr_s { + unsigned long rsvd_0_26:27; + unsigned long base:19; /* RW */ + unsigned long m_io:6; /* RW */ + unsigned long n_io:4; /* RW */ + unsigned long rsvd_56_62:7; + unsigned long enable:1; /* RW */ + } s2; }; /* ========================================================================= */ /* UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR */ /* ========================================================================= */ -#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR 0x1600028UL +#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR 0x1600028UL -#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26 -#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL +#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26 +#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL -#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26 -#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL +#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26 #define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_DUAL_HUB_SHFT 46 +#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 +#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL #define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_DUAL_HUB_MASK 0x0000400000000000UL -#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 -#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL +#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL -#define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26 -#define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL -#define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 -#define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL +#define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26 +#define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 +#define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL +#define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL union uvh_rh_gam_mmr_overlay_config_mmr_u { - unsigned long v; - struct uvh_rh_gam_mmr_overlay_config_mmr_s { - unsigned long rsvd_0_25: 26; /* */ - unsigned long base : 20; /* RW */ - unsigned long rsvd_46_62 : 17; - unsigned long enable : 1; /* RW */ - } s; - struct uv1h_rh_gam_mmr_overlay_config_mmr_s { - unsigned long rsvd_0_25: 26; /* */ - unsigned long base : 20; /* RW */ - unsigned long dual_hub : 1; /* RW */ - unsigned long rsvd_47_62: 16; /* */ - unsigned long enable : 1; /* RW */ - } s1; - struct uv2h_rh_gam_mmr_overlay_config_mmr_s { - unsigned long rsvd_0_25: 26; /* */ - unsigned long base : 20; /* RW */ - unsigned long rsvd_46_62: 17; /* */ - unsigned long enable : 1; /* RW */ - } s2; + unsigned long v; + struct uvh_rh_gam_mmr_overlay_config_mmr_s { + unsigned long rsvd_0_25:26; + unsigned long base:20; /* RW */ + unsigned long rsvd_46_62:17; + unsigned long enable:1; /* RW */ + } s; + struct uv1h_rh_gam_mmr_overlay_config_mmr_s { + unsigned long rsvd_0_25:26; + unsigned long base:20; /* RW */ + unsigned long dual_hub:1; /* RW */ + unsigned long rsvd_47_62:16; + unsigned long enable:1; /* RW */ + } s1; + struct uv2h_rh_gam_mmr_overlay_config_mmr_s { + unsigned long rsvd_0_25:26; + unsigned long base:20; /* RW */ + unsigned long rsvd_46_62:17; + unsigned long enable:1; /* RW */ + } s2; }; /* ========================================================================= */ /* UVH_RTC */ /* ========================================================================= */ -#define UVH_RTC 0x340000UL +#define UVH_RTC 0x340000UL -#define UVH_RTC_REAL_TIME_CLOCK_SHFT 0 -#define UVH_RTC_REAL_TIME_CLOCK_MASK 0x00ffffffffffffffUL +#define UVH_RTC_REAL_TIME_CLOCK_SHFT 0 +#define UVH_RTC_REAL_TIME_CLOCK_MASK 0x00ffffffffffffffUL union uvh_rtc_u { - unsigned long v; - struct uvh_rtc_s { - unsigned long real_time_clock : 56; /* RW */ - unsigned long rsvd_56_63 : 8; /* */ - } s; + unsigned long v; + struct uvh_rtc_s { + unsigned long real_time_clock:56; /* RW */ + unsigned long rsvd_56_63:8; + } s; }; /* ========================================================================= */ /* UVH_RTC1_INT_CONFIG */ /* ========================================================================= */ -#define UVH_RTC1_INT_CONFIG 0x615c0UL - -#define UVH_RTC1_INT_CONFIG_VECTOR_SHFT 0 -#define UVH_RTC1_INT_CONFIG_VECTOR_MASK 0x00000000000000ffUL -#define UVH_RTC1_INT_CONFIG_DM_SHFT 8 -#define UVH_RTC1_INT_CONFIG_DM_MASK 0x0000000000000700UL -#define UVH_RTC1_INT_CONFIG_DESTMODE_SHFT 11 -#define UVH_RTC1_INT_CONFIG_DESTMODE_MASK 0x0000000000000800UL -#define UVH_RTC1_INT_CONFIG_STATUS_SHFT 12 -#define UVH_RTC1_INT_CONFIG_STATUS_MASK 0x0000000000001000UL -#define UVH_RTC1_INT_CONFIG_P_SHFT 13 -#define UVH_RTC1_INT_CONFIG_P_MASK 0x0000000000002000UL -#define UVH_RTC1_INT_CONFIG_T_SHFT 15 -#define UVH_RTC1_INT_CONFIG_T_MASK 0x0000000000008000UL -#define UVH_RTC1_INT_CONFIG_M_SHFT 16 -#define UVH_RTC1_INT_CONFIG_M_MASK 0x0000000000010000UL -#define UVH_RTC1_INT_CONFIG_APIC_ID_SHFT 32 -#define UVH_RTC1_INT_CONFIG_APIC_ID_MASK 0xffffffff00000000UL +#define UVH_RTC1_INT_CONFIG 0x615c0UL + +#define UVH_RTC1_INT_CONFIG_VECTOR_SHFT 0 +#define UVH_RTC1_INT_CONFIG_DM_SHFT 8 +#define UVH_RTC1_INT_CONFIG_DESTMODE_SHFT 11 +#define UVH_RTC1_INT_CONFIG_STATUS_SHFT 12 +#define UVH_RTC1_INT_CONFIG_P_SHFT 13 +#define UVH_RTC1_INT_CONFIG_T_SHFT 15 +#define UVH_RTC1_INT_CONFIG_M_SHFT 16 +#define UVH_RTC1_INT_CONFIG_APIC_ID_SHFT 32 +#define UVH_RTC1_INT_CONFIG_VECTOR_MASK 0x00000000000000ffUL +#define UVH_RTC1_INT_CONFIG_DM_MASK 0x0000000000000700UL +#define UVH_RTC1_INT_CONFIG_DESTMODE_MASK 0x0000000000000800UL +#define UVH_RTC1_INT_CONFIG_STATUS_MASK 0x0000000000001000UL +#define UVH_RTC1_INT_CONFIG_P_MASK 0x0000000000002000UL +#define UVH_RTC1_INT_CONFIG_T_MASK 0x0000000000008000UL +#define UVH_RTC1_INT_CONFIG_M_MASK 0x0000000000010000UL +#define UVH_RTC1_INT_CONFIG_APIC_ID_MASK 0xffffffff00000000UL union uvh_rtc1_int_config_u { - unsigned long v; - struct uvh_rtc1_int_config_s { - unsigned long vector_ : 8; /* RW */ - unsigned long dm : 3; /* RW */ - unsigned long destmode : 1; /* RW */ - unsigned long status : 1; /* RO */ - unsigned long p : 1; /* RO */ - unsigned long rsvd_14 : 1; /* */ - unsigned long t : 1; /* RO */ - unsigned long m : 1; /* RW */ - unsigned long rsvd_17_31: 15; /* */ - unsigned long apic_id : 32; /* RW */ - } s; + unsigned long v; + struct uvh_rtc1_int_config_s { + unsigned long vector_:8; /* RW */ + unsigned long dm:3; /* RW */ + unsigned long destmode:1; /* RW */ + unsigned long status:1; /* RO */ + unsigned long p:1; /* RO */ + unsigned long rsvd_14:1; + unsigned long t:1; /* RO */ + unsigned long m:1; /* RW */ + unsigned long rsvd_17_31:15; + unsigned long apic_id:32; /* RW */ + } s; }; /* ========================================================================= */ /* UVH_SCRATCH5 */ /* ========================================================================= */ -#define UVH_SCRATCH5 0x2d0200UL -#define UVH_SCRATCH5_32 0x778 +#define UVH_SCRATCH5 0x2d0200UL +#define UVH_SCRATCH5_32 0x778 -#define UVH_SCRATCH5_SCRATCH5_SHFT 0 -#define UVH_SCRATCH5_SCRATCH5_MASK 0xffffffffffffffffUL +#define UVH_SCRATCH5_SCRATCH5_SHFT 0 +#define UVH_SCRATCH5_SCRATCH5_MASK 0xffffffffffffffffUL union uvh_scratch5_u { - unsigned long v; - struct uvh_scratch5_s { - unsigned long scratch5 : 64; /* RW, W1CS */ - } s; + unsigned long v; + struct uvh_scratch5_s { + unsigned long scratch5:64; /* RW, W1CS */ + } s; }; /* ========================================================================= */ /* UV2H_EVENT_OCCURRED2 */ /* ========================================================================= */ -#define UV2H_EVENT_OCCURRED2 0x70100UL -#define UV2H_EVENT_OCCURRED2_32 0xb68 - -#define UV2H_EVENT_OCCURRED2_RTC_0_SHFT 0 -#define UV2H_EVENT_OCCURRED2_RTC_0_MASK 0x0000000000000001UL -#define UV2H_EVENT_OCCURRED2_RTC_1_SHFT 1 -#define UV2H_EVENT_OCCURRED2_RTC_1_MASK 0x0000000000000002UL -#define UV2H_EVENT_OCCURRED2_RTC_2_SHFT 2 -#define UV2H_EVENT_OCCURRED2_RTC_2_MASK 0x0000000000000004UL -#define UV2H_EVENT_OCCURRED2_RTC_3_SHFT 3 -#define UV2H_EVENT_OCCURRED2_RTC_3_MASK 0x0000000000000008UL -#define UV2H_EVENT_OCCURRED2_RTC_4_SHFT 4 -#define UV2H_EVENT_OCCURRED2_RTC_4_MASK 0x0000000000000010UL -#define UV2H_EVENT_OCCURRED2_RTC_5_SHFT 5 -#define UV2H_EVENT_OCCURRED2_RTC_5_MASK 0x0000000000000020UL -#define UV2H_EVENT_OCCURRED2_RTC_6_SHFT 6 -#define UV2H_EVENT_OCCURRED2_RTC_6_MASK 0x0000000000000040UL -#define UV2H_EVENT_OCCURRED2_RTC_7_SHFT 7 -#define UV2H_EVENT_OCCURRED2_RTC_7_MASK 0x0000000000000080UL -#define UV2H_EVENT_OCCURRED2_RTC_8_SHFT 8 -#define UV2H_EVENT_OCCURRED2_RTC_8_MASK 0x0000000000000100UL -#define UV2H_EVENT_OCCURRED2_RTC_9_SHFT 9 -#define UV2H_EVENT_OCCURRED2_RTC_9_MASK 0x0000000000000200UL -#define UV2H_EVENT_OCCURRED2_RTC_10_SHFT 10 -#define UV2H_EVENT_OCCURRED2_RTC_10_MASK 0x0000000000000400UL -#define UV2H_EVENT_OCCURRED2_RTC_11_SHFT 11 -#define UV2H_EVENT_OCCURRED2_RTC_11_MASK 0x0000000000000800UL -#define UV2H_EVENT_OCCURRED2_RTC_12_SHFT 12 -#define UV2H_EVENT_OCCURRED2_RTC_12_MASK 0x0000000000001000UL -#define UV2H_EVENT_OCCURRED2_RTC_13_SHFT 13 -#define UV2H_EVENT_OCCURRED2_RTC_13_MASK 0x0000000000002000UL -#define UV2H_EVENT_OCCURRED2_RTC_14_SHFT 14 -#define UV2H_EVENT_OCCURRED2_RTC_14_MASK 0x0000000000004000UL -#define UV2H_EVENT_OCCURRED2_RTC_15_SHFT 15 -#define UV2H_EVENT_OCCURRED2_RTC_15_MASK 0x0000000000008000UL -#define UV2H_EVENT_OCCURRED2_RTC_16_SHFT 16 -#define UV2H_EVENT_OCCURRED2_RTC_16_MASK 0x0000000000010000UL -#define UV2H_EVENT_OCCURRED2_RTC_17_SHFT 17 -#define UV2H_EVENT_OCCURRED2_RTC_17_MASK 0x0000000000020000UL -#define UV2H_EVENT_OCCURRED2_RTC_18_SHFT 18 -#define UV2H_EVENT_OCCURRED2_RTC_18_MASK 0x0000000000040000UL -#define UV2H_EVENT_OCCURRED2_RTC_19_SHFT 19 -#define UV2H_EVENT_OCCURRED2_RTC_19_MASK 0x0000000000080000UL -#define UV2H_EVENT_OCCURRED2_RTC_20_SHFT 20 -#define UV2H_EVENT_OCCURRED2_RTC_20_MASK 0x0000000000100000UL -#define UV2H_EVENT_OCCURRED2_RTC_21_SHFT 21 -#define UV2H_EVENT_OCCURRED2_RTC_21_MASK 0x0000000000200000UL -#define UV2H_EVENT_OCCURRED2_RTC_22_SHFT 22 -#define UV2H_EVENT_OCCURRED2_RTC_22_MASK 0x0000000000400000UL -#define UV2H_EVENT_OCCURRED2_RTC_23_SHFT 23 -#define UV2H_EVENT_OCCURRED2_RTC_23_MASK 0x0000000000800000UL -#define UV2H_EVENT_OCCURRED2_RTC_24_SHFT 24 -#define UV2H_EVENT_OCCURRED2_RTC_24_MASK 0x0000000001000000UL -#define UV2H_EVENT_OCCURRED2_RTC_25_SHFT 25 -#define UV2H_EVENT_OCCURRED2_RTC_25_MASK 0x0000000002000000UL -#define UV2H_EVENT_OCCURRED2_RTC_26_SHFT 26 -#define UV2H_EVENT_OCCURRED2_RTC_26_MASK 0x0000000004000000UL -#define UV2H_EVENT_OCCURRED2_RTC_27_SHFT 27 -#define UV2H_EVENT_OCCURRED2_RTC_27_MASK 0x0000000008000000UL -#define UV2H_EVENT_OCCURRED2_RTC_28_SHFT 28 -#define UV2H_EVENT_OCCURRED2_RTC_28_MASK 0x0000000010000000UL -#define UV2H_EVENT_OCCURRED2_RTC_29_SHFT 29 -#define UV2H_EVENT_OCCURRED2_RTC_29_MASK 0x0000000020000000UL -#define UV2H_EVENT_OCCURRED2_RTC_30_SHFT 30 -#define UV2H_EVENT_OCCURRED2_RTC_30_MASK 0x0000000040000000UL -#define UV2H_EVENT_OCCURRED2_RTC_31_SHFT 31 -#define UV2H_EVENT_OCCURRED2_RTC_31_MASK 0x0000000080000000UL +#define UV2H_EVENT_OCCURRED2 0x70100UL +#define UV2H_EVENT_OCCURRED2_32 0xb68 + +#define UV2H_EVENT_OCCURRED2_RTC_0_SHFT 0 +#define UV2H_EVENT_OCCURRED2_RTC_1_SHFT 1 +#define UV2H_EVENT_OCCURRED2_RTC_2_SHFT 2 +#define UV2H_EVENT_OCCURRED2_RTC_3_SHFT 3 +#define UV2H_EVENT_OCCURRED2_RTC_4_SHFT 4 +#define UV2H_EVENT_OCCURRED2_RTC_5_SHFT 5 +#define UV2H_EVENT_OCCURRED2_RTC_6_SHFT 6 +#define UV2H_EVENT_OCCURRED2_RTC_7_SHFT 7 +#define UV2H_EVENT_OCCURRED2_RTC_8_SHFT 8 +#define UV2H_EVENT_OCCURRED2_RTC_9_SHFT 9 +#define UV2H_EVENT_OCCURRED2_RTC_10_SHFT 10 +#define UV2H_EVENT_OCCURRED2_RTC_11_SHFT 11 +#define UV2H_EVENT_OCCURRED2_RTC_12_SHFT 12 +#define UV2H_EVENT_OCCURRED2_RTC_13_SHFT 13 +#define UV2H_EVENT_OCCURRED2_RTC_14_SHFT 14 +#define UV2H_EVENT_OCCURRED2_RTC_15_SHFT 15 +#define UV2H_EVENT_OCCURRED2_RTC_16_SHFT 16 +#define UV2H_EVENT_OCCURRED2_RTC_17_SHFT 17 +#define UV2H_EVENT_OCCURRED2_RTC_18_SHFT 18 +#define UV2H_EVENT_OCCURRED2_RTC_19_SHFT 19 +#define UV2H_EVENT_OCCURRED2_RTC_20_SHFT 20 +#define UV2H_EVENT_OCCURRED2_RTC_21_SHFT 21 +#define UV2H_EVENT_OCCURRED2_RTC_22_SHFT 22 +#define UV2H_EVENT_OCCURRED2_RTC_23_SHFT 23 +#define UV2H_EVENT_OCCURRED2_RTC_24_SHFT 24 +#define UV2H_EVENT_OCCURRED2_RTC_25_SHFT 25 +#define UV2H_EVENT_OCCURRED2_RTC_26_SHFT 26 +#define UV2H_EVENT_OCCURRED2_RTC_27_SHFT 27 +#define UV2H_EVENT_OCCURRED2_RTC_28_SHFT 28 +#define UV2H_EVENT_OCCURRED2_RTC_29_SHFT 29 +#define UV2H_EVENT_OCCURRED2_RTC_30_SHFT 30 +#define UV2H_EVENT_OCCURRED2_RTC_31_SHFT 31 +#define UV2H_EVENT_OCCURRED2_RTC_0_MASK 0x0000000000000001UL +#define UV2H_EVENT_OCCURRED2_RTC_1_MASK 0x0000000000000002UL +#define UV2H_EVENT_OCCURRED2_RTC_2_MASK 0x0000000000000004UL +#define UV2H_EVENT_OCCURRED2_RTC_3_MASK 0x0000000000000008UL +#define UV2H_EVENT_OCCURRED2_RTC_4_MASK 0x0000000000000010UL +#define UV2H_EVENT_OCCURRED2_RTC_5_MASK 0x0000000000000020UL +#define UV2H_EVENT_OCCURRED2_RTC_6_MASK 0x0000000000000040UL +#define UV2H_EVENT_OCCURRED2_RTC_7_MASK 0x0000000000000080UL +#define UV2H_EVENT_OCCURRED2_RTC_8_MASK 0x0000000000000100UL +#define UV2H_EVENT_OCCURRED2_RTC_9_MASK 0x0000000000000200UL +#define UV2H_EVENT_OCCURRED2_RTC_10_MASK 0x0000000000000400UL +#define UV2H_EVENT_OCCURRED2_RTC_11_MASK 0x0000000000000800UL +#define UV2H_EVENT_OCCURRED2_RTC_12_MASK 0x0000000000001000UL +#define UV2H_EVENT_OCCURRED2_RTC_13_MASK 0x0000000000002000UL +#define UV2H_EVENT_OCCURRED2_RTC_14_MASK 0x0000000000004000UL +#define UV2H_EVENT_OCCURRED2_RTC_15_MASK 0x0000000000008000UL +#define UV2H_EVENT_OCCURRED2_RTC_16_MASK 0x0000000000010000UL +#define UV2H_EVENT_OCCURRED2_RTC_17_MASK 0x0000000000020000UL +#define UV2H_EVENT_OCCURRED2_RTC_18_MASK 0x0000000000040000UL +#define UV2H_EVENT_OCCURRED2_RTC_19_MASK 0x0000000000080000UL +#define UV2H_EVENT_OCCURRED2_RTC_20_MASK 0x0000000000100000UL +#define UV2H_EVENT_OCCURRED2_RTC_21_MASK 0x0000000000200000UL +#define UV2H_EVENT_OCCURRED2_RTC_22_MASK 0x0000000000400000UL +#define UV2H_EVENT_OCCURRED2_RTC_23_MASK 0x0000000000800000UL +#define UV2H_EVENT_OCCURRED2_RTC_24_MASK 0x0000000001000000UL +#define UV2H_EVENT_OCCURRED2_RTC_25_MASK 0x0000000002000000UL +#define UV2H_EVENT_OCCURRED2_RTC_26_MASK 0x0000000004000000UL +#define UV2H_EVENT_OCCURRED2_RTC_27_MASK 0x0000000008000000UL +#define UV2H_EVENT_OCCURRED2_RTC_28_MASK 0x0000000010000000UL +#define UV2H_EVENT_OCCURRED2_RTC_29_MASK 0x0000000020000000UL +#define UV2H_EVENT_OCCURRED2_RTC_30_MASK 0x0000000040000000UL +#define UV2H_EVENT_OCCURRED2_RTC_31_MASK 0x0000000080000000UL union uv2h_event_occurred2_u { - unsigned long v; - struct uv2h_event_occurred2_s { - unsigned long rtc_0 : 1; /* RW */ - unsigned long rtc_1 : 1; /* RW */ - unsigned long rtc_2 : 1; /* RW */ - unsigned long rtc_3 : 1; /* RW */ - unsigned long rtc_4 : 1; /* RW */ - unsigned long rtc_5 : 1; /* RW */ - unsigned long rtc_6 : 1; /* RW */ - unsigned long rtc_7 : 1; /* RW */ - unsigned long rtc_8 : 1; /* RW */ - unsigned long rtc_9 : 1; /* RW */ - unsigned long rtc_10 : 1; /* RW */ - unsigned long rtc_11 : 1; /* RW */ - unsigned long rtc_12 : 1; /* RW */ - unsigned long rtc_13 : 1; /* RW */ - unsigned long rtc_14 : 1; /* RW */ - unsigned long rtc_15 : 1; /* RW */ - unsigned long rtc_16 : 1; /* RW */ - unsigned long rtc_17 : 1; /* RW */ - unsigned long rtc_18 : 1; /* RW */ - unsigned long rtc_19 : 1; /* RW */ - unsigned long rtc_20 : 1; /* RW */ - unsigned long rtc_21 : 1; /* RW */ - unsigned long rtc_22 : 1; /* RW */ - unsigned long rtc_23 : 1; /* RW */ - unsigned long rtc_24 : 1; /* RW */ - unsigned long rtc_25 : 1; /* RW */ - unsigned long rtc_26 : 1; /* RW */ - unsigned long rtc_27 : 1; /* RW */ - unsigned long rtc_28 : 1; /* RW */ - unsigned long rtc_29 : 1; /* RW */ - unsigned long rtc_30 : 1; /* RW */ - unsigned long rtc_31 : 1; /* RW */ - unsigned long rsvd_32_63: 32; /* */ - } s1; + unsigned long v; + struct uv2h_event_occurred2_s { + unsigned long rtc_0:1; /* RW */ + unsigned long rtc_1:1; /* RW */ + unsigned long rtc_2:1; /* RW */ + unsigned long rtc_3:1; /* RW */ + unsigned long rtc_4:1; /* RW */ + unsigned long rtc_5:1; /* RW */ + unsigned long rtc_6:1; /* RW */ + unsigned long rtc_7:1; /* RW */ + unsigned long rtc_8:1; /* RW */ + unsigned long rtc_9:1; /* RW */ + unsigned long rtc_10:1; /* RW */ + unsigned long rtc_11:1; /* RW */ + unsigned long rtc_12:1; /* RW */ + unsigned long rtc_13:1; /* RW */ + unsigned long rtc_14:1; /* RW */ + unsigned long rtc_15:1; /* RW */ + unsigned long rtc_16:1; /* RW */ + unsigned long rtc_17:1; /* RW */ + unsigned long rtc_18:1; /* RW */ + unsigned long rtc_19:1; /* RW */ + unsigned long rtc_20:1; /* RW */ + unsigned long rtc_21:1; /* RW */ + unsigned long rtc_22:1; /* RW */ + unsigned long rtc_23:1; /* RW */ + unsigned long rtc_24:1; /* RW */ + unsigned long rtc_25:1; /* RW */ + unsigned long rtc_26:1; /* RW */ + unsigned long rtc_27:1; /* RW */ + unsigned long rtc_28:1; /* RW */ + unsigned long rtc_29:1; /* RW */ + unsigned long rtc_30:1; /* RW */ + unsigned long rtc_31:1; /* RW */ + unsigned long rsvd_32_63:32; + } s1; }; /* ========================================================================= */ /* UV2H_EVENT_OCCURRED2_ALIAS */ /* ========================================================================= */ -#define UV2H_EVENT_OCCURRED2_ALIAS 0x70108UL -#define UV2H_EVENT_OCCURRED2_ALIAS_32 0xb70 +#define UV2H_EVENT_OCCURRED2_ALIAS 0x70108UL +#define UV2H_EVENT_OCCURRED2_ALIAS_32 0xb70 /* ========================================================================= */ /* UV2H_LB_BAU_SB_ACTIVATION_STATUS_2 */ /* ========================================================================= */ -#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2 0x320130UL -#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2_32 0x9f0 +#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2 0x320130UL +#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2_32 0x9f0 #define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_SHFT 0 #define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_MASK 0xffffffffffffffffUL union uv2h_lb_bau_sb_activation_status_2_u { - unsigned long v; - struct uv2h_lb_bau_sb_activation_status_2_s { - unsigned long aux_error : 64; /* RW */ - } s1; + unsigned long v; + struct uv2h_lb_bau_sb_activation_status_2_s { + unsigned long aux_error:64; /* RW */ + } s1; }; /* ========================================================================= */ /* UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK */ /* ========================================================================= */ -#define UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK 0x320130UL -#define UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK_32 0x9f0 +#define UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK 0x320130UL +#define UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK_32 0x9f0 #define UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK_BIT_ENABLES_SHFT 0 #define UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK_BIT_ENABLES_MASK 0x00000000ffffffffUL union uv1h_lb_target_physical_apic_id_mask_u { - unsigned long v; - struct uv1h_lb_target_physical_apic_id_mask_s { - unsigned long bit_enables : 32; /* RW */ - unsigned long rsvd_32_63 : 32; /* */ - } s1; + unsigned long v; + struct uv1h_lb_target_physical_apic_id_mask_s { + unsigned long bit_enables:32; /* RW */ + unsigned long rsvd_32_63:32; + } s1; }; -#endif /* __ASM_UV_MMRS_X86_H__ */ +#endif /* _ASM_X86_UV_UV_MMRS_H */ -- cgit v1.1 From df049672dddde4a2fdacf63fb32eb80146e26841 Mon Sep 17 00:00:00 2001 From: Tero Roponen Date: Tue, 31 May 2011 10:24:39 +0300 Subject: x86: tsc: Remove unneeded DMI-based blacklisting The blacklist was added in response to my bug report (http://lkml.org/lkml/2006/1/19/362) and has never contained more than the one entry describing my old now dead ThinkPad 380XD laptop. As found out later (http://lkml.org/lkml/2007/11/29/50), this special treatment has been unnecessary for a long time, so it can be removed. Signed-off-by: Tero Roponen Signed-off-by: John Stultz --- arch/x86/kernel/tsc.c | 24 ------------------------ 1 file changed, 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 6cc6922..5c45c62 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -5,7 +5,6 @@ #include #include #include -#include #include #include #include @@ -800,27 +799,6 @@ void mark_tsc_unstable(char *reason) EXPORT_SYMBOL_GPL(mark_tsc_unstable); -static int __init dmi_mark_tsc_unstable(const struct dmi_system_id *d) -{ - printk(KERN_NOTICE "%s detected: marking TSC unstable.\n", - d->ident); - tsc_unstable = 1; - return 0; -} - -/* List of systems that have known TSC problems */ -static struct dmi_system_id __initdata bad_tsc_dmi_table[] = { - { - .callback = dmi_mark_tsc_unstable, - .ident = "IBM Thinkpad 380XD", - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "IBM"), - DMI_MATCH(DMI_BOARD_NAME, "2635FA0"), - }, - }, - {} -}; - static void __init check_system_tsc_reliable(void) { #ifdef CONFIG_MGEODE_LX @@ -1010,8 +988,6 @@ void __init tsc_init(void) lpj_fine = lpj; use_tsc_delay(); - /* Check and install the TSC clocksource */ - dmi_check_system(bad_tsc_dmi_table); if (unsynchronized_tsc()) mark_tsc_unstable("TSCs unsynchronized"); -- cgit v1.1 From 6885685923ee786f26e7b170e3b961ac0fa14037 Mon Sep 17 00:00:00 2001 From: Jack Steiner Date: Thu, 2 Jun 2011 14:59:43 -0500 Subject: x66, UV: Enable 64-bit ACPI MFCG support for SGI UV2 platform Enable 64-bit ACPI MFCG support for SGI UV2 platform. The check is similar to the check on UV1. UV2 has a different oem_id string. Signed-off-by: Jack Steiner Link: http://lkml.kernel.org/r/20110602195943.GA27079@sgi.com Signed-off-by: Ingo Molnar --- arch/x86/pci/mmconfig-shared.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 750c346..301e325 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -519,7 +519,8 @@ static int __init acpi_mcfg_check_entry(struct acpi_table_mcfg *mcfg, if (cfg->address < 0xFFFFFFFF) return 0; - if (!strcmp(mcfg->header.oem_id, "SGI")) + if (!strcmp(mcfg->header.oem_id, "SGI") || + !strcmp(mcfg->header.oem_id, "SGI2")) return 0; if (mcfg->header.revision >= 1) { -- cgit v1.1 From a268fcfaa6ab2ef740fda5ecf947aca45ccd535d Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 31 May 2011 22:21:51 +0200 Subject: x86, asm: Thin down SAVE/RESTORE_* asm macros Use dwarf2 cfi annotation macros, making SAVE/RESTORE_* marginally more readable. No functionality change. Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1306873314-32523-2-git-send-email-bp@alien8.de Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/calling.h | 101 +++++++++++++++++------------------------ 1 file changed, 41 insertions(+), 60 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h index 30af5a8..b67e06c 100644 --- a/arch/x86/include/asm/calling.h +++ b/arch/x86/include/asm/calling.h @@ -46,6 +46,7 @@ For 32-bit we have the following conventions - kernel is built with */ +#include "dwarf2.h" /* * 64-bit system call stack frame layout defines and helpers, for @@ -87,30 +88,25 @@ For 32-bit we have the following conventions - kernel is built with .macro SAVE_ARGS addskip=0, norcx=0, nor891011=0 subq $9*8+\addskip, %rsp CFI_ADJUST_CFA_OFFSET 9*8+\addskip - movq %rdi, 8*8(%rsp) - CFI_REL_OFFSET rdi, 8*8 - movq %rsi, 7*8(%rsp) - CFI_REL_OFFSET rsi, 7*8 - movq %rdx, 6*8(%rsp) - CFI_REL_OFFSET rdx, 6*8 + movq_cfi rdi, 8*8 + movq_cfi rsi, 7*8 + movq_cfi rdx, 6*8 + .if \norcx .else - movq %rcx, 5*8(%rsp) - CFI_REL_OFFSET rcx, 5*8 + movq_cfi rcx, 5*8 .endif - movq %rax, 4*8(%rsp) - CFI_REL_OFFSET rax, 4*8 + + movq_cfi rax, 4*8 + .if \nor891011 .else - movq %r8, 3*8(%rsp) - CFI_REL_OFFSET r8, 3*8 - movq %r9, 2*8(%rsp) - CFI_REL_OFFSET r9, 2*8 - movq %r10, 1*8(%rsp) - CFI_REL_OFFSET r10, 1*8 - movq %r11, (%rsp) - CFI_REL_OFFSET r11, 0*8 + movq_cfi r8, 3*8 + movq_cfi r9, 2*8 + movq_cfi r10, 1*8 + movq_cfi r11, 0*8 .endif + .endm #define ARG_SKIP (9*8) @@ -119,37 +115,34 @@ For 32-bit we have the following conventions - kernel is built with skipr8910=0, skiprdx=0 .if \skipr11 .else - movq (%rsp), %r11 - CFI_RESTORE r11 + movq_cfi_restore 0*8, r11 .endif + .if \skipr8910 .else - movq 1*8(%rsp), %r10 - CFI_RESTORE r10 - movq 2*8(%rsp), %r9 - CFI_RESTORE r9 - movq 3*8(%rsp), %r8 - CFI_RESTORE r8 + movq_cfi_restore 1*8, r10 + movq_cfi_restore 2*8, r9 + movq_cfi_restore 3*8, r8 .endif + .if \skiprax .else - movq 4*8(%rsp), %rax - CFI_RESTORE rax + movq_cfi_restore 4*8, rax .endif + .if \skiprcx .else - movq 5*8(%rsp), %rcx - CFI_RESTORE rcx + movq_cfi_restore 5*8, rcx .endif + .if \skiprdx .else - movq 6*8(%rsp), %rdx - CFI_RESTORE rdx + movq_cfi_restore 6*8, rdx .endif - movq 7*8(%rsp), %rsi - CFI_RESTORE rsi - movq 8*8(%rsp), %rdi - CFI_RESTORE rdi + + movq_cfi_restore 7*8, rsi + movq_cfi_restore 8*8, rdi + .if ARG_SKIP+\addskip > 0 addq $ARG_SKIP+\addskip, %rsp CFI_ADJUST_CFA_OFFSET -(ARG_SKIP+\addskip) @@ -176,33 +169,21 @@ For 32-bit we have the following conventions - kernel is built with .macro SAVE_REST subq $REST_SKIP, %rsp CFI_ADJUST_CFA_OFFSET REST_SKIP - movq %rbx, 5*8(%rsp) - CFI_REL_OFFSET rbx, 5*8 - movq %rbp, 4*8(%rsp) - CFI_REL_OFFSET rbp, 4*8 - movq %r12, 3*8(%rsp) - CFI_REL_OFFSET r12, 3*8 - movq %r13, 2*8(%rsp) - CFI_REL_OFFSET r13, 2*8 - movq %r14, 1*8(%rsp) - CFI_REL_OFFSET r14, 1*8 - movq %r15, (%rsp) - CFI_REL_OFFSET r15, 0*8 + movq_cfi rbx, 5*8 + movq_cfi rbp, 4*8 + movq_cfi r12, 3*8 + movq_cfi r13, 2*8 + movq_cfi r14, 1*8 + movq_cfi r15, 0*8 .endm .macro RESTORE_REST - movq (%rsp), %r15 - CFI_RESTORE r15 - movq 1*8(%rsp), %r14 - CFI_RESTORE r14 - movq 2*8(%rsp), %r13 - CFI_RESTORE r13 - movq 3*8(%rsp), %r12 - CFI_RESTORE r12 - movq 4*8(%rsp), %rbp - CFI_RESTORE rbp - movq 5*8(%rsp), %rbx - CFI_RESTORE rbx + movq_cfi_restore 0*8, r15 + movq_cfi_restore 1*8, r14 + movq_cfi_restore 2*8, r13 + movq_cfi_restore 3*8, r12 + movq_cfi_restore 4*8, rbp + movq_cfi_restore 5*8, rbx addq $REST_SKIP, %rsp CFI_ADJUST_CFA_OFFSET -(REST_SKIP) .endm -- cgit v1.1 From cac0e0a78f722abd85b7f8d614ee0820f7672f58 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 31 May 2011 22:21:52 +0200 Subject: x86, asm: Flip SAVE_ARGS arguments logic This saves us the else part of the conditional statement in the macro. No functionality change. Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1306873314-32523-3-git-send-email-bp@alien8.de Signed-off-by: H. Peter Anvin --- arch/x86/ia32/ia32entry.S | 6 +++--- arch/x86/include/asm/calling.h | 8 +++----- arch/x86/kernel/entry_64.S | 2 +- 3 files changed, 7 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index c1870dd..c5435dc 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -143,7 +143,7 @@ ENTRY(ia32_sysenter_target) CFI_REL_OFFSET rip,0 pushq_cfi %rax cld - SAVE_ARGS 0,0,1 + SAVE_ARGS 0,1,0 /* no need to do an access_ok check here because rbp has been 32bit zero extended */ 1: movl (%rbp),%ebp @@ -289,7 +289,7 @@ ENTRY(ia32_cstar_target) * disabled irqs and here we enable it straight after entry: */ ENABLE_INTERRUPTS(CLBR_NONE) - SAVE_ARGS 8,1,1 + SAVE_ARGS 8,0,0 movl %eax,%eax /* zero extension */ movq %rax,ORIG_RAX-ARGOFFSET(%rsp) movq %rcx,RIP-ARGOFFSET(%rsp) @@ -419,7 +419,7 @@ ENTRY(ia32_syscall) cld /* note the registers are not zero extended to the sf. this could be a problem. */ - SAVE_ARGS 0,0,1 + SAVE_ARGS 0,1,0 GET_THREAD_INFO(%r10) orl $TS_COMPAT,TI_status(%r10) testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10) diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h index b67e06c..b0b7d90 100644 --- a/arch/x86/include/asm/calling.h +++ b/arch/x86/include/asm/calling.h @@ -85,22 +85,20 @@ For 32-bit we have the following conventions - kernel is built with #define ARGOFFSET R11 #define SWFRAME ORIG_RAX - .macro SAVE_ARGS addskip=0, norcx=0, nor891011=0 + .macro SAVE_ARGS addskip=0, save_rcx=1, save_r891011=1 subq $9*8+\addskip, %rsp CFI_ADJUST_CFA_OFFSET 9*8+\addskip movq_cfi rdi, 8*8 movq_cfi rsi, 7*8 movq_cfi rdx, 6*8 - .if \norcx - .else + .if \save_rcx movq_cfi rcx, 5*8 .endif movq_cfi rax, 4*8 - .if \nor891011 - .else + .if \save_r891011 movq_cfi r8, 3*8 movq_cfi r9, 2*8 movq_cfi r10, 1*8 diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 8a445a0..e5ece6b 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -473,7 +473,7 @@ ENTRY(system_call_after_swapgs) * and short: */ ENABLE_INTERRUPTS(CLBR_NONE) - SAVE_ARGS 8,1 + SAVE_ARGS 8,0 movq %rax,ORIG_RAX-ARGOFFSET(%rsp) movq %rcx,RIP-ARGOFFSET(%rsp) CFI_REL_OFFSET rip,RIP-ARGOFFSET -- cgit v1.1 From 838feb47549a9b73534c6c1d7da4a9639a0750f4 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 31 May 2011 22:21:53 +0200 Subject: x86, asm: Flip RESTORE_ARGS arguments logic ... thus getting rid of the "else" part of the conditional statement in the macro. No functionality change. Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1306873314-32523-4-git-send-email-bp@alien8.de Signed-off-by: H. Peter Anvin --- arch/x86/ia32/ia32entry.S | 4 ++-- arch/x86/include/asm/calling.h | 21 ++++++++------------- arch/x86/kernel/entry_64.S | 4 ++-- 3 files changed, 12 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index c5435dc..a0e866d 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -173,7 +173,7 @@ sysexit_from_sys_call: andl $~0x200,EFLAGS-R11(%rsp) movl RIP-R11(%rsp),%edx /* User %eip */ CFI_REGISTER rip,rdx - RESTORE_ARGS 1,24,1,1,1,1 + RESTORE_ARGS 0,24,0,0,0,0 xorq %r8,%r8 xorq %r9,%r9 xorq %r10,%r10 @@ -328,7 +328,7 @@ cstar_dispatch: jnz sysretl_audit sysretl_from_sys_call: andl $~TS_COMPAT,TI_status(%r10) - RESTORE_ARGS 1,-ARG_SKIP,1,1,1 + RESTORE_ARGS 0,-ARG_SKIP,0,0,0 movl RIP-ARGOFFSET(%rsp),%ecx CFI_REGISTER rip,rcx movl EFLAGS-ARGOFFSET(%rsp),%r11d diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h index b0b7d90..a9e3a74 100644 --- a/arch/x86/include/asm/calling.h +++ b/arch/x86/include/asm/calling.h @@ -109,32 +109,27 @@ For 32-bit we have the following conventions - kernel is built with #define ARG_SKIP (9*8) - .macro RESTORE_ARGS skiprax=0, addskip=0, skiprcx=0, skipr11=0, \ - skipr8910=0, skiprdx=0 - .if \skipr11 - .else + .macro RESTORE_ARGS rstor_rax=1, addskip=0, rstor_rcx=1, rstor_r11=1, \ + rstor_r8910=1, rstor_rdx=1 + .if \rstor_r11 movq_cfi_restore 0*8, r11 .endif - .if \skipr8910 - .else + .if \rstor_r8910 movq_cfi_restore 1*8, r10 movq_cfi_restore 2*8, r9 movq_cfi_restore 3*8, r8 .endif - .if \skiprax - .else + .if \rstor_rax movq_cfi_restore 4*8, rax .endif - .if \skiprcx - .else + .if \rstor_rcx movq_cfi_restore 5*8, rcx .endif - .if \skiprdx - .else + .if \rstor_rdx movq_cfi_restore 6*8, rdx .endif @@ -193,7 +188,7 @@ For 32-bit we have the following conventions - kernel is built with .macro RESTORE_ALL addskip=0 RESTORE_REST - RESTORE_ARGS 0, \addskip + RESTORE_ARGS 1, \addskip .endm .macro icebp diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index e5ece6b..0412bcb 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -508,7 +508,7 @@ sysret_check: TRACE_IRQS_ON movq RIP-ARGOFFSET(%rsp),%rcx CFI_REGISTER rip,rcx - RESTORE_ARGS 0,-ARG_SKIP,1 + RESTORE_ARGS 1,-ARG_SKIP,0 /*CFI_REGISTER rflags,r11*/ movq PER_CPU_VAR(old_rsp), %rsp USERGS_SYSRET64 @@ -858,7 +858,7 @@ retint_restore_args: /* return to kernel space */ */ TRACE_IRQS_IRETQ restore_args: - RESTORE_ARGS 0,8,0 + RESTORE_ARGS 1,8,1 irq_return: INTERRUPT_RETURN -- cgit v1.1 From 38e6b75d3b6f4b9445eb6486e28fc716acda44ae Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 31 May 2011 22:21:54 +0200 Subject: x86, asm: Cleanup thunk_64.S Drop thunk_ra macro in favor of an additional argument to the thunk macro since their bodies are almost identical. Do a whitespace scrubbing and use CFI-aware macros for full annotation. Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1306873314-32523-5-git-send-email-bp@alien8.de Signed-off-by: H. Peter Anvin --- arch/x86/lib/thunk_64.S | 46 +++++++++++++++++++--------------------------- 1 file changed, 19 insertions(+), 27 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S index 782b082..d5b088b 100644 --- a/arch/x86/lib/thunk_64.S +++ b/arch/x86/lib/thunk_64.S @@ -5,50 +5,42 @@ * Added trace_hardirqs callers - Copyright 2007 Steven Rostedt, Red Hat, Inc. * Subject to the GNU public license, v.2. No warranty of any kind. */ +#include +#include +#include +#include - #include - #include - #include - #include - - /* rdi: arg1 ... normal C conventions. rax is saved/restored. */ - .macro thunk name,func - .globl \name -\name: - CFI_STARTPROC - SAVE_ARGS - call \func - jmp restore - CFI_ENDPROC - .endm - -#ifdef CONFIG_TRACE_IRQFLAGS - /* put return address in rdi (arg1) */ - .macro thunk_ra name,func + /* rdi: arg1 ... normal C conventions. rax is saved/restored. */ + .macro THUNK name, func, put_ret_addr_in_rdi=0 .globl \name \name: CFI_STARTPROC + + /* this one pushes 9 elems, the next one would be %rIP */ SAVE_ARGS - /* SAVE_ARGS pushs 9 elements */ - /* the next element would be the rip */ - movq 9*8(%rsp), %rdi + + .if \put_ret_addr_in_rdi + movq_cfi_restore 9*8, rdi + .endif + call \func jmp restore CFI_ENDPROC .endm - thunk_ra trace_hardirqs_on_thunk,trace_hardirqs_on_caller - thunk_ra trace_hardirqs_off_thunk,trace_hardirqs_off_caller +#ifdef CONFIG_TRACE_IRQFLAGS + THUNK trace_hardirqs_on_thunk,trace_hardirqs_on_caller,1 + THUNK trace_hardirqs_off_thunk,trace_hardirqs_off_caller,1 #endif #ifdef CONFIG_DEBUG_LOCK_ALLOC - thunk lockdep_sys_exit_thunk,lockdep_sys_exit + THUNK lockdep_sys_exit_thunk,lockdep_sys_exit #endif - + /* SAVE_ARGS below is used only for the .cfi directives it contains. */ CFI_STARTPROC SAVE_ARGS restore: RESTORE_ARGS - ret + ret CFI_ENDPROC -- cgit v1.1 From dd2897bf0f4d523238e87dabb23e9634ea9ba73d Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 3 Jun 2011 22:07:22 +0200 Subject: x86, asm: Fix binutils 2.16 issue with __USER32_CS While testing the patchset at http://lkml.kernel.org/r/1306873314-32523-1-git-send-email-bp@alien8.de with binutils 2.16.1 from hell, kernel build fails with the following error: arch/x86/ia32/ia32entry.S: Assembler messages: arch/x86/ia32/ia32entry.S:139: Error: too many positional arguments make[2]: *** [arch/x86/ia32/ia32entry.o] Error 1 make[1]: *** [arch/x86/ia32] Error 2 make[1]: *** Waiting for unfinished jobs.... make: *** [arch/x86] Error 2 make: *** Waiting for unfinished jobs.... due to spaces between the operators of the __USER32_CS define. Fix it so that gas 2.16 can swallow it too. Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1307131642-32595-1-git-send-email-bp@alien8.de Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/segment.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index cd84f72..5e64171 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -162,7 +162,7 @@ #define GDT_ENTRY_DEFAULT_USER32_CS 4 #define GDT_ENTRY_DEFAULT_USER_DS 5 #define GDT_ENTRY_DEFAULT_USER_CS 6 -#define __USER32_CS (GDT_ENTRY_DEFAULT_USER32_CS * 8 + 3) +#define __USER32_CS (GDT_ENTRY_DEFAULT_USER32_CS*8+3) #define __USER32_DS __USER_DS #define GDT_ENTRY_TSS 8 /* needs two entries */ -- cgit v1.1 From 6879eb2deed7171a81b2f904c9ad14b9648689a7 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sun, 5 Jun 2011 13:50:17 -0400 Subject: x86-64: Fix alignment of jiffies variable It's declared __attribute__((aligned(16)) but it's explicitly not aligned. This is probably harmless but it's a bit embarrassing. Signed-off-by: Andy Lutomirski Cc: Jesper Juhl Cc: Borislav Petkov Cc: Linus Torvalds Cc: Arjan van de Ven Cc: Jan Beulich Cc: richard -rw- weinberger Cc: Mikael Pettersson Cc: Andi Kleen Cc: Brian Gerst Cc: Louis Rilling Cc: Valdis.Kletnieks@vt.edu Cc: pageexec@freemail.hu Link: http://lkml.kernel.org/r/5f3bc5542e9aaa9382d53f153f54373165cdef89.1307292171.git.luto@mit.edu Signed-off-by: Ingo Molnar --- arch/x86/include/asm/vvar.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h index 341b355..a4eaca4 100644 --- a/arch/x86/include/asm/vvar.h +++ b/arch/x86/include/asm/vvar.h @@ -45,7 +45,7 @@ /* DECLARE_VVAR(offset, type, name) */ DECLARE_VVAR(0, volatile unsigned long, jiffies) -DECLARE_VVAR(8, int, vgetcpu_mode) +DECLARE_VVAR(16, int, vgetcpu_mode) DECLARE_VVAR(128, struct vsyscall_gtod_data, vsyscall_gtod_data) #undef DECLARE_VVAR -- cgit v1.1 From 8b4777a4b50cb0c84c1152eac85d24415fb6ff7d Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sun, 5 Jun 2011 13:50:18 -0400 Subject: x86-64: Document some of entry_64.S Signed-off-by: Andy Lutomirski Cc: Jesper Juhl Cc: Borislav Petkov Cc: Linus Torvalds Cc: Arjan van de Ven Cc: Jan Beulich Cc: richard -rw- weinberger Cc: Mikael Pettersson Cc: Andi Kleen Cc: Brian Gerst Cc: Louis Rilling Cc: Valdis.Kletnieks@vt.edu Cc: pageexec@freemail.hu Link: http://lkml.kernel.org/r/fc134867cc550977cc996866129e11a16ba0f9ea.1307292171.git.luto@mit.edu Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 8a445a0..72c4a77 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -9,6 +9,8 @@ /* * entry.S contains the system-call and fault low-level handling routines. * + * Some of this is documented in Documentation/x86/entry_64.txt + * * NOTE: This code handles signal-recognition, which happens every time * after an interrupt and after each system call. * -- cgit v1.1 From 9fd67b4ed0714ab718f1f9bd14c344af336a6df7 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sun, 5 Jun 2011 13:50:19 -0400 Subject: x86-64: Give vvars their own page Move vvars out of the vsyscall page into their own page and mark it NX. Without this patch, an attacker who can force a daemon to call some fixed address could wait until the time contains, say, 0xCD80, and then execute the current time. Signed-off-by: Andy Lutomirski Cc: Jesper Juhl Cc: Borislav Petkov Cc: Linus Torvalds Cc: Arjan van de Ven Cc: Jan Beulich Cc: richard -rw- weinberger Cc: Mikael Pettersson Cc: Andi Kleen Cc: Brian Gerst Cc: Louis Rilling Cc: Valdis.Kletnieks@vt.edu Cc: pageexec@freemail.hu Link: http://lkml.kernel.org/r/b1460f81dc4463d66ea3f2b5ce240f58d48effec.1307292171.git.luto@mit.edu Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fixmap.h | 1 + arch/x86/include/asm/pgtable_types.h | 2 ++ arch/x86/include/asm/vvar.h | 22 ++++++++++------------ arch/x86/kernel/vmlinux.lds.S | 28 +++++++++++++++++----------- arch/x86/kernel/vsyscall_64.c | 5 +++++ 5 files changed, 35 insertions(+), 23 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 4729b2b..460c74e 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -78,6 +78,7 @@ enum fixed_addresses { VSYSCALL_LAST_PAGE, VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1, + VVAR_PAGE, VSYSCALL_HPET, #endif FIX_DBGP_BASE, diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index d56187c..6a29aed6 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -108,6 +108,7 @@ #define __PAGE_KERNEL_UC_MINUS (__PAGE_KERNEL | _PAGE_PCD) #define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER) #define __PAGE_KERNEL_VSYSCALL_NOCACHE (__PAGE_KERNEL_VSYSCALL | _PAGE_PCD | _PAGE_PWT) +#define __PAGE_KERNEL_VVAR (__PAGE_KERNEL_RO | _PAGE_USER) #define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE) #define __PAGE_KERNEL_LARGE_NOCACHE (__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE) #define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE) @@ -130,6 +131,7 @@ #define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC) #define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL) #define PAGE_KERNEL_VSYSCALL_NOCACHE __pgprot(__PAGE_KERNEL_VSYSCALL_NOCACHE) +#define PAGE_KERNEL_VVAR __pgprot(__PAGE_KERNEL_VVAR) #define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO) #define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE) diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h index a4eaca4..de656ac 100644 --- a/arch/x86/include/asm/vvar.h +++ b/arch/x86/include/asm/vvar.h @@ -10,15 +10,14 @@ * In normal kernel code, they are used like any other variable. * In user code, they are accessed through the VVAR macro. * - * Each of these variables lives in the vsyscall page, and each - * one needs a unique offset within the little piece of the page - * reserved for vvars. Specify that offset in DECLARE_VVAR. - * (There are 896 bytes available. If you mess up, the linker will - * catch it.) + * These variables live in a page of kernel data that has an extra RO + * mapping for userspace. Each variable needs a unique offset within + * that page; specify that offset with the DECLARE_VVAR macro. (If + * you mess up, the linker will catch it.) */ -/* Offset of vars within vsyscall page */ -#define VSYSCALL_VARS_OFFSET (3072 + 128) +/* Base address of vvars. This is not ABI. */ +#define VVAR_ADDRESS (-10*1024*1024 - 4096) #if defined(__VVAR_KERNEL_LDS) @@ -26,17 +25,17 @@ * right place. */ #define DECLARE_VVAR(offset, type, name) \ - EMIT_VVAR(name, VSYSCALL_VARS_OFFSET + offset) + EMIT_VVAR(name, offset) #else #define DECLARE_VVAR(offset, type, name) \ static type const * const vvaraddr_ ## name = \ - (void *)(VSYSCALL_START + VSYSCALL_VARS_OFFSET + (offset)); + (void *)(VVAR_ADDRESS + (offset)); #define DEFINE_VVAR(type, name) \ - type __vvar_ ## name \ - __attribute__((section(".vsyscall_var_" #name), aligned(16))) + type name \ + __attribute__((section(".vvar_" #name), aligned(16))) #define VVAR(name) (*vvaraddr_ ## name) @@ -49,4 +48,3 @@ DECLARE_VVAR(16, int, vgetcpu_mode) DECLARE_VVAR(128, struct vsyscall_gtod_data, vsyscall_gtod_data) #undef DECLARE_VVAR -#undef VSYSCALL_VARS_OFFSET diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 89aed99..98b378d 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -161,12 +161,6 @@ SECTIONS #define VVIRT_OFFSET (VSYSCALL_ADDR - __vsyscall_0) #define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) -#define EMIT_VVAR(x, offset) .vsyscall_var_ ## x \ - ADDR(.vsyscall_0) + offset \ - : AT(VLOAD(.vsyscall_var_ ## x)) { \ - *(.vsyscall_var_ ## x) \ - } \ - x = VVIRT(.vsyscall_var_ ## x); . = ALIGN(4096); __vsyscall_0 = .; @@ -192,19 +186,31 @@ SECTIONS *(.vsyscall_3) } -#define __VVAR_KERNEL_LDS -#include -#undef __VVAR_KERNEL_LDS - - . = __vsyscall_0 + PAGE_SIZE; + . = ALIGN(__vsyscall_0 + PAGE_SIZE, PAGE_SIZE); #undef VSYSCALL_ADDR #undef VLOAD_OFFSET #undef VLOAD #undef VVIRT_OFFSET #undef VVIRT + + __vvar_page = .; + + .vvar : AT(ADDR(.vvar) - LOAD_OFFSET) { + + /* Place all vvars at the offsets in asm/vvar.h. */ +#define EMIT_VVAR(name, offset) \ + . = offset; \ + *(.vvar_ ## name) +#define __VVAR_KERNEL_LDS +#include +#undef __VVAR_KERNEL_LDS #undef EMIT_VVAR + } :data + + . = ALIGN(__vvar_page + PAGE_SIZE, PAGE_SIZE); + #endif /* CONFIG_X86_64 */ /* Init code and data - will be freed after init */ diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 3e68218..3cf1cef 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -284,9 +284,14 @@ void __init map_vsyscall(void) { extern char __vsyscall_0; unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0); + extern char __vvar_page; + unsigned long physaddr_vvar_page = __pa_symbol(&__vvar_page); /* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */ __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL); + __set_fixmap(VVAR_PAGE, physaddr_vvar_page, PAGE_KERNEL_VVAR); + BUILD_BUG_ON((unsigned long)__fix_to_virt(VVAR_PAGE) != + (unsigned long)VVAR_ADDRESS); } static int __init vsyscall_init(void) -- cgit v1.1 From 0d7b8547fb67d5c2a7d954c56b3715b0e708be4a Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sun, 5 Jun 2011 13:50:20 -0400 Subject: x86-64: Remove kernel.vsyscall64 sysctl It's unnecessary overhead in code that's supposed to be highly optimized. Removing it allows us to remove one of the two syscall instructions in the vsyscall page. The only sensible use for it is for UML users, and it doesn't fully address inconsistent vsyscall results on UML. The real fix for UML is to stop using vsyscalls entirely. Signed-off-by: Andy Lutomirski Cc: Jesper Juhl Cc: Borislav Petkov Cc: Linus Torvalds Cc: Arjan van de Ven Cc: Jan Beulich Cc: richard -rw- weinberger Cc: Mikael Pettersson Cc: Andi Kleen Cc: Brian Gerst Cc: Louis Rilling Cc: Valdis.Kletnieks@vt.edu Cc: pageexec@freemail.hu Link: http://lkml.kernel.org/r/973ae803fe76f712da4b2740e66dccf452d3b1e4.1307292171.git.luto@mit.edu Signed-off-by: Ingo Molnar --- arch/x86/include/asm/vgtod.h | 1 - arch/x86/kernel/vsyscall_64.c | 34 +------------------------- arch/x86/vdso/vclock_gettime.c | 55 ++++++++++++++++-------------------------- 3 files changed, 22 insertions(+), 68 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h index 646b4c1..aa5add8 100644 --- a/arch/x86/include/asm/vgtod.h +++ b/arch/x86/include/asm/vgtod.h @@ -11,7 +11,6 @@ struct vsyscall_gtod_data { time_t wall_time_sec; u32 wall_time_nsec; - int sysctl_enabled; struct timezone sys_tz; struct { /* extract of a clocksource struct */ cycle_t (*vread)(void); diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 3cf1cef..9b2f3f5 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -53,7 +53,6 @@ DEFINE_VVAR(int, vgetcpu_mode); DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) = { .lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock), - .sysctl_enabled = 1, }; void update_vsyscall_tz(void) @@ -103,15 +102,6 @@ static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz) return ret; } -static __always_inline long time_syscall(long *t) -{ - long secs; - asm volatile("syscall" - : "=a" (secs) - : "0" (__NR_time),"D" (t) : __syscall_clobber); - return secs; -} - static __always_inline void do_vgettimeofday(struct timeval * tv) { cycle_t now, base, mask, cycle_delta; @@ -122,8 +112,7 @@ static __always_inline void do_vgettimeofday(struct timeval * tv) seq = read_seqbegin(&VVAR(vsyscall_gtod_data).lock); vread = VVAR(vsyscall_gtod_data).clock.vread; - if (unlikely(!VVAR(vsyscall_gtod_data).sysctl_enabled || - !vread)) { + if (unlikely(!vread)) { gettimeofday(tv,NULL); return; } @@ -165,8 +154,6 @@ time_t __vsyscall(1) vtime(time_t *t) { unsigned seq; time_t result; - if (unlikely(!VVAR(vsyscall_gtod_data).sysctl_enabled)) - return time_syscall(t); do { seq = read_seqbegin(&VVAR(vsyscall_gtod_data).lock); @@ -227,22 +214,6 @@ static long __vsyscall(3) venosys_1(void) return -ENOSYS; } -#ifdef CONFIG_SYSCTL -static ctl_table kernel_table2[] = { - { .procname = "vsyscall64", - .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec }, - {} -}; - -static ctl_table kernel_root_table2[] = { - { .procname = "kernel", .mode = 0555, - .child = kernel_table2 }, - {} -}; -#endif - /* Assume __initcall executes before all user space. Hopefully kmod doesn't violate that. We'll find out if it does. */ static void __cpuinit vsyscall_set_cpu(int cpu) @@ -301,9 +272,6 @@ static int __init vsyscall_init(void) BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime)); BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE))); BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu)); -#ifdef CONFIG_SYSCTL - register_sysctl_table(kernel_root_table2); -#endif on_each_cpu(cpu_vsyscall_init, NULL, 1); /* notifier priority > KVM */ hotcpu_notifier(cpu_vsyscall_notifier, 30); diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index a724905..cf54813 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -116,21 +116,21 @@ notrace static noinline int do_monotonic_coarse(struct timespec *ts) notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) { - if (likely(gtod->sysctl_enabled)) - switch (clock) { - case CLOCK_REALTIME: - if (likely(gtod->clock.vread)) - return do_realtime(ts); - break; - case CLOCK_MONOTONIC: - if (likely(gtod->clock.vread)) - return do_monotonic(ts); - break; - case CLOCK_REALTIME_COARSE: - return do_realtime_coarse(ts); - case CLOCK_MONOTONIC_COARSE: - return do_monotonic_coarse(ts); - } + switch (clock) { + case CLOCK_REALTIME: + if (likely(gtod->clock.vread)) + return do_realtime(ts); + break; + case CLOCK_MONOTONIC: + if (likely(gtod->clock.vread)) + return do_monotonic(ts); + break; + case CLOCK_REALTIME_COARSE: + return do_realtime_coarse(ts); + case CLOCK_MONOTONIC_COARSE: + return do_monotonic_coarse(ts); + } + return vdso_fallback_gettime(clock, ts); } int clock_gettime(clockid_t, struct timespec *) @@ -139,7 +139,7 @@ int clock_gettime(clockid_t, struct timespec *) notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) { long ret; - if (likely(gtod->sysctl_enabled && gtod->clock.vread)) { + if (likely(gtod->clock.vread)) { if (likely(tv != NULL)) { BUILD_BUG_ON(offsetof(struct timeval, tv_usec) != offsetof(struct timespec, tv_nsec) || @@ -161,27 +161,14 @@ notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) int gettimeofday(struct timeval *, struct timezone *) __attribute__((weak, alias("__vdso_gettimeofday"))); -/* This will break when the xtime seconds get inaccurate, but that is - * unlikely */ - -static __always_inline long time_syscall(long *t) -{ - long secs; - asm volatile("syscall" - : "=a" (secs) - : "0" (__NR_time), "D" (t) : "cc", "r11", "cx", "memory"); - return secs; -} - +/* + * This will break when the xtime seconds get inaccurate, but that is + * unlikely + */ notrace time_t __vdso_time(time_t *t) { - time_t result; - - if (unlikely(!VVAR(vsyscall_gtod_data).sysctl_enabled)) - return time_syscall(t); - /* This is atomic on x86_64 so we don't need any locks. */ - result = ACCESS_ONCE(VVAR(vsyscall_gtod_data).wall_time_sec); + time_t result = ACCESS_ONCE(VVAR(vsyscall_gtod_data).wall_time_sec); if (t) *t = result; -- cgit v1.1 From d319bb79afa4039bda6f85661d6bf0c13299ce93 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sun, 5 Jun 2011 13:50:21 -0400 Subject: x86-64: Map the HPET NX Currently the HPET mapping is a user-accessible syscall instruction at a fixed address some of the time. A sufficiently determined hacker might be able to guess when. Signed-off-by: Andy Lutomirski Cc: Jesper Juhl Cc: Borislav Petkov Cc: Linus Torvalds Cc: Arjan van de Ven Cc: Jan Beulich Cc: richard -rw- weinberger Cc: Mikael Pettersson Cc: Andi Kleen Cc: Brian Gerst Cc: Louis Rilling Cc: Valdis.Kletnieks@vt.edu Cc: pageexec@freemail.hu Link: http://lkml.kernel.org/r/ab41b525a4ca346b1ca1145d16fb8d181861a8aa.1307292171.git.luto@mit.edu Signed-off-by: Ingo Molnar --- arch/x86/include/asm/pgtable_types.h | 4 ++-- arch/x86/kernel/hpet.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 6a29aed6..013286a1 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -107,8 +107,8 @@ #define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD | _PAGE_PWT) #define __PAGE_KERNEL_UC_MINUS (__PAGE_KERNEL | _PAGE_PCD) #define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER) -#define __PAGE_KERNEL_VSYSCALL_NOCACHE (__PAGE_KERNEL_VSYSCALL | _PAGE_PCD | _PAGE_PWT) #define __PAGE_KERNEL_VVAR (__PAGE_KERNEL_RO | _PAGE_USER) +#define __PAGE_KERNEL_VVAR_NOCACHE (__PAGE_KERNEL_VVAR | _PAGE_PCD | _PAGE_PWT) #define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE) #define __PAGE_KERNEL_LARGE_NOCACHE (__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE) #define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE) @@ -130,8 +130,8 @@ #define PAGE_KERNEL_LARGE_NOCACHE __pgprot(__PAGE_KERNEL_LARGE_NOCACHE) #define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC) #define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL) -#define PAGE_KERNEL_VSYSCALL_NOCACHE __pgprot(__PAGE_KERNEL_VSYSCALL_NOCACHE) #define PAGE_KERNEL_VVAR __pgprot(__PAGE_KERNEL_VVAR) +#define PAGE_KERNEL_VVAR_NOCACHE __pgprot(__PAGE_KERNEL_VVAR_NOCACHE) #define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO) #define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE) diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 6781765..e9f5605 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -71,7 +71,7 @@ static inline void hpet_set_mapping(void) { hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE); #ifdef CONFIG_X86_64 - __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE); + __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VVAR_NOCACHE); #endif } -- cgit v1.1 From bb5fe2f78eadf5a52d8dcbf9a57728fd107af97b Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sun, 5 Jun 2011 13:50:22 -0400 Subject: x86-64: Remove vsyscall number 3 (venosys) It just segfaults since April 2008 (a4928cff), so I'm pretty sure that nothing uses it. And having an empty section makes the linker script a bit fragile. Signed-off-by: Andy Lutomirski Cc: Jesper Juhl Cc: Borislav Petkov Cc: Linus Torvalds Cc: Arjan van de Ven Cc: Jan Beulich Cc: richard -rw- weinberger Cc: Mikael Pettersson Cc: Andi Kleen Cc: Brian Gerst Cc: Louis Rilling Cc: Valdis.Kletnieks@vt.edu Cc: pageexec@freemail.hu Link: http://lkml.kernel.org/r/4a4abcf47ecadc269f2391a313576fe6d06acef7.1307292171.git.luto@mit.edu Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux.lds.S | 4 ---- arch/x86/kernel/vsyscall_64.c | 5 ----- 2 files changed, 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 98b378d..4f90082 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -182,10 +182,6 @@ SECTIONS *(.vsyscall_2) } - .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) { - *(.vsyscall_3) - } - . = ALIGN(__vsyscall_0 + PAGE_SIZE, PAGE_SIZE); #undef VSYSCALL_ADDR diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 9b2f3f5..70a5f6e 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -209,11 +209,6 @@ vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache) return 0; } -static long __vsyscall(3) venosys_1(void) -{ - return -ENOSYS; -} - /* Assume __initcall executes before all user space. Hopefully kmod doesn't violate that. We'll find out if it does. */ static void __cpuinit vsyscall_set_cpu(int cpu) -- cgit v1.1 From 5dfcea629a08b4684a019cd0cb59d0c9129a6c02 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sun, 5 Jun 2011 13:50:23 -0400 Subject: x86-64: Fill unused parts of the vsyscall page with 0xcc Jumping to 0x00 might do something depending on the following bytes. Jumping to 0xcc is a trap. So fill the unused parts of the vsyscall page with 0xcc to make it useless for exploits to jump there. Signed-off-by: Andy Lutomirski Cc: Jesper Juhl Cc: Borislav Petkov Cc: Linus Torvalds Cc: Arjan van de Ven Cc: Jan Beulich Cc: richard -rw- weinberger Cc: Mikael Pettersson Cc: Andi Kleen Cc: Brian Gerst Cc: Louis Rilling Cc: Valdis.Kletnieks@vt.edu Cc: pageexec@freemail.hu Link: http://lkml.kernel.org/r/ed54bfcfbe50a9070d20ec1edbe0d149e22a4568.1307292171.git.luto@mit.edu Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux.lds.S | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 4f90082..8017471 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -166,22 +166,20 @@ SECTIONS __vsyscall_0 = .; . = VSYSCALL_ADDR; - .vsyscall_0 : AT(VLOAD(.vsyscall_0)) { + .vsyscall : AT(VLOAD(.vsyscall)) { *(.vsyscall_0) - } :user - . = ALIGN(L1_CACHE_BYTES); - .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { + . = ALIGN(L1_CACHE_BYTES); *(.vsyscall_fn) - } - .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) { + . = 1024; *(.vsyscall_1) - } - .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) { + + . = 2048; *(.vsyscall_2) - } + . = 4096; /* Pad the whole page. */ + } :user =0xcc . = ALIGN(__vsyscall_0 + PAGE_SIZE, PAGE_SIZE); #undef VSYSCALL_ADDR -- cgit v1.1 From c2419b4a4727f67af2fc2cd68b0d878b75e781bb Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 31 May 2011 10:50:10 -0400 Subject: xen: allow enable use of VGA console on dom0 Get the information about the VGA console hardware from Xen, and put it into the form the bootloader normally generates, so that the rest of the kernel can deal with VGA as usual. [ Impact: make VGA console work in dom0 ] Signed-off-by: Jeremy Fitzhardinge [v1: Rebased on 2.6.39] [v2: Removed incorrect comments and fixed compile warnings] Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/Makefile | 2 +- arch/x86/xen/enlighten.c | 8 ++++++ arch/x86/xen/vga.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/xen/xen-ops.h | 11 ++++++++ 4 files changed, 87 insertions(+), 1 deletion(-) create mode 100644 arch/x86/xen/vga.c (limited to 'arch/x86') diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 17c565d..a6575b9 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -18,5 +18,5 @@ obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o - +obj-$(CONFIG_XEN_DOM0) += vga.o obj-$(CONFIG_SWIOTLB_XEN) += pci-swiotlb-xen.o diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index e3c6a06..4abd2d5 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1241,6 +1241,14 @@ asmlinkage void __init xen_start_kernel(void) if (pci_xen) x86_init.pci.arch_init = pci_xen_init; } else { + const struct dom0_vga_console_info *info = + (void *)((char *)xen_start_info + + xen_start_info->console.dom0.info_off); + + xen_init_vga(info, xen_start_info->console.dom0.info_size); + xen_start_info->console.domU.mfn = 0; + xen_start_info->console.domU.evtchn = 0; + /* Make sure ACS will be enabled */ pci_request_acs(); } diff --git a/arch/x86/xen/vga.c b/arch/x86/xen/vga.c new file mode 100644 index 0000000..1cd7f4d --- /dev/null +++ b/arch/x86/xen/vga.c @@ -0,0 +1,67 @@ +#include +#include + +#include +#include + +#include + +#include "xen-ops.h" + +void __init xen_init_vga(const struct dom0_vga_console_info *info, size_t size) +{ + struct screen_info *screen_info = &boot_params.screen_info; + + /* This is drawn from a dump from vgacon:startup in + * standard Linux. */ + screen_info->orig_video_mode = 3; + screen_info->orig_video_isVGA = 1; + screen_info->orig_video_lines = 25; + screen_info->orig_video_cols = 80; + screen_info->orig_video_ega_bx = 3; + screen_info->orig_video_points = 16; + screen_info->orig_y = screen_info->orig_video_lines - 1; + + switch (info->video_type) { + case XEN_VGATYPE_TEXT_MODE_3: + if (size < offsetof(struct dom0_vga_console_info, u.text_mode_3) + + sizeof(info->u.text_mode_3)) + break; + screen_info->orig_video_lines = info->u.text_mode_3.rows; + screen_info->orig_video_cols = info->u.text_mode_3.columns; + screen_info->orig_x = info->u.text_mode_3.cursor_x; + screen_info->orig_y = info->u.text_mode_3.cursor_y; + screen_info->orig_video_points = + info->u.text_mode_3.font_height; + break; + + case XEN_VGATYPE_VESA_LFB: + if (size < offsetof(struct dom0_vga_console_info, + u.vesa_lfb.gbl_caps)) + break; + screen_info->orig_video_isVGA = VIDEO_TYPE_VLFB; + screen_info->lfb_width = info->u.vesa_lfb.width; + screen_info->lfb_height = info->u.vesa_lfb.height; + screen_info->lfb_depth = info->u.vesa_lfb.bits_per_pixel; + screen_info->lfb_base = info->u.vesa_lfb.lfb_base; + screen_info->lfb_size = info->u.vesa_lfb.lfb_size; + screen_info->lfb_linelength = info->u.vesa_lfb.bytes_per_line; + screen_info->red_size = info->u.vesa_lfb.red_size; + screen_info->red_pos = info->u.vesa_lfb.red_pos; + screen_info->green_size = info->u.vesa_lfb.green_size; + screen_info->green_pos = info->u.vesa_lfb.green_pos; + screen_info->blue_size = info->u.vesa_lfb.blue_size; + screen_info->blue_pos = info->u.vesa_lfb.blue_pos; + screen_info->rsvd_size = info->u.vesa_lfb.rsvd_size; + screen_info->rsvd_pos = info->u.vesa_lfb.rsvd_pos; + if (size >= offsetof(struct dom0_vga_console_info, + u.vesa_lfb.gbl_caps) + + sizeof(info->u.vesa_lfb.gbl_caps)) + screen_info->capabilities = info->u.vesa_lfb.gbl_caps; + if (size >= offsetof(struct dom0_vga_console_info, + u.vesa_lfb.mode_attrs) + + sizeof(info->u.vesa_lfb.mode_attrs)) + screen_info->vesa_attributes = info->u.vesa_lfb.mode_attrs; + break; + } +} diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 3112f55..e14c54e 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -88,6 +88,17 @@ static inline void xen_uninit_lock_cpu(int cpu) } #endif +struct dom0_vga_console_info; + +#ifdef CONFIG_XEN_DOM0 +void __init xen_init_vga(const struct dom0_vga_console_info *, size_t size); +#else +static inline void __init xen_init_vga(const struct dom0_vga_console_info *info, + size_t size) +{ +} +#endif + /* Declare an asm function, along with symbols needed to make it inlineable */ #define DECL_ASM(ret, name, ...) \ -- cgit v1.1 From f7a2d73fe75c71941fb0a6b4d8fe7da8144f2c7b Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Mon, 6 Jun 2011 15:36:24 -0400 Subject: x86, efi: Fix argument types for SetVariable() The spec says this takes uint32 for attributes, not uintn. Signed-off-by: Matthew Garrett Link: http://lkml.kernel.org/r/1307388985-7852-1-git-send-email-mjg@redhat.com Signed-off-by: H. Peter Anvin --- arch/x86/platform/efi/efi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 0d3a4fa..f4f6de9 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -122,7 +122,7 @@ static efi_status_t virt_efi_get_next_variable(unsigned long *name_size, static efi_status_t virt_efi_set_variable(efi_char16_t *name, efi_guid_t *vendor, - unsigned long attr, + u32 attr, unsigned long data_size, void *data) { -- cgit v1.1 From 3b3702377c576f6624348c7c6fd113bfd934fbd7 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Mon, 6 Jun 2011 15:36:25 -0400 Subject: x86, efi: Add infrastructure for UEFI 2.0 runtime services We're currently missing support for any of the runtime service calls introduced with the UEFI 2.0 spec in 2006. Add the infrastructure for supporting them. Signed-off-by: Matthew Garrett Link: http://lkml.kernel.org/r/1307388985-7852-2-git-send-email-mjg@redhat.com Signed-off-by: H. Peter Anvin --- arch/x86/platform/efi/efi.c | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index f4f6de9..a0e4244 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -131,6 +131,18 @@ static efi_status_t virt_efi_set_variable(efi_char16_t *name, data_size, data); } +static efi_status_t virt_efi_query_variable_info(u32 attr, + u64 *storage_space, + u64 *remaining_space, + u64 *max_variable_size) +{ + if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION) + return EFI_UNSUPPORTED; + + return efi_call_virt4(query_variable_info, attr, storage_space, + remaining_space, max_variable_size); +} + static efi_status_t virt_efi_get_next_high_mono_count(u32 *count) { return efi_call_virt1(get_next_high_mono_count, count); @@ -145,6 +157,28 @@ static void virt_efi_reset_system(int reset_type, data_size, data); } +static efi_status_t virt_efi_update_capsule(efi_capsule_header_t **capsules, + unsigned long count, + unsigned long sg_list) +{ + if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION) + return EFI_UNSUPPORTED; + + return efi_call_virt3(update_capsule, capsules, count, sg_list); +} + +static efi_status_t virt_efi_query_capsule_caps(efi_capsule_header_t **capsules, + unsigned long count, + u64 *max_size, + int *reset_type) +{ + if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION) + return EFI_UNSUPPORTED; + + return efi_call_virt4(query_capsule_caps, capsules, count, max_size, + reset_type); +} + static efi_status_t __init phys_efi_set_virtual_address_map( unsigned long memory_map_size, unsigned long descriptor_size, @@ -651,6 +685,9 @@ void __init efi_enter_virtual_mode(void) efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count; efi.reset_system = virt_efi_reset_system; efi.set_virtual_address_map = NULL; + efi.query_variable_info = virt_efi_query_variable_info; + efi.update_capsule = virt_efi_update_capsule; + efi.query_capsule_caps = virt_efi_query_capsule_caps; if (__supported_pte_mask & _PAGE_NX) runtime_code_page_mkexec(); early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size); -- cgit v1.1 From 5cec93c216db77c45f7ce970d46283bcb1933884 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sun, 5 Jun 2011 13:50:24 -0400 Subject: x86-64: Emulate legacy vsyscalls There's a fair amount of code in the vsyscall page. It contains a syscall instruction (in the gettimeofday fallback) and who knows what will happen if an exploit jumps into the middle of some other code. Reduce the risk by replacing the vsyscalls with short magic incantations that cause the kernel to emulate the real vsyscalls. These incantations are useless if entered in the middle. This causes vsyscalls to be a little more expensive than real syscalls. Fortunately sensible programs don't use them. The only exception is time() which is still called by glibc through the vsyscall - but calling time() millions of times per second is not sensible. glibc has this fixed in the development tree. This patch is not perfect: the vread_tsc and vread_hpet functions are still at a fixed address. Fixing that might involve making alternative patching work in the vDSO. Signed-off-by: Andy Lutomirski Acked-by: Linus Torvalds Cc: Jesper Juhl Cc: Borislav Petkov Cc: Arjan van de Ven Cc: Jan Beulich Cc: richard -rw- weinberger Cc: Mikael Pettersson Cc: Andi Kleen Cc: Brian Gerst Cc: Louis Rilling Cc: Valdis.Kletnieks@vt.edu Cc: pageexec@freemail.hu Link: http://lkml.kernel.org/r/e64e1b3c64858820d12c48fa739efbd1485e79d5.1307292171.git.luto@mit.edu [ Removed the CONFIG option - it's simpler to just do it unconditionally. Tidied up the code as well. ] Signed-off-by: Ingo Molnar --- arch/x86/include/asm/irq_vectors.h | 6 +- arch/x86/include/asm/traps.h | 4 + arch/x86/include/asm/vsyscall.h | 12 ++ arch/x86/kernel/Makefile | 1 + arch/x86/kernel/entry_64.S | 2 + arch/x86/kernel/traps.c | 6 + arch/x86/kernel/vsyscall_64.c | 261 +++++++++++++++++-------------------- arch/x86/kernel/vsyscall_emu_64.S | 27 ++++ 8 files changed, 179 insertions(+), 140 deletions(-) create mode 100644 arch/x86/kernel/vsyscall_emu_64.S (limited to 'arch/x86') diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 6e976ee..a563c50 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -17,7 +17,8 @@ * Vectors 0 ... 31 : system traps and exceptions - hardcoded events * Vectors 32 ... 127 : device interrupts * Vector 128 : legacy int80 syscall interface - * Vectors 129 ... INVALIDATE_TLB_VECTOR_START-1 : device interrupts + * Vector 204 : legacy x86_64 vsyscall emulation + * Vectors 129 ... INVALIDATE_TLB_VECTOR_START-1 except 204 : device interrupts * Vectors INVALIDATE_TLB_VECTOR_START ... 255 : special interrupts * * 64-bit x86 has per CPU IDT tables, 32-bit has one shared IDT table. @@ -50,6 +51,9 @@ #ifdef CONFIG_X86_32 # define SYSCALL_VECTOR 0x80 #endif +#ifdef CONFIG_X86_64 +# define VSYSCALL_EMU_VECTOR 0xcc +#endif /* * Vectors 0x30-0x3f are used for ISA interrupts. diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 0310da6..2bae0a5 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -1,6 +1,8 @@ #ifndef _ASM_X86_TRAPS_H #define _ASM_X86_TRAPS_H +#include + #include #include /* TRAP_TRACE, ... */ @@ -38,6 +40,7 @@ asmlinkage void alignment_check(void); asmlinkage void machine_check(void); #endif /* CONFIG_X86_MCE */ asmlinkage void simd_coprocessor_error(void); +asmlinkage void emulate_vsyscall(void); dotraplinkage void do_divide_error(struct pt_regs *, long); dotraplinkage void do_debug(struct pt_regs *, long); @@ -64,6 +67,7 @@ dotraplinkage void do_alignment_check(struct pt_regs *, long); dotraplinkage void do_machine_check(struct pt_regs *, long); #endif dotraplinkage void do_simd_coprocessor_error(struct pt_regs *, long); +dotraplinkage void do_emulate_vsyscall(struct pt_regs *, long); #ifdef CONFIG_X86_32 dotraplinkage void do_iret_error(struct pt_regs *, long); #endif diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h index d555973..bb710cb 100644 --- a/arch/x86/include/asm/vsyscall.h +++ b/arch/x86/include/asm/vsyscall.h @@ -31,6 +31,18 @@ extern struct timezone sys_tz; extern void map_vsyscall(void); +/* Emulation */ + +static inline bool is_vsyscall_entry(unsigned long addr) +{ + return (addr & ~0xC00UL) == VSYSCALL_START; +} + +static inline int vsyscall_entry_nr(unsigned long addr) +{ + return (addr & 0xC00UL) >> 10; +} + #endif /* __KERNEL__ */ #endif /* _ASM_X86_VSYSCALL_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 90b06d4..cc0469a 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -44,6 +44,7 @@ obj-y += probe_roms.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o vread_tsc_64.o +obj-$(CONFIG_X86_64) += vsyscall_emu_64.o obj-y += bootflag.o e820.o obj-y += pci-dma.o quirks.o topology.o kdebugfs.o obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 72c4a77..e949793 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1123,6 +1123,8 @@ zeroentry spurious_interrupt_bug do_spurious_interrupt_bug zeroentry coprocessor_error do_coprocessor_error errorentry alignment_check do_alignment_check zeroentry simd_coprocessor_error do_simd_coprocessor_error +zeroentry emulate_vsyscall do_emulate_vsyscall + /* Reload gs selector with exception handling */ /* edi: new selector */ diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index b9b6716..fbc097a 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -872,6 +872,12 @@ void __init trap_init(void) set_bit(SYSCALL_VECTOR, used_vectors); #endif +#ifdef CONFIG_X86_64 + BUG_ON(test_bit(VSYSCALL_EMU_VECTOR, used_vectors)); + set_system_intr_gate(VSYSCALL_EMU_VECTOR, &emulate_vsyscall); + set_bit(VSYSCALL_EMU_VECTOR, used_vectors); +#endif + /* * Should be a barrier for any external CPU state: */ diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 70a5f6e..10cd8ac 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -2,6 +2,8 @@ * Copyright (C) 2001 Andrea Arcangeli SuSE * Copyright 2003 Andi Kleen, SuSE Labs. * + * [ NOTE: this mechanism is now deprecated in favor of the vDSO. ] + * * Thanks to hpa@transmeta.com for some useful hint. * Special thanks to Ingo Molnar for his early experience with * a different vsyscall implementation for Linux/IA32 and for the name. @@ -11,10 +13,9 @@ * vsyscalls. One vsyscall can reserve more than 1 slot to avoid * jumping out of line if necessary. We cannot add more with this * mechanism because older kernels won't return -ENOSYS. - * If we want more than four we need a vDSO. * - * Note: the concept clashes with user mode linux. If you use UML and - * want per guest time just set the kernel.vsyscall64 sysctl to 0. + * Note: the concept clashes with user mode linux. UML users should + * use the vDSO. */ /* Disable profiling for userspace code: */ @@ -32,6 +33,8 @@ #include #include #include +#include +#include #include #include @@ -44,10 +47,7 @@ #include #include #include - -#define __vsyscall(nr) \ - __attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace -#define __syscall_clobber "r11","cx","memory" +#include DEFINE_VVAR(int, vgetcpu_mode); DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) = @@ -71,146 +71,129 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm, unsigned long flags; write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags); + /* copy vsyscall data */ - vsyscall_gtod_data.clock.vread = clock->vread; - vsyscall_gtod_data.clock.cycle_last = clock->cycle_last; - vsyscall_gtod_data.clock.mask = clock->mask; - vsyscall_gtod_data.clock.mult = mult; - vsyscall_gtod_data.clock.shift = clock->shift; - vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; - vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; - vsyscall_gtod_data.wall_to_monotonic = *wtm; - vsyscall_gtod_data.wall_time_coarse = __current_kernel_time(); + vsyscall_gtod_data.clock.vread = clock->vread; + vsyscall_gtod_data.clock.cycle_last = clock->cycle_last; + vsyscall_gtod_data.clock.mask = clock->mask; + vsyscall_gtod_data.clock.mult = mult; + vsyscall_gtod_data.clock.shift = clock->shift; + vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; + vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; + vsyscall_gtod_data.wall_to_monotonic = *wtm; + vsyscall_gtod_data.wall_time_coarse = __current_kernel_time(); + write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); } -/* RED-PEN may want to readd seq locking, but then the variable should be - * write-once. - */ -static __always_inline void do_get_tz(struct timezone * tz) +static void warn_bad_vsyscall(const char *level, struct pt_regs *regs, + const char *message) { - *tz = VVAR(vsyscall_gtod_data).sys_tz; -} + static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); + struct task_struct *tsk; -static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz) -{ - int ret; - asm volatile("syscall" - : "=a" (ret) - : "0" (__NR_gettimeofday),"D" (tv),"S" (tz) - : __syscall_clobber ); - return ret; -} + if (!show_unhandled_signals || !__ratelimit(&rs)) + return; -static __always_inline void do_vgettimeofday(struct timeval * tv) -{ - cycle_t now, base, mask, cycle_delta; - unsigned seq; - unsigned long mult, shift, nsec; - cycle_t (*vread)(void); - do { - seq = read_seqbegin(&VVAR(vsyscall_gtod_data).lock); - - vread = VVAR(vsyscall_gtod_data).clock.vread; - if (unlikely(!vread)) { - gettimeofday(tv,NULL); - return; - } - - now = vread(); - base = VVAR(vsyscall_gtod_data).clock.cycle_last; - mask = VVAR(vsyscall_gtod_data).clock.mask; - mult = VVAR(vsyscall_gtod_data).clock.mult; - shift = VVAR(vsyscall_gtod_data).clock.shift; - - tv->tv_sec = VVAR(vsyscall_gtod_data).wall_time_sec; - nsec = VVAR(vsyscall_gtod_data).wall_time_nsec; - } while (read_seqretry(&VVAR(vsyscall_gtod_data).lock, seq)); - - /* calculate interval: */ - cycle_delta = (now - base) & mask; - /* convert to nsecs: */ - nsec += (cycle_delta * mult) >> shift; - - while (nsec >= NSEC_PER_SEC) { - tv->tv_sec += 1; - nsec -= NSEC_PER_SEC; - } - tv->tv_usec = nsec / NSEC_PER_USEC; -} + tsk = current; -int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz) -{ - if (tv) - do_vgettimeofday(tv); - if (tz) - do_get_tz(tz); - return 0; + printk("%s%s[%d] %s ip:%lx sp:%lx ax:%lx si:%lx di:%lx\n", + level, tsk->comm, task_pid_nr(tsk), + message, regs->ip - 2, regs->sp, regs->ax, regs->si, regs->di); } -/* This will break when the xtime seconds get inaccurate, but that is - * unlikely */ -time_t __vsyscall(1) vtime(time_t *t) +void dotraplinkage do_emulate_vsyscall(struct pt_regs *regs, long error_code) { - unsigned seq; - time_t result; + const char *vsyscall_name; + struct task_struct *tsk; + unsigned long caller; + int vsyscall_nr; + long ret; + + /* Kernel code must never get here. */ + BUG_ON(!user_mode(regs)); + + local_irq_enable(); + + /* + * x86-ism here: regs->ip points to the instruction after the int 0xcc, + * and int 0xcc is two bytes long. + */ + if (!is_vsyscall_entry(regs->ip - 2)) { + warn_bad_vsyscall(KERN_WARNING, regs, "illegal int 0xcc (exploit attempt?)"); + goto sigsegv; + } + vsyscall_nr = vsyscall_entry_nr(regs->ip - 2); - do { - seq = read_seqbegin(&VVAR(vsyscall_gtod_data).lock); + if (get_user(caller, (unsigned long __user *)regs->sp) != 0) { + warn_bad_vsyscall(KERN_WARNING, regs, "int 0xcc with bad stack (exploit attempt?)"); + goto sigsegv; + } - result = VVAR(vsyscall_gtod_data).wall_time_sec; + tsk = current; + if (seccomp_mode(&tsk->seccomp)) + do_exit(SIGKILL); + + switch (vsyscall_nr) { + case 0: + vsyscall_name = "gettimeofday"; + ret = sys_gettimeofday( + (struct timeval __user *)regs->di, + (struct timezone __user *)regs->si); + break; + + case 1: + vsyscall_name = "time"; + ret = sys_time((time_t __user *)regs->di); + break; + + case 2: + vsyscall_name = "getcpu"; + ret = sys_getcpu((unsigned __user *)regs->di, + (unsigned __user *)regs->si, + 0); + break; + + default: + /* + * If we get here, then vsyscall_nr indicates that int 0xcc + * happened at an address in the vsyscall page that doesn't + * contain int 0xcc. That can't happen. + */ + BUG(); + } - } while (read_seqretry(&VVAR(vsyscall_gtod_data).lock, seq)); + if (ret == -EFAULT) { + /* + * Bad news -- userspace fed a bad pointer to a vsyscall. + * + * With a real vsyscall, that would have caused SIGSEGV. + * To make writing reliable exploits using the emulated + * vsyscalls harder, generate SIGSEGV here as well. + */ + warn_bad_vsyscall(KERN_INFO, regs, + "vsyscall fault (exploit attempt?)"); + goto sigsegv; + } - if (t) - *t = result; - return result; -} + regs->ax = ret; -/* Fast way to get current CPU and node. - This helps to do per node and per CPU caches in user space. - The result is not guaranteed without CPU affinity, but usually - works out because the scheduler tries to keep a thread on the same - CPU. + /* Emulate a ret instruction. */ + regs->ip = caller; + regs->sp += 8; - tcache must point to a two element sized long array. - All arguments can be NULL. */ -long __vsyscall(2) -vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache) -{ - unsigned int p; - unsigned long j = 0; - - /* Fast cache - only recompute value once per jiffies and avoid - relatively costly rdtscp/cpuid otherwise. - This works because the scheduler usually keeps the process - on the same CPU and this syscall doesn't guarantee its - results anyways. - We do this here because otherwise user space would do it on - its own in a likely inferior way (no access to jiffies). - If you don't like it pass NULL. */ - if (tcache && tcache->blob[0] == (j = VVAR(jiffies))) { - p = tcache->blob[1]; - } else if (VVAR(vgetcpu_mode) == VGETCPU_RDTSCP) { - /* Load per CPU data from RDTSCP */ - native_read_tscp(&p); - } else { - /* Load per CPU data from GDT */ - asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG)); - } - if (tcache) { - tcache->blob[0] = j; - tcache->blob[1] = p; - } - if (cpu) - *cpu = p & 0xfff; - if (node) - *node = p >> 12; - return 0; + local_irq_disable(); + return; + +sigsegv: + regs->ip -= 2; /* The faulting instruction should be the int 0xcc. */ + force_sig(SIGSEGV, current); } -/* Assume __initcall executes before all user space. Hopefully kmod - doesn't violate that. We'll find out if it does. */ +/* + * Assume __initcall executes before all user space. Hopefully kmod + * doesn't violate that. We'll find out if it does. + */ static void __cpuinit vsyscall_set_cpu(int cpu) { unsigned long d; @@ -221,13 +204,15 @@ static void __cpuinit vsyscall_set_cpu(int cpu) if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP)) write_rdtscp_aux((node << 12) | cpu); - /* Store cpu number in limit so that it can be loaded quickly - in user space in vgetcpu. - 12 bits for the CPU and 8 bits for the node. */ + /* + * Store cpu number in limit so that it can be loaded quickly + * in user space in vgetcpu. (12 bits for the CPU and 8 bits for the node) + */ d = 0x0f40000000000ULL; d |= cpu; d |= (node & 0xf) << 12; d |= (node >> 4) << 48; + write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S); } @@ -241,8 +226,10 @@ static int __cpuinit cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg) { long cpu = (long)arg; + if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1); + return NOTIFY_DONE; } @@ -256,21 +243,17 @@ void __init map_vsyscall(void) /* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */ __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL); __set_fixmap(VVAR_PAGE, physaddr_vvar_page, PAGE_KERNEL_VVAR); - BUILD_BUG_ON((unsigned long)__fix_to_virt(VVAR_PAGE) != - (unsigned long)VVAR_ADDRESS); + BUILD_BUG_ON((unsigned long)__fix_to_virt(VVAR_PAGE) != (unsigned long)VVAR_ADDRESS); } static int __init vsyscall_init(void) { - BUG_ON(((unsigned long) &vgettimeofday != - VSYSCALL_ADDR(__NR_vgettimeofday))); - BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime)); - BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE))); - BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu)); + BUG_ON(VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)); + on_each_cpu(cpu_vsyscall_init, NULL, 1); /* notifier priority > KVM */ hotcpu_notifier(cpu_vsyscall_notifier, 30); + return 0; } - __initcall(vsyscall_init); diff --git a/arch/x86/kernel/vsyscall_emu_64.S b/arch/x86/kernel/vsyscall_emu_64.S new file mode 100644 index 0000000..ffa845e --- /dev/null +++ b/arch/x86/kernel/vsyscall_emu_64.S @@ -0,0 +1,27 @@ +/* + * vsyscall_emu_64.S: Vsyscall emulation page + * + * Copyright (c) 2011 Andy Lutomirski + * + * Subject to the GNU General Public License, version 2 + */ + +#include +#include + +/* The unused parts of the page are filled with 0xcc by the linker script. */ + +.section .vsyscall_0, "a" +ENTRY(vsyscall_0) + int $VSYSCALL_EMU_VECTOR +END(vsyscall_0) + +.section .vsyscall_1, "a" +ENTRY(vsyscall_1) + int $VSYSCALL_EMU_VECTOR +END(vsyscall_1) + +.section .vsyscall_2, "a" +ENTRY(vsyscall_2) + int $VSYSCALL_EMU_VECTOR +END(vsyscall_2) -- cgit v1.1 From 3d5fe5a65af9c0b609d6e26b8d63fe5923c4e512 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Mon, 11 Apr 2011 11:19:09 +1000 Subject: x86/devicetree: Use generic PCI <-> OF matching Instead of walking the whole PCI tree to update the of_node's for PCI busses and devices after the fact, enable the new generic core code for doing so by providing the proper device nodes for the PCI host bridges Signed-off-by: Benjamin Herrenschmidt Acked-by: Grant Likely Tested-by: Sebastian Andrzej Siewior --- arch/x86/kernel/devicetree.c | 60 +++++++++++++------------------------------- 1 file changed, 18 insertions(+), 42 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 690bc84..d23f7af 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -123,6 +123,24 @@ static int __init add_bus_probe(void) module_init(add_bus_probe); #ifdef CONFIG_PCI +struct device_node *pcibios_get_phb_of_node(struct pci_bus *bus) +{ + struct device_node *np; + + for_each_node_by_type(np, "pci") { + const void *prop; + unsigned int bus_min; + + prop = of_get_property(np, "bus-range", NULL); + if (!prop) + continue; + bus_min = be32_to_cpup(prop); + if (bus->number == bus_min) + return np; + } + return NULL; +} + static int x86_of_pci_irq_enable(struct pci_dev *dev) { struct of_irq oirq; @@ -154,50 +172,8 @@ static void x86_of_pci_irq_disable(struct pci_dev *dev) void __cpuinit x86_of_pci_init(void) { - struct device_node *np; - pcibios_enable_irq = x86_of_pci_irq_enable; pcibios_disable_irq = x86_of_pci_irq_disable; - - for_each_node_by_type(np, "pci") { - const void *prop; - struct pci_bus *bus; - unsigned int bus_min; - struct device_node *child; - - prop = of_get_property(np, "bus-range", NULL); - if (!prop) - continue; - bus_min = be32_to_cpup(prop); - - bus = pci_find_bus(0, bus_min); - if (!bus) { - printk(KERN_ERR "Can't find a node for bus %s.\n", - np->full_name); - continue; - } - - if (bus->self) - bus->self->dev.of_node = np; - else - bus->dev.of_node = np; - - for_each_child_of_node(np, child) { - struct pci_dev *dev; - u32 devfn; - - prop = of_get_property(child, "reg", NULL); - if (!prop) - continue; - - devfn = (be32_to_cpup(prop) >> 8) & 0xff; - dev = pci_get_slot(bus, devfn); - if (!dev) - continue; - dev->dev.of_node = child; - pci_dev_put(dev); - } - } } #endif -- cgit v1.1 From 64099d981c9916ec4a485b3ffbb89fa877fc595f Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 7 Apr 2011 13:09:47 +1000 Subject: pci/of: Consolidate pci_device_to_OF_node() All archs do more or less the same thing now, move it into a single generic place. I chose pci.h rather than of_pci.h to avoid having to change all call-sites to include the later. Signed-off-by: Benjamin Herrenschmidt Acked-by: Michal Simek Acked-by: Grant Likely Acked-by: Jesse Barnes --- arch/x86/include/asm/prom.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h index 971e0b4..dd6066a 100644 --- a/arch/x86/include/asm/prom.h +++ b/arch/x86/include/asm/prom.h @@ -31,11 +31,6 @@ extern void x86_add_irq_domains(void); void __cpuinit x86_of_pci_init(void); void x86_dtb_init(void); -static inline struct device_node *pci_device_to_OF_node(struct pci_dev *pdev) -{ - return pdev ? pdev->dev.of_node : NULL; -} - static inline struct device_node *pci_bus_to_OF_node(struct pci_bus *bus) { return pci_device_to_OF_node(bus->self); -- cgit v1.1 From ef3b4f8cc20e80c767e47b848fb7512770ab80d7 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Mon, 11 Apr 2011 11:34:33 +1000 Subject: pci/of: Consolidate pci_bus_to_OF_node() The generic code always get the device-node in the right place now so a single implementation will work for all archs Signed-off-by: Benjamin Herrenschmidt Acked-by: Grant Likely Acked-by: Michal Simek Acked-by: Jesse Barnes --- arch/x86/include/asm/prom.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h index dd6066a..df12870 100644 --- a/arch/x86/include/asm/prom.h +++ b/arch/x86/include/asm/prom.h @@ -30,12 +30,6 @@ extern void add_dtb(u64 data); extern void x86_add_irq_domains(void); void __cpuinit x86_of_pci_init(void); void x86_dtb_init(void); - -static inline struct device_node *pci_bus_to_OF_node(struct pci_bus *bus) -{ - return pci_device_to_OF_node(bus->self); -} - #else static inline void add_dtb(u64 data) { } static inline void x86_add_irq_domains(void) { } -- cgit v1.1 From 334955ef964bee9d3b1e20966847eee28cfd05f6 Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Wed, 1 Jun 2011 19:04:57 +0100 Subject: i8253: Create linux/i8253.h and use it in all 8253 related files Signed-off-by: Ralf Baechle Cc: linux-mips@linux-mips.org Link: http://lkml.kernel.org/r/20110601180610.054254048@duck.linux-mips.net Signed-off-by: Thomas Gleixner arch/arm/mach-footbridge/isa-timer.c | 2 +- arch/mips/cobalt/time.c | 2 +- arch/mips/jazz/irq.c | 2 +- arch/mips/kernel/i8253.c | 2 +- arch/mips/mti-malta/malta-time.c | 2 +- arch/mips/sgi-ip22/ip22-time.c | 2 +- arch/mips/sni/time.c | 2 +- arch/x86/kernel/apic/apic.c | 2 +- arch/x86/kernel/apm_32.c | 2 +- arch/x86/kernel/hpet.c | 2 +- arch/x86/kernel/i8253.c | 2 +- arch/x86/kernel/time.c | 2 +- drivers/block/hd.c | 2 +- drivers/clocksource/i8253.c | 2 +- drivers/input/gameport/gameport.c | 2 +- drivers/input/joystick/analog.c | 2 +- drivers/input/misc/pcspkr.c | 2 +- include/linux/i8253.h | 11 +++++++++++ sound/drivers/pcsp/pcsp.h | 2 +- 19 files changed, 29 insertions(+), 18 deletions(-) --- arch/x86/kernel/apic/apic.c | 2 +- arch/x86/kernel/apm_32.c | 2 +- arch/x86/kernel/hpet.c | 2 +- arch/x86/kernel/i8253.c | 2 +- arch/x86/kernel/time.c | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index b961af8..f3c3784 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -39,7 +40,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 965a766..a30740e 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -229,11 +229,11 @@ #include #include #include +#include #include #include #include -#include #include #include #include diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 6781765..85b8a8a 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -12,7 +13,6 @@ #include #include -#include #include #define HPET_MASK CLOCKSOURCE_MASK(32) diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c index fb66dc9..3235550 100644 --- a/arch/x86/kernel/i8253.c +++ b/arch/x86/kernel/i8253.c @@ -9,10 +9,10 @@ #include #include #include +#include #include #include -#include #include #include diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index 00cbb27..5a64d05 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -11,13 +11,13 @@ #include #include +#include #include #include #include #include #include -#include #include #include #include -- cgit v1.1 From cb2455aa274b780802c593fecf115240a655d809 Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Wed, 1 Jun 2011 19:04:58 +0100 Subject: i8253: Unify all kernel declarations of i8253_lock Signed-off-by: Ralf Baechle Cc: Russell King Cc: linux-arm-kernel@lists.infradead.org Cc: linux-mips@linux-mips.org Link: http://lkml.kernel.org/r/20110601180610.134151920@duck.linux-mips.net Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/i8253.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/i8253.h b/arch/x86/include/asm/i8253.h index 65aaa91..20480ce 100644 --- a/arch/x86/include/asm/i8253.h +++ b/arch/x86/include/asm/i8253.h @@ -1,20 +1,10 @@ #ifndef _ASM_X86_I8253_H #define _ASM_X86_I8253_H -/* i8253A PIT registers */ -#define PIT_MODE 0x43 -#define PIT_CH0 0x40 -#define PIT_CH2 0x42 - #define PIT_LATCH LATCH -extern raw_spinlock_t i8253_lock; - extern struct clock_event_device *global_clock_event; extern void setup_pit_timer(void); -#define inb_pit inb_p -#define outb_pit outb_p - #endif /* _ASM_X86_I8253_H */ -- cgit v1.1 From 15f304b664c0d0a3e76ed3a9ce3615a86908babe Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Wed, 1 Jun 2011 19:04:59 +0100 Subject: i8253: Consolidate all kernel definitions of i8253_lock Move them to drivers/clocksource/i8253.c and remove the implementations in arch/ [ tglx: Avoid the extra file in lib - folded arch patches in. The export will become conditional in a later step ] Signed-off-by: Ralf Baechle Link: http://lkml.kernel.org/r/20110601180610.221426078@duck.linux-mips.net Cc: Russell King Signed-off-by: Thomas Gleixner --- arch/x86/Kconfig | 1 + arch/x86/kernel/i8253.c | 3 --- 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index da34972..e6060a1 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -70,6 +70,7 @@ config X86 select IRQ_FORCED_THREADING select USE_GENERIC_SMP_HELPERS if SMP select HAVE_BPF_JIT if (X86_64 && NET) + select I8253_LOCK config INSTRUCTION_DECODER def_bool (KPROBES || PERF_EVENTS) diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c index 3235550..9c92b6f 100644 --- a/arch/x86/kernel/i8253.c +++ b/arch/x86/kernel/i8253.c @@ -16,9 +16,6 @@ #include #include -DEFINE_RAW_SPINLOCK(i8253_lock); -EXPORT_SYMBOL(i8253_lock); - /* * HPET replaces the PIT, when enabled. So we need to know, which of * the two timers is used -- cgit v1.1 From 16f871bc30f86560017b9d34520593a28e08f373 Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Wed, 1 Jun 2011 19:05:06 +0100 Subject: x86: i8253: Consolidate definitions of global_clock_event There are multiple declarations of global_clock_event in header files specific to particular clock event implementations. Consolidate them in and make sure all users include that header. Signed-off-by: Ralf Baechle Cc: Venkatesh Pallipadi (Venki) Link: http://lkml.kernel.org/r/20110601180610.762763451@duck.linux-mips.net Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/apb_timer.h | 1 - arch/x86/include/asm/i8253.h | 2 -- arch/x86/include/asm/time.h | 6 ++++-- arch/x86/kernel/apb_timer.c | 1 + arch/x86/kernel/apic/apic.c | 1 + arch/x86/kernel/hpet.c | 1 + arch/x86/kernel/i8253.c | 1 + 7 files changed, 8 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/apb_timer.h b/arch/x86/include/asm/apb_timer.h index 2fefa50..963f7d4 100644 --- a/arch/x86/include/asm/apb_timer.h +++ b/arch/x86/include/asm/apb_timer.h @@ -50,7 +50,6 @@ #define APBT_DEV_USED 1 extern void apbt_time_init(void); -extern struct clock_event_device *global_clock_event; extern unsigned long apbt_quick_calibrate(void); extern int arch_setup_apbt_irqs(int irq, int trigger, int mask, int cpu); extern void apbt_setup_secondary_clock(void); diff --git a/arch/x86/include/asm/i8253.h b/arch/x86/include/asm/i8253.h index 20480ce..3d5f5ee 100644 --- a/arch/x86/include/asm/i8253.h +++ b/arch/x86/include/asm/i8253.h @@ -3,8 +3,6 @@ #define PIT_LATCH LATCH -extern struct clock_event_device *global_clock_event; - extern void setup_pit_timer(void); #endif /* _ASM_X86_I8253_H */ diff --git a/arch/x86/include/asm/time.h b/arch/x86/include/asm/time.h index 7bdec4e..92b8aec 100644 --- a/arch/x86/include/asm/time.h +++ b/arch/x86/include/asm/time.h @@ -1,10 +1,12 @@ #ifndef _ASM_X86_TIME_H #define _ASM_X86_TIME_H -extern void hpet_time_init(void); - +#include #include +extern void hpet_time_init(void); extern void time_init(void); +extern struct clock_event_device *global_clock_event; + #endif /* _ASM_X86_TIME_H */ diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index 289e928..2b6630d 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c @@ -44,6 +44,7 @@ #include #include #include +#include #define APBT_MASK CLOCKSOURCE_MASK(32) #define APBT_SHIFT 22 diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index f3c3784..9da0dc0 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -48,6 +48,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 85b8a8a..0f4b065 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -14,6 +14,7 @@ #include #include +#include #define HPET_MASK CLOCKSOURCE_MASK(32) diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c index 9c92b6f..5783e6d 100644 --- a/arch/x86/kernel/i8253.c +++ b/arch/x86/kernel/i8253.c @@ -14,6 +14,7 @@ #include #include +#include #include /* -- cgit v1.1 From 49cf3f29a1ef2e59f1be823e376c1fbac586eb8d Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Wed, 1 Jun 2011 19:05:07 +0100 Subject: i8253: Consolidate definitions of PIT_LATCH x86 defines PIT_LATCH as LATCH which in is defined as ((CLOCK_TICK_RATE + HZ/2) / HZ) and again defines CLOCK_TICK_RATE as PIT_TICK_RATE. MIPS defines PIT_LATCH as LATCH which in is defined as ((CLOCK_TICK_RATE + HZ/2) / HZ) and again defines CLOCK_TICK_RATE as 1193182. ARM defines PITCH_LATCH as ((PIT_TICK_RATE + HZ / 2) / HZ) - and that's the sanest thing and equivalent to above definitions so use that as the new definition in . Signed-off-by: Ralf Baechle Cc: Russell King Cc: linux-arm-kernel@lists.infradead.org Cc: linux-mips@linux-mips.org Link: http://lkml.kernel.org/r/20110601180610.832810002@duck.linux-mips.net Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/i8253.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/i8253.h b/arch/x86/include/asm/i8253.h index 3d5f5ee..0049dc9 100644 --- a/arch/x86/include/asm/i8253.h +++ b/arch/x86/include/asm/i8253.h @@ -1,8 +1,6 @@ #ifndef _ASM_X86_I8253_H #define _ASM_X86_I8253_H -#define PIT_LATCH LATCH - extern void setup_pit_timer(void); #endif /* _ASM_X86_I8253_H */ -- cgit v1.1 From 850492760cfd38d14d64a5fc6d636091464f755e Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Wed, 1 Jun 2011 19:05:08 +0100 Subject: i8253: Move remaining content and delete asm/i8253.h Move setup_pit_timer() declaration to the common header file and remove the arch specific ones. [ tglx: Move it to linux/i8253.h instead of asm/mips and asm/x86 ] Signed-off-by: Ralf Baechle Cc: linux-arm-kernel@lists.infradead.org Cc: Russell King Cc: linux-mips@linux-mips.org Cc: Sergei Shtylyov --- arch/x86/include/asm/i8253.h | 6 ------ 1 file changed, 6 deletions(-) delete mode 100644 arch/x86/include/asm/i8253.h (limited to 'arch/x86') diff --git a/arch/x86/include/asm/i8253.h b/arch/x86/include/asm/i8253.h deleted file mode 100644 index 0049dc9..0000000 --- a/arch/x86/include/asm/i8253.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef _ASM_X86_I8253_H -#define _ASM_X86_I8253_H - -extern void setup_pit_timer(void); - -#endif /* _ASM_X86_I8253_H */ -- cgit v1.1 From 8761f1ab717cac0616f8cf7ec589ae021c739113 Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Wed, 1 Jun 2011 19:05:09 +0100 Subject: pcspkr: Cleanup Kconfig dependencies Lenghty lists of the kind "depends on ARCH1 || ARCH2 ... || ARCH123" are usually either wrong or too coarse grained. Or plain an ugly sin. [ tglx: Fixed up amigaone ] Signed-off-by: Ralf Baechle Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Matt Turner Acked-by: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: linux-alpha@vger.kernel.org Cc: linux-mips@linux-mips.org Cc: linuxppc-dev@lists.ozlabs.org Cc: Gerhard Pircher Link: http://lkml.kernel.org/r/20110601180610.984881988@duck.linux-mips.net Signed-off-by: Thomas Gleixner --- arch/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e6060a1..2dd95b55 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -20,6 +20,7 @@ config X86 select HAVE_UNSTABLE_SCHED_CLOCK select HAVE_IDE select HAVE_OPROFILE + select HAVE_PCSPKR_PLATFORM select HAVE_PERF_EVENTS select HAVE_IRQ_WORK select HAVE_IOREMAP_PROT -- cgit v1.1 From 28f65c11f2ffb3957259dece647a24f8ad2e241b Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 9 Jun 2011 09:13:32 -0700 Subject: treewide: Convert uses of struct resource to resource_size(ptr) Several fixes as well where the +1 was missing. Done via coccinelle scripts like: @@ struct resource *ptr; @@ - ptr->end - ptr->start + 1 + resource_size(ptr) and some grep and typing. Mostly uncompiled, no cross-compilers. Signed-off-by: Joe Perches Signed-off-by: Jiri Kosina --- arch/x86/kernel/pci-calgary_64.c | 2 +- arch/x86/kernel/probe_roms.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index e8c33a3..726494b 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -1553,7 +1553,7 @@ static void __init calgary_fixup_one_tce_space(struct pci_dev *dev) continue; /* cover the whole region */ - npages = (r->end - r->start) >> PAGE_SHIFT; + npages = resource_size(r) >> PAGE_SHIFT; npages++; iommu_range_reserve(tbl, r->start, npages); diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c index ba0a4cc..6322803 100644 --- a/arch/x86/kernel/probe_roms.c +++ b/arch/x86/kernel/probe_roms.c @@ -234,7 +234,7 @@ void __init probe_roms(void) /* check for extension rom (ignore length byte!) */ rom = isa_bus_to_virt(extension_rom_resource.start); if (romsignature(rom)) { - length = extension_rom_resource.end - extension_rom_resource.start + 1; + length = resource_size(&extension_rom_resource); if (romchecksum(rom, length)) { request_resource(&iomem_resource, &extension_rom_resource); upper = extension_rom_resource.start; -- cgit v1.1 From 39c555460cbc6d35656878c89d4664fbd6c81551 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 9 Jun 2011 11:52:27 +0200 Subject: x86/amd-iommu: Remove redundant device_flush_dte() calls Remove these function calls from places where the function has already been called by another function. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 7c3a95e..cc6f6da6 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1798,7 +1798,6 @@ static int device_change_notifier(struct notifier_block *nb, goto out; } - device_flush_dte(dev); iommu_completion_wait(iommu); out: @@ -2605,7 +2604,6 @@ static void amd_iommu_detach_device(struct iommu_domain *dom, if (!iommu) return; - device_flush_dte(dev); iommu_completion_wait(iommu); } -- cgit v1.1 From 8fa5f802abf3cd374b5f07418cea72c5d9d204cc Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 9 Jun 2011 12:24:45 +0200 Subject: x86/amd-iommu: Introduce global dev_data_list This list keeps all allocated iommu_dev_data structs in a list together. This is needed for instances that have no associated device. Signed-off-by: Joerg Roedel --- arch/x86/include/asm/amd_iommu_types.h | 1 + arch/x86/kernel/amd_iommu.c | 56 ++++++++++++++++++++++++++++------ 2 files changed, 48 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h index 4c99829..35520e3 100644 --- a/arch/x86/include/asm/amd_iommu_types.h +++ b/arch/x86/include/asm/amd_iommu_types.h @@ -310,6 +310,7 @@ struct protection_domain { */ struct iommu_dev_data { struct list_head list; /* For domain->dev_list */ + struct list_head dev_data_list; /* For global dev_data_list */ struct device *dev; /* Device this data belong to */ struct device *alias; /* The Alias Device */ struct protection_domain *domain; /* Domain the device is bound to */ diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index cc6f6da6..8b84890 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -45,6 +45,10 @@ static DEFINE_RWLOCK(amd_iommu_devtable_lock); static LIST_HEAD(iommu_pd_list); static DEFINE_SPINLOCK(iommu_pd_list_lock); +/* List of all available dev_data structures */ +static LIST_HEAD(dev_data_list); +static DEFINE_SPINLOCK(dev_data_list_lock); + /* * Domain for untranslated devices - only allocated * if iommu=pt passed on kernel cmd line. @@ -68,6 +72,35 @@ static void update_domain(struct protection_domain *domain); * ****************************************************************************/ +static struct iommu_dev_data *alloc_dev_data(void) +{ + struct iommu_dev_data *dev_data; + unsigned long flags; + + dev_data = kzalloc(sizeof(*dev_data), GFP_KERNEL); + if (!dev_data) + return NULL; + + atomic_set(&dev_data->bind, 0); + + spin_lock_irqsave(&dev_data_list_lock, flags); + list_add_tail(&dev_data->dev_data_list, &dev_data_list); + spin_unlock_irqrestore(&dev_data_list_lock, flags); + + return dev_data; +} + +static void free_dev_data(struct iommu_dev_data *dev_data) +{ + unsigned long flags; + + spin_lock_irqsave(&dev_data_list_lock, flags); + list_del(&dev_data->dev_data_list); + spin_unlock_irqrestore(&dev_data_list_lock, flags); + + kfree(dev_data); +} + static inline u16 get_device_id(struct device *dev) { struct pci_dev *pdev = to_pci_dev(dev); @@ -139,32 +172,28 @@ static int iommu_init_device(struct device *dev) { struct iommu_dev_data *dev_data; struct pci_dev *pdev; - u16 devid, alias; + u16 alias; if (dev->archdata.iommu) return 0; - dev_data = kzalloc(sizeof(*dev_data), GFP_KERNEL); + dev_data = alloc_dev_data(); if (!dev_data) return -ENOMEM; dev_data->dev = dev; - devid = get_device_id(dev); - alias = amd_iommu_alias_table[devid]; + alias = amd_iommu_alias_table[get_device_id(dev)]; pdev = pci_get_bus_and_slot(PCI_BUS(alias), alias & 0xff); if (pdev) dev_data->alias = &pdev->dev; else { - kfree(dev_data); + free_dev_data(dev_data); return -ENOTSUPP; } - atomic_set(&dev_data->bind, 0); - dev->archdata.iommu = dev_data; - return 0; } @@ -184,11 +213,16 @@ static void iommu_ignore_device(struct device *dev) static void iommu_uninit_device(struct device *dev) { - kfree(dev->archdata.iommu); + /* + * Nothing to do here - we keep dev_data around for unplugged devices + * and reuse it when the device is re-plugged - not doing so would + * introduce a ton of races. + */ } void __init amd_iommu_uninit_devices(void) { + struct iommu_dev_data *dev_data, *n; struct pci_dev *pdev = NULL; for_each_pci_dev(pdev) { @@ -198,6 +232,10 @@ void __init amd_iommu_uninit_devices(void) iommu_uninit_device(&pdev->dev); } + + /* Free all of our dev_data structures */ + list_for_each_entry_safe(dev_data, n, &dev_data_list, dev_data_list) + free_dev_data(dev_data); } int __init amd_iommu_init_devices(void) -- cgit v1.1 From f62dda66b5601396c28b563cc18b481af04a91b3 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 9 Jun 2011 12:55:35 +0200 Subject: x86/amd-iommu: Store devid in dev_data This allows to use dev_data independent of struct device later. Signed-off-by: Joerg Roedel --- arch/x86/include/asm/amd_iommu_types.h | 1 + arch/x86/kernel/amd_iommu.c | 39 +++++++++++++--------------------- 2 files changed, 16 insertions(+), 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h index 35520e3..2833862 100644 --- a/arch/x86/include/asm/amd_iommu_types.h +++ b/arch/x86/include/asm/amd_iommu_types.h @@ -315,6 +315,7 @@ struct iommu_dev_data { struct device *alias; /* The Alias Device */ struct protection_domain *domain; /* Domain the device is bound to */ atomic_t bind; /* Domain attach reverent count */ + u16 devid; /* PCI Device ID */ }; /* diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 8b84890..4e8b176 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -72,7 +72,7 @@ static void update_domain(struct protection_domain *domain); * ****************************************************************************/ -static struct iommu_dev_data *alloc_dev_data(void) +static struct iommu_dev_data *alloc_dev_data(u16 devid) { struct iommu_dev_data *dev_data; unsigned long flags; @@ -81,6 +81,7 @@ static struct iommu_dev_data *alloc_dev_data(void) if (!dev_data) return NULL; + dev_data->devid = devid; atomic_set(&dev_data->bind, 0); spin_lock_irqsave(&dev_data_list_lock, flags); @@ -177,13 +178,13 @@ static int iommu_init_device(struct device *dev) if (dev->archdata.iommu) return 0; - dev_data = alloc_dev_data(); + dev_data = alloc_dev_data(get_device_id(dev)); if (!dev_data) return -ENOMEM; dev_data->dev = dev; - alias = amd_iommu_alias_table[get_device_id(dev)]; + alias = amd_iommu_alias_table[dev_data->devid]; pdev = pci_get_bus_and_slot(PCI_BUS(alias), alias & 0xff); if (pdev) dev_data->alias = &pdev->dev; @@ -714,16 +715,16 @@ static int device_flush_iotlb(struct device *dev, u64 address, size_t size) */ static int device_flush_dte(struct device *dev) { + struct iommu_dev_data *dev_data; struct amd_iommu *iommu; struct pci_dev *pdev; - u16 devid; int ret; - pdev = to_pci_dev(dev); - devid = get_device_id(dev); - iommu = amd_iommu_rlookup_table[devid]; + pdev = to_pci_dev(dev); + dev_data = get_dev_data(dev); + iommu = amd_iommu_rlookup_table[dev_data->devid]; - ret = iommu_flush_dte(iommu, devid); + ret = iommu_flush_dte(iommu, dev_data->devid); if (ret) return ret; @@ -1570,11 +1571,9 @@ static void do_attach(struct device *dev, struct protection_domain *domain) struct amd_iommu *iommu; struct pci_dev *pdev; bool ats = false; - u16 devid; - devid = get_device_id(dev); - iommu = amd_iommu_rlookup_table[devid]; dev_data = get_dev_data(dev); + iommu = amd_iommu_rlookup_table[dev_data->devid]; pdev = to_pci_dev(dev); if (amd_iommu_iotlb_sup) @@ -1583,7 +1582,7 @@ static void do_attach(struct device *dev, struct protection_domain *domain) /* Update data structures */ dev_data->domain = domain; list_add(&dev_data->list, &domain->dev_list); - set_dte_entry(devid, domain, ats); + set_dte_entry(dev_data->devid, domain, ats); /* Do reference counting */ domain->dev_iommu[iommu->index] += 1; @@ -1597,11 +1596,9 @@ static void do_detach(struct device *dev) { struct iommu_dev_data *dev_data; struct amd_iommu *iommu; - u16 devid; - devid = get_device_id(dev); - iommu = amd_iommu_rlookup_table[devid]; dev_data = get_dev_data(dev); + iommu = amd_iommu_rlookup_table[dev_data->devid]; /* decrease reference counters */ dev_data->domain->dev_iommu[iommu->index] -= 1; @@ -1610,7 +1607,7 @@ static void do_detach(struct device *dev) /* Update data structures */ dev_data->domain = NULL; list_del(&dev_data->list); - clear_dte_entry(devid); + clear_dte_entry(dev_data->devid); /* Flush the DTE entry */ device_flush_dte(dev); @@ -1760,9 +1757,7 @@ static struct protection_domain *domain_for_device(struct device *dev) struct protection_domain *dom; struct iommu_dev_data *dev_data, *alias_data; unsigned long flags; - u16 devid; - devid = get_device_id(dev); dev_data = get_dev_data(dev); alias_data = get_dev_data(dev_data->alias); if (!alias_data) @@ -1897,8 +1892,7 @@ static void update_device_table(struct protection_domain *domain) list_for_each_entry(dev_data, &domain->dev_list, list) { struct pci_dev *pdev = to_pci_dev(dev_data->dev); - u16 devid = get_device_id(dev_data->dev); - set_dte_entry(devid, domain, pci_ats_enabled(pdev)); + set_dte_entry(dev_data->devid, domain, pci_ats_enabled(pdev)); } } @@ -2652,16 +2646,13 @@ static int amd_iommu_attach_device(struct iommu_domain *dom, struct iommu_dev_data *dev_data; struct amd_iommu *iommu; int ret; - u16 devid; if (!check_device(dev)) return -EINVAL; dev_data = dev->archdata.iommu; - devid = get_device_id(dev); - - iommu = amd_iommu_rlookup_table[devid]; + iommu = amd_iommu_rlookup_table[dev_data->devid]; if (!iommu) return -EINVAL; -- cgit v1.1 From ea61cddb9dcb9da9bcabd40f1620c24abaf645c9 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 9 Jun 2011 12:56:30 +0200 Subject: x86/amd-iommu: Store ATS state in dev_data This allows the low-level functions to operate on dev_data exclusivly later. Signed-off-by: Joerg Roedel --- arch/x86/include/asm/amd_iommu_types.h | 4 ++++ arch/x86/kernel/amd_iommu.c | 44 ++++++++++++++++++---------------- 2 files changed, 28 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h index 2833862..9fc045e 100644 --- a/arch/x86/include/asm/amd_iommu_types.h +++ b/arch/x86/include/asm/amd_iommu_types.h @@ -316,6 +316,10 @@ struct iommu_dev_data { struct protection_domain *domain; /* Domain the device is bound to */ atomic_t bind; /* Domain attach reverent count */ u16 devid; /* PCI Device ID */ + struct { + bool enabled; + int qdep; + } ats; /* ATS state */ }; /* diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 4e8b176..9e07ef6 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -695,17 +695,16 @@ void iommu_flush_all_caches(struct amd_iommu *iommu) */ static int device_flush_iotlb(struct device *dev, u64 address, size_t size) { - struct pci_dev *pdev = to_pci_dev(dev); + struct iommu_dev_data *dev_data; struct amd_iommu *iommu; struct iommu_cmd cmd; - u16 devid; int qdep; - qdep = pci_ats_queue_depth(pdev); - devid = get_device_id(dev); - iommu = amd_iommu_rlookup_table[devid]; + dev_data = get_dev_data(dev); + qdep = dev_data->ats.qdep; + iommu = amd_iommu_rlookup_table[dev_data->devid]; - build_inv_iotlb_pages(&cmd, devid, qdep, address, size); + build_inv_iotlb_pages(&cmd, dev_data->devid, qdep, address, size); return iommu_queue_command(iommu, &cmd); } @@ -728,7 +727,7 @@ static int device_flush_dte(struct device *dev) if (ret) return ret; - if (pci_ats_enabled(pdev)) + if (dev_data->ats.enabled) ret = device_flush_iotlb(dev, 0, ~0UL); return ret; @@ -760,9 +759,8 @@ static void __domain_flush_pages(struct protection_domain *domain, } list_for_each_entry(dev_data, &domain->dev_list, list) { - struct pci_dev *pdev = to_pci_dev(dev_data->dev); - if (!pci_ats_enabled(pdev)) + if (!dev_data->ats.enabled) continue; ret |= device_flush_iotlb(dev_data->dev, address, size); @@ -1576,8 +1574,7 @@ static void do_attach(struct device *dev, struct protection_domain *domain) iommu = amd_iommu_rlookup_table[dev_data->devid]; pdev = to_pci_dev(dev); - if (amd_iommu_iotlb_sup) - ats = pci_ats_enabled(pdev); + ats = dev_data->ats.enabled; /* Update data structures */ dev_data->domain = domain; @@ -1674,11 +1671,16 @@ static int attach_device(struct device *dev, struct protection_domain *domain) { struct pci_dev *pdev = to_pci_dev(dev); + struct iommu_dev_data *dev_data; unsigned long flags; int ret; - if (amd_iommu_iotlb_sup) - pci_enable_ats(pdev, PAGE_SHIFT); + dev_data = get_dev_data(dev); + + if (amd_iommu_iotlb_sup && pci_enable_ats(pdev, PAGE_SHIFT) == 0) { + dev_data->ats.enabled = true; + dev_data->ats.qdep = pci_ats_queue_depth(pdev); + } write_lock_irqsave(&amd_iommu_devtable_lock, flags); ret = __attach_device(dev, domain); @@ -1736,7 +1738,7 @@ static void __detach_device(struct device *dev) */ static void detach_device(struct device *dev) { - struct pci_dev *pdev = to_pci_dev(dev); + struct iommu_dev_data *dev_data; unsigned long flags; /* lock device table */ @@ -1744,8 +1746,12 @@ static void detach_device(struct device *dev) __detach_device(dev); write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); - if (amd_iommu_iotlb_sup && pci_ats_enabled(pdev)) - pci_disable_ats(pdev); + dev_data = get_dev_data(dev); + + if (dev_data->ats.enabled) { + pci_disable_ats(to_pci_dev(dev)); + dev_data->ats.enabled = false; + } } /* @@ -1890,10 +1896,8 @@ static void update_device_table(struct protection_domain *domain) { struct iommu_dev_data *dev_data; - list_for_each_entry(dev_data, &domain->dev_list, list) { - struct pci_dev *pdev = to_pci_dev(dev_data->dev); - set_dte_entry(dev_data->devid, domain, pci_ats_enabled(pdev)); - } + list_for_each_entry(dev_data, &domain->dev_list, list) + set_dte_entry(dev_data->devid, domain, dev_data->ats.enabled); } static void update_domain(struct protection_domain *domain) -- cgit v1.1 From 6c542047937ad66db1d31e290fd2a3b290424ffa Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 9 Jun 2011 17:07:31 +0200 Subject: x86/amd-iommu: Use only dev_data for dte and iotlb flushing routines This patch make the functions flushing the DTE and IOTLBs only take the dev_data structure instead of the struct device directly. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 9e07ef6..0a5b465 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -693,14 +693,13 @@ void iommu_flush_all_caches(struct amd_iommu *iommu) /* * Command send function for flushing on-device TLB */ -static int device_flush_iotlb(struct device *dev, u64 address, size_t size) +static int device_flush_iotlb(struct iommu_dev_data *dev_data, + u64 address, size_t size) { - struct iommu_dev_data *dev_data; struct amd_iommu *iommu; struct iommu_cmd cmd; int qdep; - dev_data = get_dev_data(dev); qdep = dev_data->ats.qdep; iommu = amd_iommu_rlookup_table[dev_data->devid]; @@ -712,23 +711,19 @@ static int device_flush_iotlb(struct device *dev, u64 address, size_t size) /* * Command send function for invalidating a device table entry */ -static int device_flush_dte(struct device *dev) +static int device_flush_dte(struct iommu_dev_data *dev_data) { - struct iommu_dev_data *dev_data; struct amd_iommu *iommu; - struct pci_dev *pdev; int ret; - pdev = to_pci_dev(dev); - dev_data = get_dev_data(dev); - iommu = amd_iommu_rlookup_table[dev_data->devid]; + iommu = amd_iommu_rlookup_table[dev_data->devid]; ret = iommu_flush_dte(iommu, dev_data->devid); if (ret) return ret; if (dev_data->ats.enabled) - ret = device_flush_iotlb(dev, 0, ~0UL); + ret = device_flush_iotlb(dev_data, 0, ~0UL); return ret; } @@ -763,7 +758,7 @@ static void __domain_flush_pages(struct protection_domain *domain, if (!dev_data->ats.enabled) continue; - ret |= device_flush_iotlb(dev_data->dev, address, size); + ret |= device_flush_iotlb(dev_data, address, size); } WARN_ON(ret); @@ -815,7 +810,7 @@ static void domain_flush_devices(struct protection_domain *domain) spin_lock_irqsave(&domain->lock, flags); list_for_each_entry(dev_data, &domain->dev_list, list) - device_flush_dte(dev_data->dev); + device_flush_dte(dev_data); spin_unlock_irqrestore(&domain->lock, flags); } @@ -1586,7 +1581,7 @@ static void do_attach(struct device *dev, struct protection_domain *domain) domain->dev_cnt += 1; /* Flush the DTE entry */ - device_flush_dte(dev); + device_flush_dte(dev_data); } static void do_detach(struct device *dev) @@ -1607,7 +1602,7 @@ static void do_detach(struct device *dev) clear_dte_entry(dev_data->devid); /* Flush the DTE entry */ - device_flush_dte(dev); + device_flush_dte(dev_data); } /* -- cgit v1.1 From ec9e79ef062272a119994c4fdd5e434f3e1bf352 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 9 Jun 2011 17:25:50 +0200 Subject: x86/amd-iommu: Use only dev_data in low-level domain attach/detach functions With this patch the low-level attach/detach functions only work on dev_data structures. This allows to remove the dev_data->dev pointer. Signed-off-by: Joerg Roedel --- arch/x86/include/asm/amd_iommu_types.h | 1 - arch/x86/kernel/amd_iommu.c | 61 +++++++++++++--------------------- 2 files changed, 24 insertions(+), 38 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h index 9fc045e..fffbe3c 100644 --- a/arch/x86/include/asm/amd_iommu_types.h +++ b/arch/x86/include/asm/amd_iommu_types.h @@ -311,7 +311,6 @@ struct protection_domain { struct iommu_dev_data { struct list_head list; /* For domain->dev_list */ struct list_head dev_data_list; /* For global dev_data_list */ - struct device *dev; /* Device this data belong to */ struct device *alias; /* The Alias Device */ struct protection_domain *domain; /* Domain the device is bound to */ atomic_t bind; /* Domain attach reverent count */ diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 0a5b465..a15a172 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -182,8 +182,6 @@ static int iommu_init_device(struct device *dev) if (!dev_data) return -ENOMEM; - dev_data->dev = dev; - alias = amd_iommu_alias_table[dev_data->devid]; pdev = pci_get_bus_and_slot(PCI_BUS(alias), alias & 0xff); if (pdev) @@ -1558,18 +1556,14 @@ static void clear_dte_entry(u16 devid) amd_iommu_apply_erratum_63(devid); } -static void do_attach(struct device *dev, struct protection_domain *domain) +static void do_attach(struct iommu_dev_data *dev_data, + struct protection_domain *domain) { - struct iommu_dev_data *dev_data; struct amd_iommu *iommu; - struct pci_dev *pdev; - bool ats = false; - - dev_data = get_dev_data(dev); - iommu = amd_iommu_rlookup_table[dev_data->devid]; - pdev = to_pci_dev(dev); + bool ats; - ats = dev_data->ats.enabled; + iommu = amd_iommu_rlookup_table[dev_data->devid]; + ats = dev_data->ats.enabled; /* Update data structures */ dev_data->domain = domain; @@ -1584,13 +1578,11 @@ static void do_attach(struct device *dev, struct protection_domain *domain) device_flush_dte(dev_data); } -static void do_detach(struct device *dev) +static void do_detach(struct iommu_dev_data *dev_data) { - struct iommu_dev_data *dev_data; struct amd_iommu *iommu; - dev_data = get_dev_data(dev); - iommu = amd_iommu_rlookup_table[dev_data->devid]; + iommu = amd_iommu_rlookup_table[dev_data->devid]; /* decrease reference counters */ dev_data->domain->dev_iommu[iommu->index] -= 1; @@ -1609,13 +1601,12 @@ static void do_detach(struct device *dev) * If a device is not yet associated with a domain, this function does * assigns it visible for the hardware */ -static int __attach_device(struct device *dev, +static int __attach_device(struct iommu_dev_data *dev_data, struct protection_domain *domain) { - struct iommu_dev_data *dev_data, *alias_data; + struct iommu_dev_data *alias_data; int ret; - dev_data = get_dev_data(dev); alias_data = get_dev_data(dev_data->alias); if (!alias_data) @@ -1635,16 +1626,15 @@ static int __attach_device(struct device *dev, goto out_unlock; /* Do real assignment */ - if (dev_data->alias != dev) { - alias_data = get_dev_data(dev_data->alias); + if (dev_data->devid != alias_data->devid) { if (alias_data->domain == NULL) - do_attach(dev_data->alias, domain); + do_attach(alias_data, domain); atomic_inc(&alias_data->bind); } if (dev_data->domain == NULL) - do_attach(dev, domain); + do_attach(dev_data, domain); atomic_inc(&dev_data->bind); @@ -1678,7 +1668,7 @@ static int attach_device(struct device *dev, } write_lock_irqsave(&amd_iommu_devtable_lock, flags); - ret = __attach_device(dev, domain); + ret = __attach_device(dev_data, domain); write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); /* @@ -1694,9 +1684,8 @@ static int attach_device(struct device *dev, /* * Removes a device from a protection domain (unlocked) */ -static void __detach_device(struct device *dev) +static void __detach_device(struct iommu_dev_data *dev_data) { - struct iommu_dev_data *dev_data = get_dev_data(dev); struct iommu_dev_data *alias_data; struct protection_domain *domain; unsigned long flags; @@ -1707,14 +1696,14 @@ static void __detach_device(struct device *dev) spin_lock_irqsave(&domain->lock, flags); - if (dev_data->alias != dev) { - alias_data = get_dev_data(dev_data->alias); + alias_data = get_dev_data(dev_data->alias); + if (dev_data->devid != alias_data->devid) { if (atomic_dec_and_test(&alias_data->bind)) - do_detach(dev_data->alias); + do_detach(alias_data); } if (atomic_dec_and_test(&dev_data->bind)) - do_detach(dev); + do_detach(dev_data); spin_unlock_irqrestore(&domain->lock, flags); @@ -1725,7 +1714,7 @@ static void __detach_device(struct device *dev) */ if (iommu_pass_through && (dev_data->domain == NULL && domain != pt_domain)) - __attach_device(dev, pt_domain); + __attach_device(dev_data, pt_domain); } /* @@ -1736,13 +1725,13 @@ static void detach_device(struct device *dev) struct iommu_dev_data *dev_data; unsigned long flags; + dev_data = get_dev_data(dev); + /* lock device table */ write_lock_irqsave(&amd_iommu_devtable_lock, flags); - __detach_device(dev); + __detach_device(dev_data); write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); - dev_data = get_dev_data(dev); - if (dev_data->ats.enabled) { pci_disable_ats(to_pci_dev(dev)); dev_data->ats.enabled = false; @@ -1768,7 +1757,7 @@ static struct protection_domain *domain_for_device(struct device *dev) dom = dev_data->domain; if (dom == NULL && alias_data->domain != NULL) { - __attach_device(dev, alias_data->domain); + __attach_device(dev_data, alias_data->domain); dom = alias_data->domain; } @@ -2527,9 +2516,7 @@ static void cleanup_domain(struct protection_domain *domain) write_lock_irqsave(&amd_iommu_devtable_lock, flags); list_for_each_entry_safe(dev_data, next, &domain->dev_list, list) { - struct device *dev = dev_data->dev; - - __detach_device(dev); + __detach_device(dev_data); atomic_set(&dev_data->bind, 0); } -- cgit v1.1 From 2b02b091ab3d1685926bfc76486bf7a22d92979a Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 9 Jun 2011 17:48:39 +0200 Subject: x86/amd-iommu: Allow dev_data->alias to be NULL Let dev_data->alias be just NULL if the device has no alias. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 77 ++++++++++++++++++++++++--------------------- 1 file changed, 41 insertions(+), 36 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index a15a172..2d914a4 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -183,12 +183,14 @@ static int iommu_init_device(struct device *dev) return -ENOMEM; alias = amd_iommu_alias_table[dev_data->devid]; - pdev = pci_get_bus_and_slot(PCI_BUS(alias), alias & 0xff); - if (pdev) - dev_data->alias = &pdev->dev; - else { - free_dev_data(dev_data); - return -ENOTSUPP; + if (alias != dev_data->devid) { + pdev = pci_get_bus_and_slot(PCI_BUS(alias), alias & 0xff); + if (pdev) + dev_data->alias = &pdev->dev; + else { + free_dev_data(dev_data); + return -ENOTSUPP; + } } dev->archdata.iommu = dev_data; @@ -1604,29 +1606,31 @@ static void do_detach(struct iommu_dev_data *dev_data) static int __attach_device(struct iommu_dev_data *dev_data, struct protection_domain *domain) { - struct iommu_dev_data *alias_data; int ret; - alias_data = get_dev_data(dev_data->alias); - - if (!alias_data) - return -EINVAL; - /* lock domain */ spin_lock(&domain->lock); - /* Some sanity checks */ - ret = -EBUSY; - if (alias_data->domain != NULL && - alias_data->domain != domain) - goto out_unlock; + if (dev_data->alias != NULL) { + struct iommu_dev_data *alias_data; + + alias_data = get_dev_data(dev_data->alias); + + ret = -EINVAL; + if (!alias_data) + goto out_unlock; - if (dev_data->domain != NULL && - dev_data->domain != domain) - goto out_unlock; + /* Some sanity checks */ + ret = -EBUSY; + if (alias_data->domain != NULL && + alias_data->domain != domain) + goto out_unlock; - /* Do real assignment */ - if (dev_data->devid != alias_data->devid) { + if (dev_data->domain != NULL && + dev_data->domain != domain) + goto out_unlock; + + /* Do real assignment */ if (alias_data->domain == NULL) do_attach(alias_data, domain); @@ -1696,8 +1700,8 @@ static void __detach_device(struct iommu_dev_data *dev_data) spin_lock_irqsave(&domain->lock, flags); - alias_data = get_dev_data(dev_data->alias); - if (dev_data->devid != alias_data->devid) { + if (dev_data->alias) { + alias_data = get_dev_data(dev_data->alias); if (atomic_dec_and_test(&alias_data->bind)) do_detach(alias_data); } @@ -1744,24 +1748,25 @@ static void detach_device(struct device *dev) */ static struct protection_domain *domain_for_device(struct device *dev) { - struct protection_domain *dom; struct iommu_dev_data *dev_data, *alias_data; + struct protection_domain *dom = NULL; unsigned long flags; dev_data = get_dev_data(dev); - alias_data = get_dev_data(dev_data->alias); - if (!alias_data) - return NULL; - read_lock_irqsave(&amd_iommu_devtable_lock, flags); - dom = dev_data->domain; - if (dom == NULL && - alias_data->domain != NULL) { - __attach_device(dev_data, alias_data->domain); - dom = alias_data->domain; - } + if (dev_data->domain) + return dev_data->domain; - read_unlock_irqrestore(&amd_iommu_devtable_lock, flags); + if (dev_data->alias != NULL) { + alias_data = get_dev_data(dev_data->alias); + + read_lock_irqsave(&amd_iommu_devtable_lock, flags); + if (alias_data->domain != NULL) { + __attach_device(dev_data, alias_data->domain); + dom = alias_data->domain; + } + read_unlock_irqrestore(&amd_iommu_devtable_lock, flags); + } return dom; } -- cgit v1.1 From 3b03bb745eb9fcafc6ef80fcbd04f0f0512acdc9 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 9 Jun 2011 18:53:25 +0200 Subject: x86/amd-iommu: Search for existind dev_data before allocting a new one Search for existing dev_data first will allow to switch dev_data->alias to just store dev_data instead of struct device. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 2d914a4..f647998 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -102,6 +102,37 @@ static void free_dev_data(struct iommu_dev_data *dev_data) kfree(dev_data); } +static struct iommu_dev_data *search_dev_data(u16 devid) +{ + struct iommu_dev_data *dev_data; + unsigned long flags; + + spin_lock_irqsave(&dev_data_list_lock, flags); + list_for_each_entry(dev_data, &dev_data_list, dev_data_list) { + if (dev_data->devid == devid) + goto out_unlock; + } + + dev_data = NULL; + +out_unlock: + spin_unlock_irqrestore(&dev_data_list_lock, flags); + + return dev_data; +} + +static struct iommu_dev_data *find_dev_data(u16 devid) +{ + struct iommu_dev_data *dev_data; + + dev_data = search_dev_data(devid); + + if (dev_data == NULL) + dev_data = alloc_dev_data(devid); + + return dev_data; +} + static inline u16 get_device_id(struct device *dev) { struct pci_dev *pdev = to_pci_dev(dev); @@ -178,7 +209,7 @@ static int iommu_init_device(struct device *dev) if (dev->archdata.iommu) return 0; - dev_data = alloc_dev_data(get_device_id(dev)); + dev_data = find_dev_data(get_device_id(dev)); if (!dev_data) return -ENOMEM; -- cgit v1.1 From 71f77580906b60eb43daff0efb99792e76a3d06d Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 9 Jun 2011 19:03:15 +0200 Subject: x86/amd-iommu: Store device alias as dev_data pointer This finally allows PCI-Device-IDs to be handled by the IOMMU driver that have no corresponding struct device present in the system. Signed-off-by: Joerg Roedel --- arch/x86/include/asm/amd_iommu_types.h | 2 +- arch/x86/kernel/amd_iommu.c | 34 +++++++++++++++------------------- 2 files changed, 16 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h index fffbe3c..5b9c507 100644 --- a/arch/x86/include/asm/amd_iommu_types.h +++ b/arch/x86/include/asm/amd_iommu_types.h @@ -311,7 +311,7 @@ struct protection_domain { struct iommu_dev_data { struct list_head list; /* For domain->dev_list */ struct list_head dev_data_list; /* For global dev_data_list */ - struct device *alias; /* The Alias Device */ + struct iommu_dev_data *alias_data;/* The alias dev_data */ struct protection_domain *domain; /* Domain the device is bound to */ atomic_t bind; /* Domain attach reverent count */ u16 devid; /* PCI Device ID */ diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index f647998..0ed1c94 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -203,7 +203,6 @@ static bool check_device(struct device *dev) static int iommu_init_device(struct device *dev) { struct iommu_dev_data *dev_data; - struct pci_dev *pdev; u16 alias; if (dev->archdata.iommu) @@ -215,13 +214,16 @@ static int iommu_init_device(struct device *dev) alias = amd_iommu_alias_table[dev_data->devid]; if (alias != dev_data->devid) { - pdev = pci_get_bus_and_slot(PCI_BUS(alias), alias & 0xff); - if (pdev) - dev_data->alias = &pdev->dev; - else { + struct iommu_dev_data *alias_data; + + alias_data = find_dev_data(alias); + if (alias_data == NULL) { + pr_err("AMD-Vi: Warning: Unhandled device %s\n", + dev_name(dev)); free_dev_data(dev_data); return -ENOTSUPP; } + dev_data->alias_data = alias_data; } dev->archdata.iommu = dev_data; @@ -1642,14 +1644,8 @@ static int __attach_device(struct iommu_dev_data *dev_data, /* lock domain */ spin_lock(&domain->lock); - if (dev_data->alias != NULL) { - struct iommu_dev_data *alias_data; - - alias_data = get_dev_data(dev_data->alias); - - ret = -EINVAL; - if (!alias_data) - goto out_unlock; + if (dev_data->alias_data != NULL) { + struct iommu_dev_data *alias_data = dev_data->alias_data; /* Some sanity checks */ ret = -EBUSY; @@ -1721,7 +1717,6 @@ static int attach_device(struct device *dev, */ static void __detach_device(struct iommu_dev_data *dev_data) { - struct iommu_dev_data *alias_data; struct protection_domain *domain; unsigned long flags; @@ -1731,8 +1726,9 @@ static void __detach_device(struct iommu_dev_data *dev_data) spin_lock_irqsave(&domain->lock, flags); - if (dev_data->alias) { - alias_data = get_dev_data(dev_data->alias); + if (dev_data->alias_data != NULL) { + struct iommu_dev_data *alias_data = dev_data->alias_data; + if (atomic_dec_and_test(&alias_data->bind)) do_detach(alias_data); } @@ -1779,7 +1775,7 @@ static void detach_device(struct device *dev) */ static struct protection_domain *domain_for_device(struct device *dev) { - struct iommu_dev_data *dev_data, *alias_data; + struct iommu_dev_data *dev_data; struct protection_domain *dom = NULL; unsigned long flags; @@ -1788,8 +1784,8 @@ static struct protection_domain *domain_for_device(struct device *dev) if (dev_data->domain) return dev_data->domain; - if (dev_data->alias != NULL) { - alias_data = get_dev_data(dev_data->alias); + if (dev_data->alias_data != NULL) { + struct iommu_dev_data *alias_data = dev_data->alias_data; read_lock_irqsave(&amd_iommu_devtable_lock, flags); if (alias_data->domain != NULL) { -- cgit v1.1 From ab493a0f0f55d28636ac860ea682d57b84257f10 Mon Sep 17 00:00:00 2001 From: Ohad Ben-Cohen Date: Thu, 2 Jun 2011 02:48:05 +0300 Subject: drivers: iommu: move to a dedicated folder Create a dedicated folder for iommu drivers, and move the base iommu implementation over there. Grouping the various iommu drivers in a single location will help finding similar problems shared by different platforms, so they could be solved once, in the iommu framework, instead of solved differently (or duplicated) in each driver. Signed-off-by: Ohad Ben-Cohen Signed-off-by: Joerg Roedel --- arch/x86/Kconfig | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index da34972..460d573 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -685,6 +685,7 @@ config AMD_IOMMU select SWIOTLB select PCI_MSI select PCI_IOV + select IOMMU_API depends on X86_64 && PCI && ACPI ---help--- With this option you can enable support for AMD IOMMU hardware in @@ -720,9 +721,6 @@ config SWIOTLB config IOMMU_HELPER def_bool (CALGARY_IOMMU || GART_IOMMU || SWIOTLB || AMD_IOMMU) -config IOMMU_API - def_bool (AMD_IOMMU || DMAR) - config MAXSMP bool "Enable Maximum number of SMP Processors and NUMA Nodes" depends on X86_64 && SMP && DEBUG_KERNEL && EXPERIMENTAL @@ -1945,6 +1943,7 @@ config PCI_CNB20LE_QUIRK config DMAR bool "Support for DMA Remapping Devices (EXPERIMENTAL)" depends on PCI_MSI && ACPI && EXPERIMENTAL + select IOMMU_API help DMA remapping (DMAR) devices support enables independent address translations for Direct Memory Access (DMA) from devices. -- cgit v1.1 From 395810627b6a43c8d0ec948884043946fa162308 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 8 Jun 2011 16:09:21 +0900 Subject: x86: Swap save_stack_trace_regs parameters Swap the 1st and 2nd parameters of save_stack_trace_regs() as same as the parameters of save_stack_trace_tsk(). Signed-off-by: Masami Hiramatsu Cc: yrl.pp-manager.tt@hitachi.com Cc: Frederic Weisbecker Cc: Peter Zijlstra Cc: Namhyung Kim Link: http://lkml.kernel.org/r/20110608070921.17777.31103.stgit@fedora15 Signed-off-by: Steven Rostedt --- arch/x86/kernel/stacktrace.c | 2 +- arch/x86/mm/kmemcheck/error.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 55d9bc0..fdd0c64 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -66,7 +66,7 @@ void save_stack_trace(struct stack_trace *trace) } EXPORT_SYMBOL_GPL(save_stack_trace); -void save_stack_trace_regs(struct stack_trace *trace, struct pt_regs *regs) +void save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace) { dump_trace(current, regs, NULL, 0, &save_stack_ops, trace); if (trace->nr_entries < trace->max_entries) diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c index 704a37c..dab4187 100644 --- a/arch/x86/mm/kmemcheck/error.c +++ b/arch/x86/mm/kmemcheck/error.c @@ -185,7 +185,7 @@ void kmemcheck_error_save(enum kmemcheck_shadow state, e->trace.entries = e->trace_entries; e->trace.max_entries = ARRAY_SIZE(e->trace_entries); e->trace.skip = 0; - save_stack_trace_regs(&e->trace, regs); + save_stack_trace_regs(regs, &e->trace); /* Round address down to nearest 16 bytes */ shadow_copy = kmemcheck_shadow_lookup(address -- cgit v1.1 From a0e3e70243f5b270bc3eca718f0a9fa5e6b8262e Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Fri, 3 Jun 2011 16:37:47 +0200 Subject: oprofile, x86: Fix nmi-unsafe callgraph support Current oprofile's x86 callgraph support may trigger page faults throwing the BUG_ON(in_nmi()) message below. This patch fixes this by using the same nmi-safe copy-from-user code as in perf. ------------[ cut here ]------------ kernel BUG at .../arch/x86/kernel/traps.c:436! invalid opcode: 0000 [#1] SMP last sysfs file: /sys/devices/pci0000:00/0000:00:0a.0/0000:07:00.0/0000:08:04.0/net/eth0/broadcast CPU 5 Modules linked in: Pid: 8611, comm: opcontrol Not tainted 2.6.39-00007-gfe47ae7 #1 Advanced Micro Device Anaheim/Anaheim RIP: 0010:[] [] do_nmi+0x22/0x1ee RSP: 0000:ffff88042fd47f28 EFLAGS: 00010002 RAX: ffff88042c0a7fd8 RBX: 0000000000000001 RCX: 00000000c0000101 RDX: 00000000ffff8804 RSI: ffffffffffffffff RDI: ffff88042fd47f58 RBP: ffff88042fd47f48 R08: 0000000000000004 R09: 0000000000001484 R10: 0000000000000001 R11: 0000000000000000 R12: ffff88042fd47f58 R13: 0000000000000000 R14: ffff88042fd47d98 R15: 0000000000000020 FS: 00007fca25e56700(0000) GS:ffff88042fd40000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000074 CR3: 000000042d28b000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process opcontrol (pid: 8611, threadinfo ffff88042c0a6000, task ffff88042c532310) Stack: 0000000000000000 0000000000000001 ffff88042c0a7fd8 0000000000000000 ffff88042fd47de8 ffffffff813e897a 0000000000000020 ffff88042fd47d98 0000000000000000 ffff88042c0a7fd8 ffff88042fd47de8 0000000000000074 Call Trace: [] nmi+0x1a/0x20 [] ? bad_to_user+0x25/0x771 <> Code: ff 59 5b 41 5c 41 5d c9 c3 55 65 48 8b 04 25 88 b5 00 00 48 89 e5 41 55 41 54 49 89 fc 53 48 83 ec 08 f6 80 47 e0 ff ff 04 74 04 <0f> 0b eb fe 81 80 44 e0 ff ff 00 00 01 04 65 ff 04 25 c4 0f 01 RIP [] do_nmi+0x22/0x1ee RSP ---[ end trace ed6752185092104b ]--- Kernel panic - not syncing: Fatal exception in interrupt Pid: 8611, comm: opcontrol Tainted: G D 2.6.39-00007-gfe47ae7 #1 Call Trace: [] panic+0x8c/0x188 [] oops_end+0x81/0x8e [] die+0x55/0x5e [] do_trap+0x11c/0x12b [] do_invalid_op+0x91/0x9a [] ? do_nmi+0x22/0x1ee [] ? oprofile_add_sample+0x83/0x95 [] ? op_amd_check_ctrs+0x4f/0x2cf [] invalid_op+0x15/0x20 [] ? do_nmi+0x22/0x1ee [] ? do_nmi+0x67/0x1ee [] nmi+0x1a/0x20 [] ? bad_to_user+0x25/0x771 <> Cc: John Lumby Cc: Maynard Johnson Cc: # .37+ Signed-off-by: Robert Richter --- arch/x86/oprofile/backtrace.c | 56 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 47 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c index a5b64ab..32f78eb 100644 --- a/arch/x86/oprofile/backtrace.c +++ b/arch/x86/oprofile/backtrace.c @@ -11,10 +11,12 @@ #include #include #include +#include +#include + #include #include #include -#include static int backtrace_stack(void *data, char *name) { @@ -36,17 +38,53 @@ static struct stacktrace_ops backtrace_ops = { .walk_stack = print_context_stack, }; +/* from arch/x86/kernel/cpu/perf_event.c: */ + +/* + * best effort, GUP based copy_from_user() that assumes IRQ or NMI context + */ +static unsigned long +copy_from_user_nmi(void *to, const void __user *from, unsigned long n) +{ + unsigned long offset, addr = (unsigned long)from; + unsigned long size, len = 0; + struct page *page; + void *map; + int ret; + + do { + ret = __get_user_pages_fast(addr, 1, 0, &page); + if (!ret) + break; + + offset = addr & (PAGE_SIZE - 1); + size = min(PAGE_SIZE - offset, n - len); + + map = kmap_atomic(page); + memcpy(to, map+offset, size); + kunmap_atomic(map); + put_page(page); + + len += size; + to += size; + addr += size; + + } while (len < n); + + return len; +} + #ifdef CONFIG_COMPAT static struct stack_frame_ia32 * dump_user_backtrace_32(struct stack_frame_ia32 *head) { + /* Also check accessibility of one struct frame_head beyond: */ struct stack_frame_ia32 bufhead[2]; struct stack_frame_ia32 *fp; + unsigned long bytes; - /* Also check accessibility of one struct frame_head beyond */ - if (!access_ok(VERIFY_READ, head, sizeof(bufhead))) - return NULL; - if (__copy_from_user_inatomic(bufhead, head, sizeof(bufhead))) + bytes = copy_from_user_nmi(bufhead, head, sizeof(bufhead)); + if (bytes != sizeof(bufhead)) return NULL; fp = (struct stack_frame_ia32 *) compat_ptr(bufhead[0].next_frame); @@ -87,12 +125,12 @@ x86_backtrace_32(struct pt_regs * const regs, unsigned int depth) static struct stack_frame *dump_user_backtrace(struct stack_frame *head) { + /* Also check accessibility of one struct frame_head beyond: */ struct stack_frame bufhead[2]; + unsigned long bytes; - /* Also check accessibility of one struct stack_frame beyond */ - if (!access_ok(VERIFY_READ, head, sizeof(bufhead))) - return NULL; - if (__copy_from_user_inatomic(bufhead, head, sizeof(bufhead))) + bytes = copy_from_user_nmi(bufhead, head, sizeof(bufhead)); + if (bytes != sizeof(bufhead)) return NULL; oprofile_add_trace(bufhead[0].return_address); -- cgit v1.1 From 86b445676d13f520ef9ab7aebe933aa6684ce84c Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 22 Feb 2011 18:41:48 +0100 Subject: x86, microcode, AMD: Correct buf references Both the equivalence table and the microcode patch types are u32. Access them properly through the buf-ptr. Signed-off-by: Borislav Petkov --- arch/x86/kernel/microcode_amd.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index c561038..d30d67c 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -157,7 +157,7 @@ static int apply_microcode_amd(int cpu) static unsigned int verify_ucode_size(int cpu, const u8 *buf, unsigned int size) { struct cpuinfo_x86 *c = &cpu_data(cpu); - unsigned int max_size, actual_size; + u32 max_size, actual_size; #define F1XH_MPB_MAX_SIZE 2048 #define F14H_MPB_MAX_SIZE 1824 @@ -175,7 +175,7 @@ static unsigned int verify_ucode_size(int cpu, const u8 *buf, unsigned int size) break; } - actual_size = buf[4] + (buf[5] << 8); + actual_size = *(u32 *)(buf + 4); if (actual_size > size || actual_size > max_size) { pr_err("section size mismatch\n"); @@ -191,7 +191,7 @@ get_next_ucode(int cpu, const u8 *buf, unsigned int size, unsigned int *mc_size) struct microcode_header_amd *mc = NULL; unsigned int actual_size = 0; - if (buf[0] != UCODE_UCODE_TYPE) { + if (*(u32 *)buf != UCODE_UCODE_TYPE) { pr_err("invalid type field in container file section header\n"); goto out; } -- cgit v1.1 From 880a317abccf60b989951bf1515964cd73245970 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Tue, 7 Jun 2011 21:51:56 -0400 Subject: x86, mce, severity: Fix two severities table signatures The "Spurious not enabled" entry is redundant: the "Not enabled" entry earlier in the table will cover this case. The "Action required; unknown MCACOD" entry shouldn't specify MCACOD in the .mask field. Current code will only match for mcacod==0 rather than all AR=1 entries. Signed-off-by: Tony Luck Signed-off-by: Hidetoshi Seto Link: http://lkml.kernel.org/r/4DEED5BC.8030703@jp.fujitsu.com Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/mcheck/mce-severity.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 1e8d66c..352d16a 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -69,8 +69,6 @@ static struct severity { MCGMASK(MCG_STATUS_RIPV, 0, PANIC, "In kernel and no restart IP", KERNEL), BITCLR(MCI_STATUS_UC, KEEP, "Corrected error", NOSER), - MASK(MCI_STATUS_OVER|MCI_STATUS_UC|MCI_STATUS_EN, MCI_STATUS_UC, SOME, - "Spurious not enabled", SER), /* ignore OVER for UCNA */ MASK(MCI_UC_SAR, MCI_STATUS_UC, KEEP, @@ -82,7 +80,7 @@ static struct severity { /* AR add known MCACODs here */ MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_SAR, PANIC, "Action required with lost events", SER), - MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_SAR, PANIC, + MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR, PANIC, "Action required; unknown MCACOD", SER), /* known AO MCACODs: */ -- cgit v1.1 From 901d7691d3238ad68c80a567b88b1e5d614137fb Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Wed, 8 Jun 2011 10:52:43 +0900 Subject: x86, mce, severity: Make formatting a bit more readable The table looks very complicated and hard to read for people other than skilled developers. So let's clean it up a bit. At first, change format to ease reading elements in the table. Signed-off-by: Hidetoshi Seto Acked-by: Tony Luck Link: http://lkml.kernel.org/r/4DEED5EB.6050400@jp.fujitsu.com Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/mcheck/mce-severity.c | 118 ++++++++++++++++++++++-------- 1 file changed, 88 insertions(+), 30 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 352d16a..eaf5a43 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -58,44 +58,102 @@ static struct severity { #define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR) #define MCACOD 0xffff - BITCLR(MCI_STATUS_VAL, NO, "Invalid"), - BITCLR(MCI_STATUS_EN, NO, "Not enabled"), - BITSET(MCI_STATUS_PCC, PANIC, "Processor context corrupt"), + BITCLR( + MCI_STATUS_VAL, + NO, "Invalid" + ), + BITCLR( + MCI_STATUS_EN, + NO, "Not enabled" + ), + BITSET( + MCI_STATUS_PCC, + PANIC, "Processor context corrupt" + ), /* When MCIP is not set something is very confused */ - MCGMASK(MCG_STATUS_MCIP, 0, PANIC, "MCIP not set in MCA handler"), + MCGMASK( + MCG_STATUS_MCIP, 0, + PANIC, "MCIP not set in MCA handler" + ), /* Neither return not error IP -- no chance to recover -> PANIC */ - MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0, PANIC, - "Neither restart nor error IP"), - MCGMASK(MCG_STATUS_RIPV, 0, PANIC, "In kernel and no restart IP", - KERNEL), - BITCLR(MCI_STATUS_UC, KEEP, "Corrected error", NOSER), + MCGMASK( + MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0, + PANIC, "Neither restart nor error IP" + ), + MCGMASK( + MCG_STATUS_RIPV, 0, + PANIC, "In kernel and no restart IP", + KERNEL + ), + BITCLR( + MCI_STATUS_UC, + KEEP, "Corrected error", + NOSER + ), /* ignore OVER for UCNA */ - MASK(MCI_UC_SAR, MCI_STATUS_UC, KEEP, - "Uncorrected no action required", SER), - MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR, PANIC, - "Illegal combination (UCNA with AR=1)", SER), - MASK(MCI_STATUS_S, 0, KEEP, "Non signalled machine check", SER), + MASK( + MCI_UC_SAR, MCI_STATUS_UC, + KEEP, "Uncorrected no action required", + SER + ), + MASK( + MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR, + PANIC, "Illegal combination (UCNA with AR=1)", + SER + ), + MASK( + MCI_STATUS_S, 0, + KEEP, "Non signalled machine check", + SER + ), /* AR add known MCACODs here */ - MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_SAR, PANIC, - "Action required with lost events", SER), - MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR, PANIC, - "Action required; unknown MCACOD", SER), + MASK( + MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_SAR, + PANIC, "Action required with lost events", + SER + ), + MASK( + MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR, + PANIC, "Action required; unknown MCACOD", + SER + ), /* known AO MCACODs: */ - MASK(MCI_UC_SAR|MCI_STATUS_OVER|0xfff0, MCI_UC_S|0xc0, AO, - "Action optional: memory scrubbing error", SER), - MASK(MCI_UC_SAR|MCI_STATUS_OVER|MCACOD, MCI_UC_S|0x17a, AO, - "Action optional: last level cache writeback error", SER), - - MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S, SOME, - "Action optional unknown MCACOD", SER), - MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S|MCI_STATUS_OVER, SOME, - "Action optional with lost events", SER), - BITSET(MCI_STATUS_UC|MCI_STATUS_OVER, PANIC, "Overflowed uncorrected"), - BITSET(MCI_STATUS_UC, UC, "Uncorrected"), - BITSET(0, SOME, "No match") /* always matches. keep at end */ + MASK( + MCI_UC_SAR|MCI_STATUS_OVER|0xfff0, MCI_UC_S|0xc0, + AO, "Action optional: memory scrubbing error", + SER + ), + MASK( + MCI_UC_SAR|MCI_STATUS_OVER|MCACOD, MCI_UC_S|0x17a, + AO, "Action optional: last level cache writeback error", + SER + ), + + MASK( + MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S, + SOME, "Action optional unknown MCACOD", + SER + ), + MASK( + MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S|MCI_STATUS_OVER, + SOME, "Action optional with lost events", + SER + ), + BITSET( + MCI_STATUS_UC|MCI_STATUS_OVER, + PANIC, "Overflowed uncorrected" + ), + BITSET( + MCI_STATUS_UC, + UC, "Uncorrected" + ), + BITSET( + 0, + SOME, "No match" + ) /* always matches. keep at end */ }; /* -- cgit v1.1 From a17957cdec69acb9e26319618b95a810a936e637 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Wed, 8 Jun 2011 10:53:35 +0900 Subject: x86, mce, severity: Cleanup severity table The current format of an item in this table is: condition(param, ..., level, message [, condition2 ...]) So we have to check both an item's head and tail to find the conditions which match the item. Format them in a more straight forward manner: item(level, message, condition [, condition2 ...]) Signed-off-by: Hidetoshi Seto Acked-by: Tony Luck Link: http://lkml.kernel.org/r/4DEED61F.5010502@jp.fujitsu.com Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/mcheck/mce-severity.c | 127 ++++++++++++++---------------- 1 file changed, 58 insertions(+), 69 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index eaf5a43..27e778e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -43,116 +43,105 @@ static struct severity { unsigned char covered; char *msg; } severities[] = { -#define KERNEL .context = IN_KERNEL -#define USER .context = IN_USER -#define SER .ser = SER_REQUIRED -#define NOSER .ser = NO_SER -#define SEV(s) .sev = MCE_ ## s ## _SEVERITY -#define BITCLR(x, s, m, r...) { .mask = x, .result = 0, SEV(s), .msg = m, ## r } -#define BITSET(x, s, m, r...) { .mask = x, .result = x, SEV(s), .msg = m, ## r } -#define MCGMASK(x, res, s, m, r...) \ - { .mcgmask = x, .mcgres = res, SEV(s), .msg = m, ## r } -#define MASK(x, y, s, m, r...) \ - { .mask = x, .result = y, SEV(s), .msg = m, ## r } +#define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c } +#define KERNEL .context = IN_KERNEL +#define USER .context = IN_USER +#define SER .ser = SER_REQUIRED +#define NOSER .ser = NO_SER +#define BITCLR(x) .mask = x, .result = 0 +#define BITSET(x) .mask = x, .result = x +#define MCGMASK(x, y) .mcgmask = x, .mcgres = y +#define MASK(x, y) .mask = x, .result = y #define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S) #define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR) #define MCACOD 0xffff - BITCLR( - MCI_STATUS_VAL, - NO, "Invalid" + MCESEV( + NO, "Invalid", + BITCLR(MCI_STATUS_VAL) ), - BITCLR( - MCI_STATUS_EN, - NO, "Not enabled" + MCESEV( + NO, "Not enabled", + BITCLR(MCI_STATUS_EN) ), - BITSET( - MCI_STATUS_PCC, - PANIC, "Processor context corrupt" + MCESEV( + PANIC, "Processor context corrupt", + BITSET(MCI_STATUS_PCC) ), /* When MCIP is not set something is very confused */ - MCGMASK( - MCG_STATUS_MCIP, 0, - PANIC, "MCIP not set in MCA handler" + MCESEV( + PANIC, "MCIP not set in MCA handler", + MCGMASK(MCG_STATUS_MCIP, 0) ), /* Neither return not error IP -- no chance to recover -> PANIC */ - MCGMASK( - MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0, - PANIC, "Neither restart nor error IP" + MCESEV( + PANIC, "Neither restart nor error IP", + MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0) ), - MCGMASK( - MCG_STATUS_RIPV, 0, + MCESEV( PANIC, "In kernel and no restart IP", - KERNEL + KERNEL, MCGMASK(MCG_STATUS_RIPV, 0) ), - BITCLR( - MCI_STATUS_UC, + MCESEV( KEEP, "Corrected error", - NOSER + NOSER, BITCLR(MCI_STATUS_UC) ), /* ignore OVER for UCNA */ - MASK( - MCI_UC_SAR, MCI_STATUS_UC, + MCESEV( KEEP, "Uncorrected no action required", - SER + SER, MASK(MCI_UC_SAR, MCI_STATUS_UC) ), - MASK( - MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR, + MCESEV( PANIC, "Illegal combination (UCNA with AR=1)", - SER + SER, + MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR) ), - MASK( - MCI_STATUS_S, 0, + MCESEV( KEEP, "Non signalled machine check", - SER + SER, MASK(MCI_STATUS_S, 0) ), /* AR add known MCACODs here */ - MASK( - MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_SAR, + MCESEV( PANIC, "Action required with lost events", - SER + SER, + MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_SAR) ), - MASK( - MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR, + MCESEV( PANIC, "Action required; unknown MCACOD", - SER + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR) ), /* known AO MCACODs: */ - MASK( - MCI_UC_SAR|MCI_STATUS_OVER|0xfff0, MCI_UC_S|0xc0, + MCESEV( AO, "Action optional: memory scrubbing error", - SER + SER, MASK(MCI_UC_SAR|MCI_STATUS_OVER|0xfff0, MCI_UC_S|0xc0) ), - MASK( - MCI_UC_SAR|MCI_STATUS_OVER|MCACOD, MCI_UC_S|0x17a, + MCESEV( AO, "Action optional: last level cache writeback error", - SER + SER, MASK(MCI_UC_SAR|MCI_STATUS_OVER|MCACOD, MCI_UC_S|0x17a) ), - - MASK( - MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S, + MCESEV( SOME, "Action optional unknown MCACOD", - SER + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S) ), - MASK( - MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S|MCI_STATUS_OVER, + MCESEV( SOME, "Action optional with lost events", - SER + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S|MCI_STATUS_OVER) ), - BITSET( - MCI_STATUS_UC|MCI_STATUS_OVER, - PANIC, "Overflowed uncorrected" + + MCESEV( + PANIC, "Overflowed uncorrected", + BITSET(MCI_STATUS_UC|MCI_STATUS_OVER) ), - BITSET( - MCI_STATUS_UC, - UC, "Uncorrected" + MCESEV( + UC, "Uncorrected", + BITSET(MCI_STATUS_UC) ), - BITSET( - 0, - SOME, "No match" + MCESEV( + SOME, "No match", + BITSET(0) ) /* always matches. keep at end */ }; -- cgit v1.1 From 7639bfc753f70321dbea83852e1cc47a45b681d7 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Wed, 8 Jun 2011 10:55:05 +0900 Subject: x86, mce, severity: Clean up trivial coding style problems More specifically: - sort bits in the macros - use BITCLR/BITSET - coordinate message pattern - use m for struct mce - cleanup for severities_debugfs_init() No functional change. Signed-off-by: Hidetoshi Seto Acked-by: Tony Luck Link: http://lkml.kernel.org/r/4DEED679.9090503@jp.fujitsu.com Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/mcheck/mce-severity.c | 39 +++++++++++++++---------------- 1 file changed, 19 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 27e778e..7395d5f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -99,41 +99,40 @@ static struct severity { ), MCESEV( KEEP, "Non signalled machine check", - SER, MASK(MCI_STATUS_S, 0) + SER, BITCLR(MCI_STATUS_S) ), /* AR add known MCACODs here */ MCESEV( PANIC, "Action required with lost events", - SER, - MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_SAR) + SER, BITSET(MCI_STATUS_OVER|MCI_UC_SAR) ), MCESEV( - PANIC, "Action required; unknown MCACOD", + PANIC, "Action required: unknown MCACOD", SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR) ), /* known AO MCACODs: */ MCESEV( AO, "Action optional: memory scrubbing error", - SER, MASK(MCI_UC_SAR|MCI_STATUS_OVER|0xfff0, MCI_UC_S|0xc0) + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|0xfff0, MCI_UC_S|0x00c0) ), MCESEV( AO, "Action optional: last level cache writeback error", - SER, MASK(MCI_UC_SAR|MCI_STATUS_OVER|MCACOD, MCI_UC_S|0x17a) + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_S|0x017a) ), MCESEV( - SOME, "Action optional unknown MCACOD", + SOME, "Action optional: unknown MCACOD", SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S) ), MCESEV( SOME, "Action optional with lost events", - SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S|MCI_STATUS_OVER) + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_S) ), MCESEV( PANIC, "Overflowed uncorrected", - BITSET(MCI_STATUS_UC|MCI_STATUS_OVER) + BITSET(MCI_STATUS_OVER|MCI_STATUS_UC) ), MCESEV( UC, "Uncorrected", @@ -157,15 +156,15 @@ static int error_context(struct mce *m) return IN_KERNEL; } -int mce_severity(struct mce *a, int tolerant, char **msg) +int mce_severity(struct mce *m, int tolerant, char **msg) { - enum context ctx = error_context(a); + enum context ctx = error_context(m); struct severity *s; for (s = severities;; s++) { - if ((a->status & s->mask) != s->result) + if ((m->status & s->mask) != s->result) continue; - if ((a->mcgstatus & s->mcgmask) != s->mcgres) + if ((m->mcgstatus & s->mcgmask) != s->mcgres) continue; if (s->ser == SER_REQUIRED && !mce_ser) continue; @@ -242,15 +241,15 @@ static const struct file_operations severities_coverage_fops = { static int __init severities_debugfs_init(void) { - struct dentry *dmce = NULL, *fseverities_coverage = NULL; + struct dentry *dmce, *fsev; dmce = mce_get_debugfs_dir(); - if (dmce == NULL) + if (!dmce) goto err_out; - fseverities_coverage = debugfs_create_file("severities-coverage", - 0444, dmce, NULL, - &severities_coverage_fops); - if (fseverities_coverage == NULL) + + fsev = debugfs_create_file("severities-coverage", 0444, dmce, NULL, + &severities_coverage_fops); + if (!fsev) goto err_out; return 0; @@ -259,4 +258,4 @@ err_out: return -ENOMEM; } late_initcall(severities_debugfs_init); -#endif +#endif /* CONFIG_DEBUG_FS */ -- cgit v1.1 From b77e70bf3535e0bd5472e0681f41cce4ae0598bb Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Wed, 8 Jun 2011 10:56:02 +0900 Subject: x86, mce: Replace MCE_SELF_VECTOR by irq_work The MCE handler uses a special vector for self IPI to invoke post-emergency processing in an interrupt context, e.g. call an NMI-unsafe function, wakeup loggers, schedule time-consuming work for recovery, etc. This mechanism is now generalized by the following commit: > e360adbe29241a0194e10e20595360dd7b98a2b3 > Author: Peter Zijlstra > Date: Thu Oct 14 14:01:34 2010 +0800 > > irq_work: Add generic hardirq context callbacks > > Provide a mechanism that allows running code in IRQ context. It is > most useful for NMI code that needs to interact with the rest of the > system -- like wakeup a task to drain buffers. : So change to use provided generic mechanism. Signed-off-by: Hidetoshi Seto Acked-by: Tony Luck Link: http://lkml.kernel.org/r/4DEED6B2.6080005@jp.fujitsu.com Signed-off-by: Borislav Petkov --- arch/x86/include/asm/entry_arch.h | 4 ---- arch/x86/include/asm/hw_irq.h | 1 - arch/x86/include/asm/irq_vectors.h | 5 ---- arch/x86/kernel/cpu/mcheck/mce.c | 47 +++++--------------------------------- arch/x86/kernel/entry_64.S | 5 ---- arch/x86/kernel/irqinit.c | 3 --- 6 files changed, 6 insertions(+), 59 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index 1cd6d26..0baa628 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -53,8 +53,4 @@ BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR) BUILD_INTERRUPT(threshold_interrupt,THRESHOLD_APIC_VECTOR) #endif -#ifdef CONFIG_X86_MCE -BUILD_INTERRUPT(mce_self_interrupt,MCE_SELF_VECTOR) -#endif - #endif diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index bb9efe8..13f5504 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -34,7 +34,6 @@ extern void irq_work_interrupt(void); extern void spurious_interrupt(void); extern void thermal_interrupt(void); extern void reschedule_interrupt(void); -extern void mce_self_interrupt(void); extern void invalidate_interrupt(void); extern void invalidate_interrupt0(void); diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 6e976ee..6665026 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -109,11 +109,6 @@ #define UV_BAU_MESSAGE 0xf5 -/* - * Self IPI vector for machine checks - */ -#define MCE_SELF_VECTOR 0xf4 - /* Xen vector callback to receive events in a HVM domain */ #define XEN_HVM_EVTCHN_CALLBACK 0xf3 diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index ff1ae9b..e81d48b 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -10,7 +10,6 @@ #include #include #include -#include #include #include #include @@ -38,12 +37,9 @@ #include #include #include +#include #include -#include -#include -#include -#include #include #include @@ -461,22 +457,13 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) m->ip = mce_rdmsrl(rip_msr); } -#ifdef CONFIG_X86_LOCAL_APIC -/* - * Called after interrupts have been reenabled again - * when a MCE happened during an interrupts off region - * in the kernel. - */ -asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs) +DEFINE_PER_CPU(struct irq_work, mce_irq_work); + +static void mce_irq_work_cb(struct irq_work *entry) { - ack_APIC_irq(); - exit_idle(); - irq_enter(); mce_notify_irq(); mce_schedule_work(); - irq_exit(); } -#endif static void mce_report_event(struct pt_regs *regs) { @@ -492,29 +479,7 @@ static void mce_report_event(struct pt_regs *regs) return; } -#ifdef CONFIG_X86_LOCAL_APIC - /* - * Without APIC do not notify. The event will be picked - * up eventually. - */ - if (!cpu_has_apic) - return; - - /* - * When interrupts are disabled we cannot use - * kernel services safely. Trigger an self interrupt - * through the APIC to instead do the notification - * after interrupts are reenabled again. - */ - apic->send_IPI_self(MCE_SELF_VECTOR); - - /* - * Wait for idle afterwards again so that we don't leave the - * APIC in a non idle state because the normal APIC writes - * cannot exclude us. - */ - apic_wait_icr_idle(); -#endif + irq_work_queue(&__get_cpu_var(mce_irq_work)); } DEFINE_PER_CPU(unsigned, mce_poll_count); @@ -1444,7 +1409,7 @@ void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c) __mcheck_cpu_init_vendor(c); __mcheck_cpu_init_timer(); INIT_WORK(&__get_cpu_var(mce_work), mce_process_work); - + init_irq_work(&__get_cpu_var(mce_irq_work), &mce_irq_work_cb); } /* diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 8a445a0..9fa6546 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -991,11 +991,6 @@ apicinterrupt THRESHOLD_APIC_VECTOR \ apicinterrupt THERMAL_APIC_VECTOR \ thermal_interrupt smp_thermal_interrupt -#ifdef CONFIG_X86_MCE -apicinterrupt MCE_SELF_VECTOR \ - mce_self_interrupt smp_mce_self_interrupt -#endif - #ifdef CONFIG_SMP apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \ call_function_single_interrupt smp_call_function_single_interrupt diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index f470e4e..f09d4bb 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -272,9 +272,6 @@ static void __init apic_intr_init(void) #ifdef CONFIG_X86_MCE_THRESHOLD alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); #endif -#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_LOCAL_APIC) - alloc_intr_gate(MCE_SELF_VECTOR, mce_self_interrupt); -#endif #if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) /* self generated IPI for local APIC timer */ -- cgit v1.1 From 2b90e77eaee8809073db5cf43ac9795cc2054dc0 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Wed, 8 Jun 2011 10:56:56 +0900 Subject: x86, mce: Replace MCM_ with MCI_MISC_ Follow other MCi register defines. Plus define MCI_MISC_ADDR_LSB() and MCI_MISC_ADDR_MODE(). Signed-off-by: Hidetoshi Seto Acked-by: Tony Luck Link: http://lkml.kernel.org/r/4DEED6E8.9090509@jp.fujitsu.com Signed-off-by: Borislav Petkov --- arch/x86/include/asm/mce.h | 17 +++++++++++------ arch/x86/kernel/cpu/mcheck/mce.c | 4 ++-- 2 files changed, 13 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 021979a6..bbb328f 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -8,6 +8,7 @@ * Machine Check support for x86 */ +/* MCG_CAP register defines */ #define MCG_BANKCNT_MASK 0xff /* Number of Banks */ #define MCG_CTL_P (1ULL<<8) /* MCG_CTL register available */ #define MCG_EXT_P (1ULL<<9) /* Extended registers available */ @@ -17,10 +18,12 @@ #define MCG_EXT_CNT(c) (((c) & MCG_EXT_CNT_MASK) >> MCG_EXT_CNT_SHIFT) #define MCG_SER_P (1ULL<<24) /* MCA recovery/new status bits */ +/* MCG_STATUS register defines */ #define MCG_STATUS_RIPV (1ULL<<0) /* restart ip valid */ #define MCG_STATUS_EIPV (1ULL<<1) /* ip points to correct instruction */ #define MCG_STATUS_MCIP (1ULL<<2) /* machine check in progress */ +/* MCi_STATUS register defines */ #define MCI_STATUS_VAL (1ULL<<63) /* valid error */ #define MCI_STATUS_OVER (1ULL<<62) /* previous errors lost */ #define MCI_STATUS_UC (1ULL<<61) /* uncorrected error */ @@ -31,12 +34,14 @@ #define MCI_STATUS_S (1ULL<<56) /* Signaled machine check */ #define MCI_STATUS_AR (1ULL<<55) /* Action required */ -/* MISC register defines */ -#define MCM_ADDR_SEGOFF 0 /* segment offset */ -#define MCM_ADDR_LINEAR 1 /* linear address */ -#define MCM_ADDR_PHYS 2 /* physical address */ -#define MCM_ADDR_MEM 3 /* memory address */ -#define MCM_ADDR_GENERIC 7 /* generic */ +/* MCi_MISC register defines */ +#define MCI_MISC_ADDR_LSB(m) ((m) & 0x3f) +#define MCI_MISC_ADDR_MODE(m) (((m) >> 6) & 7) +#define MCI_MISC_ADDR_SEGOFF 0 /* segment offset */ +#define MCI_MISC_ADDR_LINEAR 1 /* linear address */ +#define MCI_MISC_ADDR_PHYS 2 /* physical address */ +#define MCI_MISC_ADDR_MEM 3 /* memory address */ +#define MCI_MISC_ADDR_GENERIC 7 /* generic */ /* CTL2 register defines */ #define MCI_CTL2_CMCI_EN (1ULL << 30) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index e81d48b..e807508 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -844,9 +844,9 @@ static int mce_usable_address(struct mce *m) { if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV)) return 0; - if ((m->misc & 0x3f) > PAGE_SHIFT) + if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT) return 0; - if (((m->misc >> 6) & 7) != MCM_ADDR_PHYS) + if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS) return 0; return 1; } -- cgit v1.1 From b8325c5b110d7ff460b79588e7e9afdcc73d5c3c Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Wed, 8 Jun 2011 10:57:46 +0900 Subject: x86, mce: Introduce mce_gather_info() This patch introduces mce_gather_info() which is to be called at the beginning of error handling and gathers minimum error information from proper error registers (and saved registers). As the result of mce_get_rip() is integrated, unnecessary zeroing is removed. This also takes care of saving RIP which is required to make some decision about error severity for SRAR errors, instead of retrieving it later in the handler. Signed-off-by: Hidetoshi Seto Acked-by: Tony Luck Link: http://lkml.kernel.org/r/4DEED71A.1060906@jp.fujitsu.com Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/mcheck/mce.c | 50 ++++++++++++++++++++++------------------ 1 file changed, 27 insertions(+), 23 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index e807508..a182875 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -369,6 +369,31 @@ static void mce_wrmsrl(u32 msr, u64 v) } /* + * Collect all global (w.r.t. this processor) status about this machine + * check into our "mce" struct so that we can use it later to assess + * the severity of the problem as we read per-bank specific details. + */ +static inline void mce_gather_info(struct mce *m, struct pt_regs *regs) +{ + mce_setup(m); + + m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); + if (regs) { + /* + * Get the address of the instruction at the time of + * the machine check error. + */ + if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) { + m->ip = regs->ip; + m->cs = regs->cs; + } + /* Use accurate RIP reporting if available. */ + if (rip_msr) + m->ip = mce_rdmsrl(rip_msr); + } +} + +/* * Simple lockless ring to communicate PFNs from the exception handler with the * process context work function. This is vastly simplified because there's * only a single reader and a single writer. @@ -439,24 +464,6 @@ static void mce_schedule_work(void) } } -/* - * Get the address of the instruction at the time of the machine check - * error. - */ -static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) -{ - - if (regs && (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV))) { - m->ip = regs->ip; - m->cs = regs->cs; - } else { - m->ip = 0; - m->cs = 0; - } - if (rip_msr) - m->ip = mce_rdmsrl(rip_msr); -} - DEFINE_PER_CPU(struct irq_work, mce_irq_work); static void mce_irq_work_cb(struct irq_work *entry) @@ -506,9 +513,8 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) percpu_inc(mce_poll_count); - mce_setup(&m); + mce_gather_info(&m, NULL); - m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); for (i = 0; i < banks; i++) { if (!mce_banks[i].ctl || !test_bit(i, *b)) continue; @@ -907,9 +913,8 @@ void do_machine_check(struct pt_regs *regs, long error_code) if (!banks) goto out; - mce_setup(&m); + mce_gather_info(&m, regs); - m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); final = &__get_cpu_var(mces_seen); *final = m; @@ -993,7 +998,6 @@ void do_machine_check(struct pt_regs *regs, long error_code) if (severity == MCE_AO_SEVERITY && mce_usable_address(&m)) mce_ring_add(m.addr >> PAGE_SHIFT); - mce_get_rip(&m, regs); mce_log(&m); if (severity > worst) { -- cgit v1.1 From 3a97fc34130326da87b20de5d0259c35406707ce Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Wed, 8 Jun 2011 10:58:35 +0900 Subject: x86, mce: Check the result of ancient_init() Because "ancient CPUs" like p5 and winchip don't have X86_FEATURE_MCA (I suppose so), mcheck_cpu_init() on such CPUs will return at check of mce_available() after __mcheck_cpu_ancient_init(). It is hard to know this implicit behavior without knowing the CPUs well. So make it clear that we leave mcheck_cpu_init() when the CPU is initialized in __mcheck_cpu_ancient_init(). Signed-off-by: Hidetoshi Seto Acked-by: Tony Luck Link: http://lkml.kernel.org/r/4DEED74B.20502@jp.fujitsu.com Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/mcheck/mce.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index a182875..2cc98d5 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1332,18 +1332,23 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) return 0; } -static void __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) +static int __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) { if (c->x86 != 5) - return; + return 0; + switch (c->x86_vendor) { case X86_VENDOR_INTEL: intel_p5_mcheck_init(c); + return 1; break; case X86_VENDOR_CENTAUR: winchip_mcheck_init(c); + return 1; break; } + + return 0; } static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) @@ -1397,7 +1402,8 @@ void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c) if (mce_disabled) return; - __mcheck_cpu_ancient_init(c); + if (__mcheck_cpu_ancient_init(c)) + return; if (!mce_available(c)) return; -- cgit v1.1 From f6783c4234e65bd6f85596d97745ccdbf041cb63 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Wed, 8 Jun 2011 10:59:19 +0900 Subject: x86, mce: Cleanup mce_create()/remove_device() Use temporary local variable sysdev to simplify the code. No change in logic. Signed-off-by: Hidetoshi Seto Acked-by: Tony Luck Link: http://lkml.kernel.org/r/4DEED777.7080205@jp.fujitsu.com Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/mcheck/mce.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 2cc98d5..f3f648c 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1925,28 +1925,28 @@ static cpumask_var_t mce_dev_initialized; /* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */ static __cpuinit int mce_create_device(unsigned int cpu) { + struct sys_device *sysdev = &per_cpu(mce_dev, cpu); int err; int i, j; if (!mce_available(&boot_cpu_data)) return -EIO; - memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject)); - per_cpu(mce_dev, cpu).id = cpu; - per_cpu(mce_dev, cpu).cls = &mce_sysclass; + memset(&sysdev->kobj, 0, sizeof(struct kobject)); + sysdev->id = cpu; + sysdev->cls = &mce_sysclass; - err = sysdev_register(&per_cpu(mce_dev, cpu)); + err = sysdev_register(sysdev); if (err) return err; for (i = 0; mce_attrs[i]; i++) { - err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); + err = sysdev_create_file(sysdev, mce_attrs[i]); if (err) goto error; } for (j = 0; j < banks; j++) { - err = sysdev_create_file(&per_cpu(mce_dev, cpu), - &mce_banks[j].attr); + err = sysdev_create_file(sysdev, &mce_banks[j].attr); if (err) goto error2; } @@ -1955,30 +1955,31 @@ static __cpuinit int mce_create_device(unsigned int cpu) return 0; error2: while (--j >= 0) - sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr); + sysdev_remove_file(sysdev, &mce_banks[j].attr); error: while (--i >= 0) - sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); + sysdev_remove_file(sysdev, mce_attrs[i]); - sysdev_unregister(&per_cpu(mce_dev, cpu)); + sysdev_unregister(sysdev); return err; } static __cpuinit void mce_remove_device(unsigned int cpu) { + struct sys_device *sysdev = &per_cpu(mce_dev, cpu); int i; if (!cpumask_test_cpu(cpu, mce_dev_initialized)) return; for (i = 0; mce_attrs[i]; i++) - sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); + sysdev_remove_file(sysdev, mce_attrs[i]); for (i = 0; i < banks; i++) - sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr); + sysdev_remove_file(sysdev, &mce_banks[i].attr); - sysdev_unregister(&per_cpu(mce_dev, cpu)); + sysdev_unregister(sysdev); cpumask_clear_cpu(cpu, mce_dev_initialized); } -- cgit v1.1 From 559faa6be143b8aa7a07b12f618d29fbc1c8eb0d Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Wed, 8 Jun 2011 11:00:08 +0900 Subject: x86, mce: Cleanup mce_read() Use a temporary local variable m to simplify the code. No change in logic. Signed-off-by: Hidetoshi Seto Acked-by: Tony Luck Link: http://lkml.kernel.org/r/4DEED7A8.8020307@jp.fujitsu.com Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/mcheck/mce.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index f3f648c..54360e8 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1537,19 +1537,18 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, do { for (i = prev; i < next; i++) { unsigned long start = jiffies; + struct mce *m = &mcelog.entry[i]; - while (!mcelog.entry[i].finished) { + while (!m->finished) { if (time_after_eq(jiffies, start + 2)) { - memset(mcelog.entry + i, 0, - sizeof(struct mce)); + memset(m, 0, sizeof(*m)); goto timeout; } cpu_relax(); } smp_rmb(); - err |= copy_to_user(buf, mcelog.entry + i, - sizeof(struct mce)); - buf += sizeof(struct mce); + err |= copy_to_user(buf, m, sizeof(*m)); + buf += sizeof(*m); timeout: ; } @@ -1569,13 +1568,13 @@ timeout: on_each_cpu(collect_tscs, cpu_tsc, 1); for (i = next; i < MCE_LOG_LEN; i++) { - if (mcelog.entry[i].finished && - mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { - err |= copy_to_user(buf, mcelog.entry+i, - sizeof(struct mce)); + struct mce *m = &mcelog.entry[i]; + + if (m->finished && m->tsc < cpu_tsc[m->cpu]) { + err |= copy_to_user(buf, m, sizeof(*m)); smp_rmb(); - buf += sizeof(struct mce); - memset(&mcelog.entry[i], 0, sizeof(struct mce)); + buf += sizeof(*m); + memset(m, 0, sizeof(*m)); } } -- cgit v1.1 From 93b62c3cf59d44850cbe9f04d58da08930e3fb0b Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Wed, 8 Jun 2011 11:00:45 +0900 Subject: x86, mce: Use mce_chrdev_ prefix to group functions There are many functions named mce_* so use a new prefix for the subset of functions dealing with the character device /dev/mcelog. This change doesn't impact the mce-inject module because the exported symbol mce_chrdev_ops already has the prefix, therefore it is left unchanged. Before: After: mce_wait mce_chrdev_wait mce_state_lock mce_chrdev_state_lock open_count mce_chrdev_open_count open_exclu mce_chrdev_open_exclu mce_open mce_chrdev_open mce_release mce_chrdev_release mce_read_mutex mce_chrdev_read_mutex mce_read mce_chrdev_read mce_poll mce_chrdev_poll mce_ioctl mce_chrdev_ioctl mce_log_device mce_chrdev_device Signed-off-by: Hidetoshi Seto Acked-by: Tony Luck Link: http://lkml.kernel.org/r/4DEED7CD.3040500@jp.fujitsu.com Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/mcheck/mce.c | 77 +++++++++++++++++++++------------------- 1 file changed, 41 insertions(+), 36 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 54360e8..9a73516 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -45,12 +45,12 @@ #include "mce-internal.h" -static DEFINE_MUTEX(mce_read_mutex); +static DEFINE_MUTEX(mce_chrdev_read_mutex); #define rcu_dereference_check_mce(p) \ rcu_dereference_index_check((p), \ rcu_read_lock_sched_held() || \ - lockdep_is_held(&mce_read_mutex)) + lockdep_is_held(&mce_chrdev_read_mutex)) #define CREATE_TRACE_POINTS #include @@ -90,7 +90,8 @@ static unsigned long mce_need_notify; static char mce_helper[128]; static char *mce_helper_argv[2] = { mce_helper, NULL }; -static DECLARE_WAIT_QUEUE_HEAD(mce_wait); +static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait); + static DEFINE_PER_CPU(struct mce, mces_seen); static int cpu_missing; @@ -1159,7 +1160,8 @@ int mce_notify_irq(void) clear_thread_flag(TIF_MCE_NOTIFY); if (test_and_clear_bit(0, &mce_need_notify)) { - wake_up_interruptible(&mce_wait); + /* wake processes polling /dev/mcelog */ + wake_up_interruptible(&mce_chrdev_wait); /* * There is no risk of missing notifications because @@ -1423,40 +1425,41 @@ void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c) } /* - * Character device to read and clear the MCE log. + * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log. */ -static DEFINE_SPINLOCK(mce_state_lock); -static int open_count; /* #times opened */ -static int open_exclu; /* already open exclusive? */ +static DEFINE_SPINLOCK(mce_chrdev_state_lock); +static int mce_chrdev_open_count; /* #times opened */ +static int mce_chrdev_open_exclu; /* already open exclusive? */ -static int mce_open(struct inode *inode, struct file *file) +static int mce_chrdev_open(struct inode *inode, struct file *file) { - spin_lock(&mce_state_lock); + spin_lock(&mce_chrdev_state_lock); - if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { - spin_unlock(&mce_state_lock); + if (mce_chrdev_open_exclu || + (mce_chrdev_open_count && (file->f_flags & O_EXCL))) { + spin_unlock(&mce_chrdev_state_lock); return -EBUSY; } if (file->f_flags & O_EXCL) - open_exclu = 1; - open_count++; + mce_chrdev_open_exclu = 1; + mce_chrdev_open_count++; - spin_unlock(&mce_state_lock); + spin_unlock(&mce_chrdev_state_lock); return nonseekable_open(inode, file); } -static int mce_release(struct inode *inode, struct file *file) +static int mce_chrdev_release(struct inode *inode, struct file *file) { - spin_lock(&mce_state_lock); + spin_lock(&mce_chrdev_state_lock); - open_count--; - open_exclu = 0; + mce_chrdev_open_count--; + mce_chrdev_open_exclu = 0; - spin_unlock(&mce_state_lock); + spin_unlock(&mce_chrdev_state_lock); return 0; } @@ -1505,8 +1508,8 @@ static int __mce_read_apei(char __user **ubuf, size_t usize) return 0; } -static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, - loff_t *off) +static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf, + size_t usize, loff_t *off) { char __user *buf = ubuf; unsigned long *cpu_tsc; @@ -1517,7 +1520,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, if (!cpu_tsc) return -ENOMEM; - mutex_lock(&mce_read_mutex); + mutex_lock(&mce_chrdev_read_mutex); if (!mce_apei_read_done) { err = __mce_read_apei(&buf, usize); @@ -1582,15 +1585,15 @@ timeout: err = -EFAULT; out: - mutex_unlock(&mce_read_mutex); + mutex_unlock(&mce_chrdev_read_mutex); kfree(cpu_tsc); return err ? err : buf - ubuf; } -static unsigned int mce_poll(struct file *file, poll_table *wait) +static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait) { - poll_wait(file, &mce_wait, wait); + poll_wait(file, &mce_chrdev_wait, wait); if (rcu_access_index(mcelog.next)) return POLLIN | POLLRDNORM; if (!mce_apei_read_done && apei_check_mce()) @@ -1598,7 +1601,8 @@ static unsigned int mce_poll(struct file *file, poll_table *wait) return 0; } -static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg) +static long mce_chrdev_ioctl(struct file *f, unsigned int cmd, + unsigned long arg) { int __user *p = (int __user *)arg; @@ -1626,16 +1630,16 @@ static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg) /* Modified in mce-inject.c, so not static or const */ struct file_operations mce_chrdev_ops = { - .open = mce_open, - .release = mce_release, - .read = mce_read, - .poll = mce_poll, - .unlocked_ioctl = mce_ioctl, - .llseek = no_llseek, + .open = mce_chrdev_open, + .release = mce_chrdev_release, + .read = mce_chrdev_read, + .poll = mce_chrdev_poll, + .unlocked_ioctl = mce_chrdev_ioctl, + .llseek = no_llseek, }; EXPORT_SYMBOL_GPL(mce_chrdev_ops); -static struct miscdevice mce_log_device = { +static struct miscdevice mce_chrdev_device = { MISC_MCELOG_MINOR, "mcelog", &mce_chrdev_ops, @@ -2107,11 +2111,12 @@ static __init int mcheck_init_device(void) register_syscore_ops(&mce_syscore_ops); register_hotcpu_notifier(&mce_cpu_notifier); - misc_register(&mce_log_device); + + /* register character device /dev/mcelog */ + misc_register(&mce_chrdev_device); return err; } - device_initcall(mcheck_init_device); /* -- cgit v1.1 From c7cece89f1b00b56276303942f96ec67cf206e1e Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Wed, 8 Jun 2011 11:02:03 +0900 Subject: x86, mce: Use mce_sysdev_ prefix to group functions There are many functions named mce_* so use a new prefix for the subset of functions related to sysfs support. And since f3c6ea1b06c71b43f751b36bd99345369fe911af introduces syscore_ops, use the prefix mce_syscore for some functions related to power management which were in sysdev_class before. Before: After: mce_device mce_sysdev mce_sysclass mce_sysdev_class mce_attrs mce_sysdev_attrs mce_dev_initialized mce_sysdev_initialized mce_create_device mce_sysdev_create mce_remove_device mce_sysdev_remove mce_suspend mce_syscore_suspend mce_shutdown mce_syscore_shutdown mce_resume mce_syscore_resume Signed-off-by: Hidetoshi Seto Acked-by: Tony Luck Link: http://lkml.kernel.org/r/4DEED81B.8020506@jp.fujitsu.com Signed-off-by: Borislav Petkov --- arch/x86/include/asm/mce.h | 2 +- arch/x86/kernel/cpu/mcheck/mce.c | 62 +++++++++++++++++++----------------- arch/x86/kernel/cpu/mcheck/mce_amd.c | 10 +++--- 3 files changed, 39 insertions(+), 35 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index bbb328f..716b48a 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -149,7 +149,7 @@ static inline void enable_p5_mce(void) {} void mce_setup(struct mce *m); void mce_log(struct mce *m); -DECLARE_PER_CPU(struct sys_device, mce_dev); +DECLARE_PER_CPU(struct sys_device, mce_sysdev); /* * Maximum banks number. diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 9a73516..08363b0 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1697,7 +1697,7 @@ int __init mcheck_init(void) } /* - * Sysfs support + * mce_syscore: PM support */ /* @@ -1717,12 +1717,12 @@ static int mce_disable_error_reporting(void) return 0; } -static int mce_suspend(void) +static int mce_syscore_suspend(void) { return mce_disable_error_reporting(); } -static void mce_shutdown(void) +static void mce_syscore_shutdown(void) { mce_disable_error_reporting(); } @@ -1732,18 +1732,22 @@ static void mce_shutdown(void) * Only one CPU is active at this time, the others get re-added later using * CPU hotplug: */ -static void mce_resume(void) +static void mce_syscore_resume(void) { __mcheck_cpu_init_generic(); __mcheck_cpu_init_vendor(__this_cpu_ptr(&cpu_info)); } static struct syscore_ops mce_syscore_ops = { - .suspend = mce_suspend, - .shutdown = mce_shutdown, - .resume = mce_resume, + .suspend = mce_syscore_suspend, + .shutdown = mce_syscore_shutdown, + .resume = mce_syscore_resume, }; +/* + * mce_sysdev: Sysfs support + */ + static void mce_cpu_restart(void *data) { del_timer_sync(&__get_cpu_var(mce_timer)); @@ -1779,11 +1783,11 @@ static void mce_enable_ce(void *all) __mcheck_cpu_init_timer(); } -static struct sysdev_class mce_sysclass = { +static struct sysdev_class mce_sysdev_class = { .name = "machinecheck", }; -DEFINE_PER_CPU(struct sys_device, mce_dev); +DEFINE_PER_CPU(struct sys_device, mce_sysdev); __cpuinitdata void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); @@ -1912,7 +1916,7 @@ static struct sysdev_ext_attribute attr_cmci_disabled = { &mce_cmci_disabled }; -static struct sysdev_attribute *mce_attrs[] = { +static struct sysdev_attribute *mce_sysdev_attrs[] = { &attr_tolerant.attr, &attr_check_interval.attr, &attr_trigger, @@ -1923,12 +1927,12 @@ static struct sysdev_attribute *mce_attrs[] = { NULL }; -static cpumask_var_t mce_dev_initialized; +static cpumask_var_t mce_sysdev_initialized; /* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */ -static __cpuinit int mce_create_device(unsigned int cpu) +static __cpuinit int mce_sysdev_create(unsigned int cpu) { - struct sys_device *sysdev = &per_cpu(mce_dev, cpu); + struct sys_device *sysdev = &per_cpu(mce_sysdev, cpu); int err; int i, j; @@ -1937,14 +1941,14 @@ static __cpuinit int mce_create_device(unsigned int cpu) memset(&sysdev->kobj, 0, sizeof(struct kobject)); sysdev->id = cpu; - sysdev->cls = &mce_sysclass; + sysdev->cls = &mce_sysdev_class; err = sysdev_register(sysdev); if (err) return err; - for (i = 0; mce_attrs[i]; i++) { - err = sysdev_create_file(sysdev, mce_attrs[i]); + for (i = 0; mce_sysdev_attrs[i]; i++) { + err = sysdev_create_file(sysdev, mce_sysdev_attrs[i]); if (err) goto error; } @@ -1953,7 +1957,7 @@ static __cpuinit int mce_create_device(unsigned int cpu) if (err) goto error2; } - cpumask_set_cpu(cpu, mce_dev_initialized); + cpumask_set_cpu(cpu, mce_sysdev_initialized); return 0; error2: @@ -1961,29 +1965,29 @@ error2: sysdev_remove_file(sysdev, &mce_banks[j].attr); error: while (--i >= 0) - sysdev_remove_file(sysdev, mce_attrs[i]); + sysdev_remove_file(sysdev, mce_sysdev_attrs[i]); sysdev_unregister(sysdev); return err; } -static __cpuinit void mce_remove_device(unsigned int cpu) +static __cpuinit void mce_sysdev_remove(unsigned int cpu) { - struct sys_device *sysdev = &per_cpu(mce_dev, cpu); + struct sys_device *sysdev = &per_cpu(mce_sysdev, cpu); int i; - if (!cpumask_test_cpu(cpu, mce_dev_initialized)) + if (!cpumask_test_cpu(cpu, mce_sysdev_initialized)) return; - for (i = 0; mce_attrs[i]; i++) - sysdev_remove_file(sysdev, mce_attrs[i]); + for (i = 0; mce_sysdev_attrs[i]; i++) + sysdev_remove_file(sysdev, mce_sysdev_attrs[i]); for (i = 0; i < banks; i++) sysdev_remove_file(sysdev, &mce_banks[i].attr); sysdev_unregister(sysdev); - cpumask_clear_cpu(cpu, mce_dev_initialized); + cpumask_clear_cpu(cpu, mce_sysdev_initialized); } /* Make sure there are no machine checks on offlined CPUs. */ @@ -2033,7 +2037,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) switch (action) { case CPU_ONLINE: case CPU_ONLINE_FROZEN: - mce_create_device(cpu); + mce_sysdev_create(cpu); if (threshold_cpu_callback) threshold_cpu_callback(action, cpu); break; @@ -2041,7 +2045,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) case CPU_DEAD_FROZEN: if (threshold_cpu_callback) threshold_cpu_callback(action, cpu); - mce_remove_device(cpu); + mce_sysdev_remove(cpu); break; case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: @@ -2095,16 +2099,16 @@ static __init int mcheck_init_device(void) if (!mce_available(&boot_cpu_data)) return -EIO; - zalloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL); + zalloc_cpumask_var(&mce_sysdev_initialized, GFP_KERNEL); mce_init_banks(); - err = sysdev_class_register(&mce_sysclass); + err = sysdev_class_register(&mce_sysdev_class); if (err) return err; for_each_online_cpu(i) { - err = mce_create_device(i); + err = mce_sysdev_create(i); if (err) return err; } diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index bb0adad..f547421 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -548,7 +548,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) if (!b) goto out; - err = sysfs_create_link(&per_cpu(mce_dev, cpu).kobj, + err = sysfs_create_link(&per_cpu(mce_sysdev, cpu).kobj, b->kobj, name); if (err) goto out; @@ -571,7 +571,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) goto out; } - b->kobj = kobject_create_and_add(name, &per_cpu(mce_dev, cpu).kobj); + b->kobj = kobject_create_and_add(name, &per_cpu(mce_sysdev, cpu).kobj); if (!b->kobj) goto out_free; @@ -591,7 +591,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) if (i == cpu) continue; - err = sysfs_create_link(&per_cpu(mce_dev, i).kobj, + err = sysfs_create_link(&per_cpu(mce_sysdev, i).kobj, b->kobj, name); if (err) goto out; @@ -669,7 +669,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank) #ifdef CONFIG_SMP /* sibling symlink */ if (shared_bank[bank] && b->blocks->cpu != cpu) { - sysfs_remove_link(&per_cpu(mce_dev, cpu).kobj, name); + sysfs_remove_link(&per_cpu(mce_sysdev, cpu).kobj, name); per_cpu(threshold_banks, cpu)[bank] = NULL; return; @@ -681,7 +681,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank) if (i == cpu) continue; - sysfs_remove_link(&per_cpu(mce_dev, i).kobj, name); + sysfs_remove_link(&per_cpu(mce_sysdev, i).kobj, name); per_cpu(threshold_banks, i)[bank] = NULL; } -- cgit v1.1 From 40b7f3dfcc5ab211a0b8d916751bb22ac2290806 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 15 Jun 2011 15:34:57 +0200 Subject: x86, microcode, AMD: Fix section header size check The ucode size check has to take the section header size into account too when sanity checking the section length. Shorten and clarify define names, while at it. Caught-by: Ben Hutchings Link: http://lkml.kernel.org/r/1302752223.5282.674.camel@localhost Signed-off-by: Borislav Petkov --- arch/x86/kernel/microcode_amd.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index d30d67c..591be0e 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -66,8 +66,8 @@ struct microcode_amd { unsigned int mpb[0]; }; -#define UCODE_CONTAINER_SECTION_HDR 8 -#define UCODE_CONTAINER_HEADER_SIZE 12 +#define SECTION_HDR_SIZE 8 +#define CONTAINER_HDR_SZ 12 static struct equiv_cpu_entry *equiv_cpu_table; @@ -177,7 +177,7 @@ static unsigned int verify_ucode_size(int cpu, const u8 *buf, unsigned int size) actual_size = *(u32 *)(buf + 4); - if (actual_size > size || actual_size > max_size) { + if (actual_size + SECTION_HDR_SIZE > size || actual_size > max_size) { pr_err("section size mismatch\n"); return 0; } @@ -204,8 +204,8 @@ get_next_ucode(int cpu, const u8 *buf, unsigned int size, unsigned int *mc_size) if (!mc) goto out; - get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, actual_size); - *mc_size = actual_size + UCODE_CONTAINER_SECTION_HDR; + get_ucode_data(mc, buf + SECTION_HDR_SIZE, actual_size); + *mc_size = actual_size + SECTION_HDR_SIZE; out: return mc; @@ -229,9 +229,10 @@ static int install_equiv_cpu_table(const u8 *buf) return -ENOMEM; } - get_ucode_data(equiv_cpu_table, buf + UCODE_CONTAINER_HEADER_SIZE, size); + get_ucode_data(equiv_cpu_table, buf + CONTAINER_HDR_SZ, size); - return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */ + /* add header length */ + return size + CONTAINER_HDR_SZ; } static void free_equiv_cpu_table(void) -- cgit v1.1 From e44ba033c5654dbfda53461c9b1f7dd9bd1d198f Mon Sep 17 00:00:00 2001 From: Vitaliy Ivanov Date: Mon, 20 Jun 2011 16:08:07 +0200 Subject: treewide: remove duplicate includes Many stupid corrections of duplicated includes based on the output of scripts/checkincludes.pl. Signed-off-by: Vitaliy Ivanov Signed-off-by: Jiri Kosina --- arch/x86/kvm/mmu.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index bd14bb4..9b9f012 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -22,7 +22,6 @@ #include "mmu.h" #include "x86.h" #include "kvm_cache_regs.h" -#include "x86.h" #include #include -- cgit v1.1 From 29b68415e335ba9e0eb6057f9405aa4d9c23efe4 Mon Sep 17 00:00:00 2001 From: Ohad Ben-Cohen Date: Sun, 5 Jun 2011 18:22:18 +0300 Subject: x86: amd_iommu: move to drivers/iommu/ This should ease finding similarities with different platforms, with the intention of solving problems once in a generic framework which everyone can use. Compile-tested on x86_64. Signed-off-by: Ohad Ben-Cohen Signed-off-by: Joerg Roedel --- arch/x86/Kconfig | 28 - arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/amd_iommu.c | 2764 ------------------------------------------- 3 files changed, 1 insertion(+), 2793 deletions(-) delete mode 100644 arch/x86/kernel/amd_iommu.c (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 460d573..1b6a2e2 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -680,34 +680,6 @@ config CALGARY_IOMMU_ENABLED_BY_DEFAULT Calgary anyway, pass 'iommu=calgary' on the kernel command line. If unsure, say Y. -config AMD_IOMMU - bool "AMD IOMMU support" - select SWIOTLB - select PCI_MSI - select PCI_IOV - select IOMMU_API - depends on X86_64 && PCI && ACPI - ---help--- - With this option you can enable support for AMD IOMMU hardware in - your system. An IOMMU is a hardware component which provides - remapping of DMA memory accesses from devices. With an AMD IOMMU you - can isolate the the DMA memory of different devices and protect the - system from misbehaving device drivers or hardware. - - You can find out if your system has an AMD IOMMU if you look into - your BIOS for an option to enable it or if you have an IVRS ACPI - table. - -config AMD_IOMMU_STATS - bool "Export AMD IOMMU statistics to debugfs" - depends on AMD_IOMMU - select DEBUG_FS - ---help--- - This option enables code in the AMD IOMMU driver to collect various - statistics about whats happening in the driver and exports that - information to userspace via debugfs. - If unsure, say N. - # need this always selected by IOMMU for the VIA workaround config SWIOTLB def_bool y if X86_64 diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 90b06d4..aef0298 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -123,7 +123,7 @@ ifeq ($(CONFIG_X86_64),y) obj-$(CONFIG_GART_IOMMU) += amd_gart_64.o aperture_64.o obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o - obj-$(CONFIG_AMD_IOMMU) += amd_iommu_init.o amd_iommu.o + obj-$(CONFIG_AMD_IOMMU) += amd_iommu_init.o obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o obj-y += vsmp_64.o diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c deleted file mode 100644 index 7c3a95e..0000000 --- a/arch/x86/kernel/amd_iommu.c +++ /dev/null @@ -1,2764 +0,0 @@ -/* - * Copyright (C) 2007-2010 Advanced Micro Devices, Inc. - * Author: Joerg Roedel - * Leo Duran - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 as published - * by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28)) - -#define LOOP_TIMEOUT 100000 - -static DEFINE_RWLOCK(amd_iommu_devtable_lock); - -/* A list of preallocated protection domains */ -static LIST_HEAD(iommu_pd_list); -static DEFINE_SPINLOCK(iommu_pd_list_lock); - -/* - * Domain for untranslated devices - only allocated - * if iommu=pt passed on kernel cmd line. - */ -static struct protection_domain *pt_domain; - -static struct iommu_ops amd_iommu_ops; - -/* - * general struct to manage commands send to an IOMMU - */ -struct iommu_cmd { - u32 data[4]; -}; - -static void update_domain(struct protection_domain *domain); - -/**************************************************************************** - * - * Helper functions - * - ****************************************************************************/ - -static inline u16 get_device_id(struct device *dev) -{ - struct pci_dev *pdev = to_pci_dev(dev); - - return calc_devid(pdev->bus->number, pdev->devfn); -} - -static struct iommu_dev_data *get_dev_data(struct device *dev) -{ - return dev->archdata.iommu; -} - -/* - * In this function the list of preallocated protection domains is traversed to - * find the domain for a specific device - */ -static struct dma_ops_domain *find_protection_domain(u16 devid) -{ - struct dma_ops_domain *entry, *ret = NULL; - unsigned long flags; - u16 alias = amd_iommu_alias_table[devid]; - - if (list_empty(&iommu_pd_list)) - return NULL; - - spin_lock_irqsave(&iommu_pd_list_lock, flags); - - list_for_each_entry(entry, &iommu_pd_list, list) { - if (entry->target_dev == devid || - entry->target_dev == alias) { - ret = entry; - break; - } - } - - spin_unlock_irqrestore(&iommu_pd_list_lock, flags); - - return ret; -} - -/* - * This function checks if the driver got a valid device from the caller to - * avoid dereferencing invalid pointers. - */ -static bool check_device(struct device *dev) -{ - u16 devid; - - if (!dev || !dev->dma_mask) - return false; - - /* No device or no PCI device */ - if (dev->bus != &pci_bus_type) - return false; - - devid = get_device_id(dev); - - /* Out of our scope? */ - if (devid > amd_iommu_last_bdf) - return false; - - if (amd_iommu_rlookup_table[devid] == NULL) - return false; - - return true; -} - -static int iommu_init_device(struct device *dev) -{ - struct iommu_dev_data *dev_data; - struct pci_dev *pdev; - u16 devid, alias; - - if (dev->archdata.iommu) - return 0; - - dev_data = kzalloc(sizeof(*dev_data), GFP_KERNEL); - if (!dev_data) - return -ENOMEM; - - dev_data->dev = dev; - - devid = get_device_id(dev); - alias = amd_iommu_alias_table[devid]; - pdev = pci_get_bus_and_slot(PCI_BUS(alias), alias & 0xff); - if (pdev) - dev_data->alias = &pdev->dev; - else { - kfree(dev_data); - return -ENOTSUPP; - } - - atomic_set(&dev_data->bind, 0); - - dev->archdata.iommu = dev_data; - - - return 0; -} - -static void iommu_ignore_device(struct device *dev) -{ - u16 devid, alias; - - devid = get_device_id(dev); - alias = amd_iommu_alias_table[devid]; - - memset(&amd_iommu_dev_table[devid], 0, sizeof(struct dev_table_entry)); - memset(&amd_iommu_dev_table[alias], 0, sizeof(struct dev_table_entry)); - - amd_iommu_rlookup_table[devid] = NULL; - amd_iommu_rlookup_table[alias] = NULL; -} - -static void iommu_uninit_device(struct device *dev) -{ - kfree(dev->archdata.iommu); -} - -void __init amd_iommu_uninit_devices(void) -{ - struct pci_dev *pdev = NULL; - - for_each_pci_dev(pdev) { - - if (!check_device(&pdev->dev)) - continue; - - iommu_uninit_device(&pdev->dev); - } -} - -int __init amd_iommu_init_devices(void) -{ - struct pci_dev *pdev = NULL; - int ret = 0; - - for_each_pci_dev(pdev) { - - if (!check_device(&pdev->dev)) - continue; - - ret = iommu_init_device(&pdev->dev); - if (ret == -ENOTSUPP) - iommu_ignore_device(&pdev->dev); - else if (ret) - goto out_free; - } - - return 0; - -out_free: - - amd_iommu_uninit_devices(); - - return ret; -} -#ifdef CONFIG_AMD_IOMMU_STATS - -/* - * Initialization code for statistics collection - */ - -DECLARE_STATS_COUNTER(compl_wait); -DECLARE_STATS_COUNTER(cnt_map_single); -DECLARE_STATS_COUNTER(cnt_unmap_single); -DECLARE_STATS_COUNTER(cnt_map_sg); -DECLARE_STATS_COUNTER(cnt_unmap_sg); -DECLARE_STATS_COUNTER(cnt_alloc_coherent); -DECLARE_STATS_COUNTER(cnt_free_coherent); -DECLARE_STATS_COUNTER(cross_page); -DECLARE_STATS_COUNTER(domain_flush_single); -DECLARE_STATS_COUNTER(domain_flush_all); -DECLARE_STATS_COUNTER(alloced_io_mem); -DECLARE_STATS_COUNTER(total_map_requests); - -static struct dentry *stats_dir; -static struct dentry *de_fflush; - -static void amd_iommu_stats_add(struct __iommu_counter *cnt) -{ - if (stats_dir == NULL) - return; - - cnt->dent = debugfs_create_u64(cnt->name, 0444, stats_dir, - &cnt->value); -} - -static void amd_iommu_stats_init(void) -{ - stats_dir = debugfs_create_dir("amd-iommu", NULL); - if (stats_dir == NULL) - return; - - de_fflush = debugfs_create_bool("fullflush", 0444, stats_dir, - (u32 *)&amd_iommu_unmap_flush); - - amd_iommu_stats_add(&compl_wait); - amd_iommu_stats_add(&cnt_map_single); - amd_iommu_stats_add(&cnt_unmap_single); - amd_iommu_stats_add(&cnt_map_sg); - amd_iommu_stats_add(&cnt_unmap_sg); - amd_iommu_stats_add(&cnt_alloc_coherent); - amd_iommu_stats_add(&cnt_free_coherent); - amd_iommu_stats_add(&cross_page); - amd_iommu_stats_add(&domain_flush_single); - amd_iommu_stats_add(&domain_flush_all); - amd_iommu_stats_add(&alloced_io_mem); - amd_iommu_stats_add(&total_map_requests); -} - -#endif - -/**************************************************************************** - * - * Interrupt handling functions - * - ****************************************************************************/ - -static void dump_dte_entry(u16 devid) -{ - int i; - - for (i = 0; i < 8; ++i) - pr_err("AMD-Vi: DTE[%d]: %08x\n", i, - amd_iommu_dev_table[devid].data[i]); -} - -static void dump_command(unsigned long phys_addr) -{ - struct iommu_cmd *cmd = phys_to_virt(phys_addr); - int i; - - for (i = 0; i < 4; ++i) - pr_err("AMD-Vi: CMD[%d]: %08x\n", i, cmd->data[i]); -} - -static void iommu_print_event(struct amd_iommu *iommu, void *__evt) -{ - u32 *event = __evt; - int type = (event[1] >> EVENT_TYPE_SHIFT) & EVENT_TYPE_MASK; - int devid = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK; - int domid = (event[1] >> EVENT_DOMID_SHIFT) & EVENT_DOMID_MASK; - int flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK; - u64 address = (u64)(((u64)event[3]) << 32) | event[2]; - - printk(KERN_ERR "AMD-Vi: Event logged ["); - - switch (type) { - case EVENT_TYPE_ILL_DEV: - printk("ILLEGAL_DEV_TABLE_ENTRY device=%02x:%02x.%x " - "address=0x%016llx flags=0x%04x]\n", - PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid), - address, flags); - dump_dte_entry(devid); - break; - case EVENT_TYPE_IO_FAULT: - printk("IO_PAGE_FAULT device=%02x:%02x.%x " - "domain=0x%04x address=0x%016llx flags=0x%04x]\n", - PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid), - domid, address, flags); - break; - case EVENT_TYPE_DEV_TAB_ERR: - printk("DEV_TAB_HARDWARE_ERROR device=%02x:%02x.%x " - "address=0x%016llx flags=0x%04x]\n", - PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid), - address, flags); - break; - case EVENT_TYPE_PAGE_TAB_ERR: - printk("PAGE_TAB_HARDWARE_ERROR device=%02x:%02x.%x " - "domain=0x%04x address=0x%016llx flags=0x%04x]\n", - PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid), - domid, address, flags); - break; - case EVENT_TYPE_ILL_CMD: - printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address); - dump_command(address); - break; - case EVENT_TYPE_CMD_HARD_ERR: - printk("COMMAND_HARDWARE_ERROR address=0x%016llx " - "flags=0x%04x]\n", address, flags); - break; - case EVENT_TYPE_IOTLB_INV_TO: - printk("IOTLB_INV_TIMEOUT device=%02x:%02x.%x " - "address=0x%016llx]\n", - PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid), - address); - break; - case EVENT_TYPE_INV_DEV_REQ: - printk("INVALID_DEVICE_REQUEST device=%02x:%02x.%x " - "address=0x%016llx flags=0x%04x]\n", - PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid), - address, flags); - break; - default: - printk(KERN_ERR "UNKNOWN type=0x%02x]\n", type); - } -} - -static void iommu_poll_events(struct amd_iommu *iommu) -{ - u32 head, tail; - unsigned long flags; - - spin_lock_irqsave(&iommu->lock, flags); - - head = readl(iommu->mmio_base + MMIO_EVT_HEAD_OFFSET); - tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET); - - while (head != tail) { - iommu_print_event(iommu, iommu->evt_buf + head); - head = (head + EVENT_ENTRY_SIZE) % iommu->evt_buf_size; - } - - writel(head, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET); - - spin_unlock_irqrestore(&iommu->lock, flags); -} - -irqreturn_t amd_iommu_int_thread(int irq, void *data) -{ - struct amd_iommu *iommu; - - for_each_iommu(iommu) - iommu_poll_events(iommu); - - return IRQ_HANDLED; -} - -irqreturn_t amd_iommu_int_handler(int irq, void *data) -{ - return IRQ_WAKE_THREAD; -} - -/**************************************************************************** - * - * IOMMU command queuing functions - * - ****************************************************************************/ - -static int wait_on_sem(volatile u64 *sem) -{ - int i = 0; - - while (*sem == 0 && i < LOOP_TIMEOUT) { - udelay(1); - i += 1; - } - - if (i == LOOP_TIMEOUT) { - pr_alert("AMD-Vi: Completion-Wait loop timed out\n"); - return -EIO; - } - - return 0; -} - -static void copy_cmd_to_buffer(struct amd_iommu *iommu, - struct iommu_cmd *cmd, - u32 tail) -{ - u8 *target; - - target = iommu->cmd_buf + tail; - tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size; - - /* Copy command to buffer */ - memcpy(target, cmd, sizeof(*cmd)); - - /* Tell the IOMMU about it */ - writel(tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); -} - -static void build_completion_wait(struct iommu_cmd *cmd, u64 address) -{ - WARN_ON(address & 0x7ULL); - - memset(cmd, 0, sizeof(*cmd)); - cmd->data[0] = lower_32_bits(__pa(address)) | CMD_COMPL_WAIT_STORE_MASK; - cmd->data[1] = upper_32_bits(__pa(address)); - cmd->data[2] = 1; - CMD_SET_TYPE(cmd, CMD_COMPL_WAIT); -} - -static void build_inv_dte(struct iommu_cmd *cmd, u16 devid) -{ - memset(cmd, 0, sizeof(*cmd)); - cmd->data[0] = devid; - CMD_SET_TYPE(cmd, CMD_INV_DEV_ENTRY); -} - -static void build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address, - size_t size, u16 domid, int pde) -{ - u64 pages; - int s; - - pages = iommu_num_pages(address, size, PAGE_SIZE); - s = 0; - - if (pages > 1) { - /* - * If we have to flush more than one page, flush all - * TLB entries for this domain - */ - address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; - s = 1; - } - - address &= PAGE_MASK; - - memset(cmd, 0, sizeof(*cmd)); - cmd->data[1] |= domid; - cmd->data[2] = lower_32_bits(address); - cmd->data[3] = upper_32_bits(address); - CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES); - if (s) /* size bit - we flush more than one 4kb page */ - cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK; - if (pde) /* PDE bit - we wan't flush everything not only the PTEs */ - cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK; -} - -static void build_inv_iotlb_pages(struct iommu_cmd *cmd, u16 devid, int qdep, - u64 address, size_t size) -{ - u64 pages; - int s; - - pages = iommu_num_pages(address, size, PAGE_SIZE); - s = 0; - - if (pages > 1) { - /* - * If we have to flush more than one page, flush all - * TLB entries for this domain - */ - address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; - s = 1; - } - - address &= PAGE_MASK; - - memset(cmd, 0, sizeof(*cmd)); - cmd->data[0] = devid; - cmd->data[0] |= (qdep & 0xff) << 24; - cmd->data[1] = devid; - cmd->data[2] = lower_32_bits(address); - cmd->data[3] = upper_32_bits(address); - CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES); - if (s) - cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK; -} - -static void build_inv_all(struct iommu_cmd *cmd) -{ - memset(cmd, 0, sizeof(*cmd)); - CMD_SET_TYPE(cmd, CMD_INV_ALL); -} - -/* - * Writes the command to the IOMMUs command buffer and informs the - * hardware about the new command. - */ -static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd) -{ - u32 left, tail, head, next_tail; - unsigned long flags; - - WARN_ON(iommu->cmd_buf_size & CMD_BUFFER_UNINITIALIZED); - -again: - spin_lock_irqsave(&iommu->lock, flags); - - head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); - tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); - next_tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size; - left = (head - next_tail) % iommu->cmd_buf_size; - - if (left <= 2) { - struct iommu_cmd sync_cmd; - volatile u64 sem = 0; - int ret; - - build_completion_wait(&sync_cmd, (u64)&sem); - copy_cmd_to_buffer(iommu, &sync_cmd, tail); - - spin_unlock_irqrestore(&iommu->lock, flags); - - if ((ret = wait_on_sem(&sem)) != 0) - return ret; - - goto again; - } - - copy_cmd_to_buffer(iommu, cmd, tail); - - /* We need to sync now to make sure all commands are processed */ - iommu->need_sync = true; - - spin_unlock_irqrestore(&iommu->lock, flags); - - return 0; -} - -/* - * This function queues a completion wait command into the command - * buffer of an IOMMU - */ -static int iommu_completion_wait(struct amd_iommu *iommu) -{ - struct iommu_cmd cmd; - volatile u64 sem = 0; - int ret; - - if (!iommu->need_sync) - return 0; - - build_completion_wait(&cmd, (u64)&sem); - - ret = iommu_queue_command(iommu, &cmd); - if (ret) - return ret; - - return wait_on_sem(&sem); -} - -static int iommu_flush_dte(struct amd_iommu *iommu, u16 devid) -{ - struct iommu_cmd cmd; - - build_inv_dte(&cmd, devid); - - return iommu_queue_command(iommu, &cmd); -} - -static void iommu_flush_dte_all(struct amd_iommu *iommu) -{ - u32 devid; - - for (devid = 0; devid <= 0xffff; ++devid) - iommu_flush_dte(iommu, devid); - - iommu_completion_wait(iommu); -} - -/* - * This function uses heavy locking and may disable irqs for some time. But - * this is no issue because it is only called during resume. - */ -static void iommu_flush_tlb_all(struct amd_iommu *iommu) -{ - u32 dom_id; - - for (dom_id = 0; dom_id <= 0xffff; ++dom_id) { - struct iommu_cmd cmd; - build_inv_iommu_pages(&cmd, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, - dom_id, 1); - iommu_queue_command(iommu, &cmd); - } - - iommu_completion_wait(iommu); -} - -static void iommu_flush_all(struct amd_iommu *iommu) -{ - struct iommu_cmd cmd; - - build_inv_all(&cmd); - - iommu_queue_command(iommu, &cmd); - iommu_completion_wait(iommu); -} - -void iommu_flush_all_caches(struct amd_iommu *iommu) -{ - if (iommu_feature(iommu, FEATURE_IA)) { - iommu_flush_all(iommu); - } else { - iommu_flush_dte_all(iommu); - iommu_flush_tlb_all(iommu); - } -} - -/* - * Command send function for flushing on-device TLB - */ -static int device_flush_iotlb(struct device *dev, u64 address, size_t size) -{ - struct pci_dev *pdev = to_pci_dev(dev); - struct amd_iommu *iommu; - struct iommu_cmd cmd; - u16 devid; - int qdep; - - qdep = pci_ats_queue_depth(pdev); - devid = get_device_id(dev); - iommu = amd_iommu_rlookup_table[devid]; - - build_inv_iotlb_pages(&cmd, devid, qdep, address, size); - - return iommu_queue_command(iommu, &cmd); -} - -/* - * Command send function for invalidating a device table entry - */ -static int device_flush_dte(struct device *dev) -{ - struct amd_iommu *iommu; - struct pci_dev *pdev; - u16 devid; - int ret; - - pdev = to_pci_dev(dev); - devid = get_device_id(dev); - iommu = amd_iommu_rlookup_table[devid]; - - ret = iommu_flush_dte(iommu, devid); - if (ret) - return ret; - - if (pci_ats_enabled(pdev)) - ret = device_flush_iotlb(dev, 0, ~0UL); - - return ret; -} - -/* - * TLB invalidation function which is called from the mapping functions. - * It invalidates a single PTE if the range to flush is within a single - * page. Otherwise it flushes the whole TLB of the IOMMU. - */ -static void __domain_flush_pages(struct protection_domain *domain, - u64 address, size_t size, int pde) -{ - struct iommu_dev_data *dev_data; - struct iommu_cmd cmd; - int ret = 0, i; - - build_inv_iommu_pages(&cmd, address, size, domain->id, pde); - - for (i = 0; i < amd_iommus_present; ++i) { - if (!domain->dev_iommu[i]) - continue; - - /* - * Devices of this domain are behind this IOMMU - * We need a TLB flush - */ - ret |= iommu_queue_command(amd_iommus[i], &cmd); - } - - list_for_each_entry(dev_data, &domain->dev_list, list) { - struct pci_dev *pdev = to_pci_dev(dev_data->dev); - - if (!pci_ats_enabled(pdev)) - continue; - - ret |= device_flush_iotlb(dev_data->dev, address, size); - } - - WARN_ON(ret); -} - -static void domain_flush_pages(struct protection_domain *domain, - u64 address, size_t size) -{ - __domain_flush_pages(domain, address, size, 0); -} - -/* Flush the whole IO/TLB for a given protection domain */ -static void domain_flush_tlb(struct protection_domain *domain) -{ - __domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 0); -} - -/* Flush the whole IO/TLB for a given protection domain - including PDE */ -static void domain_flush_tlb_pde(struct protection_domain *domain) -{ - __domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1); -} - -static void domain_flush_complete(struct protection_domain *domain) -{ - int i; - - for (i = 0; i < amd_iommus_present; ++i) { - if (!domain->dev_iommu[i]) - continue; - - /* - * Devices of this domain are behind this IOMMU - * We need to wait for completion of all commands. - */ - iommu_completion_wait(amd_iommus[i]); - } -} - - -/* - * This function flushes the DTEs for all devices in domain - */ -static void domain_flush_devices(struct protection_domain *domain) -{ - struct iommu_dev_data *dev_data; - unsigned long flags; - - spin_lock_irqsave(&domain->lock, flags); - - list_for_each_entry(dev_data, &domain->dev_list, list) - device_flush_dte(dev_data->dev); - - spin_unlock_irqrestore(&domain->lock, flags); -} - -/**************************************************************************** - * - * The functions below are used the create the page table mappings for - * unity mapped regions. - * - ****************************************************************************/ - -/* - * This function is used to add another level to an IO page table. Adding - * another level increases the size of the address space by 9 bits to a size up - * to 64 bits. - */ -static bool increase_address_space(struct protection_domain *domain, - gfp_t gfp) -{ - u64 *pte; - - if (domain->mode == PAGE_MODE_6_LEVEL) - /* address space already 64 bit large */ - return false; - - pte = (void *)get_zeroed_page(gfp); - if (!pte) - return false; - - *pte = PM_LEVEL_PDE(domain->mode, - virt_to_phys(domain->pt_root)); - domain->pt_root = pte; - domain->mode += 1; - domain->updated = true; - - return true; -} - -static u64 *alloc_pte(struct protection_domain *domain, - unsigned long address, - unsigned long page_size, - u64 **pte_page, - gfp_t gfp) -{ - int level, end_lvl; - u64 *pte, *page; - - BUG_ON(!is_power_of_2(page_size)); - - while (address > PM_LEVEL_SIZE(domain->mode)) - increase_address_space(domain, gfp); - - level = domain->mode - 1; - pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)]; - address = PAGE_SIZE_ALIGN(address, page_size); - end_lvl = PAGE_SIZE_LEVEL(page_size); - - while (level > end_lvl) { - if (!IOMMU_PTE_PRESENT(*pte)) { - page = (u64 *)get_zeroed_page(gfp); - if (!page) - return NULL; - *pte = PM_LEVEL_PDE(level, virt_to_phys(page)); - } - - /* No level skipping support yet */ - if (PM_PTE_LEVEL(*pte) != level) - return NULL; - - level -= 1; - - pte = IOMMU_PTE_PAGE(*pte); - - if (pte_page && level == end_lvl) - *pte_page = pte; - - pte = &pte[PM_LEVEL_INDEX(level, address)]; - } - - return pte; -} - -/* - * This function checks if there is a PTE for a given dma address. If - * there is one, it returns the pointer to it. - */ -static u64 *fetch_pte(struct protection_domain *domain, unsigned long address) -{ - int level; - u64 *pte; - - if (address > PM_LEVEL_SIZE(domain->mode)) - return NULL; - - level = domain->mode - 1; - pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)]; - - while (level > 0) { - - /* Not Present */ - if (!IOMMU_PTE_PRESENT(*pte)) - return NULL; - - /* Large PTE */ - if (PM_PTE_LEVEL(*pte) == 0x07) { - unsigned long pte_mask, __pte; - - /* - * If we have a series of large PTEs, make - * sure to return a pointer to the first one. - */ - pte_mask = PTE_PAGE_SIZE(*pte); - pte_mask = ~((PAGE_SIZE_PTE_COUNT(pte_mask) << 3) - 1); - __pte = ((unsigned long)pte) & pte_mask; - - return (u64 *)__pte; - } - - /* No level skipping support yet */ - if (PM_PTE_LEVEL(*pte) != level) - return NULL; - - level -= 1; - - /* Walk to the next level */ - pte = IOMMU_PTE_PAGE(*pte); - pte = &pte[PM_LEVEL_INDEX(level, address)]; - } - - return pte; -} - -/* - * Generic mapping functions. It maps a physical address into a DMA - * address space. It allocates the page table pages if necessary. - * In the future it can be extended to a generic mapping function - * supporting all features of AMD IOMMU page tables like level skipping - * and full 64 bit address spaces. - */ -static int iommu_map_page(struct protection_domain *dom, - unsigned long bus_addr, - unsigned long phys_addr, - int prot, - unsigned long page_size) -{ - u64 __pte, *pte; - int i, count; - - if (!(prot & IOMMU_PROT_MASK)) - return -EINVAL; - - bus_addr = PAGE_ALIGN(bus_addr); - phys_addr = PAGE_ALIGN(phys_addr); - count = PAGE_SIZE_PTE_COUNT(page_size); - pte = alloc_pte(dom, bus_addr, page_size, NULL, GFP_KERNEL); - - for (i = 0; i < count; ++i) - if (IOMMU_PTE_PRESENT(pte[i])) - return -EBUSY; - - if (page_size > PAGE_SIZE) { - __pte = PAGE_SIZE_PTE(phys_addr, page_size); - __pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_P | IOMMU_PTE_FC; - } else - __pte = phys_addr | IOMMU_PTE_P | IOMMU_PTE_FC; - - if (prot & IOMMU_PROT_IR) - __pte |= IOMMU_PTE_IR; - if (prot & IOMMU_PROT_IW) - __pte |= IOMMU_PTE_IW; - - for (i = 0; i < count; ++i) - pte[i] = __pte; - - update_domain(dom); - - return 0; -} - -static unsigned long iommu_unmap_page(struct protection_domain *dom, - unsigned long bus_addr, - unsigned long page_size) -{ - unsigned long long unmap_size, unmapped; - u64 *pte; - - BUG_ON(!is_power_of_2(page_size)); - - unmapped = 0; - - while (unmapped < page_size) { - - pte = fetch_pte(dom, bus_addr); - - if (!pte) { - /* - * No PTE for this address - * move forward in 4kb steps - */ - unmap_size = PAGE_SIZE; - } else if (PM_PTE_LEVEL(*pte) == 0) { - /* 4kb PTE found for this address */ - unmap_size = PAGE_SIZE; - *pte = 0ULL; - } else { - int count, i; - - /* Large PTE found which maps this address */ - unmap_size = PTE_PAGE_SIZE(*pte); - count = PAGE_SIZE_PTE_COUNT(unmap_size); - for (i = 0; i < count; i++) - pte[i] = 0ULL; - } - - bus_addr = (bus_addr & ~(unmap_size - 1)) + unmap_size; - unmapped += unmap_size; - } - - BUG_ON(!is_power_of_2(unmapped)); - - return unmapped; -} - -/* - * This function checks if a specific unity mapping entry is needed for - * this specific IOMMU. - */ -static int iommu_for_unity_map(struct amd_iommu *iommu, - struct unity_map_entry *entry) -{ - u16 bdf, i; - - for (i = entry->devid_start; i <= entry->devid_end; ++i) { - bdf = amd_iommu_alias_table[i]; - if (amd_iommu_rlookup_table[bdf] == iommu) - return 1; - } - - return 0; -} - -/* - * This function actually applies the mapping to the page table of the - * dma_ops domain. - */ -static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, - struct unity_map_entry *e) -{ - u64 addr; - int ret; - - for (addr = e->address_start; addr < e->address_end; - addr += PAGE_SIZE) { - ret = iommu_map_page(&dma_dom->domain, addr, addr, e->prot, - PAGE_SIZE); - if (ret) - return ret; - /* - * if unity mapping is in aperture range mark the page - * as allocated in the aperture - */ - if (addr < dma_dom->aperture_size) - __set_bit(addr >> PAGE_SHIFT, - dma_dom->aperture[0]->bitmap); - } - - return 0; -} - -/* - * Init the unity mappings for a specific IOMMU in the system - * - * Basically iterates over all unity mapping entries and applies them to - * the default domain DMA of that IOMMU if necessary. - */ -static int iommu_init_unity_mappings(struct amd_iommu *iommu) -{ - struct unity_map_entry *entry; - int ret; - - list_for_each_entry(entry, &amd_iommu_unity_map, list) { - if (!iommu_for_unity_map(iommu, entry)) - continue; - ret = dma_ops_unity_map(iommu->default_dom, entry); - if (ret) - return ret; - } - - return 0; -} - -/* - * Inits the unity mappings required for a specific device - */ -static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom, - u16 devid) -{ - struct unity_map_entry *e; - int ret; - - list_for_each_entry(e, &amd_iommu_unity_map, list) { - if (!(devid >= e->devid_start && devid <= e->devid_end)) - continue; - ret = dma_ops_unity_map(dma_dom, e); - if (ret) - return ret; - } - - return 0; -} - -/**************************************************************************** - * - * The next functions belong to the address allocator for the dma_ops - * interface functions. They work like the allocators in the other IOMMU - * drivers. Its basically a bitmap which marks the allocated pages in - * the aperture. Maybe it could be enhanced in the future to a more - * efficient allocator. - * - ****************************************************************************/ - -/* - * The address allocator core functions. - * - * called with domain->lock held - */ - -/* - * Used to reserve address ranges in the aperture (e.g. for exclusion - * ranges. - */ -static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, - unsigned long start_page, - unsigned int pages) -{ - unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT; - - if (start_page + pages > last_page) - pages = last_page - start_page; - - for (i = start_page; i < start_page + pages; ++i) { - int index = i / APERTURE_RANGE_PAGES; - int page = i % APERTURE_RANGE_PAGES; - __set_bit(page, dom->aperture[index]->bitmap); - } -} - -/* - * This function is used to add a new aperture range to an existing - * aperture in case of dma_ops domain allocation or address allocation - * failure. - */ -static int alloc_new_range(struct dma_ops_domain *dma_dom, - bool populate, gfp_t gfp) -{ - int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT; - struct amd_iommu *iommu; - unsigned long i; - -#ifdef CONFIG_IOMMU_STRESS - populate = false; -#endif - - if (index >= APERTURE_MAX_RANGES) - return -ENOMEM; - - dma_dom->aperture[index] = kzalloc(sizeof(struct aperture_range), gfp); - if (!dma_dom->aperture[index]) - return -ENOMEM; - - dma_dom->aperture[index]->bitmap = (void *)get_zeroed_page(gfp); - if (!dma_dom->aperture[index]->bitmap) - goto out_free; - - dma_dom->aperture[index]->offset = dma_dom->aperture_size; - - if (populate) { - unsigned long address = dma_dom->aperture_size; - int i, num_ptes = APERTURE_RANGE_PAGES / 512; - u64 *pte, *pte_page; - - for (i = 0; i < num_ptes; ++i) { - pte = alloc_pte(&dma_dom->domain, address, PAGE_SIZE, - &pte_page, gfp); - if (!pte) - goto out_free; - - dma_dom->aperture[index]->pte_pages[i] = pte_page; - - address += APERTURE_RANGE_SIZE / 64; - } - } - - dma_dom->aperture_size += APERTURE_RANGE_SIZE; - - /* Initialize the exclusion range if necessary */ - for_each_iommu(iommu) { - if (iommu->exclusion_start && - iommu->exclusion_start >= dma_dom->aperture[index]->offset - && iommu->exclusion_start < dma_dom->aperture_size) { - unsigned long startpage; - int pages = iommu_num_pages(iommu->exclusion_start, - iommu->exclusion_length, - PAGE_SIZE); - startpage = iommu->exclusion_start >> PAGE_SHIFT; - dma_ops_reserve_addresses(dma_dom, startpage, pages); - } - } - - /* - * Check for areas already mapped as present in the new aperture - * range and mark those pages as reserved in the allocator. Such - * mappings may already exist as a result of requested unity - * mappings for devices. - */ - for (i = dma_dom->aperture[index]->offset; - i < dma_dom->aperture_size; - i += PAGE_SIZE) { - u64 *pte = fetch_pte(&dma_dom->domain, i); - if (!pte || !IOMMU_PTE_PRESENT(*pte)) - continue; - - dma_ops_reserve_addresses(dma_dom, i << PAGE_SHIFT, 1); - } - - update_domain(&dma_dom->domain); - - return 0; - -out_free: - update_domain(&dma_dom->domain); - - free_page((unsigned long)dma_dom->aperture[index]->bitmap); - - kfree(dma_dom->aperture[index]); - dma_dom->aperture[index] = NULL; - - return -ENOMEM; -} - -static unsigned long dma_ops_area_alloc(struct device *dev, - struct dma_ops_domain *dom, - unsigned int pages, - unsigned long align_mask, - u64 dma_mask, - unsigned long start) -{ - unsigned long next_bit = dom->next_address % APERTURE_RANGE_SIZE; - int max_index = dom->aperture_size >> APERTURE_RANGE_SHIFT; - int i = start >> APERTURE_RANGE_SHIFT; - unsigned long boundary_size; - unsigned long address = -1; - unsigned long limit; - - next_bit >>= PAGE_SHIFT; - - boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, - PAGE_SIZE) >> PAGE_SHIFT; - - for (;i < max_index; ++i) { - unsigned long offset = dom->aperture[i]->offset >> PAGE_SHIFT; - - if (dom->aperture[i]->offset >= dma_mask) - break; - - limit = iommu_device_max_index(APERTURE_RANGE_PAGES, offset, - dma_mask >> PAGE_SHIFT); - - address = iommu_area_alloc(dom->aperture[i]->bitmap, - limit, next_bit, pages, 0, - boundary_size, align_mask); - if (address != -1) { - address = dom->aperture[i]->offset + - (address << PAGE_SHIFT); - dom->next_address = address + (pages << PAGE_SHIFT); - break; - } - - next_bit = 0; - } - - return address; -} - -static unsigned long dma_ops_alloc_addresses(struct device *dev, - struct dma_ops_domain *dom, - unsigned int pages, - unsigned long align_mask, - u64 dma_mask) -{ - unsigned long address; - -#ifdef CONFIG_IOMMU_STRESS - dom->next_address = 0; - dom->need_flush = true; -#endif - - address = dma_ops_area_alloc(dev, dom, pages, align_mask, - dma_mask, dom->next_address); - - if (address == -1) { - dom->next_address = 0; - address = dma_ops_area_alloc(dev, dom, pages, align_mask, - dma_mask, 0); - dom->need_flush = true; - } - - if (unlikely(address == -1)) - address = DMA_ERROR_CODE; - - WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size); - - return address; -} - -/* - * The address free function. - * - * called with domain->lock held - */ -static void dma_ops_free_addresses(struct dma_ops_domain *dom, - unsigned long address, - unsigned int pages) -{ - unsigned i = address >> APERTURE_RANGE_SHIFT; - struct aperture_range *range = dom->aperture[i]; - - BUG_ON(i >= APERTURE_MAX_RANGES || range == NULL); - -#ifdef CONFIG_IOMMU_STRESS - if (i < 4) - return; -#endif - - if (address >= dom->next_address) - dom->need_flush = true; - - address = (address % APERTURE_RANGE_SIZE) >> PAGE_SHIFT; - - bitmap_clear(range->bitmap, address, pages); - -} - -/**************************************************************************** - * - * The next functions belong to the domain allocation. A domain is - * allocated for every IOMMU as the default domain. If device isolation - * is enabled, every device get its own domain. The most important thing - * about domains is the page table mapping the DMA address space they - * contain. - * - ****************************************************************************/ - -/* - * This function adds a protection domain to the global protection domain list - */ -static void add_domain_to_list(struct protection_domain *domain) -{ - unsigned long flags; - - spin_lock_irqsave(&amd_iommu_pd_lock, flags); - list_add(&domain->list, &amd_iommu_pd_list); - spin_unlock_irqrestore(&amd_iommu_pd_lock, flags); -} - -/* - * This function removes a protection domain to the global - * protection domain list - */ -static void del_domain_from_list(struct protection_domain *domain) -{ - unsigned long flags; - - spin_lock_irqsave(&amd_iommu_pd_lock, flags); - list_del(&domain->list); - spin_unlock_irqrestore(&amd_iommu_pd_lock, flags); -} - -static u16 domain_id_alloc(void) -{ - unsigned long flags; - int id; - - write_lock_irqsave(&amd_iommu_devtable_lock, flags); - id = find_first_zero_bit(amd_iommu_pd_alloc_bitmap, MAX_DOMAIN_ID); - BUG_ON(id == 0); - if (id > 0 && id < MAX_DOMAIN_ID) - __set_bit(id, amd_iommu_pd_alloc_bitmap); - else - id = 0; - write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); - - return id; -} - -static void domain_id_free(int id) -{ - unsigned long flags; - - write_lock_irqsave(&amd_iommu_devtable_lock, flags); - if (id > 0 && id < MAX_DOMAIN_ID) - __clear_bit(id, amd_iommu_pd_alloc_bitmap); - write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); -} - -static void free_pagetable(struct protection_domain *domain) -{ - int i, j; - u64 *p1, *p2, *p3; - - p1 = domain->pt_root; - - if (!p1) - return; - - for (i = 0; i < 512; ++i) { - if (!IOMMU_PTE_PRESENT(p1[i])) - continue; - - p2 = IOMMU_PTE_PAGE(p1[i]); - for (j = 0; j < 512; ++j) { - if (!IOMMU_PTE_PRESENT(p2[j])) - continue; - p3 = IOMMU_PTE_PAGE(p2[j]); - free_page((unsigned long)p3); - } - - free_page((unsigned long)p2); - } - - free_page((unsigned long)p1); - - domain->pt_root = NULL; -} - -/* - * Free a domain, only used if something went wrong in the - * allocation path and we need to free an already allocated page table - */ -static void dma_ops_domain_free(struct dma_ops_domain *dom) -{ - int i; - - if (!dom) - return; - - del_domain_from_list(&dom->domain); - - free_pagetable(&dom->domain); - - for (i = 0; i < APERTURE_MAX_RANGES; ++i) { - if (!dom->aperture[i]) - continue; - free_page((unsigned long)dom->aperture[i]->bitmap); - kfree(dom->aperture[i]); - } - - kfree(dom); -} - -/* - * Allocates a new protection domain usable for the dma_ops functions. - * It also initializes the page table and the address allocator data - * structures required for the dma_ops interface - */ -static struct dma_ops_domain *dma_ops_domain_alloc(void) -{ - struct dma_ops_domain *dma_dom; - - dma_dom = kzalloc(sizeof(struct dma_ops_domain), GFP_KERNEL); - if (!dma_dom) - return NULL; - - spin_lock_init(&dma_dom->domain.lock); - - dma_dom->domain.id = domain_id_alloc(); - if (dma_dom->domain.id == 0) - goto free_dma_dom; - INIT_LIST_HEAD(&dma_dom->domain.dev_list); - dma_dom->domain.mode = PAGE_MODE_2_LEVEL; - dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL); - dma_dom->domain.flags = PD_DMA_OPS_MASK; - dma_dom->domain.priv = dma_dom; - if (!dma_dom->domain.pt_root) - goto free_dma_dom; - - dma_dom->need_flush = false; - dma_dom->target_dev = 0xffff; - - add_domain_to_list(&dma_dom->domain); - - if (alloc_new_range(dma_dom, true, GFP_KERNEL)) - goto free_dma_dom; - - /* - * mark the first page as allocated so we never return 0 as - * a valid dma-address. So we can use 0 as error value - */ - dma_dom->aperture[0]->bitmap[0] = 1; - dma_dom->next_address = 0; - - - return dma_dom; - -free_dma_dom: - dma_ops_domain_free(dma_dom); - - return NULL; -} - -/* - * little helper function to check whether a given protection domain is a - * dma_ops domain - */ -static bool dma_ops_domain(struct protection_domain *domain) -{ - return domain->flags & PD_DMA_OPS_MASK; -} - -static void set_dte_entry(u16 devid, struct protection_domain *domain, bool ats) -{ - u64 pte_root = virt_to_phys(domain->pt_root); - u32 flags = 0; - - pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK) - << DEV_ENTRY_MODE_SHIFT; - pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV; - - if (ats) - flags |= DTE_FLAG_IOTLB; - - amd_iommu_dev_table[devid].data[3] |= flags; - amd_iommu_dev_table[devid].data[2] = domain->id; - amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root); - amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root); -} - -static void clear_dte_entry(u16 devid) -{ - /* remove entry from the device table seen by the hardware */ - amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV; - amd_iommu_dev_table[devid].data[1] = 0; - amd_iommu_dev_table[devid].data[2] = 0; - - amd_iommu_apply_erratum_63(devid); -} - -static void do_attach(struct device *dev, struct protection_domain *domain) -{ - struct iommu_dev_data *dev_data; - struct amd_iommu *iommu; - struct pci_dev *pdev; - bool ats = false; - u16 devid; - - devid = get_device_id(dev); - iommu = amd_iommu_rlookup_table[devid]; - dev_data = get_dev_data(dev); - pdev = to_pci_dev(dev); - - if (amd_iommu_iotlb_sup) - ats = pci_ats_enabled(pdev); - - /* Update data structures */ - dev_data->domain = domain; - list_add(&dev_data->list, &domain->dev_list); - set_dte_entry(devid, domain, ats); - - /* Do reference counting */ - domain->dev_iommu[iommu->index] += 1; - domain->dev_cnt += 1; - - /* Flush the DTE entry */ - device_flush_dte(dev); -} - -static void do_detach(struct device *dev) -{ - struct iommu_dev_data *dev_data; - struct amd_iommu *iommu; - u16 devid; - - devid = get_device_id(dev); - iommu = amd_iommu_rlookup_table[devid]; - dev_data = get_dev_data(dev); - - /* decrease reference counters */ - dev_data->domain->dev_iommu[iommu->index] -= 1; - dev_data->domain->dev_cnt -= 1; - - /* Update data structures */ - dev_data->domain = NULL; - list_del(&dev_data->list); - clear_dte_entry(devid); - - /* Flush the DTE entry */ - device_flush_dte(dev); -} - -/* - * If a device is not yet associated with a domain, this function does - * assigns it visible for the hardware - */ -static int __attach_device(struct device *dev, - struct protection_domain *domain) -{ - struct iommu_dev_data *dev_data, *alias_data; - int ret; - - dev_data = get_dev_data(dev); - alias_data = get_dev_data(dev_data->alias); - - if (!alias_data) - return -EINVAL; - - /* lock domain */ - spin_lock(&domain->lock); - - /* Some sanity checks */ - ret = -EBUSY; - if (alias_data->domain != NULL && - alias_data->domain != domain) - goto out_unlock; - - if (dev_data->domain != NULL && - dev_data->domain != domain) - goto out_unlock; - - /* Do real assignment */ - if (dev_data->alias != dev) { - alias_data = get_dev_data(dev_data->alias); - if (alias_data->domain == NULL) - do_attach(dev_data->alias, domain); - - atomic_inc(&alias_data->bind); - } - - if (dev_data->domain == NULL) - do_attach(dev, domain); - - atomic_inc(&dev_data->bind); - - ret = 0; - -out_unlock: - - /* ready */ - spin_unlock(&domain->lock); - - return ret; -} - -/* - * If a device is not yet associated with a domain, this function does - * assigns it visible for the hardware - */ -static int attach_device(struct device *dev, - struct protection_domain *domain) -{ - struct pci_dev *pdev = to_pci_dev(dev); - unsigned long flags; - int ret; - - if (amd_iommu_iotlb_sup) - pci_enable_ats(pdev, PAGE_SHIFT); - - write_lock_irqsave(&amd_iommu_devtable_lock, flags); - ret = __attach_device(dev, domain); - write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); - - /* - * We might boot into a crash-kernel here. The crashed kernel - * left the caches in the IOMMU dirty. So we have to flush - * here to evict all dirty stuff. - */ - domain_flush_tlb_pde(domain); - - return ret; -} - -/* - * Removes a device from a protection domain (unlocked) - */ -static void __detach_device(struct device *dev) -{ - struct iommu_dev_data *dev_data = get_dev_data(dev); - struct iommu_dev_data *alias_data; - struct protection_domain *domain; - unsigned long flags; - - BUG_ON(!dev_data->domain); - - domain = dev_data->domain; - - spin_lock_irqsave(&domain->lock, flags); - - if (dev_data->alias != dev) { - alias_data = get_dev_data(dev_data->alias); - if (atomic_dec_and_test(&alias_data->bind)) - do_detach(dev_data->alias); - } - - if (atomic_dec_and_test(&dev_data->bind)) - do_detach(dev); - - spin_unlock_irqrestore(&domain->lock, flags); - - /* - * If we run in passthrough mode the device must be assigned to the - * passthrough domain if it is detached from any other domain. - * Make sure we can deassign from the pt_domain itself. - */ - if (iommu_pass_through && - (dev_data->domain == NULL && domain != pt_domain)) - __attach_device(dev, pt_domain); -} - -/* - * Removes a device from a protection domain (with devtable_lock held) - */ -static void detach_device(struct device *dev) -{ - struct pci_dev *pdev = to_pci_dev(dev); - unsigned long flags; - - /* lock device table */ - write_lock_irqsave(&amd_iommu_devtable_lock, flags); - __detach_device(dev); - write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); - - if (amd_iommu_iotlb_sup && pci_ats_enabled(pdev)) - pci_disable_ats(pdev); -} - -/* - * Find out the protection domain structure for a given PCI device. This - * will give us the pointer to the page table root for example. - */ -static struct protection_domain *domain_for_device(struct device *dev) -{ - struct protection_domain *dom; - struct iommu_dev_data *dev_data, *alias_data; - unsigned long flags; - u16 devid; - - devid = get_device_id(dev); - dev_data = get_dev_data(dev); - alias_data = get_dev_data(dev_data->alias); - if (!alias_data) - return NULL; - - read_lock_irqsave(&amd_iommu_devtable_lock, flags); - dom = dev_data->domain; - if (dom == NULL && - alias_data->domain != NULL) { - __attach_device(dev, alias_data->domain); - dom = alias_data->domain; - } - - read_unlock_irqrestore(&amd_iommu_devtable_lock, flags); - - return dom; -} - -static int device_change_notifier(struct notifier_block *nb, - unsigned long action, void *data) -{ - struct device *dev = data; - u16 devid; - struct protection_domain *domain; - struct dma_ops_domain *dma_domain; - struct amd_iommu *iommu; - unsigned long flags; - - if (!check_device(dev)) - return 0; - - devid = get_device_id(dev); - iommu = amd_iommu_rlookup_table[devid]; - - switch (action) { - case BUS_NOTIFY_UNBOUND_DRIVER: - - domain = domain_for_device(dev); - - if (!domain) - goto out; - if (iommu_pass_through) - break; - detach_device(dev); - break; - case BUS_NOTIFY_ADD_DEVICE: - - iommu_init_device(dev); - - domain = domain_for_device(dev); - - /* allocate a protection domain if a device is added */ - dma_domain = find_protection_domain(devid); - if (dma_domain) - goto out; - dma_domain = dma_ops_domain_alloc(); - if (!dma_domain) - goto out; - dma_domain->target_dev = devid; - - spin_lock_irqsave(&iommu_pd_list_lock, flags); - list_add_tail(&dma_domain->list, &iommu_pd_list); - spin_unlock_irqrestore(&iommu_pd_list_lock, flags); - - break; - case BUS_NOTIFY_DEL_DEVICE: - - iommu_uninit_device(dev); - - default: - goto out; - } - - device_flush_dte(dev); - iommu_completion_wait(iommu); - -out: - return 0; -} - -static struct notifier_block device_nb = { - .notifier_call = device_change_notifier, -}; - -void amd_iommu_init_notifier(void) -{ - bus_register_notifier(&pci_bus_type, &device_nb); -} - -/***************************************************************************** - * - * The next functions belong to the dma_ops mapping/unmapping code. - * - *****************************************************************************/ - -/* - * In the dma_ops path we only have the struct device. This function - * finds the corresponding IOMMU, the protection domain and the - * requestor id for a given device. - * If the device is not yet associated with a domain this is also done - * in this function. - */ -static struct protection_domain *get_domain(struct device *dev) -{ - struct protection_domain *domain; - struct dma_ops_domain *dma_dom; - u16 devid = get_device_id(dev); - - if (!check_device(dev)) - return ERR_PTR(-EINVAL); - - domain = domain_for_device(dev); - if (domain != NULL && !dma_ops_domain(domain)) - return ERR_PTR(-EBUSY); - - if (domain != NULL) - return domain; - - /* Device not bount yet - bind it */ - dma_dom = find_protection_domain(devid); - if (!dma_dom) - dma_dom = amd_iommu_rlookup_table[devid]->default_dom; - attach_device(dev, &dma_dom->domain); - DUMP_printk("Using protection domain %d for device %s\n", - dma_dom->domain.id, dev_name(dev)); - - return &dma_dom->domain; -} - -static void update_device_table(struct protection_domain *domain) -{ - struct iommu_dev_data *dev_data; - - list_for_each_entry(dev_data, &domain->dev_list, list) { - struct pci_dev *pdev = to_pci_dev(dev_data->dev); - u16 devid = get_device_id(dev_data->dev); - set_dte_entry(devid, domain, pci_ats_enabled(pdev)); - } -} - -static void update_domain(struct protection_domain *domain) -{ - if (!domain->updated) - return; - - update_device_table(domain); - - domain_flush_devices(domain); - domain_flush_tlb_pde(domain); - - domain->updated = false; -} - -/* - * This function fetches the PTE for a given address in the aperture - */ -static u64* dma_ops_get_pte(struct dma_ops_domain *dom, - unsigned long address) -{ - struct aperture_range *aperture; - u64 *pte, *pte_page; - - aperture = dom->aperture[APERTURE_RANGE_INDEX(address)]; - if (!aperture) - return NULL; - - pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)]; - if (!pte) { - pte = alloc_pte(&dom->domain, address, PAGE_SIZE, &pte_page, - GFP_ATOMIC); - aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page; - } else - pte += PM_LEVEL_INDEX(0, address); - - update_domain(&dom->domain); - - return pte; -} - -/* - * This is the generic map function. It maps one 4kb page at paddr to - * the given address in the DMA address space for the domain. - */ -static dma_addr_t dma_ops_domain_map(struct dma_ops_domain *dom, - unsigned long address, - phys_addr_t paddr, - int direction) -{ - u64 *pte, __pte; - - WARN_ON(address > dom->aperture_size); - - paddr &= PAGE_MASK; - - pte = dma_ops_get_pte(dom, address); - if (!pte) - return DMA_ERROR_CODE; - - __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC; - - if (direction == DMA_TO_DEVICE) - __pte |= IOMMU_PTE_IR; - else if (direction == DMA_FROM_DEVICE) - __pte |= IOMMU_PTE_IW; - else if (direction == DMA_BIDIRECTIONAL) - __pte |= IOMMU_PTE_IR | IOMMU_PTE_IW; - - WARN_ON(*pte); - - *pte = __pte; - - return (dma_addr_t)address; -} - -/* - * The generic unmapping function for on page in the DMA address space. - */ -static void dma_ops_domain_unmap(struct dma_ops_domain *dom, - unsigned long address) -{ - struct aperture_range *aperture; - u64 *pte; - - if (address >= dom->aperture_size) - return; - - aperture = dom->aperture[APERTURE_RANGE_INDEX(address)]; - if (!aperture) - return; - - pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)]; - if (!pte) - return; - - pte += PM_LEVEL_INDEX(0, address); - - WARN_ON(!*pte); - - *pte = 0ULL; -} - -/* - * This function contains common code for mapping of a physically - * contiguous memory region into DMA address space. It is used by all - * mapping functions provided with this IOMMU driver. - * Must be called with the domain lock held. - */ -static dma_addr_t __map_single(struct device *dev, - struct dma_ops_domain *dma_dom, - phys_addr_t paddr, - size_t size, - int dir, - bool align, - u64 dma_mask) -{ - dma_addr_t offset = paddr & ~PAGE_MASK; - dma_addr_t address, start, ret; - unsigned int pages; - unsigned long align_mask = 0; - int i; - - pages = iommu_num_pages(paddr, size, PAGE_SIZE); - paddr &= PAGE_MASK; - - INC_STATS_COUNTER(total_map_requests); - - if (pages > 1) - INC_STATS_COUNTER(cross_page); - - if (align) - align_mask = (1UL << get_order(size)) - 1; - -retry: - address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask, - dma_mask); - if (unlikely(address == DMA_ERROR_CODE)) { - /* - * setting next_address here will let the address - * allocator only scan the new allocated range in the - * first run. This is a small optimization. - */ - dma_dom->next_address = dma_dom->aperture_size; - - if (alloc_new_range(dma_dom, false, GFP_ATOMIC)) - goto out; - - /* - * aperture was successfully enlarged by 128 MB, try - * allocation again - */ - goto retry; - } - - start = address; - for (i = 0; i < pages; ++i) { - ret = dma_ops_domain_map(dma_dom, start, paddr, dir); - if (ret == DMA_ERROR_CODE) - goto out_unmap; - - paddr += PAGE_SIZE; - start += PAGE_SIZE; - } - address += offset; - - ADD_STATS_COUNTER(alloced_io_mem, size); - - if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) { - domain_flush_tlb(&dma_dom->domain); - dma_dom->need_flush = false; - } else if (unlikely(amd_iommu_np_cache)) - domain_flush_pages(&dma_dom->domain, address, size); - -out: - return address; - -out_unmap: - - for (--i; i >= 0; --i) { - start -= PAGE_SIZE; - dma_ops_domain_unmap(dma_dom, start); - } - - dma_ops_free_addresses(dma_dom, address, pages); - - return DMA_ERROR_CODE; -} - -/* - * Does the reverse of the __map_single function. Must be called with - * the domain lock held too - */ -static void __unmap_single(struct dma_ops_domain *dma_dom, - dma_addr_t dma_addr, - size_t size, - int dir) -{ - dma_addr_t flush_addr; - dma_addr_t i, start; - unsigned int pages; - - if ((dma_addr == DMA_ERROR_CODE) || - (dma_addr + size > dma_dom->aperture_size)) - return; - - flush_addr = dma_addr; - pages = iommu_num_pages(dma_addr, size, PAGE_SIZE); - dma_addr &= PAGE_MASK; - start = dma_addr; - - for (i = 0; i < pages; ++i) { - dma_ops_domain_unmap(dma_dom, start); - start += PAGE_SIZE; - } - - SUB_STATS_COUNTER(alloced_io_mem, size); - - dma_ops_free_addresses(dma_dom, dma_addr, pages); - - if (amd_iommu_unmap_flush || dma_dom->need_flush) { - domain_flush_pages(&dma_dom->domain, flush_addr, size); - dma_dom->need_flush = false; - } -} - -/* - * The exported map_single function for dma_ops. - */ -static dma_addr_t map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, - enum dma_data_direction dir, - struct dma_attrs *attrs) -{ - unsigned long flags; - struct protection_domain *domain; - dma_addr_t addr; - u64 dma_mask; - phys_addr_t paddr = page_to_phys(page) + offset; - - INC_STATS_COUNTER(cnt_map_single); - - domain = get_domain(dev); - if (PTR_ERR(domain) == -EINVAL) - return (dma_addr_t)paddr; - else if (IS_ERR(domain)) - return DMA_ERROR_CODE; - - dma_mask = *dev->dma_mask; - - spin_lock_irqsave(&domain->lock, flags); - - addr = __map_single(dev, domain->priv, paddr, size, dir, false, - dma_mask); - if (addr == DMA_ERROR_CODE) - goto out; - - domain_flush_complete(domain); - -out: - spin_unlock_irqrestore(&domain->lock, flags); - - return addr; -} - -/* - * The exported unmap_single function for dma_ops. - */ -static void unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size, - enum dma_data_direction dir, struct dma_attrs *attrs) -{ - unsigned long flags; - struct protection_domain *domain; - - INC_STATS_COUNTER(cnt_unmap_single); - - domain = get_domain(dev); - if (IS_ERR(domain)) - return; - - spin_lock_irqsave(&domain->lock, flags); - - __unmap_single(domain->priv, dma_addr, size, dir); - - domain_flush_complete(domain); - - spin_unlock_irqrestore(&domain->lock, flags); -} - -/* - * This is a special map_sg function which is used if we should map a - * device which is not handled by an AMD IOMMU in the system. - */ -static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist, - int nelems, int dir) -{ - struct scatterlist *s; - int i; - - for_each_sg(sglist, s, nelems, i) { - s->dma_address = (dma_addr_t)sg_phys(s); - s->dma_length = s->length; - } - - return nelems; -} - -/* - * The exported map_sg function for dma_ops (handles scatter-gather - * lists). - */ -static int map_sg(struct device *dev, struct scatterlist *sglist, - int nelems, enum dma_data_direction dir, - struct dma_attrs *attrs) -{ - unsigned long flags; - struct protection_domain *domain; - int i; - struct scatterlist *s; - phys_addr_t paddr; - int mapped_elems = 0; - u64 dma_mask; - - INC_STATS_COUNTER(cnt_map_sg); - - domain = get_domain(dev); - if (PTR_ERR(domain) == -EINVAL) - return map_sg_no_iommu(dev, sglist, nelems, dir); - else if (IS_ERR(domain)) - return 0; - - dma_mask = *dev->dma_mask; - - spin_lock_irqsave(&domain->lock, flags); - - for_each_sg(sglist, s, nelems, i) { - paddr = sg_phys(s); - - s->dma_address = __map_single(dev, domain->priv, - paddr, s->length, dir, false, - dma_mask); - - if (s->dma_address) { - s->dma_length = s->length; - mapped_elems++; - } else - goto unmap; - } - - domain_flush_complete(domain); - -out: - spin_unlock_irqrestore(&domain->lock, flags); - - return mapped_elems; -unmap: - for_each_sg(sglist, s, mapped_elems, i) { - if (s->dma_address) - __unmap_single(domain->priv, s->dma_address, - s->dma_length, dir); - s->dma_address = s->dma_length = 0; - } - - mapped_elems = 0; - - goto out; -} - -/* - * The exported map_sg function for dma_ops (handles scatter-gather - * lists). - */ -static void unmap_sg(struct device *dev, struct scatterlist *sglist, - int nelems, enum dma_data_direction dir, - struct dma_attrs *attrs) -{ - unsigned long flags; - struct protection_domain *domain; - struct scatterlist *s; - int i; - - INC_STATS_COUNTER(cnt_unmap_sg); - - domain = get_domain(dev); - if (IS_ERR(domain)) - return; - - spin_lock_irqsave(&domain->lock, flags); - - for_each_sg(sglist, s, nelems, i) { - __unmap_single(domain->priv, s->dma_address, - s->dma_length, dir); - s->dma_address = s->dma_length = 0; - } - - domain_flush_complete(domain); - - spin_unlock_irqrestore(&domain->lock, flags); -} - -/* - * The exported alloc_coherent function for dma_ops. - */ -static void *alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_addr, gfp_t flag) -{ - unsigned long flags; - void *virt_addr; - struct protection_domain *domain; - phys_addr_t paddr; - u64 dma_mask = dev->coherent_dma_mask; - - INC_STATS_COUNTER(cnt_alloc_coherent); - - domain = get_domain(dev); - if (PTR_ERR(domain) == -EINVAL) { - virt_addr = (void *)__get_free_pages(flag, get_order(size)); - *dma_addr = __pa(virt_addr); - return virt_addr; - } else if (IS_ERR(domain)) - return NULL; - - dma_mask = dev->coherent_dma_mask; - flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); - flag |= __GFP_ZERO; - - virt_addr = (void *)__get_free_pages(flag, get_order(size)); - if (!virt_addr) - return NULL; - - paddr = virt_to_phys(virt_addr); - - if (!dma_mask) - dma_mask = *dev->dma_mask; - - spin_lock_irqsave(&domain->lock, flags); - - *dma_addr = __map_single(dev, domain->priv, paddr, - size, DMA_BIDIRECTIONAL, true, dma_mask); - - if (*dma_addr == DMA_ERROR_CODE) { - spin_unlock_irqrestore(&domain->lock, flags); - goto out_free; - } - - domain_flush_complete(domain); - - spin_unlock_irqrestore(&domain->lock, flags); - - return virt_addr; - -out_free: - - free_pages((unsigned long)virt_addr, get_order(size)); - - return NULL; -} - -/* - * The exported free_coherent function for dma_ops. - */ -static void free_coherent(struct device *dev, size_t size, - void *virt_addr, dma_addr_t dma_addr) -{ - unsigned long flags; - struct protection_domain *domain; - - INC_STATS_COUNTER(cnt_free_coherent); - - domain = get_domain(dev); - if (IS_ERR(domain)) - goto free_mem; - - spin_lock_irqsave(&domain->lock, flags); - - __unmap_single(domain->priv, dma_addr, size, DMA_BIDIRECTIONAL); - - domain_flush_complete(domain); - - spin_unlock_irqrestore(&domain->lock, flags); - -free_mem: - free_pages((unsigned long)virt_addr, get_order(size)); -} - -/* - * This function is called by the DMA layer to find out if we can handle a - * particular device. It is part of the dma_ops. - */ -static int amd_iommu_dma_supported(struct device *dev, u64 mask) -{ - return check_device(dev); -} - -/* - * The function for pre-allocating protection domains. - * - * If the driver core informs the DMA layer if a driver grabs a device - * we don't need to preallocate the protection domains anymore. - * For now we have to. - */ -static void prealloc_protection_domains(void) -{ - struct pci_dev *dev = NULL; - struct dma_ops_domain *dma_dom; - u16 devid; - - for_each_pci_dev(dev) { - - /* Do we handle this device? */ - if (!check_device(&dev->dev)) - continue; - - /* Is there already any domain for it? */ - if (domain_for_device(&dev->dev)) - continue; - - devid = get_device_id(&dev->dev); - - dma_dom = dma_ops_domain_alloc(); - if (!dma_dom) - continue; - init_unity_mappings_for_device(dma_dom, devid); - dma_dom->target_dev = devid; - - attach_device(&dev->dev, &dma_dom->domain); - - list_add_tail(&dma_dom->list, &iommu_pd_list); - } -} - -static struct dma_map_ops amd_iommu_dma_ops = { - .alloc_coherent = alloc_coherent, - .free_coherent = free_coherent, - .map_page = map_page, - .unmap_page = unmap_page, - .map_sg = map_sg, - .unmap_sg = unmap_sg, - .dma_supported = amd_iommu_dma_supported, -}; - -static unsigned device_dma_ops_init(void) -{ - struct pci_dev *pdev = NULL; - unsigned unhandled = 0; - - for_each_pci_dev(pdev) { - if (!check_device(&pdev->dev)) { - unhandled += 1; - continue; - } - - pdev->dev.archdata.dma_ops = &amd_iommu_dma_ops; - } - - return unhandled; -} - -/* - * The function which clues the AMD IOMMU driver into dma_ops. - */ - -void __init amd_iommu_init_api(void) -{ - register_iommu(&amd_iommu_ops); -} - -int __init amd_iommu_init_dma_ops(void) -{ - struct amd_iommu *iommu; - int ret, unhandled; - - /* - * first allocate a default protection domain for every IOMMU we - * found in the system. Devices not assigned to any other - * protection domain will be assigned to the default one. - */ - for_each_iommu(iommu) { - iommu->default_dom = dma_ops_domain_alloc(); - if (iommu->default_dom == NULL) - return -ENOMEM; - iommu->default_dom->domain.flags |= PD_DEFAULT_MASK; - ret = iommu_init_unity_mappings(iommu); - if (ret) - goto free_domains; - } - - /* - * Pre-allocate the protection domains for each device. - */ - prealloc_protection_domains(); - - iommu_detected = 1; - swiotlb = 0; - - /* Make the driver finally visible to the drivers */ - unhandled = device_dma_ops_init(); - if (unhandled && max_pfn > MAX_DMA32_PFN) { - /* There are unhandled devices - initialize swiotlb for them */ - swiotlb = 1; - } - - amd_iommu_stats_init(); - - return 0; - -free_domains: - - for_each_iommu(iommu) { - if (iommu->default_dom) - dma_ops_domain_free(iommu->default_dom); - } - - return ret; -} - -/***************************************************************************** - * - * The following functions belong to the exported interface of AMD IOMMU - * - * This interface allows access to lower level functions of the IOMMU - * like protection domain handling and assignement of devices to domains - * which is not possible with the dma_ops interface. - * - *****************************************************************************/ - -static void cleanup_domain(struct protection_domain *domain) -{ - struct iommu_dev_data *dev_data, *next; - unsigned long flags; - - write_lock_irqsave(&amd_iommu_devtable_lock, flags); - - list_for_each_entry_safe(dev_data, next, &domain->dev_list, list) { - struct device *dev = dev_data->dev; - - __detach_device(dev); - atomic_set(&dev_data->bind, 0); - } - - write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); -} - -static void protection_domain_free(struct protection_domain *domain) -{ - if (!domain) - return; - - del_domain_from_list(domain); - - if (domain->id) - domain_id_free(domain->id); - - kfree(domain); -} - -static struct protection_domain *protection_domain_alloc(void) -{ - struct protection_domain *domain; - - domain = kzalloc(sizeof(*domain), GFP_KERNEL); - if (!domain) - return NULL; - - spin_lock_init(&domain->lock); - mutex_init(&domain->api_lock); - domain->id = domain_id_alloc(); - if (!domain->id) - goto out_err; - INIT_LIST_HEAD(&domain->dev_list); - - add_domain_to_list(domain); - - return domain; - -out_err: - kfree(domain); - - return NULL; -} - -static int amd_iommu_domain_init(struct iommu_domain *dom) -{ - struct protection_domain *domain; - - domain = protection_domain_alloc(); - if (!domain) - goto out_free; - - domain->mode = PAGE_MODE_3_LEVEL; - domain->pt_root = (void *)get_zeroed_page(GFP_KERNEL); - if (!domain->pt_root) - goto out_free; - - dom->priv = domain; - - return 0; - -out_free: - protection_domain_free(domain); - - return -ENOMEM; -} - -static void amd_iommu_domain_destroy(struct iommu_domain *dom) -{ - struct protection_domain *domain = dom->priv; - - if (!domain) - return; - - if (domain->dev_cnt > 0) - cleanup_domain(domain); - - BUG_ON(domain->dev_cnt != 0); - - free_pagetable(domain); - - protection_domain_free(domain); - - dom->priv = NULL; -} - -static void amd_iommu_detach_device(struct iommu_domain *dom, - struct device *dev) -{ - struct iommu_dev_data *dev_data = dev->archdata.iommu; - struct amd_iommu *iommu; - u16 devid; - - if (!check_device(dev)) - return; - - devid = get_device_id(dev); - - if (dev_data->domain != NULL) - detach_device(dev); - - iommu = amd_iommu_rlookup_table[devid]; - if (!iommu) - return; - - device_flush_dte(dev); - iommu_completion_wait(iommu); -} - -static int amd_iommu_attach_device(struct iommu_domain *dom, - struct device *dev) -{ - struct protection_domain *domain = dom->priv; - struct iommu_dev_data *dev_data; - struct amd_iommu *iommu; - int ret; - u16 devid; - - if (!check_device(dev)) - return -EINVAL; - - dev_data = dev->archdata.iommu; - - devid = get_device_id(dev); - - iommu = amd_iommu_rlookup_table[devid]; - if (!iommu) - return -EINVAL; - - if (dev_data->domain) - detach_device(dev); - - ret = attach_device(dev, domain); - - iommu_completion_wait(iommu); - - return ret; -} - -static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova, - phys_addr_t paddr, int gfp_order, int iommu_prot) -{ - unsigned long page_size = 0x1000UL << gfp_order; - struct protection_domain *domain = dom->priv; - int prot = 0; - int ret; - - if (iommu_prot & IOMMU_READ) - prot |= IOMMU_PROT_IR; - if (iommu_prot & IOMMU_WRITE) - prot |= IOMMU_PROT_IW; - - mutex_lock(&domain->api_lock); - ret = iommu_map_page(domain, iova, paddr, prot, page_size); - mutex_unlock(&domain->api_lock); - - return ret; -} - -static int amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova, - int gfp_order) -{ - struct protection_domain *domain = dom->priv; - unsigned long page_size, unmap_size; - - page_size = 0x1000UL << gfp_order; - - mutex_lock(&domain->api_lock); - unmap_size = iommu_unmap_page(domain, iova, page_size); - mutex_unlock(&domain->api_lock); - - domain_flush_tlb_pde(domain); - - return get_order(unmap_size); -} - -static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom, - unsigned long iova) -{ - struct protection_domain *domain = dom->priv; - unsigned long offset_mask; - phys_addr_t paddr; - u64 *pte, __pte; - - pte = fetch_pte(domain, iova); - - if (!pte || !IOMMU_PTE_PRESENT(*pte)) - return 0; - - if (PM_PTE_LEVEL(*pte) == 0) - offset_mask = PAGE_SIZE - 1; - else - offset_mask = PTE_PAGE_SIZE(*pte) - 1; - - __pte = *pte & PM_ADDR_MASK; - paddr = (__pte & ~offset_mask) | (iova & offset_mask); - - return paddr; -} - -static int amd_iommu_domain_has_cap(struct iommu_domain *domain, - unsigned long cap) -{ - switch (cap) { - case IOMMU_CAP_CACHE_COHERENCY: - return 1; - } - - return 0; -} - -static struct iommu_ops amd_iommu_ops = { - .domain_init = amd_iommu_domain_init, - .domain_destroy = amd_iommu_domain_destroy, - .attach_dev = amd_iommu_attach_device, - .detach_dev = amd_iommu_detach_device, - .map = amd_iommu_map, - .unmap = amd_iommu_unmap, - .iova_to_phys = amd_iommu_iova_to_phys, - .domain_has_cap = amd_iommu_domain_has_cap, -}; - -/***************************************************************************** - * - * The next functions do a basic initialization of IOMMU for pass through - * mode - * - * In passthrough mode the IOMMU is initialized and enabled but not used for - * DMA-API translation. - * - *****************************************************************************/ - -int __init amd_iommu_init_passthrough(void) -{ - struct amd_iommu *iommu; - struct pci_dev *dev = NULL; - u16 devid; - - /* allocate passthrough domain */ - pt_domain = protection_domain_alloc(); - if (!pt_domain) - return -ENOMEM; - - pt_domain->mode |= PAGE_MODE_NONE; - - for_each_pci_dev(dev) { - if (!check_device(&dev->dev)) - continue; - - devid = get_device_id(&dev->dev); - - iommu = amd_iommu_rlookup_table[devid]; - if (!iommu) - continue; - - attach_device(&dev->dev, pt_domain); - } - - pr_info("AMD-Vi: Initialized for Passthrough Mode\n"); - - return 0; -} -- cgit v1.1 From 166e9278a3f98bab29ebb3d685a81cfb11b98be0 Mon Sep 17 00:00:00 2001 From: Ohad Ben-Cohen Date: Fri, 10 Jun 2011 21:42:27 +0300 Subject: x86/ia64: intel-iommu: move to drivers/iommu/ This should ease finding similarities with different platforms, with the intention of solving problems once in a generic framework which everyone can use. Note: to move intel-iommu.c, the declaration of pci_find_upstream_pcie_bridge() has to move from drivers/pci/pci.h to include/linux/pci.h. This is handled in this patch, too. As suggested, also drop DMAR's EXPERIMENTAL tag while we're at it. Compile-tested on x86_64. Signed-off-by: Ohad Ben-Cohen Signed-off-by: Joerg Roedel --- arch/x86/Kconfig | 50 -------------------------------------------------- 1 file changed, 50 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 1b6a2e2..a169573 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1912,56 +1912,6 @@ config PCI_CNB20LE_QUIRK You should say N unless you know you need this. -config DMAR - bool "Support for DMA Remapping Devices (EXPERIMENTAL)" - depends on PCI_MSI && ACPI && EXPERIMENTAL - select IOMMU_API - help - DMA remapping (DMAR) devices support enables independent address - translations for Direct Memory Access (DMA) from devices. - These DMA remapping devices are reported via ACPI tables - and include PCI device scope covered by these DMA - remapping devices. - -config DMAR_DEFAULT_ON - def_bool y - prompt "Enable DMA Remapping Devices by default" - depends on DMAR - help - Selecting this option will enable a DMAR device at boot time if - one is found. If this option is not selected, DMAR support can - be enabled by passing intel_iommu=on to the kernel. It is - recommended you say N here while the DMAR code remains - experimental. - -config DMAR_BROKEN_GFX_WA - bool "Workaround broken graphics drivers (going away soon)" - depends on DMAR && BROKEN - ---help--- - Current Graphics drivers tend to use physical address - for DMA and avoid using DMA APIs. Setting this config - option permits the IOMMU driver to set a unity map for - all the OS-visible memory. Hence the driver can continue - to use physical addresses for DMA, at least until this - option is removed in the 2.6.32 kernel. - -config DMAR_FLOPPY_WA - def_bool y - depends on DMAR - ---help--- - Floppy disk drivers are known to bypass DMA API calls - thereby failing to work when IOMMU is enabled. This - workaround will setup a 1:1 mapping for the first - 16MiB to make floppy (an ISA device) work. - -config INTR_REMAP - bool "Support for Interrupt Remapping (EXPERIMENTAL)" - depends on X86_64 && X86_IO_APIC && PCI_MSI && ACPI && EXPERIMENTAL - ---help--- - Supports Interrupt remapping for IO-APIC and MSI devices. - To use x2apic mode in the CPU's which support x2APIC enhancements or - to support platforms with CPU's having > 8 bit APIC ID, say Y. - source "drivers/pci/pcie/Kconfig" source "drivers/pci/Kconfig" -- cgit v1.1 From 403f81d8ee532c976d50a5e1051f14ec78ae8db3 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 14 Jun 2011 16:44:25 +0200 Subject: iommu/amd: Move missing parts to drivers/iommu A few parts of the driver were missing in drivers/iommu. Move them there to have the complete driver in that directory. Signed-off-by: Joerg Roedel --- arch/x86/include/asm/amd_iommu.h | 35 - arch/x86/include/asm/amd_iommu_proto.h | 54 -- arch/x86/include/asm/amd_iommu_types.h | 580 ------------ arch/x86/kernel/Makefile | 1 - arch/x86/kernel/amd_iommu_init.c | 1572 -------------------------------- 5 files changed, 2242 deletions(-) delete mode 100644 arch/x86/include/asm/amd_iommu.h delete mode 100644 arch/x86/include/asm/amd_iommu_proto.h delete mode 100644 arch/x86/include/asm/amd_iommu_types.h delete mode 100644 arch/x86/kernel/amd_iommu_init.c (limited to 'arch/x86') diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h deleted file mode 100644 index a6863a2..0000000 --- a/arch/x86/include/asm/amd_iommu.h +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Copyright (C) 2007-2010 Advanced Micro Devices, Inc. - * Author: Joerg Roedel - * Leo Duran - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 as published - * by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#ifndef _ASM_X86_AMD_IOMMU_H -#define _ASM_X86_AMD_IOMMU_H - -#include - -#ifdef CONFIG_AMD_IOMMU - -extern int amd_iommu_detect(void); - -#else - -static inline int amd_iommu_detect(void) { return -ENODEV; } - -#endif - -#endif /* _ASM_X86_AMD_IOMMU_H */ diff --git a/arch/x86/include/asm/amd_iommu_proto.h b/arch/x86/include/asm/amd_iommu_proto.h deleted file mode 100644 index 55d95eb..0000000 --- a/arch/x86/include/asm/amd_iommu_proto.h +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Copyright (C) 2009-2010 Advanced Micro Devices, Inc. - * Author: Joerg Roedel - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 as published - * by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#ifndef _ASM_X86_AMD_IOMMU_PROTO_H -#define _ASM_X86_AMD_IOMMU_PROTO_H - -#include - -extern int amd_iommu_init_dma_ops(void); -extern int amd_iommu_init_passthrough(void); -extern irqreturn_t amd_iommu_int_thread(int irq, void *data); -extern irqreturn_t amd_iommu_int_handler(int irq, void *data); -extern void amd_iommu_apply_erratum_63(u16 devid); -extern void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu); -extern int amd_iommu_init_devices(void); -extern void amd_iommu_uninit_devices(void); -extern void amd_iommu_init_notifier(void); -extern void amd_iommu_init_api(void); -#ifndef CONFIG_AMD_IOMMU_STATS - -static inline void amd_iommu_stats_init(void) { } - -#endif /* !CONFIG_AMD_IOMMU_STATS */ - -static inline bool is_rd890_iommu(struct pci_dev *pdev) -{ - return (pdev->vendor == PCI_VENDOR_ID_ATI) && - (pdev->device == PCI_DEVICE_ID_RD890_IOMMU); -} - -static inline bool iommu_feature(struct amd_iommu *iommu, u64 f) -{ - if (!(iommu->cap & (1 << IOMMU_CAP_EFR))) - return false; - - return !!(iommu->features & f); -} - -#endif /* _ASM_X86_AMD_IOMMU_PROTO_H */ diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h deleted file mode 100644 index 4c99829..0000000 --- a/arch/x86/include/asm/amd_iommu_types.h +++ /dev/null @@ -1,580 +0,0 @@ -/* - * Copyright (C) 2007-2010 Advanced Micro Devices, Inc. - * Author: Joerg Roedel - * Leo Duran - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 as published - * by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#ifndef _ASM_X86_AMD_IOMMU_TYPES_H -#define _ASM_X86_AMD_IOMMU_TYPES_H - -#include -#include -#include -#include - -/* - * Maximum number of IOMMUs supported - */ -#define MAX_IOMMUS 32 - -/* - * some size calculation constants - */ -#define DEV_TABLE_ENTRY_SIZE 32 -#define ALIAS_TABLE_ENTRY_SIZE 2 -#define RLOOKUP_TABLE_ENTRY_SIZE (sizeof(void *)) - -/* Length of the MMIO region for the AMD IOMMU */ -#define MMIO_REGION_LENGTH 0x4000 - -/* Capability offsets used by the driver */ -#define MMIO_CAP_HDR_OFFSET 0x00 -#define MMIO_RANGE_OFFSET 0x0c -#define MMIO_MISC_OFFSET 0x10 - -/* Masks, shifts and macros to parse the device range capability */ -#define MMIO_RANGE_LD_MASK 0xff000000 -#define MMIO_RANGE_FD_MASK 0x00ff0000 -#define MMIO_RANGE_BUS_MASK 0x0000ff00 -#define MMIO_RANGE_LD_SHIFT 24 -#define MMIO_RANGE_FD_SHIFT 16 -#define MMIO_RANGE_BUS_SHIFT 8 -#define MMIO_GET_LD(x) (((x) & MMIO_RANGE_LD_MASK) >> MMIO_RANGE_LD_SHIFT) -#define MMIO_GET_FD(x) (((x) & MMIO_RANGE_FD_MASK) >> MMIO_RANGE_FD_SHIFT) -#define MMIO_GET_BUS(x) (((x) & MMIO_RANGE_BUS_MASK) >> MMIO_RANGE_BUS_SHIFT) -#define MMIO_MSI_NUM(x) ((x) & 0x1f) - -/* Flag masks for the AMD IOMMU exclusion range */ -#define MMIO_EXCL_ENABLE_MASK 0x01ULL -#define MMIO_EXCL_ALLOW_MASK 0x02ULL - -/* Used offsets into the MMIO space */ -#define MMIO_DEV_TABLE_OFFSET 0x0000 -#define MMIO_CMD_BUF_OFFSET 0x0008 -#define MMIO_EVT_BUF_OFFSET 0x0010 -#define MMIO_CONTROL_OFFSET 0x0018 -#define MMIO_EXCL_BASE_OFFSET 0x0020 -#define MMIO_EXCL_LIMIT_OFFSET 0x0028 -#define MMIO_EXT_FEATURES 0x0030 -#define MMIO_CMD_HEAD_OFFSET 0x2000 -#define MMIO_CMD_TAIL_OFFSET 0x2008 -#define MMIO_EVT_HEAD_OFFSET 0x2010 -#define MMIO_EVT_TAIL_OFFSET 0x2018 -#define MMIO_STATUS_OFFSET 0x2020 - - -/* Extended Feature Bits */ -#define FEATURE_PREFETCH (1ULL<<0) -#define FEATURE_PPR (1ULL<<1) -#define FEATURE_X2APIC (1ULL<<2) -#define FEATURE_NX (1ULL<<3) -#define FEATURE_GT (1ULL<<4) -#define FEATURE_IA (1ULL<<6) -#define FEATURE_GA (1ULL<<7) -#define FEATURE_HE (1ULL<<8) -#define FEATURE_PC (1ULL<<9) - -/* MMIO status bits */ -#define MMIO_STATUS_COM_WAIT_INT_MASK 0x04 - -/* event logging constants */ -#define EVENT_ENTRY_SIZE 0x10 -#define EVENT_TYPE_SHIFT 28 -#define EVENT_TYPE_MASK 0xf -#define EVENT_TYPE_ILL_DEV 0x1 -#define EVENT_TYPE_IO_FAULT 0x2 -#define EVENT_TYPE_DEV_TAB_ERR 0x3 -#define EVENT_TYPE_PAGE_TAB_ERR 0x4 -#define EVENT_TYPE_ILL_CMD 0x5 -#define EVENT_TYPE_CMD_HARD_ERR 0x6 -#define EVENT_TYPE_IOTLB_INV_TO 0x7 -#define EVENT_TYPE_INV_DEV_REQ 0x8 -#define EVENT_DEVID_MASK 0xffff -#define EVENT_DEVID_SHIFT 0 -#define EVENT_DOMID_MASK 0xffff -#define EVENT_DOMID_SHIFT 0 -#define EVENT_FLAGS_MASK 0xfff -#define EVENT_FLAGS_SHIFT 0x10 - -/* feature control bits */ -#define CONTROL_IOMMU_EN 0x00ULL -#define CONTROL_HT_TUN_EN 0x01ULL -#define CONTROL_EVT_LOG_EN 0x02ULL -#define CONTROL_EVT_INT_EN 0x03ULL -#define CONTROL_COMWAIT_EN 0x04ULL -#define CONTROL_PASSPW_EN 0x08ULL -#define CONTROL_RESPASSPW_EN 0x09ULL -#define CONTROL_COHERENT_EN 0x0aULL -#define CONTROL_ISOC_EN 0x0bULL -#define CONTROL_CMDBUF_EN 0x0cULL -#define CONTROL_PPFLOG_EN 0x0dULL -#define CONTROL_PPFINT_EN 0x0eULL - -/* command specific defines */ -#define CMD_COMPL_WAIT 0x01 -#define CMD_INV_DEV_ENTRY 0x02 -#define CMD_INV_IOMMU_PAGES 0x03 -#define CMD_INV_IOTLB_PAGES 0x04 -#define CMD_INV_ALL 0x08 - -#define CMD_COMPL_WAIT_STORE_MASK 0x01 -#define CMD_COMPL_WAIT_INT_MASK 0x02 -#define CMD_INV_IOMMU_PAGES_SIZE_MASK 0x01 -#define CMD_INV_IOMMU_PAGES_PDE_MASK 0x02 - -#define CMD_INV_IOMMU_ALL_PAGES_ADDRESS 0x7fffffffffffffffULL - -/* macros and definitions for device table entries */ -#define DEV_ENTRY_VALID 0x00 -#define DEV_ENTRY_TRANSLATION 0x01 -#define DEV_ENTRY_IR 0x3d -#define DEV_ENTRY_IW 0x3e -#define DEV_ENTRY_NO_PAGE_FAULT 0x62 -#define DEV_ENTRY_EX 0x67 -#define DEV_ENTRY_SYSMGT1 0x68 -#define DEV_ENTRY_SYSMGT2 0x69 -#define DEV_ENTRY_INIT_PASS 0xb8 -#define DEV_ENTRY_EINT_PASS 0xb9 -#define DEV_ENTRY_NMI_PASS 0xba -#define DEV_ENTRY_LINT0_PASS 0xbe -#define DEV_ENTRY_LINT1_PASS 0xbf -#define DEV_ENTRY_MODE_MASK 0x07 -#define DEV_ENTRY_MODE_SHIFT 0x09 - -/* constants to configure the command buffer */ -#define CMD_BUFFER_SIZE 8192 -#define CMD_BUFFER_UNINITIALIZED 1 -#define CMD_BUFFER_ENTRIES 512 -#define MMIO_CMD_SIZE_SHIFT 56 -#define MMIO_CMD_SIZE_512 (0x9ULL << MMIO_CMD_SIZE_SHIFT) - -/* constants for event buffer handling */ -#define EVT_BUFFER_SIZE 8192 /* 512 entries */ -#define EVT_LEN_MASK (0x9ULL << 56) - -#define PAGE_MODE_NONE 0x00 -#define PAGE_MODE_1_LEVEL 0x01 -#define PAGE_MODE_2_LEVEL 0x02 -#define PAGE_MODE_3_LEVEL 0x03 -#define PAGE_MODE_4_LEVEL 0x04 -#define PAGE_MODE_5_LEVEL 0x05 -#define PAGE_MODE_6_LEVEL 0x06 - -#define PM_LEVEL_SHIFT(x) (12 + ((x) * 9)) -#define PM_LEVEL_SIZE(x) (((x) < 6) ? \ - ((1ULL << PM_LEVEL_SHIFT((x))) - 1): \ - (0xffffffffffffffffULL)) -#define PM_LEVEL_INDEX(x, a) (((a) >> PM_LEVEL_SHIFT((x))) & 0x1ffULL) -#define PM_LEVEL_ENC(x) (((x) << 9) & 0xe00ULL) -#define PM_LEVEL_PDE(x, a) ((a) | PM_LEVEL_ENC((x)) | \ - IOMMU_PTE_P | IOMMU_PTE_IR | IOMMU_PTE_IW) -#define PM_PTE_LEVEL(pte) (((pte) >> 9) & 0x7ULL) - -#define PM_MAP_4k 0 -#define PM_ADDR_MASK 0x000ffffffffff000ULL -#define PM_MAP_MASK(lvl) (PM_ADDR_MASK & \ - (~((1ULL << (12 + ((lvl) * 9))) - 1))) -#define PM_ALIGNED(lvl, addr) ((PM_MAP_MASK(lvl) & (addr)) == (addr)) - -/* - * Returns the page table level to use for a given page size - * Pagesize is expected to be a power-of-two - */ -#define PAGE_SIZE_LEVEL(pagesize) \ - ((__ffs(pagesize) - 12) / 9) -/* - * Returns the number of ptes to use for a given page size - * Pagesize is expected to be a power-of-two - */ -#define PAGE_SIZE_PTE_COUNT(pagesize) \ - (1ULL << ((__ffs(pagesize) - 12) % 9)) - -/* - * Aligns a given io-virtual address to a given page size - * Pagesize is expected to be a power-of-two - */ -#define PAGE_SIZE_ALIGN(address, pagesize) \ - ((address) & ~((pagesize) - 1)) -/* - * Creates an IOMMU PTE for an address an a given pagesize - * The PTE has no permission bits set - * Pagesize is expected to be a power-of-two larger than 4096 - */ -#define PAGE_SIZE_PTE(address, pagesize) \ - (((address) | ((pagesize) - 1)) & \ - (~(pagesize >> 1)) & PM_ADDR_MASK) - -/* - * Takes a PTE value with mode=0x07 and returns the page size it maps - */ -#define PTE_PAGE_SIZE(pte) \ - (1ULL << (1 + ffz(((pte) | 0xfffULL)))) - -#define IOMMU_PTE_P (1ULL << 0) -#define IOMMU_PTE_TV (1ULL << 1) -#define IOMMU_PTE_U (1ULL << 59) -#define IOMMU_PTE_FC (1ULL << 60) -#define IOMMU_PTE_IR (1ULL << 61) -#define IOMMU_PTE_IW (1ULL << 62) - -#define DTE_FLAG_IOTLB 0x01 - -#define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL) -#define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_P) -#define IOMMU_PTE_PAGE(pte) (phys_to_virt((pte) & IOMMU_PAGE_MASK)) -#define IOMMU_PTE_MODE(pte) (((pte) >> 9) & 0x07) - -#define IOMMU_PROT_MASK 0x03 -#define IOMMU_PROT_IR 0x01 -#define IOMMU_PROT_IW 0x02 - -/* IOMMU capabilities */ -#define IOMMU_CAP_IOTLB 24 -#define IOMMU_CAP_NPCACHE 26 -#define IOMMU_CAP_EFR 27 - -#define MAX_DOMAIN_ID 65536 - -/* FIXME: move this macro to */ -#define PCI_BUS(x) (((x) >> 8) & 0xff) - -/* Protection domain flags */ -#define PD_DMA_OPS_MASK (1UL << 0) /* domain used for dma_ops */ -#define PD_DEFAULT_MASK (1UL << 1) /* domain is a default dma_ops - domain for an IOMMU */ -#define PD_PASSTHROUGH_MASK (1UL << 2) /* domain has no page - translation */ - -extern bool amd_iommu_dump; -#define DUMP_printk(format, arg...) \ - do { \ - if (amd_iommu_dump) \ - printk(KERN_INFO "AMD-Vi: " format, ## arg); \ - } while(0); - -/* global flag if IOMMUs cache non-present entries */ -extern bool amd_iommu_np_cache; -/* Only true if all IOMMUs support device IOTLBs */ -extern bool amd_iommu_iotlb_sup; - -/* - * Make iterating over all IOMMUs easier - */ -#define for_each_iommu(iommu) \ - list_for_each_entry((iommu), &amd_iommu_list, list) -#define for_each_iommu_safe(iommu, next) \ - list_for_each_entry_safe((iommu), (next), &amd_iommu_list, list) - -#define APERTURE_RANGE_SHIFT 27 /* 128 MB */ -#define APERTURE_RANGE_SIZE (1ULL << APERTURE_RANGE_SHIFT) -#define APERTURE_RANGE_PAGES (APERTURE_RANGE_SIZE >> PAGE_SHIFT) -#define APERTURE_MAX_RANGES 32 /* allows 4GB of DMA address space */ -#define APERTURE_RANGE_INDEX(a) ((a) >> APERTURE_RANGE_SHIFT) -#define APERTURE_PAGE_INDEX(a) (((a) >> 21) & 0x3fULL) - -/* - * This structure contains generic data for IOMMU protection domains - * independent of their use. - */ -struct protection_domain { - struct list_head list; /* for list of all protection domains */ - struct list_head dev_list; /* List of all devices in this domain */ - spinlock_t lock; /* mostly used to lock the page table*/ - struct mutex api_lock; /* protect page tables in the iommu-api path */ - u16 id; /* the domain id written to the device table */ - int mode; /* paging mode (0-6 levels) */ - u64 *pt_root; /* page table root pointer */ - unsigned long flags; /* flags to find out type of domain */ - bool updated; /* complete domain flush required */ - unsigned dev_cnt; /* devices assigned to this domain */ - unsigned dev_iommu[MAX_IOMMUS]; /* per-IOMMU reference count */ - void *priv; /* private data */ - -}; - -/* - * This struct contains device specific data for the IOMMU - */ -struct iommu_dev_data { - struct list_head list; /* For domain->dev_list */ - struct device *dev; /* Device this data belong to */ - struct device *alias; /* The Alias Device */ - struct protection_domain *domain; /* Domain the device is bound to */ - atomic_t bind; /* Domain attach reverent count */ -}; - -/* - * For dynamic growth the aperture size is split into ranges of 128MB of - * DMA address space each. This struct represents one such range. - */ -struct aperture_range { - - /* address allocation bitmap */ - unsigned long *bitmap; - - /* - * Array of PTE pages for the aperture. In this array we save all the - * leaf pages of the domain page table used for the aperture. This way - * we don't need to walk the page table to find a specific PTE. We can - * just calculate its address in constant time. - */ - u64 *pte_pages[64]; - - unsigned long offset; -}; - -/* - * Data container for a dma_ops specific protection domain - */ -struct dma_ops_domain { - struct list_head list; - - /* generic protection domain information */ - struct protection_domain domain; - - /* size of the aperture for the mappings */ - unsigned long aperture_size; - - /* address we start to search for free addresses */ - unsigned long next_address; - - /* address space relevant data */ - struct aperture_range *aperture[APERTURE_MAX_RANGES]; - - /* This will be set to true when TLB needs to be flushed */ - bool need_flush; - - /* - * if this is a preallocated domain, keep the device for which it was - * preallocated in this variable - */ - u16 target_dev; -}; - -/* - * Structure where we save information about one hardware AMD IOMMU in the - * system. - */ -struct amd_iommu { - struct list_head list; - - /* Index within the IOMMU array */ - int index; - - /* locks the accesses to the hardware */ - spinlock_t lock; - - /* Pointer to PCI device of this IOMMU */ - struct pci_dev *dev; - - /* physical address of MMIO space */ - u64 mmio_phys; - /* virtual address of MMIO space */ - u8 *mmio_base; - - /* capabilities of that IOMMU read from ACPI */ - u32 cap; - - /* flags read from acpi table */ - u8 acpi_flags; - - /* Extended features */ - u64 features; - - /* - * Capability pointer. There could be more than one IOMMU per PCI - * device function if there are more than one AMD IOMMU capability - * pointers. - */ - u16 cap_ptr; - - /* pci domain of this IOMMU */ - u16 pci_seg; - - /* first device this IOMMU handles. read from PCI */ - u16 first_device; - /* last device this IOMMU handles. read from PCI */ - u16 last_device; - - /* start of exclusion range of that IOMMU */ - u64 exclusion_start; - /* length of exclusion range of that IOMMU */ - u64 exclusion_length; - - /* command buffer virtual address */ - u8 *cmd_buf; - /* size of command buffer */ - u32 cmd_buf_size; - - /* size of event buffer */ - u32 evt_buf_size; - /* event buffer virtual address */ - u8 *evt_buf; - /* MSI number for event interrupt */ - u16 evt_msi_num; - - /* true if interrupts for this IOMMU are already enabled */ - bool int_enabled; - - /* if one, we need to send a completion wait command */ - bool need_sync; - - /* default dma_ops domain for that IOMMU */ - struct dma_ops_domain *default_dom; - - /* - * We can't rely on the BIOS to restore all values on reinit, so we - * need to stash them - */ - - /* The iommu BAR */ - u32 stored_addr_lo; - u32 stored_addr_hi; - - /* - * Each iommu has 6 l1s, each of which is documented as having 0x12 - * registers - */ - u32 stored_l1[6][0x12]; - - /* The l2 indirect registers */ - u32 stored_l2[0x83]; -}; - -/* - * List with all IOMMUs in the system. This list is not locked because it is - * only written and read at driver initialization or suspend time - */ -extern struct list_head amd_iommu_list; - -/* - * Array with pointers to each IOMMU struct - * The indices are referenced in the protection domains - */ -extern struct amd_iommu *amd_iommus[MAX_IOMMUS]; - -/* Number of IOMMUs present in the system */ -extern int amd_iommus_present; - -/* - * Declarations for the global list of all protection domains - */ -extern spinlock_t amd_iommu_pd_lock; -extern struct list_head amd_iommu_pd_list; - -/* - * Structure defining one entry in the device table - */ -struct dev_table_entry { - u32 data[8]; -}; - -/* - * One entry for unity mappings parsed out of the ACPI table. - */ -struct unity_map_entry { - struct list_head list; - - /* starting device id this entry is used for (including) */ - u16 devid_start; - /* end device id this entry is used for (including) */ - u16 devid_end; - - /* start address to unity map (including) */ - u64 address_start; - /* end address to unity map (including) */ - u64 address_end; - - /* required protection */ - int prot; -}; - -/* - * List of all unity mappings. It is not locked because as runtime it is only - * read. It is created at ACPI table parsing time. - */ -extern struct list_head amd_iommu_unity_map; - -/* - * Data structures for device handling - */ - -/* - * Device table used by hardware. Read and write accesses by software are - * locked with the amd_iommu_pd_table lock. - */ -extern struct dev_table_entry *amd_iommu_dev_table; - -/* - * Alias table to find requestor ids to device ids. Not locked because only - * read on runtime. - */ -extern u16 *amd_iommu_alias_table; - -/* - * Reverse lookup table to find the IOMMU which translates a specific device. - */ -extern struct amd_iommu **amd_iommu_rlookup_table; - -/* size of the dma_ops aperture as power of 2 */ -extern unsigned amd_iommu_aperture_order; - -/* largest PCI device id we expect translation requests for */ -extern u16 amd_iommu_last_bdf; - -/* allocation bitmap for domain ids */ -extern unsigned long *amd_iommu_pd_alloc_bitmap; - -/* - * If true, the addresses will be flushed on unmap time, not when - * they are reused - */ -extern bool amd_iommu_unmap_flush; - -/* takes bus and device/function and returns the device id - * FIXME: should that be in generic PCI code? */ -static inline u16 calc_devid(u8 bus, u8 devfn) -{ - return (((u16)bus) << 8) | devfn; -} - -#ifdef CONFIG_AMD_IOMMU_STATS - -struct __iommu_counter { - char *name; - struct dentry *dent; - u64 value; -}; - -#define DECLARE_STATS_COUNTER(nm) \ - static struct __iommu_counter nm = { \ - .name = #nm, \ - } - -#define INC_STATS_COUNTER(name) name.value += 1 -#define ADD_STATS_COUNTER(name, x) name.value += (x) -#define SUB_STATS_COUNTER(name, x) name.value -= (x) - -#else /* CONFIG_AMD_IOMMU_STATS */ - -#define DECLARE_STATS_COUNTER(name) -#define INC_STATS_COUNTER(name) -#define ADD_STATS_COUNTER(name, x) -#define SUB_STATS_COUNTER(name, x) - -#endif /* CONFIG_AMD_IOMMU_STATS */ - -#endif /* _ASM_X86_AMD_IOMMU_TYPES_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index aef0298..11817ff 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -123,7 +123,6 @@ ifeq ($(CONFIG_X86_64),y) obj-$(CONFIG_GART_IOMMU) += amd_gart_64.o aperture_64.o obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o - obj-$(CONFIG_AMD_IOMMU) += amd_iommu_init.o obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o obj-y += vsmp_64.o diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c deleted file mode 100644 index bfc8453..0000000 --- a/arch/x86/kernel/amd_iommu_init.c +++ /dev/null @@ -1,1572 +0,0 @@ -/* - * Copyright (C) 2007-2010 Advanced Micro Devices, Inc. - * Author: Joerg Roedel - * Leo Duran - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 as published - * by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -/* - * definitions for the ACPI scanning code - */ -#define IVRS_HEADER_LENGTH 48 - -#define ACPI_IVHD_TYPE 0x10 -#define ACPI_IVMD_TYPE_ALL 0x20 -#define ACPI_IVMD_TYPE 0x21 -#define ACPI_IVMD_TYPE_RANGE 0x22 - -#define IVHD_DEV_ALL 0x01 -#define IVHD_DEV_SELECT 0x02 -#define IVHD_DEV_SELECT_RANGE_START 0x03 -#define IVHD_DEV_RANGE_END 0x04 -#define IVHD_DEV_ALIAS 0x42 -#define IVHD_DEV_ALIAS_RANGE 0x43 -#define IVHD_DEV_EXT_SELECT 0x46 -#define IVHD_DEV_EXT_SELECT_RANGE 0x47 - -#define IVHD_FLAG_HT_TUN_EN_MASK 0x01 -#define IVHD_FLAG_PASSPW_EN_MASK 0x02 -#define IVHD_FLAG_RESPASSPW_EN_MASK 0x04 -#define IVHD_FLAG_ISOC_EN_MASK 0x08 - -#define IVMD_FLAG_EXCL_RANGE 0x08 -#define IVMD_FLAG_UNITY_MAP 0x01 - -#define ACPI_DEVFLAG_INITPASS 0x01 -#define ACPI_DEVFLAG_EXTINT 0x02 -#define ACPI_DEVFLAG_NMI 0x04 -#define ACPI_DEVFLAG_SYSMGT1 0x10 -#define ACPI_DEVFLAG_SYSMGT2 0x20 -#define ACPI_DEVFLAG_LINT0 0x40 -#define ACPI_DEVFLAG_LINT1 0x80 -#define ACPI_DEVFLAG_ATSDIS 0x10000000 - -/* - * ACPI table definitions - * - * These data structures are laid over the table to parse the important values - * out of it. - */ - -/* - * structure describing one IOMMU in the ACPI table. Typically followed by one - * or more ivhd_entrys. - */ -struct ivhd_header { - u8 type; - u8 flags; - u16 length; - u16 devid; - u16 cap_ptr; - u64 mmio_phys; - u16 pci_seg; - u16 info; - u32 reserved; -} __attribute__((packed)); - -/* - * A device entry describing which devices a specific IOMMU translates and - * which requestor ids they use. - */ -struct ivhd_entry { - u8 type; - u16 devid; - u8 flags; - u32 ext; -} __attribute__((packed)); - -/* - * An AMD IOMMU memory definition structure. It defines things like exclusion - * ranges for devices and regions that should be unity mapped. - */ -struct ivmd_header { - u8 type; - u8 flags; - u16 length; - u16 devid; - u16 aux; - u64 resv; - u64 range_start; - u64 range_length; -} __attribute__((packed)); - -bool amd_iommu_dump; - -static int __initdata amd_iommu_detected; -static bool __initdata amd_iommu_disabled; - -u16 amd_iommu_last_bdf; /* largest PCI device id we have - to handle */ -LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings - we find in ACPI */ -bool amd_iommu_unmap_flush; /* if true, flush on every unmap */ - -LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the - system */ - -/* Array to assign indices to IOMMUs*/ -struct amd_iommu *amd_iommus[MAX_IOMMUS]; -int amd_iommus_present; - -/* IOMMUs have a non-present cache? */ -bool amd_iommu_np_cache __read_mostly; -bool amd_iommu_iotlb_sup __read_mostly = true; - -/* - * The ACPI table parsing functions set this variable on an error - */ -static int __initdata amd_iommu_init_err; - -/* - * List of protection domains - used during resume - */ -LIST_HEAD(amd_iommu_pd_list); -spinlock_t amd_iommu_pd_lock; - -/* - * Pointer to the device table which is shared by all AMD IOMMUs - * it is indexed by the PCI device id or the HT unit id and contains - * information about the domain the device belongs to as well as the - * page table root pointer. - */ -struct dev_table_entry *amd_iommu_dev_table; - -/* - * The alias table is a driver specific data structure which contains the - * mappings of the PCI device ids to the actual requestor ids on the IOMMU. - * More than one device can share the same requestor id. - */ -u16 *amd_iommu_alias_table; - -/* - * The rlookup table is used to find the IOMMU which is responsible - * for a specific device. It is also indexed by the PCI device id. - */ -struct amd_iommu **amd_iommu_rlookup_table; - -/* - * AMD IOMMU allows up to 2^16 differend protection domains. This is a bitmap - * to know which ones are already in use. - */ -unsigned long *amd_iommu_pd_alloc_bitmap; - -static u32 dev_table_size; /* size of the device table */ -static u32 alias_table_size; /* size of the alias table */ -static u32 rlookup_table_size; /* size if the rlookup table */ - -/* - * This function flushes all internal caches of - * the IOMMU used by this driver. - */ -extern void iommu_flush_all_caches(struct amd_iommu *iommu); - -static inline void update_last_devid(u16 devid) -{ - if (devid > amd_iommu_last_bdf) - amd_iommu_last_bdf = devid; -} - -static inline unsigned long tbl_size(int entry_size) -{ - unsigned shift = PAGE_SHIFT + - get_order(((int)amd_iommu_last_bdf + 1) * entry_size); - - return 1UL << shift; -} - -/* Access to l1 and l2 indexed register spaces */ - -static u32 iommu_read_l1(struct amd_iommu *iommu, u16 l1, u8 address) -{ - u32 val; - - pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16)); - pci_read_config_dword(iommu->dev, 0xfc, &val); - return val; -} - -static void iommu_write_l1(struct amd_iommu *iommu, u16 l1, u8 address, u32 val) -{ - pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16 | 1 << 31)); - pci_write_config_dword(iommu->dev, 0xfc, val); - pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16)); -} - -static u32 iommu_read_l2(struct amd_iommu *iommu, u8 address) -{ - u32 val; - - pci_write_config_dword(iommu->dev, 0xf0, address); - pci_read_config_dword(iommu->dev, 0xf4, &val); - return val; -} - -static void iommu_write_l2(struct amd_iommu *iommu, u8 address, u32 val) -{ - pci_write_config_dword(iommu->dev, 0xf0, (address | 1 << 8)); - pci_write_config_dword(iommu->dev, 0xf4, val); -} - -/**************************************************************************** - * - * AMD IOMMU MMIO register space handling functions - * - * These functions are used to program the IOMMU device registers in - * MMIO space required for that driver. - * - ****************************************************************************/ - -/* - * This function set the exclusion range in the IOMMU. DMA accesses to the - * exclusion range are passed through untranslated - */ -static void iommu_set_exclusion_range(struct amd_iommu *iommu) -{ - u64 start = iommu->exclusion_start & PAGE_MASK; - u64 limit = (start + iommu->exclusion_length) & PAGE_MASK; - u64 entry; - - if (!iommu->exclusion_start) - return; - - entry = start | MMIO_EXCL_ENABLE_MASK; - memcpy_toio(iommu->mmio_base + MMIO_EXCL_BASE_OFFSET, - &entry, sizeof(entry)); - - entry = limit; - memcpy_toio(iommu->mmio_base + MMIO_EXCL_LIMIT_OFFSET, - &entry, sizeof(entry)); -} - -/* Programs the physical address of the device table into the IOMMU hardware */ -static void __init iommu_set_device_table(struct amd_iommu *iommu) -{ - u64 entry; - - BUG_ON(iommu->mmio_base == NULL); - - entry = virt_to_phys(amd_iommu_dev_table); - entry |= (dev_table_size >> 12) - 1; - memcpy_toio(iommu->mmio_base + MMIO_DEV_TABLE_OFFSET, - &entry, sizeof(entry)); -} - -/* Generic functions to enable/disable certain features of the IOMMU. */ -static void iommu_feature_enable(struct amd_iommu *iommu, u8 bit) -{ - u32 ctrl; - - ctrl = readl(iommu->mmio_base + MMIO_CONTROL_OFFSET); - ctrl |= (1 << bit); - writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET); -} - -static void iommu_feature_disable(struct amd_iommu *iommu, u8 bit) -{ - u32 ctrl; - - ctrl = readl(iommu->mmio_base + MMIO_CONTROL_OFFSET); - ctrl &= ~(1 << bit); - writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET); -} - -/* Function to enable the hardware */ -static void iommu_enable(struct amd_iommu *iommu) -{ - static const char * const feat_str[] = { - "PreF", "PPR", "X2APIC", "NX", "GT", "[5]", - "IA", "GA", "HE", "PC", NULL - }; - int i; - - printk(KERN_INFO "AMD-Vi: Enabling IOMMU at %s cap 0x%hx", - dev_name(&iommu->dev->dev), iommu->cap_ptr); - - if (iommu->cap & (1 << IOMMU_CAP_EFR)) { - printk(KERN_CONT " extended features: "); - for (i = 0; feat_str[i]; ++i) - if (iommu_feature(iommu, (1ULL << i))) - printk(KERN_CONT " %s", feat_str[i]); - } - printk(KERN_CONT "\n"); - - iommu_feature_enable(iommu, CONTROL_IOMMU_EN); -} - -static void iommu_disable(struct amd_iommu *iommu) -{ - /* Disable command buffer */ - iommu_feature_disable(iommu, CONTROL_CMDBUF_EN); - - /* Disable event logging and event interrupts */ - iommu_feature_disable(iommu, CONTROL_EVT_INT_EN); - iommu_feature_disable(iommu, CONTROL_EVT_LOG_EN); - - /* Disable IOMMU hardware itself */ - iommu_feature_disable(iommu, CONTROL_IOMMU_EN); -} - -/* - * mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in - * the system has one. - */ -static u8 * __init iommu_map_mmio_space(u64 address) -{ - u8 *ret; - - if (!request_mem_region(address, MMIO_REGION_LENGTH, "amd_iommu")) { - pr_err("AMD-Vi: Can not reserve memory region %llx for mmio\n", - address); - pr_err("AMD-Vi: This is a BIOS bug. Please contact your hardware vendor\n"); - return NULL; - } - - ret = ioremap_nocache(address, MMIO_REGION_LENGTH); - if (ret != NULL) - return ret; - - release_mem_region(address, MMIO_REGION_LENGTH); - - return NULL; -} - -static void __init iommu_unmap_mmio_space(struct amd_iommu *iommu) -{ - if (iommu->mmio_base) - iounmap(iommu->mmio_base); - release_mem_region(iommu->mmio_phys, MMIO_REGION_LENGTH); -} - -/**************************************************************************** - * - * The functions below belong to the first pass of AMD IOMMU ACPI table - * parsing. In this pass we try to find out the highest device id this - * code has to handle. Upon this information the size of the shared data - * structures is determined later. - * - ****************************************************************************/ - -/* - * This function calculates the length of a given IVHD entry - */ -static inline int ivhd_entry_length(u8 *ivhd) -{ - return 0x04 << (*ivhd >> 6); -} - -/* - * This function reads the last device id the IOMMU has to handle from the PCI - * capability header for this IOMMU - */ -static int __init find_last_devid_on_pci(int bus, int dev, int fn, int cap_ptr) -{ - u32 cap; - - cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET); - update_last_devid(calc_devid(MMIO_GET_BUS(cap), MMIO_GET_LD(cap))); - - return 0; -} - -/* - * After reading the highest device id from the IOMMU PCI capability header - * this function looks if there is a higher device id defined in the ACPI table - */ -static int __init find_last_devid_from_ivhd(struct ivhd_header *h) -{ - u8 *p = (void *)h, *end = (void *)h; - struct ivhd_entry *dev; - - p += sizeof(*h); - end += h->length; - - find_last_devid_on_pci(PCI_BUS(h->devid), - PCI_SLOT(h->devid), - PCI_FUNC(h->devid), - h->cap_ptr); - - while (p < end) { - dev = (struct ivhd_entry *)p; - switch (dev->type) { - case IVHD_DEV_SELECT: - case IVHD_DEV_RANGE_END: - case IVHD_DEV_ALIAS: - case IVHD_DEV_EXT_SELECT: - /* all the above subfield types refer to device ids */ - update_last_devid(dev->devid); - break; - default: - break; - } - p += ivhd_entry_length(p); - } - - WARN_ON(p != end); - - return 0; -} - -/* - * Iterate over all IVHD entries in the ACPI table and find the highest device - * id which we need to handle. This is the first of three functions which parse - * the ACPI table. So we check the checksum here. - */ -static int __init find_last_devid_acpi(struct acpi_table_header *table) -{ - int i; - u8 checksum = 0, *p = (u8 *)table, *end = (u8 *)table; - struct ivhd_header *h; - - /* - * Validate checksum here so we don't need to do it when - * we actually parse the table - */ - for (i = 0; i < table->length; ++i) - checksum += p[i]; - if (checksum != 0) { - /* ACPI table corrupt */ - amd_iommu_init_err = -ENODEV; - return 0; - } - - p += IVRS_HEADER_LENGTH; - - end += table->length; - while (p < end) { - h = (struct ivhd_header *)p; - switch (h->type) { - case ACPI_IVHD_TYPE: - find_last_devid_from_ivhd(h); - break; - default: - break; - } - p += h->length; - } - WARN_ON(p != end); - - return 0; -} - -/**************************************************************************** - * - * The following functions belong the the code path which parses the ACPI table - * the second time. In this ACPI parsing iteration we allocate IOMMU specific - * data structures, initialize the device/alias/rlookup table and also - * basically initialize the hardware. - * - ****************************************************************************/ - -/* - * Allocates the command buffer. This buffer is per AMD IOMMU. We can - * write commands to that buffer later and the IOMMU will execute them - * asynchronously - */ -static u8 * __init alloc_command_buffer(struct amd_iommu *iommu) -{ - u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, - get_order(CMD_BUFFER_SIZE)); - - if (cmd_buf == NULL) - return NULL; - - iommu->cmd_buf_size = CMD_BUFFER_SIZE | CMD_BUFFER_UNINITIALIZED; - - return cmd_buf; -} - -/* - * This function resets the command buffer if the IOMMU stopped fetching - * commands from it. - */ -void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu) -{ - iommu_feature_disable(iommu, CONTROL_CMDBUF_EN); - - writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); - writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); - - iommu_feature_enable(iommu, CONTROL_CMDBUF_EN); -} - -/* - * This function writes the command buffer address to the hardware and - * enables it. - */ -static void iommu_enable_command_buffer(struct amd_iommu *iommu) -{ - u64 entry; - - BUG_ON(iommu->cmd_buf == NULL); - - entry = (u64)virt_to_phys(iommu->cmd_buf); - entry |= MMIO_CMD_SIZE_512; - - memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET, - &entry, sizeof(entry)); - - amd_iommu_reset_cmd_buffer(iommu); - iommu->cmd_buf_size &= ~(CMD_BUFFER_UNINITIALIZED); -} - -static void __init free_command_buffer(struct amd_iommu *iommu) -{ - free_pages((unsigned long)iommu->cmd_buf, - get_order(iommu->cmd_buf_size & ~(CMD_BUFFER_UNINITIALIZED))); -} - -/* allocates the memory where the IOMMU will log its events to */ -static u8 * __init alloc_event_buffer(struct amd_iommu *iommu) -{ - iommu->evt_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, - get_order(EVT_BUFFER_SIZE)); - - if (iommu->evt_buf == NULL) - return NULL; - - iommu->evt_buf_size = EVT_BUFFER_SIZE; - - return iommu->evt_buf; -} - -static void iommu_enable_event_buffer(struct amd_iommu *iommu) -{ - u64 entry; - - BUG_ON(iommu->evt_buf == NULL); - - entry = (u64)virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK; - - memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET, - &entry, sizeof(entry)); - - /* set head and tail to zero manually */ - writel(0x00, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET); - writel(0x00, iommu->mmio_base + MMIO_EVT_TAIL_OFFSET); - - iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN); -} - -static void __init free_event_buffer(struct amd_iommu *iommu) -{ - free_pages((unsigned long)iommu->evt_buf, get_order(EVT_BUFFER_SIZE)); -} - -/* sets a specific bit in the device table entry. */ -static void set_dev_entry_bit(u16 devid, u8 bit) -{ - int i = (bit >> 5) & 0x07; - int _bit = bit & 0x1f; - - amd_iommu_dev_table[devid].data[i] |= (1 << _bit); -} - -static int get_dev_entry_bit(u16 devid, u8 bit) -{ - int i = (bit >> 5) & 0x07; - int _bit = bit & 0x1f; - - return (amd_iommu_dev_table[devid].data[i] & (1 << _bit)) >> _bit; -} - - -void amd_iommu_apply_erratum_63(u16 devid) -{ - int sysmgt; - - sysmgt = get_dev_entry_bit(devid, DEV_ENTRY_SYSMGT1) | - (get_dev_entry_bit(devid, DEV_ENTRY_SYSMGT2) << 1); - - if (sysmgt == 0x01) - set_dev_entry_bit(devid, DEV_ENTRY_IW); -} - -/* Writes the specific IOMMU for a device into the rlookup table */ -static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid) -{ - amd_iommu_rlookup_table[devid] = iommu; -} - -/* - * This function takes the device specific flags read from the ACPI - * table and sets up the device table entry with that information - */ -static void __init set_dev_entry_from_acpi(struct amd_iommu *iommu, - u16 devid, u32 flags, u32 ext_flags) -{ - if (flags & ACPI_DEVFLAG_INITPASS) - set_dev_entry_bit(devid, DEV_ENTRY_INIT_PASS); - if (flags & ACPI_DEVFLAG_EXTINT) - set_dev_entry_bit(devid, DEV_ENTRY_EINT_PASS); - if (flags & ACPI_DEVFLAG_NMI) - set_dev_entry_bit(devid, DEV_ENTRY_NMI_PASS); - if (flags & ACPI_DEVFLAG_SYSMGT1) - set_dev_entry_bit(devid, DEV_ENTRY_SYSMGT1); - if (flags & ACPI_DEVFLAG_SYSMGT2) - set_dev_entry_bit(devid, DEV_ENTRY_SYSMGT2); - if (flags & ACPI_DEVFLAG_LINT0) - set_dev_entry_bit(devid, DEV_ENTRY_LINT0_PASS); - if (flags & ACPI_DEVFLAG_LINT1) - set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS); - - amd_iommu_apply_erratum_63(devid); - - set_iommu_for_device(iommu, devid); -} - -/* - * Reads the device exclusion range from ACPI and initialize IOMMU with - * it - */ -static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m) -{ - struct amd_iommu *iommu = amd_iommu_rlookup_table[devid]; - - if (!(m->flags & IVMD_FLAG_EXCL_RANGE)) - return; - - if (iommu) { - /* - * We only can configure exclusion ranges per IOMMU, not - * per device. But we can enable the exclusion range per - * device. This is done here - */ - set_dev_entry_bit(m->devid, DEV_ENTRY_EX); - iommu->exclusion_start = m->range_start; - iommu->exclusion_length = m->range_length; - } -} - -/* - * This function reads some important data from the IOMMU PCI space and - * initializes the driver data structure with it. It reads the hardware - * capabilities and the first/last device entries - */ -static void __init init_iommu_from_pci(struct amd_iommu *iommu) -{ - int cap_ptr = iommu->cap_ptr; - u32 range, misc, low, high; - int i, j; - - pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET, - &iommu->cap); - pci_read_config_dword(iommu->dev, cap_ptr + MMIO_RANGE_OFFSET, - &range); - pci_read_config_dword(iommu->dev, cap_ptr + MMIO_MISC_OFFSET, - &misc); - - iommu->first_device = calc_devid(MMIO_GET_BUS(range), - MMIO_GET_FD(range)); - iommu->last_device = calc_devid(MMIO_GET_BUS(range), - MMIO_GET_LD(range)); - iommu->evt_msi_num = MMIO_MSI_NUM(misc); - - if (!(iommu->cap & (1 << IOMMU_CAP_IOTLB))) - amd_iommu_iotlb_sup = false; - - /* read extended feature bits */ - low = readl(iommu->mmio_base + MMIO_EXT_FEATURES); - high = readl(iommu->mmio_base + MMIO_EXT_FEATURES + 4); - - iommu->features = ((u64)high << 32) | low; - - if (!is_rd890_iommu(iommu->dev)) - return; - - /* - * Some rd890 systems may not be fully reconfigured by the BIOS, so - * it's necessary for us to store this information so it can be - * reprogrammed on resume - */ - - pci_read_config_dword(iommu->dev, iommu->cap_ptr + 4, - &iommu->stored_addr_lo); - pci_read_config_dword(iommu->dev, iommu->cap_ptr + 8, - &iommu->stored_addr_hi); - - /* Low bit locks writes to configuration space */ - iommu->stored_addr_lo &= ~1; - - for (i = 0; i < 6; i++) - for (j = 0; j < 0x12; j++) - iommu->stored_l1[i][j] = iommu_read_l1(iommu, i, j); - - for (i = 0; i < 0x83; i++) - iommu->stored_l2[i] = iommu_read_l2(iommu, i); -} - -/* - * Takes a pointer to an AMD IOMMU entry in the ACPI table and - * initializes the hardware and our data structures with it. - */ -static void __init init_iommu_from_acpi(struct amd_iommu *iommu, - struct ivhd_header *h) -{ - u8 *p = (u8 *)h; - u8 *end = p, flags = 0; - u16 devid = 0, devid_start = 0, devid_to = 0; - u32 dev_i, ext_flags = 0; - bool alias = false; - struct ivhd_entry *e; - - /* - * First save the recommended feature enable bits from ACPI - */ - iommu->acpi_flags = h->flags; - - /* - * Done. Now parse the device entries - */ - p += sizeof(struct ivhd_header); - end += h->length; - - - while (p < end) { - e = (struct ivhd_entry *)p; - switch (e->type) { - case IVHD_DEV_ALL: - - DUMP_printk(" DEV_ALL\t\t\t first devid: %02x:%02x.%x" - " last device %02x:%02x.%x flags: %02x\n", - PCI_BUS(iommu->first_device), - PCI_SLOT(iommu->first_device), - PCI_FUNC(iommu->first_device), - PCI_BUS(iommu->last_device), - PCI_SLOT(iommu->last_device), - PCI_FUNC(iommu->last_device), - e->flags); - - for (dev_i = iommu->first_device; - dev_i <= iommu->last_device; ++dev_i) - set_dev_entry_from_acpi(iommu, dev_i, - e->flags, 0); - break; - case IVHD_DEV_SELECT: - - DUMP_printk(" DEV_SELECT\t\t\t devid: %02x:%02x.%x " - "flags: %02x\n", - PCI_BUS(e->devid), - PCI_SLOT(e->devid), - PCI_FUNC(e->devid), - e->flags); - - devid = e->devid; - set_dev_entry_from_acpi(iommu, devid, e->flags, 0); - break; - case IVHD_DEV_SELECT_RANGE_START: - - DUMP_printk(" DEV_SELECT_RANGE_START\t " - "devid: %02x:%02x.%x flags: %02x\n", - PCI_BUS(e->devid), - PCI_SLOT(e->devid), - PCI_FUNC(e->devid), - e->flags); - - devid_start = e->devid; - flags = e->flags; - ext_flags = 0; - alias = false; - break; - case IVHD_DEV_ALIAS: - - DUMP_printk(" DEV_ALIAS\t\t\t devid: %02x:%02x.%x " - "flags: %02x devid_to: %02x:%02x.%x\n", - PCI_BUS(e->devid), - PCI_SLOT(e->devid), - PCI_FUNC(e->devid), - e->flags, - PCI_BUS(e->ext >> 8), - PCI_SLOT(e->ext >> 8), - PCI_FUNC(e->ext >> 8)); - - devid = e->devid; - devid_to = e->ext >> 8; - set_dev_entry_from_acpi(iommu, devid , e->flags, 0); - set_dev_entry_from_acpi(iommu, devid_to, e->flags, 0); - amd_iommu_alias_table[devid] = devid_to; - break; - case IVHD_DEV_ALIAS_RANGE: - - DUMP_printk(" DEV_ALIAS_RANGE\t\t " - "devid: %02x:%02x.%x flags: %02x " - "devid_to: %02x:%02x.%x\n", - PCI_BUS(e->devid), - PCI_SLOT(e->devid), - PCI_FUNC(e->devid), - e->flags, - PCI_BUS(e->ext >> 8), - PCI_SLOT(e->ext >> 8), - PCI_FUNC(e->ext >> 8)); - - devid_start = e->devid; - flags = e->flags; - devid_to = e->ext >> 8; - ext_flags = 0; - alias = true; - break; - case IVHD_DEV_EXT_SELECT: - - DUMP_printk(" DEV_EXT_SELECT\t\t devid: %02x:%02x.%x " - "flags: %02x ext: %08x\n", - PCI_BUS(e->devid), - PCI_SLOT(e->devid), - PCI_FUNC(e->devid), - e->flags, e->ext); - - devid = e->devid; - set_dev_entry_from_acpi(iommu, devid, e->flags, - e->ext); - break; - case IVHD_DEV_EXT_SELECT_RANGE: - - DUMP_printk(" DEV_EXT_SELECT_RANGE\t devid: " - "%02x:%02x.%x flags: %02x ext: %08x\n", - PCI_BUS(e->devid), - PCI_SLOT(e->devid), - PCI_FUNC(e->devid), - e->flags, e->ext); - - devid_start = e->devid; - flags = e->flags; - ext_flags = e->ext; - alias = false; - break; - case IVHD_DEV_RANGE_END: - - DUMP_printk(" DEV_RANGE_END\t\t devid: %02x:%02x.%x\n", - PCI_BUS(e->devid), - PCI_SLOT(e->devid), - PCI_FUNC(e->devid)); - - devid = e->devid; - for (dev_i = devid_start; dev_i <= devid; ++dev_i) { - if (alias) { - amd_iommu_alias_table[dev_i] = devid_to; - set_dev_entry_from_acpi(iommu, - devid_to, flags, ext_flags); - } - set_dev_entry_from_acpi(iommu, dev_i, - flags, ext_flags); - } - break; - default: - break; - } - - p += ivhd_entry_length(p); - } -} - -/* Initializes the device->iommu mapping for the driver */ -static int __init init_iommu_devices(struct amd_iommu *iommu) -{ - u32 i; - - for (i = iommu->first_device; i <= iommu->last_device; ++i) - set_iommu_for_device(iommu, i); - - return 0; -} - -static void __init free_iommu_one(struct amd_iommu *iommu) -{ - free_command_buffer(iommu); - free_event_buffer(iommu); - iommu_unmap_mmio_space(iommu); -} - -static void __init free_iommu_all(void) -{ - struct amd_iommu *iommu, *next; - - for_each_iommu_safe(iommu, next) { - list_del(&iommu->list); - free_iommu_one(iommu); - kfree(iommu); - } -} - -/* - * This function clues the initialization function for one IOMMU - * together and also allocates the command buffer and programs the - * hardware. It does NOT enable the IOMMU. This is done afterwards. - */ -static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) -{ - spin_lock_init(&iommu->lock); - - /* Add IOMMU to internal data structures */ - list_add_tail(&iommu->list, &amd_iommu_list); - iommu->index = amd_iommus_present++; - - if (unlikely(iommu->index >= MAX_IOMMUS)) { - WARN(1, "AMD-Vi: System has more IOMMUs than supported by this driver\n"); - return -ENOSYS; - } - - /* Index is fine - add IOMMU to the array */ - amd_iommus[iommu->index] = iommu; - - /* - * Copy data from ACPI table entry to the iommu struct - */ - iommu->dev = pci_get_bus_and_slot(PCI_BUS(h->devid), h->devid & 0xff); - if (!iommu->dev) - return 1; - - iommu->cap_ptr = h->cap_ptr; - iommu->pci_seg = h->pci_seg; - iommu->mmio_phys = h->mmio_phys; - iommu->mmio_base = iommu_map_mmio_space(h->mmio_phys); - if (!iommu->mmio_base) - return -ENOMEM; - - iommu->cmd_buf = alloc_command_buffer(iommu); - if (!iommu->cmd_buf) - return -ENOMEM; - - iommu->evt_buf = alloc_event_buffer(iommu); - if (!iommu->evt_buf) - return -ENOMEM; - - iommu->int_enabled = false; - - init_iommu_from_pci(iommu); - init_iommu_from_acpi(iommu, h); - init_iommu_devices(iommu); - - if (iommu->cap & (1UL << IOMMU_CAP_NPCACHE)) - amd_iommu_np_cache = true; - - return pci_enable_device(iommu->dev); -} - -/* - * Iterates over all IOMMU entries in the ACPI table, allocates the - * IOMMU structure and initializes it with init_iommu_one() - */ -static int __init init_iommu_all(struct acpi_table_header *table) -{ - u8 *p = (u8 *)table, *end = (u8 *)table; - struct ivhd_header *h; - struct amd_iommu *iommu; - int ret; - - end += table->length; - p += IVRS_HEADER_LENGTH; - - while (p < end) { - h = (struct ivhd_header *)p; - switch (*p) { - case ACPI_IVHD_TYPE: - - DUMP_printk("device: %02x:%02x.%01x cap: %04x " - "seg: %d flags: %01x info %04x\n", - PCI_BUS(h->devid), PCI_SLOT(h->devid), - PCI_FUNC(h->devid), h->cap_ptr, - h->pci_seg, h->flags, h->info); - DUMP_printk(" mmio-addr: %016llx\n", - h->mmio_phys); - - iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL); - if (iommu == NULL) { - amd_iommu_init_err = -ENOMEM; - return 0; - } - - ret = init_iommu_one(iommu, h); - if (ret) { - amd_iommu_init_err = ret; - return 0; - } - break; - default: - break; - } - p += h->length; - - } - WARN_ON(p != end); - - return 0; -} - -/**************************************************************************** - * - * The following functions initialize the MSI interrupts for all IOMMUs - * in the system. Its a bit challenging because there could be multiple - * IOMMUs per PCI BDF but we can call pci_enable_msi(x) only once per - * pci_dev. - * - ****************************************************************************/ - -static int iommu_setup_msi(struct amd_iommu *iommu) -{ - int r; - - if (pci_enable_msi(iommu->dev)) - return 1; - - r = request_threaded_irq(iommu->dev->irq, - amd_iommu_int_handler, - amd_iommu_int_thread, - 0, "AMD-Vi", - iommu->dev); - - if (r) { - pci_disable_msi(iommu->dev); - return 1; - } - - iommu->int_enabled = true; - iommu_feature_enable(iommu, CONTROL_EVT_INT_EN); - - return 0; -} - -static int iommu_init_msi(struct amd_iommu *iommu) -{ - if (iommu->int_enabled) - return 0; - - if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI)) - return iommu_setup_msi(iommu); - - return 1; -} - -/**************************************************************************** - * - * The next functions belong to the third pass of parsing the ACPI - * table. In this last pass the memory mapping requirements are - * gathered (like exclusion and unity mapping reanges). - * - ****************************************************************************/ - -static void __init free_unity_maps(void) -{ - struct unity_map_entry *entry, *next; - - list_for_each_entry_safe(entry, next, &amd_iommu_unity_map, list) { - list_del(&entry->list); - kfree(entry); - } -} - -/* called when we find an exclusion range definition in ACPI */ -static int __init init_exclusion_range(struct ivmd_header *m) -{ - int i; - - switch (m->type) { - case ACPI_IVMD_TYPE: - set_device_exclusion_range(m->devid, m); - break; - case ACPI_IVMD_TYPE_ALL: - for (i = 0; i <= amd_iommu_last_bdf; ++i) - set_device_exclusion_range(i, m); - break; - case ACPI_IVMD_TYPE_RANGE: - for (i = m->devid; i <= m->aux; ++i) - set_device_exclusion_range(i, m); - break; - default: - break; - } - - return 0; -} - -/* called for unity map ACPI definition */ -static int __init init_unity_map_range(struct ivmd_header *m) -{ - struct unity_map_entry *e = 0; - char *s; - - e = kzalloc(sizeof(*e), GFP_KERNEL); - if (e == NULL) - return -ENOMEM; - - switch (m->type) { - default: - kfree(e); - return 0; - case ACPI_IVMD_TYPE: - s = "IVMD_TYPEi\t\t\t"; - e->devid_start = e->devid_end = m->devid; - break; - case ACPI_IVMD_TYPE_ALL: - s = "IVMD_TYPE_ALL\t\t"; - e->devid_start = 0; - e->devid_end = amd_iommu_last_bdf; - break; - case ACPI_IVMD_TYPE_RANGE: - s = "IVMD_TYPE_RANGE\t\t"; - e->devid_start = m->devid; - e->devid_end = m->aux; - break; - } - e->address_start = PAGE_ALIGN(m->range_start); - e->address_end = e->address_start + PAGE_ALIGN(m->range_length); - e->prot = m->flags >> 1; - - DUMP_printk("%s devid_start: %02x:%02x.%x devid_end: %02x:%02x.%x" - " range_start: %016llx range_end: %016llx flags: %x\n", s, - PCI_BUS(e->devid_start), PCI_SLOT(e->devid_start), - PCI_FUNC(e->devid_start), PCI_BUS(e->devid_end), - PCI_SLOT(e->devid_end), PCI_FUNC(e->devid_end), - e->address_start, e->address_end, m->flags); - - list_add_tail(&e->list, &amd_iommu_unity_map); - - return 0; -} - -/* iterates over all memory definitions we find in the ACPI table */ -static int __init init_memory_definitions(struct acpi_table_header *table) -{ - u8 *p = (u8 *)table, *end = (u8 *)table; - struct ivmd_header *m; - - end += table->length; - p += IVRS_HEADER_LENGTH; - - while (p < end) { - m = (struct ivmd_header *)p; - if (m->flags & IVMD_FLAG_EXCL_RANGE) - init_exclusion_range(m); - else if (m->flags & IVMD_FLAG_UNITY_MAP) - init_unity_map_range(m); - - p += m->length; - } - - return 0; -} - -/* - * Init the device table to not allow DMA access for devices and - * suppress all page faults - */ -static void init_device_table(void) -{ - u32 devid; - - for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) { - set_dev_entry_bit(devid, DEV_ENTRY_VALID); - set_dev_entry_bit(devid, DEV_ENTRY_TRANSLATION); - } -} - -static void iommu_init_flags(struct amd_iommu *iommu) -{ - iommu->acpi_flags & IVHD_FLAG_HT_TUN_EN_MASK ? - iommu_feature_enable(iommu, CONTROL_HT_TUN_EN) : - iommu_feature_disable(iommu, CONTROL_HT_TUN_EN); - - iommu->acpi_flags & IVHD_FLAG_PASSPW_EN_MASK ? - iommu_feature_enable(iommu, CONTROL_PASSPW_EN) : - iommu_feature_disable(iommu, CONTROL_PASSPW_EN); - - iommu->acpi_flags & IVHD_FLAG_RESPASSPW_EN_MASK ? - iommu_feature_enable(iommu, CONTROL_RESPASSPW_EN) : - iommu_feature_disable(iommu, CONTROL_RESPASSPW_EN); - - iommu->acpi_flags & IVHD_FLAG_ISOC_EN_MASK ? - iommu_feature_enable(iommu, CONTROL_ISOC_EN) : - iommu_feature_disable(iommu, CONTROL_ISOC_EN); - - /* - * make IOMMU memory accesses cache coherent - */ - iommu_feature_enable(iommu, CONTROL_COHERENT_EN); -} - -static void iommu_apply_resume_quirks(struct amd_iommu *iommu) -{ - int i, j; - u32 ioc_feature_control; - struct pci_dev *pdev = NULL; - - /* RD890 BIOSes may not have completely reconfigured the iommu */ - if (!is_rd890_iommu(iommu->dev)) - return; - - /* - * First, we need to ensure that the iommu is enabled. This is - * controlled by a register in the northbridge - */ - pdev = pci_get_bus_and_slot(iommu->dev->bus->number, PCI_DEVFN(0, 0)); - - if (!pdev) - return; - - /* Select Northbridge indirect register 0x75 and enable writing */ - pci_write_config_dword(pdev, 0x60, 0x75 | (1 << 7)); - pci_read_config_dword(pdev, 0x64, &ioc_feature_control); - - /* Enable the iommu */ - if (!(ioc_feature_control & 0x1)) - pci_write_config_dword(pdev, 0x64, ioc_feature_control | 1); - - pci_dev_put(pdev); - - /* Restore the iommu BAR */ - pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4, - iommu->stored_addr_lo); - pci_write_config_dword(iommu->dev, iommu->cap_ptr + 8, - iommu->stored_addr_hi); - - /* Restore the l1 indirect regs for each of the 6 l1s */ - for (i = 0; i < 6; i++) - for (j = 0; j < 0x12; j++) - iommu_write_l1(iommu, i, j, iommu->stored_l1[i][j]); - - /* Restore the l2 indirect regs */ - for (i = 0; i < 0x83; i++) - iommu_write_l2(iommu, i, iommu->stored_l2[i]); - - /* Lock PCI setup registers */ - pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4, - iommu->stored_addr_lo | 1); -} - -/* - * This function finally enables all IOMMUs found in the system after - * they have been initialized - */ -static void enable_iommus(void) -{ - struct amd_iommu *iommu; - - for_each_iommu(iommu) { - iommu_disable(iommu); - iommu_init_flags(iommu); - iommu_set_device_table(iommu); - iommu_enable_command_buffer(iommu); - iommu_enable_event_buffer(iommu); - iommu_set_exclusion_range(iommu); - iommu_init_msi(iommu); - iommu_enable(iommu); - iommu_flush_all_caches(iommu); - } -} - -static void disable_iommus(void) -{ - struct amd_iommu *iommu; - - for_each_iommu(iommu) - iommu_disable(iommu); -} - -/* - * Suspend/Resume support - * disable suspend until real resume implemented - */ - -static void amd_iommu_resume(void) -{ - struct amd_iommu *iommu; - - for_each_iommu(iommu) - iommu_apply_resume_quirks(iommu); - - /* re-load the hardware */ - enable_iommus(); - - /* - * we have to flush after the IOMMUs are enabled because a - * disabled IOMMU will never execute the commands we send - */ - for_each_iommu(iommu) - iommu_flush_all_caches(iommu); -} - -static int amd_iommu_suspend(void) -{ - /* disable IOMMUs to go out of the way for BIOS */ - disable_iommus(); - - return 0; -} - -static struct syscore_ops amd_iommu_syscore_ops = { - .suspend = amd_iommu_suspend, - .resume = amd_iommu_resume, -}; - -/* - * This is the core init function for AMD IOMMU hardware in the system. - * This function is called from the generic x86 DMA layer initialization - * code. - * - * This function basically parses the ACPI table for AMD IOMMU (IVRS) - * three times: - * - * 1 pass) Find the highest PCI device id the driver has to handle. - * Upon this information the size of the data structures is - * determined that needs to be allocated. - * - * 2 pass) Initialize the data structures just allocated with the - * information in the ACPI table about available AMD IOMMUs - * in the system. It also maps the PCI devices in the - * system to specific IOMMUs - * - * 3 pass) After the basic data structures are allocated and - * initialized we update them with information about memory - * remapping requirements parsed out of the ACPI table in - * this last pass. - * - * After that the hardware is initialized and ready to go. In the last - * step we do some Linux specific things like registering the driver in - * the dma_ops interface and initializing the suspend/resume support - * functions. Finally it prints some information about AMD IOMMUs and - * the driver state and enables the hardware. - */ -static int __init amd_iommu_init(void) -{ - int i, ret = 0; - - /* - * First parse ACPI tables to find the largest Bus/Dev/Func - * we need to handle. Upon this information the shared data - * structures for the IOMMUs in the system will be allocated - */ - if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0) - return -ENODEV; - - ret = amd_iommu_init_err; - if (ret) - goto out; - - dev_table_size = tbl_size(DEV_TABLE_ENTRY_SIZE); - alias_table_size = tbl_size(ALIAS_TABLE_ENTRY_SIZE); - rlookup_table_size = tbl_size(RLOOKUP_TABLE_ENTRY_SIZE); - - ret = -ENOMEM; - - /* Device table - directly used by all IOMMUs */ - amd_iommu_dev_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, - get_order(dev_table_size)); - if (amd_iommu_dev_table == NULL) - goto out; - - /* - * Alias table - map PCI Bus/Dev/Func to Bus/Dev/Func the - * IOMMU see for that device - */ - amd_iommu_alias_table = (void *)__get_free_pages(GFP_KERNEL, - get_order(alias_table_size)); - if (amd_iommu_alias_table == NULL) - goto free; - - /* IOMMU rlookup table - find the IOMMU for a specific device */ - amd_iommu_rlookup_table = (void *)__get_free_pages( - GFP_KERNEL | __GFP_ZERO, - get_order(rlookup_table_size)); - if (amd_iommu_rlookup_table == NULL) - goto free; - - amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages( - GFP_KERNEL | __GFP_ZERO, - get_order(MAX_DOMAIN_ID/8)); - if (amd_iommu_pd_alloc_bitmap == NULL) - goto free; - - /* init the device table */ - init_device_table(); - - /* - * let all alias entries point to itself - */ - for (i = 0; i <= amd_iommu_last_bdf; ++i) - amd_iommu_alias_table[i] = i; - - /* - * never allocate domain 0 because its used as the non-allocated and - * error value placeholder - */ - amd_iommu_pd_alloc_bitmap[0] = 1; - - spin_lock_init(&amd_iommu_pd_lock); - - /* - * now the data structures are allocated and basically initialized - * start the real acpi table scan - */ - ret = -ENODEV; - if (acpi_table_parse("IVRS", init_iommu_all) != 0) - goto free; - - if (amd_iommu_init_err) { - ret = amd_iommu_init_err; - goto free; - } - - if (acpi_table_parse("IVRS", init_memory_definitions) != 0) - goto free; - - if (amd_iommu_init_err) { - ret = amd_iommu_init_err; - goto free; - } - - ret = amd_iommu_init_devices(); - if (ret) - goto free; - - enable_iommus(); - - if (iommu_pass_through) - ret = amd_iommu_init_passthrough(); - else - ret = amd_iommu_init_dma_ops(); - - if (ret) - goto free_disable; - - amd_iommu_init_api(); - - amd_iommu_init_notifier(); - - register_syscore_ops(&amd_iommu_syscore_ops); - - if (iommu_pass_through) - goto out; - - if (amd_iommu_unmap_flush) - printk(KERN_INFO "AMD-Vi: IO/TLB flush on unmap enabled\n"); - else - printk(KERN_INFO "AMD-Vi: Lazy IO/TLB flushing enabled\n"); - - x86_platform.iommu_shutdown = disable_iommus; -out: - return ret; - -free_disable: - disable_iommus(); - -free: - amd_iommu_uninit_devices(); - - free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, - get_order(MAX_DOMAIN_ID/8)); - - free_pages((unsigned long)amd_iommu_rlookup_table, - get_order(rlookup_table_size)); - - free_pages((unsigned long)amd_iommu_alias_table, - get_order(alias_table_size)); - - free_pages((unsigned long)amd_iommu_dev_table, - get_order(dev_table_size)); - - free_iommu_all(); - - free_unity_maps(); - -#ifdef CONFIG_GART_IOMMU - /* - * We failed to initialize the AMD IOMMU - try fallback to GART - * if possible. - */ - gart_iommu_init(); - -#endif - - goto out; -} - -/**************************************************************************** - * - * Early detect code. This code runs at IOMMU detection time in the DMA - * layer. It just looks if there is an IVRS ACPI table to detect AMD - * IOMMUs - * - ****************************************************************************/ -static int __init early_amd_iommu_detect(struct acpi_table_header *table) -{ - return 0; -} - -int __init amd_iommu_detect(void) -{ - if (no_iommu || (iommu_detected && !gart_iommu_aperture)) - return -ENODEV; - - if (amd_iommu_disabled) - return -ENODEV; - - if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) { - iommu_detected = 1; - amd_iommu_detected = 1; - x86_init.iommu.iommu_init = amd_iommu_init; - - /* Make sure ACS will be enabled */ - pci_request_acs(); - return 1; - } - return -ENODEV; -} - -/**************************************************************************** - * - * Parsing functions for the AMD IOMMU specific kernel command line - * options. - * - ****************************************************************************/ - -static int __init parse_amd_iommu_dump(char *str) -{ - amd_iommu_dump = true; - - return 1; -} - -static int __init parse_amd_iommu_options(char *str) -{ - for (; *str; ++str) { - if (strncmp(str, "fullflush", 9) == 0) - amd_iommu_unmap_flush = true; - if (strncmp(str, "off", 3) == 0) - amd_iommu_disabled = true; - } - - return 1; -} - -__setup("amd_iommu_dump", parse_amd_iommu_dump); -__setup("amd_iommu=", parse_amd_iommu_options); - -IOMMU_INIT_FINISH(amd_iommu_detect, - gart_iommu_hole_init, - 0, - 0); -- cgit v1.1 From 00b30cf04a775b5292ab704f782394e36e25617d Mon Sep 17 00:00:00 2001 From: "cpw@sgi.com" Date: Tue, 21 Jun 2011 07:21:26 -0500 Subject: x86, UV: Fix smp_processor_id() use in a preemptable region Fix a call by tunables_write() to smp_processor_id() within a preemptable region. Call get_cpu()/put_cpu() around the region where the returned cpu number is actually used, which makes it non-preemptable. A DEBUG_PREEMPT warning is prevented. UV does not support cpu hotplug yet, but this is a step toward that ability as well. Signed-off-by: Cliff Wickman Link: http://lkml.kernel.org/r/20110621122242.086384966@sgi.com Signed-off-by: Ingo Molnar --- arch/x86/platform/uv/tlb_uv.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 68e467f..34be650 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -1334,9 +1334,10 @@ static ssize_t tunables_write(struct file *file, const char __user *user, instr[count] = '\0'; - bcp = &per_cpu(bau_control, smp_processor_id()); - + cpu = get_cpu(); + bcp = &per_cpu(bau_control, cpu); ret = parse_tunables_write(bcp, instr, count); + put_cpu(); if (ret) return ret; -- cgit v1.1 From b18fb2c04ac46885f5b0226cd945e763eae51567 Mon Sep 17 00:00:00 2001 From: "cpw@sgi.com" Date: Tue, 21 Jun 2011 07:21:27 -0500 Subject: x86, UV: Inline header file functions Make all the functions in uv_bau.h inline so that it can be included in the fake prom (used in simulations). If not inlined the unused functions will generate compiler warnings. Signed-off-by: Cliff Wickman Reviewed-by: Pekka Enberg Link: http://lkml.kernel.org/r/20110621122242.230529678@sgi.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uv/uv_bau.h | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index a291c40..605e613b 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -526,72 +526,72 @@ struct bau_control { struct hub_and_pnode *thp; }; -static unsigned long read_mmr_uv2_status(void) +static inline unsigned long read_mmr_uv2_status(void) { return read_lmmr(UV2H_LB_BAU_SB_ACTIVATION_STATUS_2); } -static void write_mmr_data_broadcast(int pnode, unsigned long mmr_image) +static inline void write_mmr_data_broadcast(int pnode, unsigned long mmr_image) { write_gmmr(pnode, UVH_BAU_DATA_BROADCAST, mmr_image); } -static void write_mmr_descriptor_base(int pnode, unsigned long mmr_image) +static inline void write_mmr_descriptor_base(int pnode, unsigned long mmr_image) { write_gmmr(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE, mmr_image); } -static void write_mmr_activation(unsigned long index) +static inline void write_mmr_activation(unsigned long index) { write_lmmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index); } -static void write_gmmr_activation(int pnode, unsigned long mmr_image) +static inline void write_gmmr_activation(int pnode, unsigned long mmr_image) { write_gmmr(pnode, UVH_LB_BAU_SB_ACTIVATION_CONTROL, mmr_image); } -static void write_mmr_payload_first(int pnode, unsigned long mmr_image) +static inline void write_mmr_payload_first(int pnode, unsigned long mmr_image) { write_gmmr(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST, mmr_image); } -static void write_mmr_payload_tail(int pnode, unsigned long mmr_image) +static inline void write_mmr_payload_tail(int pnode, unsigned long mmr_image) { write_gmmr(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL, mmr_image); } -static void write_mmr_payload_last(int pnode, unsigned long mmr_image) +static inline void write_mmr_payload_last(int pnode, unsigned long mmr_image) { write_gmmr(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST, mmr_image); } -static void write_mmr_misc_control(int pnode, unsigned long mmr_image) +static inline void write_mmr_misc_control(int pnode, unsigned long mmr_image) { write_gmmr(pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image); } -static unsigned long read_mmr_misc_control(int pnode) +static inline unsigned long read_mmr_misc_control(int pnode) { return read_gmmr(pnode, UVH_LB_BAU_MISC_CONTROL); } -static void write_mmr_sw_ack(unsigned long mr) +static inline void write_mmr_sw_ack(unsigned long mr) { uv_write_local_mmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, mr); } -static unsigned long read_mmr_sw_ack(void) +static inline unsigned long read_mmr_sw_ack(void) { return read_lmmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE); } -static unsigned long read_gmmr_sw_ack(int pnode) +static inline unsigned long read_gmmr_sw_ack(int pnode) { return read_gmmr(pnode, UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE); } -static void write_mmr_data_config(int pnode, unsigned long mr) +static inline void write_mmr_data_config(int pnode, unsigned long mr) { uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG, mr); } -- cgit v1.1 From 9c9153db22870c3f37add83bea30500fcc268a73 Mon Sep 17 00:00:00 2001 From: "cpw@sgi.com" Date: Tue, 21 Jun 2011 07:21:28 -0500 Subject: x86, UV: Allow for non-consecutive sockets Fix for the topology in which there is a socket 1 on a blade with no socket 0. Only call make_per_cpu_thp() for present sockets. We have only seen this fail for internal configurations. Signed-off-by: Cliff Wickman Link: http://lkml.kernel.org/r/20110621122242.363757364@sgi.com Signed-off-by: Ingo Molnar --- arch/x86/platform/uv/tlb_uv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 34be650..7623b08 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -1752,10 +1752,10 @@ static int __init summarize_uvhub_sockets(int nuvhubs, sdp = &bdp->socket[socket]; if (scan_sock(sdp, bdp, &smaster, &hmaster)) return 1; + make_per_cpu_thp(smaster); } socket++; socket_mask = (socket_mask >> 1); - make_per_cpu_thp(smaster); } } return 0; -- cgit v1.1 From 485f07d349a3e5059413b886a3bc1996c4b3d14d Mon Sep 17 00:00:00 2001 From: "cpw@sgi.com" Date: Tue, 21 Jun 2011 07:21:29 -0500 Subject: x86, UV: Correct reset_with_ipi() Fix reset_with_ipi() to look up a cpu for a blade based on the distribution map being indexed by the potentially sparsely numbered pnode. This patch is critical to tlb shootdown on a partitioned UV system, or one with nonconsecutive blade numbers. The distribution map bits represent pnodes relative to the partition base pnode. Previous to this patch it had been assuming bits based on 0-based, consecutive blade ids. Signed-off-by: Cliff Wickman Link: http://lkml.kernel.org/r/20110621122242.497700003@sgi.com Signed-off-by: Ingo Molnar --- arch/x86/platform/uv/tlb_uv.c | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 7623b08..4b28e09 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -296,14 +296,18 @@ static void bau_process_message(struct msg_desc *mdp, } /* - * Determine the first cpu on a uvhub. + * Determine the first cpu on a pnode. */ -static int uvhub_to_first_cpu(int uvhub) +static int pnode_to_first_cpu(int pnode, struct bau_control *smaster) { int cpu; - for_each_present_cpu(cpu) - if (uvhub == uv_cpu_to_blade_id(cpu)) + struct hub_and_pnode *hpp; + + for_each_present_cpu(cpu) { + hpp = &smaster->thp[cpu]; + if (pnode == hpp->pnode) return cpu; + } return -1; } @@ -366,23 +370,28 @@ static void do_reset(void *ptr) * Use IPI to get all target uvhubs to release resources held by * a given sending cpu number. */ -static void reset_with_ipi(struct bau_targ_hubmask *distribution, int sender) +static void reset_with_ipi(struct bau_targ_hubmask *distribution, + struct bau_control *bcp) { - int uvhub; + int pnode; + int apnode; int maskbits; cpumask_t mask; + int sender = bcp->cpu; + struct bau_control *smaster = bcp->socket_master; struct reset_args reset_args; reset_args.sender = sender; cpus_clear(mask); /* find a single cpu for each uvhub in this distribution mask */ maskbits = sizeof(struct bau_targ_hubmask) * BITSPERBYTE; - for (uvhub = 0; uvhub < maskbits; uvhub++) { + /* each bit is a pnode relative to the partition base pnode */ + for (pnode = 0; pnode < maskbits; pnode++) { int cpu; - if (!bau_uvhub_isset(uvhub, distribution)) + if (!bau_uvhub_isset(pnode, distribution)) continue; - /* find a cpu for this uvhub */ - cpu = uvhub_to_first_cpu(uvhub); + apnode = pnode + bcp->partition_base_pnode; + cpu = pnode_to_first_cpu(apnode, smaster); cpu_set(cpu, mask); } @@ -604,7 +613,7 @@ static void destination_plugged(struct bau_desc *bau_desc, quiesce_local_uvhub(hmaster); spin_lock(&hmaster->queue_lock); - reset_with_ipi(&bau_desc->distribution, bcp->cpu); + reset_with_ipi(&bau_desc->distribution, bcp); spin_unlock(&hmaster->queue_lock); end_uvhub_quiesce(hmaster); @@ -626,7 +635,7 @@ static void destination_timeout(struct bau_desc *bau_desc, quiesce_local_uvhub(hmaster); spin_lock(&hmaster->queue_lock); - reset_with_ipi(&bau_desc->distribution, bcp->cpu); + reset_with_ipi(&bau_desc->distribution, bcp); spin_unlock(&hmaster->queue_lock); end_uvhub_quiesce(hmaster); -- cgit v1.1 From a456eaab87461e33d94e748565eabd474283a475 Mon Sep 17 00:00:00 2001 From: "cpw@sgi.com" Date: Tue, 21 Jun 2011 07:21:30 -0500 Subject: x86, UV: Rename hubmask to pnmask Rename 'bau_targ_hubmask' to 'pnmask' for clarity. The BAU distribution bit mask is indexed by pnode number, not hub or blade number. This important fact is not clear while the mask is called a 'hubmask'. Signed-off-by: Cliff Wickman Link: http://lkml.kernel.org/r/20110621122242.630995969@sgi.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uv/uv_bau.h | 12 ++++++------ arch/x86/platform/uv/tlb_uv.c | 5 ++--- 2 files changed, 8 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index 605e613b..fa6a48e 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -183,7 +183,7 @@ * 'base_dest_nasid' field of the header corresponds to the * destination nodeID associated with that specified bit. */ -struct bau_targ_hubmask { +struct pnmask { unsigned long bits[BITS_TO_LONGS(UV_DISTRIBUTION_SIZE)]; }; @@ -314,7 +314,7 @@ struct bau_msg_header { * Should be 64 bytes */ struct bau_desc { - struct bau_targ_hubmask distribution; + struct pnmask distribution; /* * message template, consisting of header and payload: */ @@ -596,20 +596,20 @@ static inline void write_mmr_data_config(int pnode, unsigned long mr) uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG, mr); } -static inline int bau_uvhub_isset(int uvhub, struct bau_targ_hubmask *dstp) +static inline int bau_uvhub_isset(int uvhub, struct pnmask *dstp) { return constant_test_bit(uvhub, &dstp->bits[0]); } -static inline void bau_uvhub_set(int pnode, struct bau_targ_hubmask *dstp) +static inline void bau_uvhub_set(int pnode, struct pnmask *dstp) { __set_bit(pnode, &dstp->bits[0]); } -static inline void bau_uvhubs_clear(struct bau_targ_hubmask *dstp, +static inline void bau_uvhubs_clear(struct pnmask *dstp, int nbits) { bitmap_zero(&dstp->bits[0], nbits); } -static inline int bau_uvhub_weight(struct bau_targ_hubmask *dstp) +static inline int bau_uvhub_weight(struct pnmask *dstp) { return bitmap_weight((unsigned long *)&dstp->bits[0], UV_DISTRIBUTION_SIZE); diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 4b28e09..191ef28 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -370,8 +370,7 @@ static void do_reset(void *ptr) * Use IPI to get all target uvhubs to release resources held by * a given sending cpu number. */ -static void reset_with_ipi(struct bau_targ_hubmask *distribution, - struct bau_control *bcp) +static void reset_with_ipi(struct pnmask *distribution, struct bau_control *bcp) { int pnode; int apnode; @@ -384,7 +383,7 @@ static void reset_with_ipi(struct bau_targ_hubmask *distribution, reset_args.sender = sender; cpus_clear(mask); /* find a single cpu for each uvhub in this distribution mask */ - maskbits = sizeof(struct bau_targ_hubmask) * BITSPERBYTE; + maskbits = sizeof(struct pnmask) * BITSPERBYTE; /* each bit is a pnode relative to the partition base pnode */ for (pnode = 0; pnode < maskbits; pnode++) { int cpu; -- cgit v1.1 From 442d3924926c62741912d8a930220af253922007 Mon Sep 17 00:00:00 2001 From: "cpw@sgi.com" Date: Tue, 21 Jun 2011 07:21:31 -0500 Subject: x86, UV: Remove cpumask_t from the stack Remove the large stack-resident cpumask_t from reset_with_ipi()'s stack by allocating one per uvhub. Due to the limited size of the stack the potentially huge cpumask_t may cause stack overrun. We haven't seen it happen yet, but we need to make it a practice not to push such structures onto the stack. Signed-off-by: Cliff Wickman Reviewed-by: Pekka Enberg Link: http://lkml.kernel.org/r/20110621122242.832589130@sgi.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uv/uv_bau.h | 1 + arch/x86/platform/uv/tlb_uv.c | 19 +++++++++++++++---- 2 files changed, 16 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index fa6a48e..16ce58c 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -488,6 +488,7 @@ struct bau_control { struct bau_control *uvhub_master; struct bau_control *socket_master; struct ptc_stats *statp; + cpumask_t *cpumask; unsigned long timeout_interval; unsigned long set_bau_on_time; atomic_t active_descriptor_count; diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 191ef28..5265842 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -375,13 +375,13 @@ static void reset_with_ipi(struct pnmask *distribution, struct bau_control *bcp) int pnode; int apnode; int maskbits; - cpumask_t mask; int sender = bcp->cpu; + cpumask_t *mask = bcp->uvhub_master->cpumask; struct bau_control *smaster = bcp->socket_master; struct reset_args reset_args; reset_args.sender = sender; - cpus_clear(mask); + cpus_clear(*mask); /* find a single cpu for each uvhub in this distribution mask */ maskbits = sizeof(struct pnmask) * BITSPERBYTE; /* each bit is a pnode relative to the partition base pnode */ @@ -391,11 +391,11 @@ static void reset_with_ipi(struct pnmask *distribution, struct bau_control *bcp) continue; apnode = pnode + bcp->partition_base_pnode; cpu = pnode_to_first_cpu(apnode, smaster); - cpu_set(cpu, mask); + cpu_set(cpu, *mask); } /* IPI all cpus; preemption is already disabled */ - smp_call_function_many(&mask, do_reset, (void *)&reset_args, 1); + smp_call_function_many(mask, do_reset, (void *)&reset_args, 1); return; } @@ -1696,6 +1696,16 @@ static void make_per_cpu_thp(struct bau_control *smaster) } /* + * Each uvhub is to get a local cpumask. + */ +static void make_per_hub_cpumask(struct bau_control *hmaster) +{ + int sz = sizeof(cpumask_t); + + hmaster->cpumask = kzalloc_node(sz, GFP_KERNEL, hmaster->osnode); +} + +/* * Initialize all the per_cpu information for the cpu's on a given socket, * given what has been gathered into the socket_desc struct. * And reports the chosen hub and socket masters back to the caller. @@ -1765,6 +1775,7 @@ static int __init summarize_uvhub_sockets(int nuvhubs, socket++; socket_mask = (socket_mask >> 1); } + make_per_hub_cpumask(hmaster); } return 0; } -- cgit v1.1 From bbd270e6f45a5ca30e1d3b6133c516a9dc9dd6c0 Mon Sep 17 00:00:00 2001 From: "cpw@sgi.com" Date: Tue, 21 Jun 2011 07:21:32 -0500 Subject: x86, UV: Correct failed topology memory leak Fix a memory leak in init_per_cpu() when the topology check fails. The leak should never occur on deployed systems. It would only occur in an unexpected topology that would make the BAU unuseable as a result. Signed-off-by: Cliff Wickman Link: http://lkml.kernel.org/r/20110621122242.981533045@sgi.com Signed-off-by: Ingo Molnar --- arch/x86/platform/uv/tlb_uv.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 5265842..db8b915 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -1797,15 +1797,20 @@ static int __init init_per_cpu(int nuvhubs, int base_part_pnode) uvhub_mask = kzalloc((nuvhubs+7)/8, GFP_KERNEL); if (get_cpu_topology(base_part_pnode, uvhub_descs, uvhub_mask)) - return 1; + goto fail; if (summarize_uvhub_sockets(nuvhubs, uvhub_descs, uvhub_mask)) - return 1; + goto fail; kfree(uvhub_descs); kfree(uvhub_mask); init_per_cpu_tunables(); return 0; + +fail: + kfree(uvhub_descs); + kfree(uvhub_mask); + return 1; } /* -- cgit v1.1 From ae90c232be376bd8a283f3b6fb37cb5bd2635d67 Mon Sep 17 00:00:00 2001 From: "cpw@sgi.com" Date: Tue, 21 Jun 2011 07:21:33 -0500 Subject: x86, UV: Correct UV2 BAU destination timeout Correct the UV2 broacast assist unit's destination timeout period. And the activation status register in UV2 should be tested for a destination timeout with a 4, not a 2. The values for Active versus Timeout were reversed. This patch is critical for TLB shootdown on an Altix UV2 system (i.e. the follow-on to the current Altix UV). Destination timeout period: The period is set in 4 bits of memory-mapped register MISC_CONTROL. The left bit toggles base period between 10us and 80us. The other 3 bits are the multiplier. Decimal 15, hex f, gives the maximum: 7 * 80us Signed-off-by: Cliff Wickman Link: http://lkml.kernel.org/r/20110621122243.117324443@sgi.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uv/uv_bau.h | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index 16ce58c..37d3698 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -67,7 +67,7 @@ * we're using 655us, similar to UV1: 65 units of 10us */ #define UV1_INTD_SOFT_ACK_TIMEOUT_PERIOD (9UL) -#define UV2_INTD_SOFT_ACK_TIMEOUT_PERIOD (65*10UL) +#define UV2_INTD_SOFT_ACK_TIMEOUT_PERIOD (15UL) #define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD (is_uv1_hub() ? \ UV1_INTD_SOFT_ACK_TIMEOUT_PERIOD : \ @@ -106,12 +106,20 @@ #define DS_SOURCE_TIMEOUT 3 /* * bits put together from HRP_LB_BAU_SB_ACTIVATION_STATUS_0/1/2 - * values 1 and 5 will not occur + * values 1 and 3 will not occur + * Decoded meaning ERROR BUSY AUX ERR + * ------------------------------- ---- ----- ------- + * IDLE 0 0 0 + * BUSY (active) 0 1 0 + * SW Ack Timeout (destination) 1 0 0 + * SW Ack INTD rejected (strong NACK) 1 0 1 + * Source Side Time Out Detected 1 1 0 + * Destination Side PUT Failed 1 1 1 */ #define UV2H_DESC_IDLE 0 -#define UV2H_DESC_DEST_TIMEOUT 2 -#define UV2H_DESC_DEST_STRONG_NACK 3 -#define UV2H_DESC_BUSY 4 +#define UV2H_DESC_BUSY 2 +#define UV2H_DESC_DEST_TIMEOUT 4 +#define UV2H_DESC_DEST_STRONG_NACK 5 #define UV2H_DESC_SOURCE_TIMEOUT 6 #define UV2H_DESC_DEST_PUT_ERR 7 -- cgit v1.1 From b7f080cfe223b3b7424872639d153695615a9255 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 16 Jun 2011 11:01:34 +0000 Subject: net: remove mm.h inclusion from netdevice.h Remove linux/mm.h inclusion from netdevice.h -- it's unused (I've checked manually). To prevent mm.h inclusion via other channels also extract "enum dma_data_direction" definition into separate header. This tiny piece is what gluing netdevice.h with mm.h via "netdevice.h => dmaengine.h => dma-mapping.h => scatterlist.h => mm.h". Removal of mm.h from scatterlist.h was tried and was found not feasible on most archs, so the link was cutoff earlier. Hope people are OK with tiny include file. Note, that mm_types.h is still dragged in, but it is a separate story. Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- arch/x86/kernel/tboot.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index 30ac65d..e07a2fc 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include -- cgit v1.1 From 3824abd1279ef75f791c43a6b1e3162ae0692b42 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 1 Jun 2011 12:25:47 -0500 Subject: x86: Add support for cmpxchg_double A simple implementation that only supports the word size and does not have a fallback mode (would require a spinlock). Add 32 and 64 bit support for cmpxchg_double. cmpxchg double uses the cmpxchg8b or cmpxchg16b instruction on x86 processors to compare and swap 2 machine words. This allows lockless algorithms to move more context information through critical sections. Set a flag CONFIG_CMPXCHG_DOUBLE to signal that support for double word cmpxchg detection has been build into the kernel. Note that each subsystem using cmpxchg_double has to implement a fall back mechanism as long as we offer support for processors that do not implement cmpxchg_double. Reviewed-by: H. Peter Anvin Cc: Tejun Heo Cc: Pekka Enberg Signed-off-by: Christoph Lameter Link: http://lkml.kernel.org/r/20110601172614.173427964@linux.com Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig.cpu | 3 +++ arch/x86/include/asm/cmpxchg_32.h | 48 +++++++++++++++++++++++++++++++++++++++ arch/x86/include/asm/cmpxchg_64.h | 45 ++++++++++++++++++++++++++++++++++++ arch/x86/include/asm/cpufeature.h | 2 ++ 4 files changed, 98 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 6a7cfdf..e3ca7e0 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -312,6 +312,9 @@ config X86_CMPXCHG config CMPXCHG_LOCAL def_bool X86_64 || (X86_32 && !M386) +config CMPXCHG_DOUBLE + def_bool y + config X86_L1_CACHE_SHIFT int default "7" if MPENTIUM4 || MPSC diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h index 284a6e8..3deb725 100644 --- a/arch/x86/include/asm/cmpxchg_32.h +++ b/arch/x86/include/asm/cmpxchg_32.h @@ -280,4 +280,52 @@ static inline unsigned long cmpxchg_386(volatile void *ptr, unsigned long old, #endif +#define cmpxchg8b(ptr, o1, o2, n1, n2) \ +({ \ + char __ret; \ + __typeof__(o2) __dummy; \ + __typeof__(*(ptr)) __old1 = (o1); \ + __typeof__(o2) __old2 = (o2); \ + __typeof__(*(ptr)) __new1 = (n1); \ + __typeof__(o2) __new2 = (n2); \ + asm volatile(LOCK_PREFIX "cmpxchg8b %2; setz %1" \ + : "=d"(__dummy), "=a" (__ret), "+m" (*ptr)\ + : "a" (__old1), "d"(__old2), \ + "b" (__new1), "c" (__new2) \ + : "memory"); \ + __ret; }) + + +#define cmpxchg8b_local(ptr, o1, o2, n1, n2) \ +({ \ + char __ret; \ + __typeof__(o2) __dummy; \ + __typeof__(*(ptr)) __old1 = (o1); \ + __typeof__(o2) __old2 = (o2); \ + __typeof__(*(ptr)) __new1 = (n1); \ + __typeof__(o2) __new2 = (n2); \ + asm volatile("cmpxchg8b %2; setz %1" \ + : "=d"(__dummy), "=a"(__ret), "+m" (*ptr)\ + : "a" (__old), "d"(__old2), \ + "b" (__new1), "c" (__new2), \ + : "memory"); \ + __ret; }) + + +#define cmpxchg_double(ptr, o1, o2, n1, n2) \ +({ \ + BUILD_BUG_ON(sizeof(*(ptr)) != 4); \ + VM_BUG_ON((unsigned long)(ptr) % 8); \ + cmpxchg8b((ptr), (o1), (o2), (n1), (n2)); \ +}) + +#define cmpxchg_double_local(ptr, o1, o2, n1, n2) \ +({ \ + BUILD_BUG_ON(sizeof(*(ptr)) != 4); \ + VM_BUG_ON((unsigned long)(ptr) % 8); \ + cmpxchg16b_local((ptr), (o1), (o2), (n1), (n2)); \ +}) + +#define system_has_cmpxchg_double() cpu_has_cx8 + #endif /* _ASM_X86_CMPXCHG_32_H */ diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h index 423ae58..7cf5c0a 100644 --- a/arch/x86/include/asm/cmpxchg_64.h +++ b/arch/x86/include/asm/cmpxchg_64.h @@ -151,4 +151,49 @@ extern void __cmpxchg_wrong_size(void); cmpxchg_local((ptr), (o), (n)); \ }) +#define cmpxchg16b(ptr, o1, o2, n1, n2) \ +({ \ + char __ret; \ + __typeof__(o2) __junk; \ + __typeof__(*(ptr)) __old1 = (o1); \ + __typeof__(o2) __old2 = (o2); \ + __typeof__(*(ptr)) __new1 = (n1); \ + __typeof__(o2) __new2 = (n2); \ + asm volatile(LOCK_PREFIX "cmpxchg16b %2;setz %1" \ + : "=d"(__junk), "=a"(__ret), "+m" (*ptr) \ + : "b"(__new1), "c"(__new2), \ + "a"(__old1), "d"(__old2)); \ + __ret; }) + + +#define cmpxchg16b_local(ptr, o1, o2, n1, n2) \ +({ \ + char __ret; \ + __typeof__(o2) __junk; \ + __typeof__(*(ptr)) __old1 = (o1); \ + __typeof__(o2) __old2 = (o2); \ + __typeof__(*(ptr)) __new1 = (n1); \ + __typeof__(o2) __new2 = (n2); \ + asm volatile("cmpxchg16b %2;setz %1" \ + : "=d"(__junk), "=a"(__ret), "+m" (*ptr) \ + : "b"(__new1), "c"(__new2), \ + "a"(__old1), "d"(__old2)); \ + __ret; }) + +#define cmpxchg_double(ptr, o1, o2, n1, n2) \ +({ \ + BUILD_BUG_ON(sizeof(*(ptr)) != 8); \ + VM_BUG_ON((unsigned long)(ptr) % 16); \ + cmpxchg16b((ptr), (o1), (o2), (n1), (n2)); \ +}) + +#define cmpxchg_double_local(ptr, o1, o2, n1, n2) \ +({ \ + BUILD_BUG_ON(sizeof(*(ptr)) != 8); \ + VM_BUG_ON((unsigned long)(ptr) % 16); \ + cmpxchg16b_local((ptr), (o1), (o2), (n1), (n2)); \ +}) + +#define system_has_cmpxchg_double() cpu_has_cx16 + #endif /* _ASM_X86_CMPXCHG_64_H */ diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 71cc380..d1053cd 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -288,6 +288,8 @@ extern const char * const x86_power_flags[32]; #define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR) #define cpu_has_pclmulqdq boot_cpu_has(X86_FEATURE_PCLMULQDQ) #define cpu_has_perfctr_core boot_cpu_has(X86_FEATURE_PERFCTR_CORE) +#define cpu_has_cx8 boot_cpu_has(X86_FEATURE_CX8) +#define cpu_has_cx16 boot_cpu_has(X86_FEATURE_CX16) #if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64) # define cpu_has_invlpg 1 -- cgit v1.1 From 6d3321e8e2b3bf6a5892e2ef673c7bf536e3f904 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Thu, 23 Jun 2011 11:19:26 -0700 Subject: x86, mtrr: lock stop machine during MTRR rendezvous sequence MTRR rendezvous sequence using stop_one_cpu_nowait() can potentially happen in parallel with another system wide rendezvous using stop_machine(). This can lead to deadlock (The order in which works are queued can be different on different cpu's. Some cpu's will be running the first rendezvous handler and others will be running the second rendezvous handler. Each set waiting for the other set to join for the system wide rendezvous, leading to a deadlock). MTRR rendezvous sequence is not implemented using stop_machine() as this gets called both from the process context aswell as the cpu online paths (where the cpu has not come online and the interrupts are disabled etc). stop_machine() works with only online cpus. For now, take the stop_machine mutex in the MTRR rendezvous sequence that gets called from an online cpu (here we are in the process context and can potentially sleep while taking the mutex). And the MTRR rendezvous that gets triggered during cpu online doesn't need to take this stop_machine lock (as the stop_machine() already ensures that there is no cpu hotplug going on in parallel by doing get_online_cpus()) TBD: Pursue a cleaner solution of extending the stop_machine() infrastructure to handle the case where the calling cpu is still not online and use this for MTRR rendezvous sequence. fixes: https://bugzilla.novell.com/show_bug.cgi?id=672008 Reported-by: Vadim Kotelnikov Signed-off-by: Suresh Siddha Link: http://lkml.kernel.org/r/20110623182056.807230326@sbsiddha-MOBL3.sc.intel.com Cc: stable@kernel.org # 2.6.35+, backport a week or two after this gets more testing in mainline Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mtrr/main.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 929739a..3d17bc7 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -248,6 +248,25 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ unsigned long flags; int cpu; +#ifdef CONFIG_SMP + /* + * If this cpu is not yet active, we are in the cpu online path. There + * can be no stop_machine() in parallel, as stop machine ensures this + * by using get_online_cpus(). We can skip taking the stop_cpus_mutex, + * as we don't need it and also we can't afford to block while waiting + * for the mutex. + * + * If this cpu is active, we need to prevent stop_machine() happening + * in parallel by taking the stop cpus mutex. + * + * Also, this is called in the context of cpu online path or in the + * context where cpu hotplug is prevented. So checking the active status + * of the raw_smp_processor_id() is safe. + */ + if (cpu_active(raw_smp_processor_id())) + mutex_lock(&stop_cpus_mutex); +#endif + preempt_disable(); data.smp_reg = reg; @@ -330,6 +349,10 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ local_irq_restore(flags); preempt_enable(); +#ifdef CONFIG_SMP + if (cpu_active(raw_smp_processor_id())) + mutex_unlock(&stop_cpus_mutex); +#endif } /** -- cgit v1.1 From 06c3df49521c1b112b777cc4946e5de057c814ba Mon Sep 17 00:00:00 2001 From: Jamie Iles Date: Mon, 6 Jun 2011 12:43:07 +0100 Subject: clocksource: apb: Share APB timer code with other platforms The APB timers are an IP block from Synopsys (DesignWare APB timers) and are also found in other systems including ARM SoC's. This patch adds functions for creating clock_event_devices and clocksources from APB timers but does not do the resource allocation. This is handled in a higher layer to allow the timers to be created from multiple methods such as platform_devices. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Jacob Pan Signed-off-by: Jamie Iles Signed-off-by: John Stultz --- arch/x86/Kconfig | 1 + arch/x86/include/asm/apb_timer.h | 22 +-- arch/x86/kernel/apb_timer.c | 409 +++++++-------------------------------- 3 files changed, 72 insertions(+), 360 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index da34972..10ce62f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -617,6 +617,7 @@ config HPET_EMULATE_RTC config APB_TIMER def_bool y if MRST prompt "Langwell APB Timer Support" if X86_MRST + select DW_APB_TIMER help APB timer is the replacement for 8254, HPET on X86 MID platforms. The APBT provides a stable time base on SMP diff --git a/arch/x86/include/asm/apb_timer.h b/arch/x86/include/asm/apb_timer.h index 2fefa50..230000f 100644 --- a/arch/x86/include/asm/apb_timer.h +++ b/arch/x86/include/asm/apb_timer.h @@ -18,24 +18,6 @@ #ifdef CONFIG_APB_TIMER -/* Langwell DW APB timer registers */ -#define APBTMR_N_LOAD_COUNT 0x00 -#define APBTMR_N_CURRENT_VALUE 0x04 -#define APBTMR_N_CONTROL 0x08 -#define APBTMR_N_EOI 0x0c -#define APBTMR_N_INT_STATUS 0x10 - -#define APBTMRS_INT_STATUS 0xa0 -#define APBTMRS_EOI 0xa4 -#define APBTMRS_RAW_INT_STATUS 0xa8 -#define APBTMRS_COMP_VERSION 0xac -#define APBTMRS_REG_SIZE 0x14 - -/* register bits */ -#define APBTMR_CONTROL_ENABLE (1<<0) -#define APBTMR_CONTROL_MODE_PERIODIC (1<<1) /*1: periodic 0:free running */ -#define APBTMR_CONTROL_INT (1<<2) - /* default memory mapped register base */ #define LNW_SCU_ADDR 0xFF100000 #define LNW_EXT_TIMER_OFFSET 0x1B800 @@ -43,8 +25,8 @@ #define LNW_EXT_TIMER_PGOFFSET 0x800 /* APBT clock speed range from PCLK to fabric base, 25-100MHz */ -#define APBT_MAX_FREQ 50 -#define APBT_MIN_FREQ 1 +#define APBT_MAX_FREQ 50000000 +#define APBT_MIN_FREQ 1000000 #define APBT_MMAP_SIZE 1024 #define APBT_DEV_USED 1 diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index 289e928..033d3ec 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c @@ -27,15 +27,12 @@ * timer, but by default APB timer has higher rating than local APIC timers. */ -#include -#include #include +#include #include #include -#include #include #include -#include #include #include #include @@ -45,75 +42,46 @@ #include #include -#define APBT_MASK CLOCKSOURCE_MASK(32) -#define APBT_SHIFT 22 #define APBT_CLOCKEVENT_RATING 110 #define APBT_CLOCKSOURCE_RATING 250 -#define APBT_MIN_DELTA_USEC 200 -#define EVT_TO_APBT_DEV(evt) container_of(evt, struct apbt_dev, evt) #define APBT_CLOCKEVENT0_NUM (0) -#define APBT_CLOCKEVENT1_NUM (1) #define APBT_CLOCKSOURCE_NUM (2) -static unsigned long apbt_address; +static phys_addr_t apbt_address; static int apb_timer_block_enabled; static void __iomem *apbt_virt_address; -static int phy_cs_timer_id; /* * Common DW APB timer info */ -static uint64_t apbt_freq; - -static void apbt_set_mode(enum clock_event_mode mode, - struct clock_event_device *evt); -static int apbt_next_event(unsigned long delta, - struct clock_event_device *evt); -static cycle_t apbt_read_clocksource(struct clocksource *cs); -static void apbt_restart_clocksource(struct clocksource *cs); +static unsigned long apbt_freq; struct apbt_dev { - struct clock_event_device evt; - unsigned int num; - int cpu; - unsigned int irq; - unsigned int tick; - unsigned int count; - unsigned int flags; - char name[10]; + struct dw_apb_clock_event_device *timer; + unsigned int num; + int cpu; + unsigned int irq; + char name[10]; }; -static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev); +static struct dw_apb_clocksource *clocksource_apbt; -#ifdef CONFIG_SMP -static unsigned int apbt_num_timers_used; -static struct apbt_dev *apbt_devs; -#endif - -static inline unsigned long apbt_readl_reg(unsigned long a) +static inline void __iomem *adev_virt_addr(struct apbt_dev *adev) { - return readl(apbt_virt_address + a); + return apbt_virt_address + adev->num * APBTMRS_REG_SIZE; } -static inline void apbt_writel_reg(unsigned long d, unsigned long a) -{ - writel(d, apbt_virt_address + a); -} - -static inline unsigned long apbt_readl(int n, unsigned long a) -{ - return readl(apbt_virt_address + a + n * APBTMRS_REG_SIZE); -} +static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev); -static inline void apbt_writel(int n, unsigned long d, unsigned long a) -{ - writel(d, apbt_virt_address + a + n * APBTMRS_REG_SIZE); -} +#ifdef CONFIG_SMP +static unsigned int apbt_num_timers_used; +#endif static inline void apbt_set_mapping(void) { struct sfi_timer_table_entry *mtmr; + int phy_cs_timer_id = 0; if (apbt_virt_address) { pr_debug("APBT base already mapped\n"); @@ -125,21 +93,18 @@ static inline void apbt_set_mapping(void) APBT_CLOCKEVENT0_NUM); return; } - apbt_address = (unsigned long)mtmr->phys_addr; + apbt_address = (phys_addr_t)mtmr->phys_addr; if (!apbt_address) { printk(KERN_WARNING "No timer base from SFI, use default\n"); apbt_address = APBT_DEFAULT_BASE; } apbt_virt_address = ioremap_nocache(apbt_address, APBT_MMAP_SIZE); - if (apbt_virt_address) { - pr_debug("Mapped APBT physical addr %p at virtual addr %p\n",\ - (void *)apbt_address, (void *)apbt_virt_address); - } else { - pr_debug("Failed mapping APBT phy address at %p\n",\ - (void *)apbt_address); + if (!apbt_virt_address) { + pr_debug("Failed mapping APBT phy address at %lu\n",\ + (unsigned long)apbt_address); goto panic_noapbt; } - apbt_freq = mtmr->freq_hz / USEC_PER_SEC; + apbt_freq = mtmr->freq_hz; sfi_free_mtmr(mtmr); /* Now figure out the physical timer id for clocksource device */ @@ -148,9 +113,14 @@ static inline void apbt_set_mapping(void) goto panic_noapbt; /* Now figure out the physical timer id */ - phy_cs_timer_id = (unsigned int)(mtmr->phys_addr & 0xff) - / APBTMRS_REG_SIZE; - pr_debug("Use timer %d for clocksource\n", phy_cs_timer_id); + pr_debug("Use timer %d for clocksource\n", + (int)(mtmr->phys_addr & 0xff) / APBTMRS_REG_SIZE); + phy_cs_timer_id = (unsigned int)(mtmr->phys_addr & 0xff) / + APBTMRS_REG_SIZE; + + clocksource_apbt = dw_apb_clocksource_init(APBT_CLOCKSOURCE_RATING, + "apbt0", apbt_virt_address + phy_cs_timer_id * + APBTMRS_REG_SIZE, apbt_freq); return; panic_noapbt: @@ -172,82 +142,6 @@ static inline int is_apbt_capable(void) return apbt_virt_address ? 1 : 0; } -static struct clocksource clocksource_apbt = { - .name = "apbt", - .rating = APBT_CLOCKSOURCE_RATING, - .read = apbt_read_clocksource, - .mask = APBT_MASK, - .flags = CLOCK_SOURCE_IS_CONTINUOUS, - .resume = apbt_restart_clocksource, -}; - -/* boot APB clock event device */ -static struct clock_event_device apbt_clockevent = { - .name = "apbt0", - .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, - .set_mode = apbt_set_mode, - .set_next_event = apbt_next_event, - .shift = APBT_SHIFT, - .irq = 0, - .rating = APBT_CLOCKEVENT_RATING, -}; - -/* - * start count down from 0xffff_ffff. this is done by toggling the enable bit - * then load initial load count to ~0. - */ -static void apbt_start_counter(int n) -{ - unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL); - - ctrl &= ~APBTMR_CONTROL_ENABLE; - apbt_writel(n, ctrl, APBTMR_N_CONTROL); - apbt_writel(n, ~0, APBTMR_N_LOAD_COUNT); - /* enable, mask interrupt */ - ctrl &= ~APBTMR_CONTROL_MODE_PERIODIC; - ctrl |= (APBTMR_CONTROL_ENABLE | APBTMR_CONTROL_INT); - apbt_writel(n, ctrl, APBTMR_N_CONTROL); - /* read it once to get cached counter value initialized */ - apbt_read_clocksource(&clocksource_apbt); -} - -static irqreturn_t apbt_interrupt_handler(int irq, void *data) -{ - struct apbt_dev *dev = (struct apbt_dev *)data; - struct clock_event_device *aevt = &dev->evt; - - if (!aevt->event_handler) { - printk(KERN_INFO "Spurious APBT timer interrupt on %d\n", - dev->num); - return IRQ_NONE; - } - aevt->event_handler(aevt); - return IRQ_HANDLED; -} - -static void apbt_restart_clocksource(struct clocksource *cs) -{ - apbt_start_counter(phy_cs_timer_id); -} - -static void apbt_enable_int(int n) -{ - unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL); - /* clear pending intr */ - apbt_readl(n, APBTMR_N_EOI); - ctrl &= ~APBTMR_CONTROL_INT; - apbt_writel(n, ctrl, APBTMR_N_CONTROL); -} - -static void apbt_disable_int(int n) -{ - unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL); - - ctrl |= APBTMR_CONTROL_INT; - apbt_writel(n, ctrl, APBTMR_N_CONTROL); -} - - static int __init apbt_clockevent_register(void) { struct sfi_timer_table_entry *mtmr; @@ -260,45 +154,21 @@ static int __init apbt_clockevent_register(void) return -ENODEV; } - /* - * We need to calculate the scaled math multiplication factor for - * nanosecond to apbt tick conversion. - * mult = (nsec/cycle)*2^APBT_SHIFT - */ - apbt_clockevent.mult = div_sc((unsigned long) mtmr->freq_hz - , NSEC_PER_SEC, APBT_SHIFT); - - /* Calculate the min / max delta */ - apbt_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, - &apbt_clockevent); - apbt_clockevent.min_delta_ns = clockevent_delta2ns( - APBT_MIN_DELTA_USEC*apbt_freq, - &apbt_clockevent); - /* - * Start apbt with the boot cpu mask and make it - * global if not used for per cpu timer. - */ - apbt_clockevent.cpumask = cpumask_of(smp_processor_id()); adev->num = smp_processor_id(); - memcpy(&adev->evt, &apbt_clockevent, sizeof(struct clock_event_device)); + adev->timer = dw_apb_clockevent_init(smp_processor_id(), "apbt0", + mrst_timer_options == MRST_TIMER_LAPIC_APBT ? + APBT_CLOCKEVENT_RATING - 100 : APBT_CLOCKEVENT_RATING, + adev_virt_addr(adev), 0, apbt_freq); + /* Firmware does EOI handling for us. */ + adev->timer->eoi = NULL; if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) { - adev->evt.rating = APBT_CLOCKEVENT_RATING - 100; - global_clock_event = &adev->evt; + global_clock_event = &adev->timer->ced; printk(KERN_DEBUG "%s clockevent registered as global\n", global_clock_event->name); } - if (request_irq(apbt_clockevent.irq, apbt_interrupt_handler, - IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING, - apbt_clockevent.name, adev)) { - printk(KERN_ERR "Failed request IRQ for APBT%d\n", - apbt_clockevent.irq); - } - - clockevents_register_device(&adev->evt); - /* Start APBT 0 interrupts */ - apbt_enable_int(APBT_CLOCKEVENT0_NUM); + dw_apb_clockevent_register(adev->timer); sfi_free_mtmr(mtmr); return 0; @@ -316,52 +186,34 @@ static void apbt_setup_irq(struct apbt_dev *adev) irq_set_affinity(adev->irq, cpumask_of(adev->cpu)); /* APB timer irqs are set up as mp_irqs, timer is edge type */ __irq_set_handler(adev->irq, handle_edge_irq, 0, "edge"); - - if (system_state == SYSTEM_BOOTING) { - if (request_irq(adev->irq, apbt_interrupt_handler, - IRQF_TIMER | IRQF_DISABLED | - IRQF_NOBALANCING, - adev->name, adev)) { - printk(KERN_ERR "Failed request IRQ for APBT%d\n", - adev->num); - } - } else - enable_irq(adev->irq); } /* Should be called with per cpu */ void apbt_setup_secondary_clock(void) { struct apbt_dev *adev; - struct clock_event_device *aevt; int cpu; /* Don't register boot CPU clockevent */ cpu = smp_processor_id(); if (!cpu) return; - /* - * We need to calculate the scaled math multiplication factor for - * nanosecond to apbt tick conversion. - * mult = (nsec/cycle)*2^APBT_SHIFT - */ - printk(KERN_INFO "Init per CPU clockevent %d\n", cpu); - adev = &per_cpu(cpu_apbt_dev, cpu); - aevt = &adev->evt; - memcpy(aevt, &apbt_clockevent, sizeof(*aevt)); - aevt->cpumask = cpumask_of(cpu); - aevt->name = adev->name; - aevt->mode = CLOCK_EVT_MODE_UNUSED; + adev = &__get_cpu_var(cpu_apbt_dev); + if (!adev->timer) { + adev->timer = dw_apb_clockevent_init(cpu, adev->name, + APBT_CLOCKEVENT_RATING, adev_virt_addr(adev), + adev->irq, apbt_freq); + adev->timer->eoi = NULL; + } else { + dw_apb_clockevent_resume(adev->timer); + } - printk(KERN_INFO "Registering CPU %d clockevent device %s, mask %08x\n", - cpu, aevt->name, *(u32 *)aevt->cpumask); + printk(KERN_INFO "Registering CPU %d clockevent device %s, cpu %08x\n", + cpu, adev->name, adev->cpu); apbt_setup_irq(adev); - - clockevents_register_device(aevt); - - apbt_enable_int(cpu); + dw_apb_clockevent_register(adev->timer); return; } @@ -384,13 +236,12 @@ static int apbt_cpuhp_notify(struct notifier_block *n, switch (action & 0xf) { case CPU_DEAD: - disable_irq(adev->irq); - apbt_disable_int(cpu); + dw_apb_clockevent_pause(adev->timer); if (system_state == SYSTEM_RUNNING) { pr_debug("skipping APBT CPU %lu offline\n", cpu); } else if (adev) { pr_debug("APBT clockevent for cpu %lu offline\n", cpu); - free_irq(adev->irq, adev); + dw_apb_clockevent_stop(adev->timer); } break; default: @@ -415,116 +266,16 @@ void apbt_setup_secondary_clock(void) {} #endif /* CONFIG_SMP */ -static void apbt_set_mode(enum clock_event_mode mode, - struct clock_event_device *evt) -{ - unsigned long ctrl; - uint64_t delta; - int timer_num; - struct apbt_dev *adev = EVT_TO_APBT_DEV(evt); - - BUG_ON(!apbt_virt_address); - - timer_num = adev->num; - pr_debug("%s CPU %d timer %d mode=%d\n", - __func__, first_cpu(*evt->cpumask), timer_num, mode); - - switch (mode) { - case CLOCK_EVT_MODE_PERIODIC: - delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * apbt_clockevent.mult; - delta >>= apbt_clockevent.shift; - ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL); - ctrl |= APBTMR_CONTROL_MODE_PERIODIC; - apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); - /* - * DW APB p. 46, have to disable timer before load counter, - * may cause sync problem. - */ - ctrl &= ~APBTMR_CONTROL_ENABLE; - apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); - udelay(1); - pr_debug("Setting clock period %d for HZ %d\n", (int)delta, HZ); - apbt_writel(timer_num, delta, APBTMR_N_LOAD_COUNT); - ctrl |= APBTMR_CONTROL_ENABLE; - apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); - break; - /* APB timer does not have one-shot mode, use free running mode */ - case CLOCK_EVT_MODE_ONESHOT: - ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL); - /* - * set free running mode, this mode will let timer reload max - * timeout which will give time (3min on 25MHz clock) to rearm - * the next event, therefore emulate the one-shot mode. - */ - ctrl &= ~APBTMR_CONTROL_ENABLE; - ctrl &= ~APBTMR_CONTROL_MODE_PERIODIC; - - apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); - /* write again to set free running mode */ - apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); - - /* - * DW APB p. 46, load counter with all 1s before starting free - * running mode. - */ - apbt_writel(timer_num, ~0, APBTMR_N_LOAD_COUNT); - ctrl &= ~APBTMR_CONTROL_INT; - ctrl |= APBTMR_CONTROL_ENABLE; - apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); - break; - - case CLOCK_EVT_MODE_UNUSED: - case CLOCK_EVT_MODE_SHUTDOWN: - apbt_disable_int(timer_num); - ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL); - ctrl &= ~APBTMR_CONTROL_ENABLE; - apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); - break; - - case CLOCK_EVT_MODE_RESUME: - apbt_enable_int(timer_num); - break; - } -} - -static int apbt_next_event(unsigned long delta, - struct clock_event_device *evt) -{ - unsigned long ctrl; - int timer_num; - - struct apbt_dev *adev = EVT_TO_APBT_DEV(evt); - - timer_num = adev->num; - /* Disable timer */ - ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL); - ctrl &= ~APBTMR_CONTROL_ENABLE; - apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); - /* write new count */ - apbt_writel(timer_num, delta, APBTMR_N_LOAD_COUNT); - ctrl |= APBTMR_CONTROL_ENABLE; - apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); - return 0; -} - -static cycle_t apbt_read_clocksource(struct clocksource *cs) -{ - unsigned long current_count; - - current_count = apbt_readl(phy_cs_timer_id, APBTMR_N_CURRENT_VALUE); - return (cycle_t)~current_count; -} - static int apbt_clocksource_register(void) { u64 start, now; cycle_t t1; /* Start the counter, use timer 2 as source, timer 0/1 for event */ - apbt_start_counter(phy_cs_timer_id); + dw_apb_clocksource_start(clocksource_apbt); /* Verify whether apbt counter works */ - t1 = apbt_read_clocksource(&clocksource_apbt); + t1 = dw_apb_clocksource_read(clocksource_apbt); rdtscll(start); /* @@ -539,10 +290,10 @@ static int apbt_clocksource_register(void) } while ((now - start) < 200000UL); /* APBT is the only always on clocksource, it has to work! */ - if (t1 == apbt_read_clocksource(&clocksource_apbt)) + if (t1 == dw_apb_clocksource_read(clocksource_apbt)) panic("APBT counter not counting. APBT disabled\n"); - clocksource_register_khz(&clocksource_apbt, (u32)apbt_freq*1000); + dw_apb_clocksource_register(clocksource_apbt); return 0; } @@ -566,10 +317,7 @@ void __init apbt_time_init(void) if (apb_timer_block_enabled) return; apbt_set_mapping(); - if (apbt_virt_address) { - pr_debug("Found APBT version 0x%lx\n",\ - apbt_readl_reg(APBTMRS_COMP_VERSION)); - } else + if (!apbt_virt_address) goto out_noapbt; /* * Read the frequency and check for a sane value, for ESL model @@ -577,7 +325,7 @@ void __init apbt_time_init(void) */ if (apbt_freq < APBT_MIN_FREQ || apbt_freq > APBT_MAX_FREQ) { - pr_debug("APBT has invalid freq 0x%llx\n", apbt_freq); + pr_debug("APBT has invalid freq 0x%lx\n", apbt_freq); goto out_noapbt; } if (apbt_clocksource_register()) { @@ -603,30 +351,20 @@ void __init apbt_time_init(void) } else { percpu_timer = 0; apbt_num_timers_used = 1; - adev = &per_cpu(cpu_apbt_dev, 0); - adev->flags &= ~APBT_DEV_USED; } pr_debug("%s: %d APB timers used\n", __func__, apbt_num_timers_used); /* here we set up per CPU timer data structure */ - apbt_devs = kzalloc(sizeof(struct apbt_dev) * apbt_num_timers_used, - GFP_KERNEL); - if (!apbt_devs) { - printk(KERN_ERR "Failed to allocate APB timer devices\n"); - return; - } for (i = 0; i < apbt_num_timers_used; i++) { adev = &per_cpu(cpu_apbt_dev, i); adev->num = i; adev->cpu = i; p_mtmr = sfi_get_mtmr(i); - if (p_mtmr) { - adev->tick = p_mtmr->freq_hz; + if (p_mtmr) adev->irq = p_mtmr->irq; - } else + else printk(KERN_ERR "Failed to get timer for cpu %d\n", i); - adev->count = 0; - sprintf(adev->name, "apbt%d", i); + snprintf(adev->name, sizeof(adev->name) - 1, "apbt%d", i); } #endif @@ -638,17 +376,8 @@ out_noapbt: panic("failed to enable APB timer\n"); } -static inline void apbt_disable(int n) -{ - if (is_apbt_capable()) { - unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL); - ctrl &= ~APBTMR_CONTROL_ENABLE; - apbt_writel(n, ctrl, APBTMR_N_CONTROL); - } -} - /* called before apb_timer_enable, use early map */ -unsigned long apbt_quick_calibrate() +unsigned long apbt_quick_calibrate(void) { int i, scale; u64 old, new; @@ -657,31 +386,31 @@ unsigned long apbt_quick_calibrate() u32 loop, shift; apbt_set_mapping(); - apbt_start_counter(phy_cs_timer_id); + dw_apb_clocksource_start(clocksource_apbt); /* check if the timer can count down, otherwise return */ - old = apbt_read_clocksource(&clocksource_apbt); + old = dw_apb_clocksource_read(clocksource_apbt); i = 10000; while (--i) { - if (old != apbt_read_clocksource(&clocksource_apbt)) + if (old != dw_apb_clocksource_read(clocksource_apbt)) break; } if (!i) goto failed; /* count 16 ms */ - loop = (apbt_freq * 1000) << 4; + loop = (apbt_freq / 1000) << 4; /* restart the timer to ensure it won't get to 0 in the calibration */ - apbt_start_counter(phy_cs_timer_id); + dw_apb_clocksource_start(clocksource_apbt); - old = apbt_read_clocksource(&clocksource_apbt); + old = dw_apb_clocksource_read(clocksource_apbt); old += loop; t1 = __native_read_tsc(); do { - new = apbt_read_clocksource(&clocksource_apbt); + new = dw_apb_clocksource_read(clocksource_apbt); } while (new < old); t2 = __native_read_tsc(); @@ -693,7 +422,7 @@ unsigned long apbt_quick_calibrate() return 0; } scale = (int)div_u64((t2 - t1), loop >> shift); - khz = (scale * apbt_freq * 1000) >> shift; + khz = (scale * (apbt_freq / 1000)) >> shift; printk(KERN_INFO "TSC freq calculated by APB timer is %lu khz\n", khz); return khz; failed: -- cgit v1.1 From 192d8857427dd23707d5f0b86ca990c3af6f2d74 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Thu, 23 Jun 2011 11:19:29 -0700 Subject: x86, mtrr: use stop_machine APIs for doing MTRR rendezvous MTRR rendezvous sequence is not implemened using stop_machine() before, as this gets called both from the process context aswell as the cpu online paths (where the cpu has not come online and the interrupts are disabled etc). Now that we have a new stop_machine_from_inactive_cpu() API, use it for rendezvous during mtrr init of a logical processor that is coming online. For the rest (runtime MTRR modification, system boot, resume paths), use stop_machine() to implement the rendezvous sequence. This will consolidate and cleanup the code. Signed-off-by: Suresh Siddha Link: http://lkml.kernel.org/r/20110623182057.076997177@sbsiddha-MOBL3.sc.intel.com Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mtrr/main.c | 192 +++++++++------------------------------- 1 file changed, 41 insertions(+), 151 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 3d17bc7..707b637 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -137,55 +137,43 @@ static void __init init_table(void) } struct set_mtrr_data { - atomic_t count; - atomic_t gate; unsigned long smp_base; unsigned long smp_size; unsigned int smp_reg; mtrr_type smp_type; }; -static DEFINE_PER_CPU(struct cpu_stop_work, mtrr_work); - /** - * mtrr_work_handler - Synchronisation handler. Executed by "other" CPUs. + * mtrr_rendezvous_handler - Work done in the synchronization handler. Executed + * by all the CPUs. * @info: pointer to mtrr configuration data * * Returns nothing. */ -static int mtrr_work_handler(void *info) +static int mtrr_rendezvous_handler(void *info) { #ifdef CONFIG_SMP struct set_mtrr_data *data = info; - unsigned long flags; - - atomic_dec(&data->count); - while (!atomic_read(&data->gate)) - cpu_relax(); - - local_irq_save(flags); - - atomic_dec(&data->count); - while (atomic_read(&data->gate)) - cpu_relax(); - /* The master has cleared me to execute */ + /* + * We use this same function to initialize the mtrrs during boot, + * resume, runtime cpu online and on an explicit request to set a + * specific MTRR. + * + * During boot or suspend, the state of the boot cpu's mtrrs has been + * saved, and we want to replicate that across all the cpus that come + * online (either at the end of boot or resume or during a runtime cpu + * online). If we're doing that, @reg is set to something special and on + * all the cpu's we do mtrr_if->set_all() (On the logical cpu that + * started the boot/resume sequence, this might be a duplicate + * set_all()). + */ if (data->smp_reg != ~0U) { mtrr_if->set(data->smp_reg, data->smp_base, data->smp_size, data->smp_type); - } else if (mtrr_aps_delayed_init) { - /* - * Initialize the MTRRs inaddition to the synchronisation. - */ + } else if (mtrr_aps_delayed_init || !cpu_online(smp_processor_id())) { mtrr_if->set_all(); } - - atomic_dec(&data->count); - while (!atomic_read(&data->gate)) - cpu_relax(); - - atomic_dec(&data->count); - local_irq_restore(flags); #endif return 0; } @@ -223,20 +211,11 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2) * 14. Wait for buddies to catch up * 15. Enable interrupts. * - * What does that mean for us? Well, first we set data.count to the number - * of CPUs. As each CPU announces that it started the rendezvous handler by - * decrementing the count, We reset data.count and set the data.gate flag - * allowing all the cpu's to proceed with the work. As each cpu disables - * interrupts, it'll decrement data.count once. We wait until it hits 0 and - * proceed. We clear the data.gate flag and reset data.count. Meanwhile, they - * are waiting for that flag to be cleared. Once it's cleared, each - * CPU goes through the transition of updating MTRRs. - * The CPU vendors may each do it differently, - * so we call mtrr_if->set() callback and let them take care of it. - * When they're done, they again decrement data->count and wait for data.gate - * to be set. - * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag - * Everyone then enables interrupts and we all continue on. + * What does that mean for us? Well, stop_machine() will ensure that + * the rendezvous handler is started on each CPU. And in lockstep they + * do the state transition of disabling interrupts, updating MTRR's + * (the CPU vendors may each do it differently, so we call mtrr_if->set() + * callback and let them take care of it.) and enabling interrupts. * * Note that the mechanism is the same for UP systems, too; all the SMP stuff * becomes nops. @@ -244,115 +223,26 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2) static void set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type) { - struct set_mtrr_data data; - unsigned long flags; - int cpu; - -#ifdef CONFIG_SMP - /* - * If this cpu is not yet active, we are in the cpu online path. There - * can be no stop_machine() in parallel, as stop machine ensures this - * by using get_online_cpus(). We can skip taking the stop_cpus_mutex, - * as we don't need it and also we can't afford to block while waiting - * for the mutex. - * - * If this cpu is active, we need to prevent stop_machine() happening - * in parallel by taking the stop cpus mutex. - * - * Also, this is called in the context of cpu online path or in the - * context where cpu hotplug is prevented. So checking the active status - * of the raw_smp_processor_id() is safe. - */ - if (cpu_active(raw_smp_processor_id())) - mutex_lock(&stop_cpus_mutex); -#endif - - preempt_disable(); - - data.smp_reg = reg; - data.smp_base = base; - data.smp_size = size; - data.smp_type = type; - atomic_set(&data.count, num_booting_cpus() - 1); - - /* Make sure data.count is visible before unleashing other CPUs */ - smp_wmb(); - atomic_set(&data.gate, 0); - - /* Start the ball rolling on other CPUs */ - for_each_online_cpu(cpu) { - struct cpu_stop_work *work = &per_cpu(mtrr_work, cpu); - - if (cpu == smp_processor_id()) - continue; + struct set_mtrr_data data = { .smp_reg = reg, + .smp_base = base, + .smp_size = size, + .smp_type = type + }; - stop_one_cpu_nowait(cpu, mtrr_work_handler, &data, work); - } - - - while (atomic_read(&data.count)) - cpu_relax(); - - /* Ok, reset count and toggle gate */ - atomic_set(&data.count, num_booting_cpus() - 1); - smp_wmb(); - atomic_set(&data.gate, 1); - - local_irq_save(flags); - - while (atomic_read(&data.count)) - cpu_relax(); - - /* Ok, reset count and toggle gate */ - atomic_set(&data.count, num_booting_cpus() - 1); - smp_wmb(); - atomic_set(&data.gate, 0); - - /* Do our MTRR business */ - - /* - * HACK! - * - * We use this same function to initialize the mtrrs during boot, - * resume, runtime cpu online and on an explicit request to set a - * specific MTRR. - * - * During boot or suspend, the state of the boot cpu's mtrrs has been - * saved, and we want to replicate that across all the cpus that come - * online (either at the end of boot or resume or during a runtime cpu - * online). If we're doing that, @reg is set to something special and on - * this cpu we still do mtrr_if->set_all(). During boot/resume, this - * is unnecessary if at this point we are still on the cpu that started - * the boot/resume sequence. But there is no guarantee that we are still - * on the same cpu. So we do mtrr_if->set_all() on this cpu aswell to be - * sure that we are in sync with everyone else. - */ - if (reg != ~0U) - mtrr_if->set(reg, base, size, type); - else - mtrr_if->set_all(); - - /* Wait for the others */ - while (atomic_read(&data.count)) - cpu_relax(); - - atomic_set(&data.count, num_booting_cpus() - 1); - smp_wmb(); - atomic_set(&data.gate, 1); - - /* - * Wait here for everyone to have seen the gate change - * So we're the last ones to touch 'data' - */ - while (atomic_read(&data.count)) - cpu_relax(); + stop_machine(mtrr_rendezvous_handler, &data, cpu_online_mask); +} - local_irq_restore(flags); - preempt_enable(); -#ifdef CONFIG_SMP - if (cpu_active(raw_smp_processor_id())) - mutex_unlock(&stop_cpus_mutex); -#endif +static void set_mtrr_from_inactive_cpu(unsigned int reg, unsigned long base, + unsigned long size, mtrr_type type) +{ + struct set_mtrr_data data = { .smp_reg = reg, + .smp_base = base, + .smp_size = size, + .smp_type = type + }; + + stop_machine_from_inactive_cpu(mtrr_rendezvous_handler, &data, + cpu_callout_mask); } /** @@ -806,7 +696,7 @@ void mtrr_ap_init(void) * 2. cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug * lock to prevent mtrr entry changes */ - set_mtrr(~0U, 0, 0, 0); + set_mtrr_from_inactive_cpu(~0U, 0, 0, 0); } /** -- cgit v1.1 From c3e73e76a90b1e790e0bb7bb36135be9232f58de Mon Sep 17 00:00:00 2001 From: "Gustavo F. Padovan" Date: Thu, 26 May 2011 13:29:33 +1000 Subject: crypto: ghash-intel - Fix set but not used in ghash_async_setkey() Signed-off-by: Gustavo F. Padovan Signed-off-by: Herbert Xu --- arch/x86/crypto/ghash-clmulni-intel_glue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c index 7a6e68e..976aa64 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_glue.c +++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c @@ -245,7 +245,7 @@ static int ghash_async_setkey(struct crypto_ahash *tfm, const u8 *key, crypto_ahash_set_flags(tfm, crypto_ahash_get_flags(child) & CRYPTO_TFM_RES_MASK); - return 0; + return err; } static int ghash_async_init_tfm(struct crypto_tfm *tfm) -- cgit v1.1 From 3e7cf5b00dd5b577b4ee9b2a66e40fb670ef210b Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Thu, 30 Jun 2011 17:19:32 -0700 Subject: x86-32, fpu: Fix DNA exception during check_fpu() Before check_fpu() is called, we have cr0.TS bit set and hence the floating point code to check the FDIV bug was generating a DNA exception. Use kernel_fpu_begin()/kernel_fpu_end() around the floating point code to avoid this unnecessary device not available exception during boot. Signed-off-by: Suresh Siddha Link: http://lkml.kernel.org/r/1309479572.2665.1372.camel@sbsiddha-MOBL3.sc.intel.com Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/bugs.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 525514c..46674fb 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -62,6 +62,8 @@ static void __init check_fpu(void) return; } + kernel_fpu_begin(); + /* * trap_init() enabled FXSR and company _before_ testing for FP * problems here. @@ -80,6 +82,8 @@ static void __init check_fpu(void) : "=m" (*&fdiv_bug) : "m" (*&x), "m" (*&y)); + kernel_fpu_end(); + boot_cpu_data.fdiv_bug = fdiv_bug; if (boot_cpu_data.fdiv_bug) printk(KERN_WARNING "Hmm, FPU with FDIV bug.\n"); -- cgit v1.1 From 0a779c571314b7951ff12edac80aa0cbe7c12b6e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 9 Jun 2011 13:08:26 +0000 Subject: x86: Use common i8253 clockevent Signed-off-by: Thomas Gleixner Cc: Russell King Cc: Ralf Baechle Cc: John Stultz Link: http://lkml.kernel.org/r/20110609130622.026152527@linutronix.de --- arch/x86/Kconfig | 2 +- arch/x86/kernel/i8253.c | 93 ++----------------------------------------------- 2 files changed, 4 insertions(+), 91 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 2dd95b55..26a14b4 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -71,7 +71,7 @@ config X86 select IRQ_FORCED_THREADING select USE_GENERIC_SMP_HELPERS if SMP select HAVE_BPF_JIT if (X86_64 && NET) - select I8253_LOCK + select CLKEVT_I8253 config INSTRUCTION_DECODER def_bool (KPROBES || PERF_EVENTS) diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c index 5783e6d..f2b96de 100644 --- a/arch/x86/kernel/i8253.c +++ b/arch/x86/kernel/i8253.c @@ -3,15 +3,9 @@ * */ #include -#include -#include -#include #include #include -#include #include -#include -#include #include #include @@ -23,91 +17,10 @@ */ struct clock_event_device *global_clock_event; -/* - * Initialize the PIT timer. - * - * This is also called after resume to bring the PIT into operation again. - */ -static void init_pit_timer(enum clock_event_mode mode, - struct clock_event_device *evt) -{ - raw_spin_lock(&i8253_lock); - - switch (mode) { - case CLOCK_EVT_MODE_PERIODIC: - /* binary, mode 2, LSB/MSB, ch 0 */ - outb_pit(0x34, PIT_MODE); - outb_pit(LATCH & 0xff , PIT_CH0); /* LSB */ - outb_pit(LATCH >> 8 , PIT_CH0); /* MSB */ - break; - - case CLOCK_EVT_MODE_SHUTDOWN: - case CLOCK_EVT_MODE_UNUSED: - if (evt->mode == CLOCK_EVT_MODE_PERIODIC || - evt->mode == CLOCK_EVT_MODE_ONESHOT) { - outb_pit(0x30, PIT_MODE); - outb_pit(0, PIT_CH0); - outb_pit(0, PIT_CH0); - } - break; - - case CLOCK_EVT_MODE_ONESHOT: - /* One shot setup */ - outb_pit(0x38, PIT_MODE); - break; - - case CLOCK_EVT_MODE_RESUME: - /* Nothing to do here */ - break; - } - raw_spin_unlock(&i8253_lock); -} - -/* - * Program the next event in oneshot mode - * - * Delta is given in PIT ticks - */ -static int pit_next_event(unsigned long delta, struct clock_event_device *evt) -{ - raw_spin_lock(&i8253_lock); - outb_pit(delta & 0xff , PIT_CH0); /* LSB */ - outb_pit(delta >> 8 , PIT_CH0); /* MSB */ - raw_spin_unlock(&i8253_lock); - - return 0; -} - -/* - * On UP the PIT can serve all of the possible timer functions. On SMP systems - * it can be solely used for the global tick. - * - * The profiling and update capabilities are switched off once the local apic is - * registered. This mechanism replaces the previous #ifdef LOCAL_APIC - - * !using_apic_timer decisions in do_timer_interrupt_hook() - */ -static struct clock_event_device pit_ce = { - .name = "pit", - .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, - .set_mode = init_pit_timer, - .set_next_event = pit_next_event, - .irq = 0, -}; - -/* - * Initialize the conversion factor and the min/max deltas of the clock event - * structure and register the clock event source with the framework. - */ void __init setup_pit_timer(void) { - /* - * Start pit with the boot cpu mask and make it global after the - * IO_APIC has been initialized. - */ - pit_ce.cpumask = cpumask_of(smp_processor_id()); - - clockevents_config_and_register(&pit_ce, CLOCK_TICK_RATE, 0xF, 0x7FFF); - global_clock_event = &pit_ce; + clockevent_i8253_init(true); + global_clock_event = &i8253_clockevent; } #ifndef CONFIG_X86_64 @@ -121,7 +34,7 @@ static int __init init_pit_clocksource(void) * - when local APIC timer is active (PIT is switched off) */ if (num_possible_cpus() > 1 || is_hpet_enabled() || - pit_ce.mode != CLOCK_EVT_MODE_PERIODIC) + i8253_clockevent.mode != CLOCK_EVT_MODE_PERIODIC) return 0; return clocksource_i8253_init(); -- cgit v1.1 From 01898e3e29ea8242d81923da11ce88ba71290a48 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 9 Jun 2011 13:08:30 +0000 Subject: i8253: Cleanup outb/inb magic Remove the hysterical outb/inb_pit defines and use outb_p/inb_p in the code. Signed-off-by: Thomas Gleixner Cc: Russell King Cc: Ralf Baechle Cc: John Stultz Link: http://lkml.kernel.org/r/20110609130622.348437125@linutronix.de --- arch/x86/kernel/apm_32.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index a30740e..0371c48 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -1220,11 +1220,11 @@ static void reinit_timer(void) raw_spin_lock_irqsave(&i8253_lock, flags); /* set the clock to HZ */ - outb_pit(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */ + outb_p(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */ udelay(10); - outb_pit(LATCH & 0xff, PIT_CH0); /* LSB */ + outb_p(LATCH & 0xff, PIT_CH0); /* LSB */ udelay(10); - outb_pit(LATCH >> 8, PIT_CH0); /* MSB */ + outb_p(LATCH >> 8, PIT_CH0); /* MSB */ udelay(10); raw_spin_unlock_irqrestore(&i8253_lock, flags); #endif -- cgit v1.1 From 1880c4ae182afb5650c5678949ecfe7ff66a724e Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 23 Jun 2011 16:49:18 +0400 Subject: perf, x86: Add hw_watchdog_set_attr() in a sake of nmi-watchdog on P4 Due to restriction and specifics of Netburst PMU we need a separated event for NMI watchdog. In particular every Netburst event consumes not just a counter and a config register, but also an additional ESCR register. Since ESCR registers are grouped upon counters (i.e. if ESCR is occupied for some event there is no room for another event to enter until its released) we need to pick up the "least" used ESCR (or the most available one) for nmi-watchdog purposes -- so MSR_P4_CRU_ESCR2/3 was chosen. With this patch nmi-watchdog and perf top should be able to run simultaneously. Signed-off-by: Cyrill Gorcunov CC: Lin Ming CC: Arnaldo Carvalho de Melo CC: Frederic Weisbecker Tested-and-reviewed-by: Don Zickus Tested-and-reviewed-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110623124918.GC13050@sun Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 7 +++++++ arch/x86/kernel/cpu/perf_event_p4.c | 26 ++++++++++++++++++++++++++ 2 files changed, 33 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 3a0338b..8a57f9a 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -233,6 +233,7 @@ struct x86_pmu { void (*enable_all)(int added); void (*enable)(struct perf_event *); void (*disable)(struct perf_event *); + void (*hw_watchdog_set_attr)(struct perf_event_attr *attr); int (*hw_config)(struct perf_event *event); int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign); unsigned eventsel; @@ -315,6 +316,12 @@ static u64 __read_mostly hw_cache_extra_regs [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX]; +void hw_nmi_watchdog_set_attr(struct perf_event_attr *wd_attr) +{ + if (x86_pmu.hw_watchdog_set_attr) + x86_pmu.hw_watchdog_set_attr(wd_attr); +} + /* * Propagate event elapsed time into the generic event. * Can only be executed on the CPU where the event is active. diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index ead584f..f76fddf 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -705,6 +705,31 @@ static int p4_validate_raw_event(struct perf_event *event) return 0; } +static void p4_hw_watchdog_set_attr(struct perf_event_attr *wd_attr) +{ + /* + * Watchdog ticks are special on Netburst, we use + * that named "non-sleeping" ticks as recommended + * by Intel SDM Vol3b. + */ + WARN_ON_ONCE(wd_attr->type != PERF_TYPE_HARDWARE || + wd_attr->config != PERF_COUNT_HW_CPU_CYCLES); + + wd_attr->type = PERF_TYPE_RAW; + wd_attr->config = + p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_EXECUTION_EVENT) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3)) | + p4_config_pack_cccr(P4_CCCR_THRESHOLD(15) | P4_CCCR_COMPLEMENT | + P4_CCCR_COMPARE); +} + static int p4_hw_config(struct perf_event *event) { int cpu = get_cpu(); @@ -1179,6 +1204,7 @@ static __initconst const struct x86_pmu p4_pmu = { .cntval_bits = ARCH_P4_CNTRVAL_BITS, .cntval_mask = ARCH_P4_CNTRVAL_MASK, .max_period = (1ULL << (ARCH_P4_CNTRVAL_BITS - 1)) - 1, + .hw_watchdog_set_attr = p4_hw_watchdog_set_attr, .hw_config = p4_hw_config, .schedule_events = p4_pmu_schedule_events, /* -- cgit v1.1 From a8b0ca17b80e92faab46ee7179ba9e99ccb61233 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 27 Jun 2011 14:41:57 +0200 Subject: perf: Remove the nmi parameter from the swevent and overflow interface The nmi parameter indicated if we could do wakeups from the current context, if not, we would set some state and self-IPI and let the resulting interrupt do the wakeup. For the various event classes: - hardware: nmi=0; PMI is in fact an NMI or we run irq_work_run from the PMI-tail (ARM etc.) - tracepoint: nmi=0; since tracepoint could be from NMI context. - software: nmi=[0,1]; some, like the schedule thing cannot perform wakeups, and hence need 0. As one can see, there is very little nmi=1 usage, and the down-side of not using it is that on some platforms some software events can have a jiffy delay in wakeup (when arch_irq_work_raise isn't implemented). The up-side however is that we can remove the nmi parameter and save a bunch of conditionals in fast paths. Signed-off-by: Peter Zijlstra Cc: Michael Cree Cc: Will Deacon Cc: Deng-Cheng Zhu Cc: Anton Blanchard Cc: Eric B Munson Cc: Heiko Carstens Cc: Paul Mundt Cc: David S. Miller Cc: Frederic Weisbecker Cc: Jason Wessel Cc: Don Zickus Link: http://lkml.kernel.org/n/tip-agjev8eu666tvknpb3iaj0fg@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 2 +- arch/x86/kernel/cpu/perf_event_intel.c | 2 +- arch/x86/kernel/cpu/perf_event_intel_ds.c | 4 ++-- arch/x86/kernel/cpu/perf_event_p4.c | 2 +- arch/x86/kernel/kgdb.c | 2 +- arch/x86/kernel/ptrace.c | 2 +- arch/x86/mm/fault.c | 6 +++--- 7 files changed, 10 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 8a57f9a..5b86ec5 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1339,7 +1339,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs) if (!x86_perf_event_set_period(event)) continue; - if (perf_event_overflow(event, 1, &data, regs)) + if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); } diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 41178c8..d38b002 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1003,7 +1003,7 @@ again: data.period = event->hw.last_period; - if (perf_event_overflow(event, 1, &data, regs)) + if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); } diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index bab491b..0941f93 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -340,7 +340,7 @@ static int intel_pmu_drain_bts_buffer(void) */ perf_prepare_sample(&header, &data, event, ®s); - if (perf_output_begin(&handle, event, header.size * (top - at), 1, 1)) + if (perf_output_begin(&handle, event, header.size * (top - at), 1)) return 1; for (; at < top; at++) { @@ -616,7 +616,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event, else regs.flags &= ~PERF_EFLAGS_EXACT; - if (perf_event_overflow(event, 1, &data, ®s)) + if (perf_event_overflow(event, &data, ®s)) x86_pmu_stop(event, 0); } diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index f76fddf..d6e6a67 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -970,7 +970,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs) if (!x86_perf_event_set_period(event)) continue; - if (perf_event_overflow(event, 1, &data, regs)) + if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); } diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 5f9ecff..98da6a7 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -608,7 +608,7 @@ int kgdb_arch_init(void) return register_die_notifier(&kgdb_notifier); } -static void kgdb_hw_overflow_handler(struct perf_event *event, int nmi, +static void kgdb_hw_overflow_handler(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { struct task_struct *tsk = current; diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 807c2a2..11db2e9 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -528,7 +528,7 @@ static int genregs_set(struct task_struct *target, return ret; } -static void ptrace_triggered(struct perf_event *bp, int nmi, +static void ptrace_triggered(struct perf_event *bp, struct perf_sample_data *data, struct pt_regs *regs) { diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 2dbf6bf..4d09df0 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1059,7 +1059,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) if (unlikely(error_code & PF_RSVD)) pgtable_bad(regs, error_code, address); - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); /* * If we're in an interrupt, have no user context or are running @@ -1161,11 +1161,11 @@ good_area: if (flags & FAULT_FLAG_ALLOW_RETRY) { if (fault & VM_FAULT_MAJOR) { tsk->maj_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address); } else { tsk->min_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address); } if (fault & VM_FAULT_RETRY) { -- cgit v1.1 From a7ac67ea021b4603095d2aa458bc41641238f22c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 27 Jun 2011 16:47:16 +0200 Subject: perf: Remove the perf_output_begin(.sample) argument Since only samples call perf_output_sample() its much saner (and more correct) to put the sample logic in there than in the perf_output_begin()/perf_output_end() pair. Saves a useless argument, reduces conditionals and shrinks struct perf_output_handle, win! Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-2crpvsx3cqu67q3zqjbnlpsc@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_ds.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 0941f93..1b1ef3a 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -340,7 +340,7 @@ static int intel_pmu_drain_bts_buffer(void) */ perf_prepare_sample(&header, &data, event, ®s); - if (perf_output_begin(&handle, event, header.size * (top - at), 1)) + if (perf_output_begin(&handle, event, header.size * (top - at))) return 1; for (; at < top; at++) { -- cgit v1.1 From efc9f05df2dd171280dcb736a4d973ffefd5508e Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Mon, 6 Jun 2011 16:57:03 +0200 Subject: perf_events: Update Intel extra regs shared constraints management This patch improves the code managing the extra shared registers used for offcore_response events on Intel Nehalem/Westmere. The idea is to use static allocation instead of dynamic allocation. This simplifies greatly the get and put constraint routines for those events. The patch also renames per_core to shared_regs because the same data structure gets used whether or not HT is on. When HT is off, those events still need to coordination because they use a extra MSR that has to be shared within an event group. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110606145703.GA7258@quad Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 78 +++++++--- arch/x86/kernel/cpu/perf_event_intel.c | 260 ++++++++++++++++----------------- 2 files changed, 189 insertions(+), 149 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 5b86ec5..019fda7 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -45,6 +45,29 @@ do { \ #endif /* + * | NHM/WSM | SNB | + * register ------------------------------- + * | HT | no HT | HT | no HT | + *----------------------------------------- + * offcore | core | core | cpu | core | + * lbr_sel | core | core | cpu | core | + * ld_lat | cpu | core | cpu | core | + *----------------------------------------- + * + * Given that there is a small number of shared regs, + * we can pre-allocate their slot in the per-cpu + * per-core reg tables. + */ +enum extra_reg_type { + EXTRA_REG_NONE = -1, /* not used */ + + EXTRA_REG_RSP_0 = 0, /* offcore_response_0 */ + EXTRA_REG_RSP_1 = 1, /* offcore_response_1 */ + + EXTRA_REG_MAX /* number of entries needed */ +}; + +/* * best effort, GUP based copy_from_user() that assumes IRQ or NMI context */ static unsigned long @@ -132,11 +155,10 @@ struct cpu_hw_events { struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES]; /* - * Intel percore register state. - * Coordinate shared resources between HT threads. + * manage shared (per-core, per-cpu) registers + * used on Intel NHM/WSM/SNB */ - int percore_used; /* Used by this CPU? */ - struct intel_percore *per_core; + struct intel_shared_regs *shared_regs; /* * AMD specific bits @@ -187,26 +209,45 @@ struct cpu_hw_events { for ((e) = (c); (e)->weight; (e)++) /* + * Per register state. + */ +struct er_account { + raw_spinlock_t lock; /* per-core: protect structure */ + u64 config; /* extra MSR config */ + u64 reg; /* extra MSR number */ + atomic_t ref; /* reference count */ +}; + +/* * Extra registers for specific events. + * * Some events need large masks and require external MSRs. - * Define a mapping to these extra registers. + * Those extra MSRs end up being shared for all events on + * a PMU and sometimes between PMU of sibling HT threads. + * In either case, the kernel needs to handle conflicting + * accesses to those extra, shared, regs. The data structure + * to manage those registers is stored in cpu_hw_event. */ struct extra_reg { unsigned int event; unsigned int msr; u64 config_mask; u64 valid_mask; + int idx; /* per_xxx->regs[] reg index */ }; -#define EVENT_EXTRA_REG(e, ms, m, vm) { \ +#define EVENT_EXTRA_REG(e, ms, m, vm, i) { \ .event = (e), \ .msr = (ms), \ .config_mask = (m), \ .valid_mask = (vm), \ + .idx = EXTRA_REG_##i \ } -#define INTEL_EVENT_EXTRA_REG(event, msr, vm) \ - EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm) -#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0) + +#define INTEL_EVENT_EXTRA_REG(event, msr, vm, idx) \ + EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm, idx) + +#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0, RSP_0) union perf_capabilities { struct { @@ -253,7 +294,6 @@ struct x86_pmu { void (*put_event_constraints)(struct cpu_hw_events *cpuc, struct perf_event *event); struct event_constraint *event_constraints; - struct event_constraint *percore_constraints; void (*quirks)(void); int perfctr_second_write; @@ -400,10 +440,10 @@ static inline unsigned int x86_pmu_event_addr(int index) */ static int x86_pmu_extra_regs(u64 config, struct perf_event *event) { + struct hw_perf_event_extra *reg; struct extra_reg *er; - event->hw.extra_reg = 0; - event->hw.extra_config = 0; + reg = &event->hw.extra_reg; if (!x86_pmu.extra_regs) return 0; @@ -413,8 +453,10 @@ static int x86_pmu_extra_regs(u64 config, struct perf_event *event) continue; if (event->attr.config1 & ~er->valid_mask) return -EINVAL; - event->hw.extra_reg = er->msr; - event->hw.extra_config = event->attr.config1; + + reg->idx = er->idx; + reg->config = event->attr.config1; + reg->reg = er->msr; break; } return 0; @@ -713,6 +755,9 @@ static int __x86_pmu_event_init(struct perf_event *event) event->hw.last_cpu = -1; event->hw.last_tag = ~0ULL; + /* mark unused */ + event->hw.extra_reg.idx = EXTRA_REG_NONE; + return x86_pmu.hw_config(event); } @@ -754,8 +799,8 @@ static void x86_pmu_disable(struct pmu *pmu) static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc, u64 enable_mask) { - if (hwc->extra_reg) - wrmsrl(hwc->extra_reg, hwc->extra_config); + if (hwc->extra_reg.reg) + wrmsrl(hwc->extra_reg.reg, hwc->extra_reg.config); wrmsrl(hwc->config_base, hwc->config | enable_mask); } @@ -1692,7 +1737,6 @@ static int validate_group(struct perf_event *event) fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO); if (!fake_cpuc) goto out; - /* * the event is not yet connected with its * siblings therefore we must first collect diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index d38b002..6ad95ba 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1,25 +1,15 @@ #ifdef CONFIG_CPU_SUP_INTEL -#define MAX_EXTRA_REGS 2 - -/* - * Per register state. - */ -struct er_account { - int ref; /* reference count */ - unsigned int extra_reg; /* extra MSR number */ - u64 extra_config; /* extra MSR config */ -}; - /* - * Per core state - * This used to coordinate shared registers for HT threads. + * Per core/cpu state + * + * Used to coordinate shared registers between HT threads or + * among events on a single PMU. */ -struct intel_percore { - raw_spinlock_t lock; /* protect structure */ - struct er_account regs[MAX_EXTRA_REGS]; - int refcnt; /* number of threads */ - unsigned core_id; +struct intel_shared_regs { + struct er_account regs[EXTRA_REG_MAX]; + int refcnt; /* per-core: #HT threads */ + unsigned core_id; /* per-core: core id */ }; /* @@ -88,16 +78,10 @@ static struct event_constraint intel_nehalem_event_constraints[] __read_mostly = static struct extra_reg intel_nehalem_extra_regs[] __read_mostly = { - INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff), + INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0), EVENT_EXTRA_END }; -static struct event_constraint intel_nehalem_percore_constraints[] __read_mostly = -{ - INTEL_EVENT_CONSTRAINT(0xb7, 0), - EVENT_CONSTRAINT_END -}; - static struct event_constraint intel_westmere_event_constraints[] __read_mostly = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ @@ -125,18 +109,11 @@ static struct event_constraint intel_snb_event_constraints[] __read_mostly = static struct extra_reg intel_westmere_extra_regs[] __read_mostly = { - INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff), - INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0xffff), + INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0), + INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0xffff, RSP_1), EVENT_EXTRA_END }; -static struct event_constraint intel_westmere_percore_constraints[] __read_mostly = -{ - INTEL_EVENT_CONSTRAINT(0xb7, 0), - INTEL_EVENT_CONSTRAINT(0xbb, 0), - EVENT_CONSTRAINT_END -}; - static struct event_constraint intel_gen_event_constraints[] __read_mostly = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ @@ -1037,65 +1014,89 @@ intel_bts_constraints(struct perf_event *event) return NULL; } +/* + * manage allocation of shared extra msr for certain events + * + * sharing can be: + * per-cpu: to be shared between the various events on a single PMU + * per-core: per-cpu + shared by HT threads + */ static struct event_constraint * -intel_percore_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) +__intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc, + struct hw_perf_event_extra *reg) { - struct hw_perf_event *hwc = &event->hw; - unsigned int e = hwc->config & ARCH_PERFMON_EVENTSEL_EVENT; - struct event_constraint *c; - struct intel_percore *pc; + struct event_constraint *c = &emptyconstraint; struct er_account *era; - int i; - int free_slot; - int found; - if (!x86_pmu.percore_constraints || hwc->extra_alloc) - return NULL; + /* already allocated shared msr */ + if (reg->alloc || !cpuc->shared_regs) + return &unconstrained; - for (c = x86_pmu.percore_constraints; c->cmask; c++) { - if (e != c->code) - continue; + era = &cpuc->shared_regs->regs[reg->idx]; + + raw_spin_lock(&era->lock); + + if (!atomic_read(&era->ref) || era->config == reg->config) { + + /* lock in msr value */ + era->config = reg->config; + era->reg = reg->reg; + + /* one more user */ + atomic_inc(&era->ref); + + /* no need to reallocate during incremental event scheduling */ + reg->alloc = 1; /* - * Allocate resource per core. + * All events using extra_reg are unconstrained. + * Avoids calling x86_get_event_constraints() + * + * Must revisit if extra_reg controlling events + * ever have constraints. Worst case we go through + * the regular event constraint table. */ - pc = cpuc->per_core; - if (!pc) - break; - c = &emptyconstraint; - raw_spin_lock(&pc->lock); - free_slot = -1; - found = 0; - for (i = 0; i < MAX_EXTRA_REGS; i++) { - era = &pc->regs[i]; - if (era->ref > 0 && hwc->extra_reg == era->extra_reg) { - /* Allow sharing same config */ - if (hwc->extra_config == era->extra_config) { - era->ref++; - cpuc->percore_used = 1; - hwc->extra_alloc = 1; - c = NULL; - } - /* else conflict */ - found = 1; - break; - } else if (era->ref == 0 && free_slot == -1) - free_slot = i; - } - if (!found && free_slot != -1) { - era = &pc->regs[free_slot]; - era->ref = 1; - era->extra_reg = hwc->extra_reg; - era->extra_config = hwc->extra_config; - cpuc->percore_used = 1; - hwc->extra_alloc = 1; - c = NULL; - } - raw_spin_unlock(&pc->lock); - return c; + c = &unconstrained; } + raw_spin_unlock(&era->lock); - return NULL; + return c; +} + +static void +__intel_shared_reg_put_constraints(struct cpu_hw_events *cpuc, + struct hw_perf_event_extra *reg) +{ + struct er_account *era; + + /* + * only put constraint if extra reg was actually + * allocated. Also takes care of event which do + * not use an extra shared reg + */ + if (!reg->alloc) + return; + + era = &cpuc->shared_regs->regs[reg->idx]; + + /* one fewer user */ + atomic_dec(&era->ref); + + /* allocate again next time */ + reg->alloc = 0; +} + +static struct event_constraint * +intel_shared_regs_constraints(struct cpu_hw_events *cpuc, + struct perf_event *event) +{ + struct event_constraint *c = NULL; + struct hw_perf_event_extra *xreg; + + xreg = &event->hw.extra_reg; + if (xreg->idx != EXTRA_REG_NONE) + c = __intel_shared_reg_get_constraints(cpuc, xreg); + return c; } static struct event_constraint * @@ -1111,49 +1112,28 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event if (c) return c; - c = intel_percore_constraints(cpuc, event); + c = intel_shared_regs_constraints(cpuc, event); if (c) return c; return x86_get_event_constraints(cpuc, event); } -static void intel_put_event_constraints(struct cpu_hw_events *cpuc, +static void +intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) { - struct extra_reg *er; - struct intel_percore *pc; - struct er_account *era; - struct hw_perf_event *hwc = &event->hw; - int i, allref; - - if (!cpuc->percore_used) - return; + struct hw_perf_event_extra *reg; - for (er = x86_pmu.extra_regs; er->msr; er++) { - if (er->event != (hwc->config & er->config_mask)) - continue; + reg = &event->hw.extra_reg; + if (reg->idx != EXTRA_REG_NONE) + __intel_shared_reg_put_constraints(cpuc, reg); +} - pc = cpuc->per_core; - raw_spin_lock(&pc->lock); - for (i = 0; i < MAX_EXTRA_REGS; i++) { - era = &pc->regs[i]; - if (era->ref > 0 && - era->extra_config == hwc->extra_config && - era->extra_reg == er->msr) { - era->ref--; - hwc->extra_alloc = 0; - break; - } - } - allref = 0; - for (i = 0; i < MAX_EXTRA_REGS; i++) - allref += pc->regs[i].ref; - if (allref == 0) - cpuc->percore_used = 0; - raw_spin_unlock(&pc->lock); - break; - } +static void intel_put_event_constraints(struct cpu_hw_events *cpuc, + struct perf_event *event) +{ + intel_put_shared_regs_event_constraints(cpuc, event); } static int intel_pmu_hw_config(struct perf_event *event) @@ -1231,20 +1211,36 @@ static __initconst const struct x86_pmu core_pmu = { .event_constraints = intel_core_event_constraints, }; +static struct intel_shared_regs *allocate_shared_regs(int cpu) +{ + struct intel_shared_regs *regs; + int i; + + regs = kzalloc_node(sizeof(struct intel_shared_regs), + GFP_KERNEL, cpu_to_node(cpu)); + if (regs) { + /* + * initialize the locks to keep lockdep happy + */ + for (i = 0; i < EXTRA_REG_MAX; i++) + raw_spin_lock_init(®s->regs[i].lock); + + regs->core_id = -1; + } + return regs; +} + static int intel_pmu_cpu_prepare(int cpu) { struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); - if (!cpu_has_ht_siblings()) + if (!x86_pmu.extra_regs) return NOTIFY_OK; - cpuc->per_core = kzalloc_node(sizeof(struct intel_percore), - GFP_KERNEL, cpu_to_node(cpu)); - if (!cpuc->per_core) + cpuc->shared_regs = allocate_shared_regs(cpu); + if (!cpuc->shared_regs) return NOTIFY_BAD; - raw_spin_lock_init(&cpuc->per_core->lock); - cpuc->per_core->core_id = -1; return NOTIFY_OK; } @@ -1260,32 +1256,34 @@ static void intel_pmu_cpu_starting(int cpu) */ intel_pmu_lbr_reset(); - if (!cpu_has_ht_siblings()) + if (!cpuc->shared_regs) return; for_each_cpu(i, topology_thread_cpumask(cpu)) { - struct intel_percore *pc = per_cpu(cpu_hw_events, i).per_core; + struct intel_shared_regs *pc; + pc = per_cpu(cpu_hw_events, i).shared_regs; if (pc && pc->core_id == core_id) { - kfree(cpuc->per_core); - cpuc->per_core = pc; + kfree(cpuc->shared_regs); + cpuc->shared_regs = pc; break; } } - cpuc->per_core->core_id = core_id; - cpuc->per_core->refcnt++; + cpuc->shared_regs->core_id = core_id; + cpuc->shared_regs->refcnt++; } static void intel_pmu_cpu_dying(int cpu) { struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); - struct intel_percore *pc = cpuc->per_core; + struct intel_shared_regs *pc; + pc = cpuc->shared_regs; if (pc) { if (pc->core_id == -1 || --pc->refcnt == 0) kfree(pc); - cpuc->per_core = NULL; + cpuc->shared_regs = NULL; } fini_debug_store_on_cpu(cpu); @@ -1436,7 +1434,6 @@ static __init int intel_pmu_init(void) x86_pmu.event_constraints = intel_nehalem_event_constraints; x86_pmu.pebs_constraints = intel_nehalem_pebs_event_constraints; - x86_pmu.percore_constraints = intel_nehalem_percore_constraints; x86_pmu.enable_all = intel_pmu_nhm_enable_all; x86_pmu.extra_regs = intel_nehalem_extra_regs; @@ -1481,7 +1478,6 @@ static __init int intel_pmu_init(void) intel_pmu_lbr_init_nhm(); x86_pmu.event_constraints = intel_westmere_event_constraints; - x86_pmu.percore_constraints = intel_westmere_percore_constraints; x86_pmu.enable_all = intel_pmu_nhm_enable_all; x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints; x86_pmu.extra_regs = intel_westmere_extra_regs; -- cgit v1.1 From cd8a38d33e2528998998bae70a45ad27e442f114 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Mon, 6 Jun 2011 16:57:08 +0200 Subject: perf_events: Fix validation of events using an extra reg The validate_group() function needs to validate events with extra shared regs. Within an event group, only events with the same value for the extra reg can co-exist. This was not checked by validate_group() because it was missing the shared_regs logic. This patch changes the allocation of the fake cpuc used for validation to also point to a fake shared_regs structure such that group events be properly testing. It modifies __intel_shared_reg_get_constraints() to use spin_lock_irqsave() to avoid lockdep issues. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110606145708.GA7279@quad Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 59 ++++++++++++++++++++++++++-------- arch/x86/kernel/cpu/perf_event_intel.c | 16 ++++++--- 2 files changed, 57 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 019fda7..9a0f55c 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1689,6 +1689,40 @@ static int x86_pmu_commit_txn(struct pmu *pmu) perf_pmu_enable(pmu); return 0; } +/* + * a fake_cpuc is used to validate event groups. Due to + * the extra reg logic, we need to also allocate a fake + * per_core and per_cpu structure. Otherwise, group events + * using extra reg may conflict without the kernel being + * able to catch this when the last event gets added to + * the group. + */ +static void free_fake_cpuc(struct cpu_hw_events *cpuc) +{ + kfree(cpuc->shared_regs); + kfree(cpuc); +} + +static struct cpu_hw_events *allocate_fake_cpuc(void) +{ + struct cpu_hw_events *cpuc; + int cpu = raw_smp_processor_id(); + + cpuc = kzalloc(sizeof(*cpuc), GFP_KERNEL); + if (!cpuc) + return ERR_PTR(-ENOMEM); + + /* only needed, if we have extra_regs */ + if (x86_pmu.extra_regs) { + cpuc->shared_regs = allocate_shared_regs(cpu); + if (!cpuc->shared_regs) + goto error; + } + return cpuc; +error: + free_fake_cpuc(cpuc); + return ERR_PTR(-ENOMEM); +} /* * validate that we can schedule this event @@ -1699,9 +1733,9 @@ static int validate_event(struct perf_event *event) struct event_constraint *c; int ret = 0; - fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO); - if (!fake_cpuc) - return -ENOMEM; + fake_cpuc = allocate_fake_cpuc(); + if (IS_ERR(fake_cpuc)) + return PTR_ERR(fake_cpuc); c = x86_pmu.get_event_constraints(fake_cpuc, event); @@ -1711,7 +1745,7 @@ static int validate_event(struct perf_event *event) if (x86_pmu.put_event_constraints) x86_pmu.put_event_constraints(fake_cpuc, event); - kfree(fake_cpuc); + free_fake_cpuc(fake_cpuc); return ret; } @@ -1731,35 +1765,32 @@ static int validate_group(struct perf_event *event) { struct perf_event *leader = event->group_leader; struct cpu_hw_events *fake_cpuc; - int ret, n; + int ret = -ENOSPC, n; - ret = -ENOMEM; - fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO); - if (!fake_cpuc) - goto out; + fake_cpuc = allocate_fake_cpuc(); + if (IS_ERR(fake_cpuc)) + return PTR_ERR(fake_cpuc); /* * the event is not yet connected with its * siblings therefore we must first collect * existing siblings, then add the new event * before we can simulate the scheduling */ - ret = -ENOSPC; n = collect_events(fake_cpuc, leader, true); if (n < 0) - goto out_free; + goto out; fake_cpuc->n_events = n; n = collect_events(fake_cpuc, event, false); if (n < 0) - goto out_free; + goto out; fake_cpuc->n_events = n; ret = x86_pmu.schedule_events(fake_cpuc, n, NULL); -out_free: - kfree(fake_cpuc); out: + free_fake_cpuc(fake_cpuc); return ret; } diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 6ad95ba..ac02b83 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1027,14 +1027,18 @@ __intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc, { struct event_constraint *c = &emptyconstraint; struct er_account *era; + unsigned long flags; /* already allocated shared msr */ - if (reg->alloc || !cpuc->shared_regs) + if (reg->alloc) return &unconstrained; era = &cpuc->shared_regs->regs[reg->idx]; - - raw_spin_lock(&era->lock); + /* + * we use spin_lock_irqsave() to avoid lockdep issues when + * passing a fake cpuc + */ + raw_spin_lock_irqsave(&era->lock, flags); if (!atomic_read(&era->ref) || era->config == reg->config) { @@ -1058,7 +1062,7 @@ __intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc, */ c = &unconstrained; } - raw_spin_unlock(&era->lock); + raw_spin_unlock_irqrestore(&era->lock, flags); return c; } @@ -1524,4 +1528,8 @@ static int intel_pmu_init(void) return 0; } +static struct intel_shared_regs *allocate_shared_regs(int cpu) +{ + return NULL; +} #endif /* CONFIG_CPU_SUP_INTEL */ -- cgit v1.1 From ee89cbc2d48150c7c0e9f2aaac00afde99af098c Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Mon, 6 Jun 2011 16:57:12 +0200 Subject: perf_events: Add Intel Sandy Bridge offcore_response low-level support This patch adds Intel Sandy Bridge offcore_response support by providing the low-level constraint table for those events. On Sandy Bridge, there are two offcore_response events. Each uses its own dedictated extra register. But those registers are NOT shared between sibling CPUs when HT is on unlike Nehalem/Westmere. They are always private to each CPU. But they still need to be controlled within an event group. All events within an event group must use the same value for the extra MSR. That's not controlled by the second patch in this series. Furthermore on Sandy Bridge, the offcore_response events have NO counter constraints contrary to what the official documentation indicates, so drop the events from the contraint table. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110606145712.GA7304@quad Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 1 + arch/x86/kernel/cpu/perf_event_intel.c | 13 ++++++++++--- 2 files changed, 11 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 9a0f55c..583f311 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -327,6 +327,7 @@ struct x86_pmu { * Extra registers for events */ struct extra_reg *extra_regs; + bool regs_no_ht_sharing; }; static struct x86_pmu x86_pmu __read_mostly; diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index ac02b83..a674ae4 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -100,8 +100,6 @@ static struct event_constraint intel_snb_event_constraints[] __read_mostly = FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */ - INTEL_EVENT_CONSTRAINT(0xb7, 0x1), /* OFF_CORE_RESPONSE_0 */ - INTEL_EVENT_CONSTRAINT(0xbb, 0x8), /* OFF_CORE_RESPONSE_1 */ INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */ INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */ EVENT_CONSTRAINT_END @@ -122,6 +120,12 @@ static struct event_constraint intel_gen_event_constraints[] __read_mostly = EVENT_CONSTRAINT_END }; +static struct extra_reg intel_snb_extra_regs[] __read_mostly = { + INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0x3fffffffffull, RSP_0), + INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0x3fffffffffull, RSP_1), + EVENT_EXTRA_END +}; + static u64 intel_pmu_event_map(int hw_event) { return intel_perfmon_event_map[hw_event]; @@ -1260,7 +1264,7 @@ static void intel_pmu_cpu_starting(int cpu) */ intel_pmu_lbr_reset(); - if (!cpuc->shared_regs) + if (!cpuc->shared_regs || x86_pmu.regs_no_ht_sharing) return; for_each_cpu(i, topology_thread_cpumask(cpu)) { @@ -1502,6 +1506,9 @@ static __init int intel_pmu_init(void) x86_pmu.event_constraints = intel_snb_event_constraints; x86_pmu.pebs_constraints = intel_snb_pebs_events; + x86_pmu.extra_regs = intel_snb_extra_regs; + /* all extra regs are per-cpu when HT is on */ + x86_pmu.regs_no_ht_sharing = true; /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e; -- cgit v1.1 From b79e8941fb9af07d810da91b4e29da2bba331b6e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 23 May 2011 11:08:15 +0200 Subject: perf, intel: Try alternative OFFCORE encodings Since the OFFCORE registers are fully symmetric, try the other one when the specified one is already in use. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1306141897.18455.8.camel@twins Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 5 +++- arch/x86/kernel/cpu/perf_event_intel.c | 44 ++++++++++++++++++++++++++++------ 2 files changed, 41 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 583f311..c53d433 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -327,9 +327,12 @@ struct x86_pmu { * Extra registers for events */ struct extra_reg *extra_regs; - bool regs_no_ht_sharing; + unsigned int er_flags; }; +#define ERF_NO_HT_SHARING 1 +#define ERF_HAS_RSP_1 2 + static struct x86_pmu x86_pmu __read_mostly; static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index a674ae4..5c44862 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1018,6 +1018,29 @@ intel_bts_constraints(struct perf_event *event) return NULL; } +static bool intel_try_alt_er(struct perf_event *event, int orig_idx) +{ + if (!(x86_pmu.er_flags & ERF_HAS_RSP_1)) + return false; + + if (event->hw.extra_reg.idx == EXTRA_REG_RSP_0) { + event->hw.config &= ~INTEL_ARCH_EVENT_MASK; + event->hw.config |= 0x01bb; + event->hw.extra_reg.idx = EXTRA_REG_RSP_1; + event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1; + } else if (event->hw.extra_reg.idx == EXTRA_REG_RSP_1) { + event->hw.config &= ~INTEL_ARCH_EVENT_MASK; + event->hw.config |= 0x01b7; + event->hw.extra_reg.idx = EXTRA_REG_RSP_0; + event->hw.extra_reg.reg = MSR_OFFCORE_RSP_0; + } + + if (event->hw.extra_reg.idx == orig_idx) + return false; + + return true; +} + /* * manage allocation of shared extra msr for certain events * @@ -1027,16 +1050,19 @@ intel_bts_constraints(struct perf_event *event) */ static struct event_constraint * __intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc, - struct hw_perf_event_extra *reg) + struct perf_event *event) { struct event_constraint *c = &emptyconstraint; + struct hw_perf_event_extra *reg = &event->hw.extra_reg; struct er_account *era; unsigned long flags; + int orig_idx = reg->idx; /* already allocated shared msr */ if (reg->alloc) return &unconstrained; +again: era = &cpuc->shared_regs->regs[reg->idx]; /* * we use spin_lock_irqsave() to avoid lockdep issues when @@ -1065,6 +1091,9 @@ __intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc, * the regular event constraint table. */ c = &unconstrained; + } else if (intel_try_alt_er(event, orig_idx)) { + raw_spin_unlock(&era->lock); + goto again; } raw_spin_unlock_irqrestore(&era->lock, flags); @@ -1099,11 +1128,10 @@ intel_shared_regs_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) { struct event_constraint *c = NULL; - struct hw_perf_event_extra *xreg; - xreg = &event->hw.extra_reg; - if (xreg->idx != EXTRA_REG_NONE) - c = __intel_shared_reg_get_constraints(cpuc, xreg); + if (event->hw.extra_reg.idx != EXTRA_REG_NONE) + c = __intel_shared_reg_get_constraints(cpuc, event); + return c; } @@ -1264,7 +1292,7 @@ static void intel_pmu_cpu_starting(int cpu) */ intel_pmu_lbr_reset(); - if (!cpuc->shared_regs || x86_pmu.regs_no_ht_sharing) + if (!cpuc->shared_regs || (x86_pmu.er_flags & ERF_NO_HT_SHARING)) return; for_each_cpu(i, topology_thread_cpumask(cpu)) { @@ -1489,6 +1517,7 @@ static __init int intel_pmu_init(void) x86_pmu.enable_all = intel_pmu_nhm_enable_all; x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints; x86_pmu.extra_regs = intel_westmere_extra_regs; + x86_pmu.er_flags |= ERF_HAS_RSP_1; /* UOPS_ISSUED.STALLED_CYCLES */ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e; @@ -1508,7 +1537,8 @@ static __init int intel_pmu_init(void) x86_pmu.pebs_constraints = intel_snb_pebs_events; x86_pmu.extra_regs = intel_snb_extra_regs; /* all extra regs are per-cpu when HT is on */ - x86_pmu.regs_no_ht_sharing = true; + x86_pmu.er_flags |= ERF_HAS_RSP_1; + x86_pmu.er_flags |= ERF_NO_HT_SHARING; /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e; -- cgit v1.1 From 89d6c0b5bdbb1927775584dcf532d98b3efe1477 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Apr 2011 23:37:06 +0200 Subject: perf, arch: Add generic NODE cache events Add a NODE level to the generic cache events which is used to measure local vs remote memory accesses. Like all other cache events, an ACCESS is HIT+MISS, if there is no way to distinguish between reads and writes do reads only etc.. The below needs filling out for !x86 (which I filled out with unsupported events). I'm fairly sure ARM can leave it like that since it doesn't strike me as an architecture that even has NUMA support. SH might have something since it does appear to have some NUMA bits. Sparc64, PowerPC and MIPS certainly want a good look there since they clearly are NUMA capable. Signed-off-by: Peter Zijlstra Cc: David Miller Cc: Anton Blanchard Cc: David Daney Cc: Deng-Cheng Zhu Cc: Paul Mundt Cc: Will Deacon Cc: Robert Richter Cc: Stephane Eranian Link: http://lkml.kernel.org/r/1303508226.4865.8.camel@laptop Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_amd.c | 14 ++++++++ arch/x86/kernel/cpu/perf_event_intel.c | 59 +++++++++++++++++++++++++++++++++- arch/x86/kernel/cpu/perf_event_p4.c | 14 ++++++++ 3 files changed, 86 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index fe29c1d..941caa2 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -89,6 +89,20 @@ static __initconst const u64 amd_hw_cache_event_ids [ C(RESULT_MISS) ] = -1, }, }, + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0xb8e9, /* CPU Request to Memory, l+r */ + [ C(RESULT_MISS) ] = 0x98e9, /* CPU Request to Memory, r */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, }; /* diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 5c44862..bf6f92f 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -226,6 +226,21 @@ static __initconst const u64 snb_hw_cache_event_ids [ C(RESULT_MISS) ] = -1, }, }, + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + }; static __initconst const u64 westmere_hw_cache_event_ids @@ -327,6 +342,20 @@ static __initconst const u64 westmere_hw_cache_event_ids [ C(RESULT_MISS) ] = -1, }, }, + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x01b7, + [ C(RESULT_MISS) ] = 0x01b7, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x01b7, + [ C(RESULT_MISS) ] = 0x01b7, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x01b7, + [ C(RESULT_MISS) ] = 0x01b7, + }, + }, }; /* @@ -379,7 +408,21 @@ static __initconst const u64 nehalem_hw_cache_extra_regs [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_L3_ACCESS, [ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_L3_MISS, }, - } + }, + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_ALL_DRAM, + [ C(RESULT_MISS) ] = NHM_DMND_READ|NHM_REMOTE_DRAM, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_ALL_DRAM, + [ C(RESULT_MISS) ] = NHM_DMND_WRITE|NHM_REMOTE_DRAM, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_ALL_DRAM, + [ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_REMOTE_DRAM, + }, + }, }; static __initconst const u64 nehalem_hw_cache_event_ids @@ -481,6 +524,20 @@ static __initconst const u64 nehalem_hw_cache_event_ids [ C(RESULT_MISS) ] = -1, }, }, + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x01b7, + [ C(RESULT_MISS) ] = 0x01b7, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x01b7, + [ C(RESULT_MISS) ] = 0x01b7, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x01b7, + [ C(RESULT_MISS) ] = 0x01b7, + }, + }, }; static __initconst const u64 core2_hw_cache_event_ids diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index d6e6a67..fb901c5 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -554,6 +554,20 @@ static __initconst const u64 p4_hw_cache_event_ids [ C(RESULT_MISS) ] = -1, }, }, + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, }; static u64 p4_general_events[PERF_COUNT_HW_MAX] = { -- cgit v1.1 From 4dc0da86967d5463708631d02a70cfed5b104884 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 29 Jun 2011 18:42:35 +0300 Subject: perf: Add context field to perf_event The perf_event overflow handler does not receive any caller-derived argument, so many callers need to resort to looking up the perf_event in their local data structure. This is ugly and doesn't scale if a single callback services many perf_events. Fix by adding a context parameter to perf_event_create_kernel_counter() (and derived hardware breakpoints APIs) and storing it in the perf_event. The field can be accessed from the callback as event->overflow_handler_context. All callers are updated. Signed-off-by: Avi Kivity Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1309362157-6596-2-git-send-email-avi@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/kgdb.c | 2 +- arch/x86/kernel/ptrace.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 98da6a7..00354d4 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -638,7 +638,7 @@ void kgdb_arch_late(void) for (i = 0; i < HBP_NUM; i++) { if (breakinfo[i].pev) continue; - breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL); + breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL, NULL); if (IS_ERR((void * __force)breakinfo[i].pev)) { printk(KERN_ERR "kgdb: Could not allocate hw" "breakpoints\nDisabling the kernel debugger\n"); diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 11db2e9..8252879 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -715,7 +715,8 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, attr.bp_type = HW_BREAKPOINT_W; attr.disabled = 1; - bp = register_user_hw_breakpoint(&attr, ptrace_triggered, tsk); + bp = register_user_hw_breakpoint(&attr, ptrace_triggered, + NULL, tsk); /* * CHECKME: the previous code returned -EIO if the addr wasn't -- cgit v1.1 From 0af3ac1fdb9d5c297b4b07c9e0172531d42b6716 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 29 Jun 2011 18:42:36 +0300 Subject: x86, perf: Add constraints for architectural PMU The v1 PMU does not have any fixed counters. Using the v2 constraints, which do have fixed counters, causes an additional choice to be present in the weight calculation, but not when actually scheduling the event, leading to an event being not scheduled at all. Signed-off-by: Avi Kivity Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1309362157-6596-3-git-send-email-avi@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index bf6f92f..45fbb8f 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -112,6 +112,11 @@ static struct extra_reg intel_westmere_extra_regs[] __read_mostly = EVENT_EXTRA_END }; +static struct event_constraint intel_v1_event_constraints[] __read_mostly = +{ + EVENT_CONSTRAINT_END +}; + static struct event_constraint intel_gen_event_constraints[] __read_mostly = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ @@ -1606,11 +1611,19 @@ static __init int intel_pmu_init(void) break; default: - /* - * default constraints for v2 and up - */ - x86_pmu.event_constraints = intel_gen_event_constraints; - pr_cont("generic architected perfmon, "); + switch (x86_pmu.version) { + case 1: + x86_pmu.event_constraints = intel_v1_event_constraints; + pr_cont("generic architected perfmon v1, "); + break; + default: + /* + * default constraints for v2 and up + */ + x86_pmu.event_constraints = intel_gen_event_constraints; + pr_cont("generic architected perfmon, "); + break; + } } return 0; } -- cgit v1.1 From 50c31e4a2497ea17747b587e8f96b278f07f5483 Mon Sep 17 00:00:00 2001 From: Sergei Shtylyov Date: Fri, 1 Jul 2011 22:42:08 +0400 Subject: x86, mtrr: Use pci_dev->revision This code uses PCI_CLASS_REVISION instead of PCI_REVISION_ID, so it wasn't converted by commit 44c10138fd4 ("PCI: Change all drivers to use pci_device->revision") before being moved to arch/x86/... Do it now at last -- and save one level of indentation... Signed-off-by: Sergei Shtylyov Cc: Suresh Siddha Link: http://lkml.kernel.org/r/201107012242.08347.sshtylyov@ru.mvista.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/main.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 707b637..08119a3 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -79,7 +79,6 @@ void set_mtrr_ops(const struct mtrr_ops *ops) static int have_wrcomb(void) { struct pci_dev *dev; - u8 rev; dev = pci_get_class(PCI_CLASS_BRIDGE_HOST << 8, NULL); if (dev != NULL) { @@ -89,13 +88,11 @@ static int have_wrcomb(void) * chipsets to be tagged */ if (dev->vendor == PCI_VENDOR_ID_SERVERWORKS && - dev->device == PCI_DEVICE_ID_SERVERWORKS_LE) { - pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev); - if (rev <= 5) { - pr_info("mtrr: Serverworks LE rev < 6 detected. Write-combining disabled.\n"); - pci_dev_put(dev); - return 0; - } + dev->device == PCI_DEVICE_ID_SERVERWORKS_LE && + dev->revision <= 5) { + pr_info("mtrr: Serverworks LE rev < 6 detected. Write-combining disabled.\n"); + pci_dev_put(dev); + return 0; } /* * Intel 450NX errata # 23. Non ascending cacheline evictions to -- cgit v1.1 From 9e46294dadedc0c04adcb8ce760bd2cd74f7332d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 2 Jul 2011 15:00:52 +0200 Subject: x86: Save stack pointer in perf live regs savings In order to prepare for fetching the stack pointer from the regs when possible in dump_trace() instead of taking the local one, save the current stack pointer in perf live regs saving. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Thomas Gleixner Cc: H. Peter Anvin Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo --- arch/x86/include/asm/perf_event.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index d9d4dae..094fb30 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -152,6 +152,11 @@ extern unsigned long perf_misc_flags(struct pt_regs *regs); (regs)->bp = caller_frame_pointer(); \ (regs)->cs = __KERNEL_CS; \ regs->flags = 0; \ + asm volatile( \ + _ASM_MOV "%%"_ASM_SP ", %0\n" \ + : "=m" ((regs)->sp) \ + :: "memory" \ + ); \ } #else -- cgit v1.1 From 47ce11a2b6519f9c7843223ea8e561eb71ea5896 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 30 Jun 2011 19:04:56 +0200 Subject: x86: Fetch stack from regs when possible in dump_trace() When regs are passed to dump_stack(), we fetch the frame pointer from the regs but the stack pointer is taken from the current frame. Thus the frame and stack pointers may not come from the same context. For example this can result in the unwinder to think the context is in irq, due to the current value of the stack, but the frame pointer coming from the regs points to a frame from another place. It then tries to fix up the irq link but ends up dereferencing a random frame pointer that doesn't belong to the irq stack: [ 9131.706906] ------------[ cut here ]------------ [ 9131.707003] WARNING: at arch/x86/kernel/dumpstack_64.c:129 dump_trace+0x2aa/0x330() [ 9131.707003] Hardware name: AMD690VM-FMH [ 9131.707003] Perf: bad frame pointer = 0000000000000005 in callchain [ 9131.707003] Modules linked in: [ 9131.707003] Pid: 1050, comm: perf Not tainted 3.0.0-rc3+ #181 [ 9131.707003] Call Trace: [ 9131.707003] [] warn_slowpath_common+0x7a/0xb0 [ 9131.707003] [] warn_slowpath_fmt+0x41/0x50 [ 9131.707003] [] ? bad_to_user+0x6d/0x10be [ 9131.707003] [] dump_trace+0x2aa/0x330 [ 9131.707003] [] ? native_sched_clock+0x13/0x50 [ 9131.707003] [] perf_callchain_kernel+0x54/0x70 [ 9131.707003] [] perf_prepare_sample+0x19f/0x2a0 [ 9131.707003] [] __perf_event_overflow+0x16c/0x290 [ 9131.707003] [] ? __perf_event_overflow+0x130/0x290 [ 9131.707003] [] ? native_sched_clock+0x13/0x50 [ 9131.707003] [] ? sched_clock+0x9/0x10 [ 9131.707003] [] ? T.375+0x15/0x90 [ 9131.707003] [] ? trace_hardirqs_on_caller+0x64/0x180 [ 9131.707003] [] ? trace_hardirqs_off+0xd/0x10 [ 9131.707003] [] perf_event_overflow+0x14/0x20 [ 9131.707003] [] perf_swevent_hrtimer+0x11c/0x130 [ 9131.707003] [] ? error_exit+0x51/0xb0 [ 9131.707003] [] __run_hrtimer+0x83/0x1e0 [ 9131.707003] [] ? perf_event_overflow+0x20/0x20 [ 9131.707003] [] hrtimer_interrupt+0x106/0x250 [ 9131.707003] [] ? trace_hardirqs_off_thunk+0x3a/0x3c [ 9131.707003] [] smp_apic_timer_interrupt+0x53/0x90 [ 9131.707003] [] apic_timer_interrupt+0x13/0x20 [ 9131.707003] [] ? error_exit+0x51/0xb0 [ 9131.707003] [] ? error_exit+0x4c/0xb0 [ 9131.707003] ---[ end trace b2560d4876709347 ]--- Fix this by simply taking the stack pointer from regs->sp when regs are provided. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Thomas Gleixner Cc: H. Peter Anvin Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo --- arch/x86/kernel/dumpstack_64.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index e71c98d..788295c 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -155,9 +155,12 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, task = current; if (!stack) { - stack = &dummy; - if (task && task != current) + if (regs) + stack = (unsigned long *)regs->sp; + else if (task && task != current) stack = (unsigned long *)task->thread.sp; + else + stack = &dummy; } if (!bp) -- cgit v1.1 From 1871853f7abc3c727c4346539c5062cbeaf016a4 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 1 Jul 2011 01:51:22 +0200 Subject: x86,64: Simplify save_regs() The save_regs function that saves the regs on low level irq entry is complicated because of the fact it changes its stack in the middle and also because it manipulates data allocated in the caller frame and accesses there are directly calculated from callee rsp value with the return address in the middle of the way. This complicates the static stack offsets calculation and require more dynamic ones. It also needs a save/restore of the function's return address. To simplify and optimize this, turn save_regs() into a macro. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Thomas Gleixner Cc: H. Peter Anvin Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Jan Beulich --- arch/x86/kernel/entry_64.S | 44 +++++++++++++++++--------------------------- 1 file changed, 17 insertions(+), 27 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 8a445a0..b6b2e85 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -297,27 +297,22 @@ ENDPROC(native_usergs_sysret64) .endm /* save partial stack frame */ - .pushsection .kprobes.text, "ax" -ENTRY(save_args) - XCPT_FRAME + .macro SAVE_ARGS_IRQ cld - /* - * start from rbp in pt_regs and jump over - * return address. - */ - movq_cfi rdi, RDI+8-RBP - movq_cfi rsi, RSI+8-RBP - movq_cfi rdx, RDX+8-RBP - movq_cfi rcx, RCX+8-RBP - movq_cfi rax, RAX+8-RBP - movq_cfi r8, R8+8-RBP - movq_cfi r9, R9+8-RBP - movq_cfi r10, R10+8-RBP - movq_cfi r11, R11+8-RBP - - leaq -RBP+8(%rsp),%rdi /* arg1 for handler */ - movq_cfi rbp, 8 /* push %rbp */ - leaq 8(%rsp), %rbp /* mov %rsp, %ebp */ + /* start from rbp in pt_regs and jump over */ + movq_cfi rdi, RDI-RBP + movq_cfi rsi, RSI-RBP + movq_cfi rdx, RDX-RBP + movq_cfi rcx, RCX-RBP + movq_cfi rax, RAX-RBP + movq_cfi r8, R8-RBP + movq_cfi r9, R9-RBP + movq_cfi r10, R10-RBP + movq_cfi r11, R11-RBP + + leaq -RBP(%rsp),%rdi /* arg1 for handler */ + movq_cfi rbp, 0 /* push %rbp */ + movq %rsp, %rbp testl $3, CS(%rdi) je 1f SWAPGS @@ -329,19 +324,14 @@ ENTRY(save_args) */ 1: incl PER_CPU_VAR(irq_count) jne 2f - popq_cfi %rax /* move return address... */ mov PER_CPU_VAR(irq_stack_ptr),%rsp EMPTY_FRAME 0 pushq_cfi %rbp /* backlink for unwinder */ - pushq_cfi %rax /* ... to the new stack */ /* * We entered an interrupt context - irqs are off: */ 2: TRACE_IRQS_OFF - ret - CFI_ENDPROC -END(save_args) - .popsection + .endm ENTRY(save_rest) PARTIAL_FRAME 1 REST_SKIP+8 @@ -791,7 +781,7 @@ END(interrupt) /* reserve pt_regs for scratch regs and rbp */ subq $ORIG_RAX-RBP, %rsp CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP - call save_args + SAVE_ARGS_IRQ PARTIAL_FRAME 0 call \func .endm -- cgit v1.1 From 3b99a3ef55b292180473a221f3d6bc24455f0632 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 1 Jul 2011 02:25:17 +0200 Subject: x86,64: Separate arg1 from rbp handling in SAVE_REGS_IRQ Just for clarity in the code. Have a first block that handles the frame pointer and a separate one that handles pt_regs pointer and its use. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Thomas Gleixner Cc: H. Peter Anvin Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Jan Beulich --- arch/x86/kernel/entry_64.S | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index b6b2e85..20dc8e6 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -310,9 +310,10 @@ ENDPROC(native_usergs_sysret64) movq_cfi r10, R10-RBP movq_cfi r11, R11-RBP - leaq -RBP(%rsp),%rdi /* arg1 for handler */ movq_cfi rbp, 0 /* push %rbp */ movq %rsp, %rbp + + leaq -RBP(%rsp),%rdi /* arg1 for handler */ testl $3, CS(%rdi) je 1f SWAPGS -- cgit v1.1 From 48ffee7d9e6df51b4957bed64115b7beed671374 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 2 Jul 2011 15:03:44 +0200 Subject: x86: Remove useless unwinder backlink from irq regs saving The unwinder backlink in interrupt entry is very useless. It's actually not part of the stack frame chain and thus is never used. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Thomas Gleixner Cc: H. Peter Anvin Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Jan Beulich --- arch/x86/kernel/entry_64.S | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 20dc8e6..6131432 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -327,7 +327,6 @@ ENDPROC(native_usergs_sysret64) jne 2f mov PER_CPU_VAR(irq_stack_ptr),%rsp EMPTY_FRAME 0 - pushq_cfi %rbp /* backlink for unwinder */ /* * We entered an interrupt context - irqs are off: */ -- cgit v1.1 From a2bbe75089d5eb9a3a46d50dd5c215e213790288 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 2 Jul 2011 16:52:45 +0200 Subject: x86: Don't use frame pointer to save old stack on irq entry rbp is used in SAVE_ARGS_IRQ to save the old stack pointer in order to restore it later in ret_from_intr. It is convenient because we save its value in the irq regs and it's easily restored using the leave instruction. However this is a kind of abuse of the frame pointer which role is to help unwinding the kernel by chaining frames together, each node following the return address to the previous frame. But although we are breaking the frame by changing the stack pointer, there is no preceding return address before the new frame. Hence using the frame pointer to link the two stacks breaks the stack unwinders that find a random value instead of a return address here. There is no workaround that can work in every case. We are using the fixup_bp_irq_link() function to dereference that abused frame pointer in the case of non nesting interrupt (which means stack changed). But that doesn't fix the case of interrupts that don't change the stack (but we still have the unconditional frame link), which is the case of hardirq interrupting softirq. We have no way to detect this transition so the frame irq link is considered as a real frame pointer and the return address is dereferenced but it is still a spurious one. There are two possible results of this: either the spurious return address, a random stack value, luckily belongs to the kernel text and then the unwinding can continue and we just have a weird entry in the stack trace. Or it doesn't belong to the kernel text and unwinding stops there. This is the reason why stacktraces (including perf callchains) on irqs that interrupted softirqs don't work very well. To solve this, we don't save the old stack pointer on rbp anymore but we save it to a scratch register that we push on the new stack and that we pop back later on irq return. This preserves the whole frame chain without spurious return addresses in the middle and drops the need for the horrid fixup_bp_irq_link() workaround. And finally irqs that interrupt softirq are sanely unwinded. Before: 99.81% perf [kernel.kallsyms] [k] perf_pending_event | --- perf_pending_event irq_work_run smp_irq_work_interrupt irq_work_interrupt | |--41.60%-- __read | | | |--99.90%-- create_worker | | bench_sched_messaging | | cmd_bench | | run_builtin | | main | | __libc_start_main | --0.10%-- [...] After: 1.64% swapper [kernel.kallsyms] [k] perf_pending_event | --- perf_pending_event irq_work_run smp_irq_work_interrupt irq_work_interrupt | |--95.00%-- arch_irq_work_raise | irq_work_queue | __perf_event_overflow | perf_swevent_overflow | perf_swevent_event | perf_tp_event | perf_trace_softirq | __do_softirq | call_softirq | do_softirq | irq_exit | | | |--73.68%-- smp_apic_timer_interrupt | | apic_timer_interrupt | | | | | |--96.43%-- amd_e400_idle | | | cpu_idle | | | start_secondary Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Thomas Gleixner Cc: H. Peter Anvin Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Jan Beulich --- arch/x86/kernel/dumpstack_64.c | 30 ------------------------------ arch/x86/kernel/entry_64.S | 27 +++++++++++++++------------ 2 files changed, 15 insertions(+), 42 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 788295c..19853ad 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -105,34 +105,6 @@ in_irq_stack(unsigned long *stack, unsigned long *irq_stack, } /* - * We are returning from the irq stack and go to the previous one. - * If the previous stack is also in the irq stack, then bp in the first - * frame of the irq stack points to the previous, interrupted one. - * Otherwise we have another level of indirection: We first save - * the bp of the previous stack, then we switch the stack to the irq one - * and save a new bp that links to the previous one. - * (See save_args()) - */ -static inline unsigned long -fixup_bp_irq_link(unsigned long bp, unsigned long *stack, - unsigned long *irq_stack, unsigned long *irq_stack_end) -{ -#ifdef CONFIG_FRAME_POINTER - struct stack_frame *frame = (struct stack_frame *)bp; - unsigned long next; - - if (!in_irq_stack(stack, irq_stack, irq_stack_end)) { - if (!probe_kernel_address(&frame->next_frame, next)) - return next; - else - WARN_ONCE(1, "Perf: bad frame pointer = %p in " - "callchain\n", &frame->next_frame); - } -#endif - return bp; -} - -/* * x86-64 can have up to three kernel stacks: * process stack * interrupt stack @@ -208,8 +180,6 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, * pointer (index -1 to end) in the IRQ stack: */ stack = (unsigned long *) (irq_stack_end[-1]); - bp = fixup_bp_irq_link(bp, stack, irq_stack, - irq_stack_end); irq_stack_end = NULL; ops->stack(data, "EOI"); continue; diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 6131432..d656f68 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -310,8 +310,11 @@ ENDPROC(native_usergs_sysret64) movq_cfi r10, R10-RBP movq_cfi r11, R11-RBP - movq_cfi rbp, 0 /* push %rbp */ - movq %rsp, %rbp + /* Save rbp so that we can unwind from get_irq_regs() */ + movq_cfi rbp, 0 + + /* Save previous stack value */ + movq %rsp, %rsi leaq -RBP(%rsp),%rdi /* arg1 for handler */ testl $3, CS(%rdi) @@ -327,10 +330,11 @@ ENDPROC(native_usergs_sysret64) jne 2f mov PER_CPU_VAR(irq_stack_ptr),%rsp EMPTY_FRAME 0 - /* - * We entered an interrupt context - irqs are off: - */ -2: TRACE_IRQS_OFF + +2: /* Store previous stack value */ + pushq %rsi + /* We entered an interrupt context - irqs are off: */ + TRACE_IRQS_OFF .endm ENTRY(save_rest) @@ -804,15 +808,14 @@ ret_from_intr: DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF decl PER_CPU_VAR(irq_count) - leaveq - CFI_RESTORE rbp + /* Restore saved previous stack */ + popq %rsi + leaq 16(%rsi), %rsp + CFI_DEF_CFA_REGISTER rsp - CFI_ADJUST_CFA_OFFSET -8 + CFI_ADJUST_CFA_OFFSET -16 - /* we did not save rbx, restore only from ARGOFFSET */ - addq $8, %rsp - CFI_ADJUST_CFA_OFFSET -8 exit_intr: GET_THREAD_INFO(%rcx) testl $3,CS-ARGOFFSET(%rsp) -- cgit v1.1 From d80603c9d876efafd8b07469c891076de470e323 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 5 Jul 2011 12:22:18 +0100 Subject: x86, efi: Properly pre-initialize table pointers Consumers of the table pointers in struct efi check for EFI_INVALID_TABLE_ADDR to determine validity, hence these pointers should all be pre-initialized to this value (rather than zero). Noticed by the discrepancy between efivars' systab sysfs entry showing all tables (and their pointers) despite the code intending to only display the valid ones. No other bad effects known, but having the various table parsing routines bogusly access physical address zero is certainly not very desirable (even though they're unlikely to find anything useful there). Signed-off-by: Jan Beulich Link: http://lkml.kernel.org/r/4E13100A020000780004C256@nat28.tlf.novell.com Signed-off-by: Ingo Molnar --- arch/x86/platform/efi/efi.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index a0e4244..11e766d 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -51,7 +51,17 @@ int efi_enabled; EXPORT_SYMBOL(efi_enabled); -struct efi efi; +struct efi __read_mostly efi = { + .mps = EFI_INVALID_TABLE_ADDR, + .acpi = EFI_INVALID_TABLE_ADDR, + .acpi20 = EFI_INVALID_TABLE_ADDR, + .smbios = EFI_INVALID_TABLE_ADDR, + .sal_systab = EFI_INVALID_TABLE_ADDR, + .boot_info = EFI_INVALID_TABLE_ADDR, + .hcdp = EFI_INVALID_TABLE_ADDR, + .uga = EFI_INVALID_TABLE_ADDR, + .uv_systab = EFI_INVALID_TABLE_ADDR, +}; EXPORT_SYMBOL(efi); struct efi_memory_map memmap; -- cgit v1.1 From f70d8ef4745349000dc599b6873a8b866289c694 Mon Sep 17 00:00:00 2001 From: Daniel Drake Date: Sat, 25 Jun 2011 17:34:08 +0100 Subject: x86, olpc: Add missing elements to device tree In response to new device tree code in the kernel, OLPC will start using it for probing of certain devices. However, some firmware fixes are needed to put the devicetree into a usable state. Retain compatibility with old firmware by fixing up the device tree at boot-time if it does not contain the new nodes/properties that we need for probing. This is the same approach taken on PPC platforms. Signed-off-by: Daniel Drake Link: http://lkml.kernel.org/r/1309019658-1712-2-git-send-email-dsd@laptop.org Acked-by: Grant Likely Acked-by: Andres Salomon Cc: devicetree-discuss@lists.ozlabs.org Signed-off-by: H. Peter Anvin --- arch/x86/platform/olpc/olpc_dt.c | 103 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 103 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/platform/olpc/olpc_dt.c b/arch/x86/platform/olpc/olpc_dt.c index d39f63d..d6ee929 100644 --- a/arch/x86/platform/olpc/olpc_dt.c +++ b/arch/x86/platform/olpc/olpc_dt.c @@ -165,6 +165,107 @@ static struct of_pdt_ops prom_olpc_ops __initdata = { .pkg2path = olpc_dt_pkg2path, }; +static phandle __init olpc_dt_finddevice(const char *path) +{ + phandle node; + const void *args[] = { path }; + void *res[] = { &node }; + + if (olpc_ofw("finddevice", args, res)) { + pr_err("olpc_dt: finddevice failed!\n"); + return 0; + } + + if ((s32) node == -1) + return 0; + + return node; +} + +static int __init olpc_dt_interpret(const char *words) +{ + int result; + const void *args[] = { words }; + void *res[] = { &result }; + + if (olpc_ofw("interpret", args, res)) { + pr_err("olpc_dt: interpret failed!\n"); + return -1; + } + + return result; +} + +/* + * Extract board revision directly from OFW device tree. + * We can't use olpc_platform_info because that hasn't been set up yet. + */ +static u32 __init olpc_dt_get_board_revision(void) +{ + phandle node; + __be32 rev; + int r; + + node = olpc_dt_finddevice("/"); + if (!node) + return 0; + + r = olpc_dt_getproperty(node, "board-revision-int", + (char *) &rev, sizeof(rev)); + if (r < 0) + return 0; + + return be32_to_cpu(rev); +} + +void __init olpc_dt_fixup(void) +{ + int r; + char buf[64]; + phandle node; + u32 board_rev; + + node = olpc_dt_finddevice("/battery@0"); + if (!node) + return; + + /* + * If the battery node has a compatible property, we are running a new + * enough firmware and don't have fixups to make. + */ + r = olpc_dt_getproperty(node, "compatible", buf, sizeof(buf)); + if (r > 0) + return; + + pr_info("PROM DT: Old firmware detected, applying fixes\n"); + + /* Add olpc,xo1-battery compatible marker to battery node */ + olpc_dt_interpret("\" /battery@0\" find-device" + " \" olpc,xo1-battery\" +compatible" + " device-end"); + + board_rev = olpc_dt_get_board_revision(); + if (!board_rev) + return; + + if (board_rev >= olpc_board_pre(0xd0)) { + /* XO-1.5: add dcon device */ + olpc_dt_interpret("\" /pci/display@1\" find-device" + " new-device" + " \" dcon\" device-name \" olpc,xo1-dcon\" +compatible" + " finish-device device-end"); + } else { + /* XO-1: add dcon device, mark RTC as olpc,xo1-rtc */ + olpc_dt_interpret("\" /pci/display@1,1\" find-device" + " new-device" + " \" dcon\" device-name \" olpc,xo1-dcon\" +compatible" + " finish-device device-end" + " \" /rtc\" find-device" + " \" olpc,xo1-rtc\" +compatible" + " device-end"); + } +} + void __init olpc_dt_build_devicetree(void) { phandle root; @@ -172,6 +273,8 @@ void __init olpc_dt_build_devicetree(void) if (!olpc_ofw_is_installed()) return; + olpc_dt_fixup(); + root = olpc_dt_getsibling(0); if (!root) { pr_err("PROM: unable to get root node from OFW!\n"); -- cgit v1.1 From 7a0d4fcf6d4b80b30503fd2701eeef1883e11404 Mon Sep 17 00:00:00 2001 From: Daniel Drake Date: Sat, 25 Jun 2011 17:34:09 +0100 Subject: x86, olpc: Move CS5536-related constants to cs5535.h Move these definitions into the relevant header file. This was requested in the review of the upcoming XO-1 suspend/resume code. Signed-off-by: Daniel Drake Link: http://lkml.kernel.org/r/1309019658-1712-3-git-send-email-dsd@laptop.org Acked-by: Andres Salomon Signed-off-by: H. Peter Anvin --- arch/x86/platform/olpc/olpc-xo1.c | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/olpc/olpc-xo1.c b/arch/x86/platform/olpc/olpc-xo1.c index ab81fb2..a63e948 100644 --- a/arch/x86/platform/olpc/olpc-xo1.c +++ b/arch/x86/platform/olpc/olpc-xo1.c @@ -12,6 +12,7 @@ * (at your option) any later version. */ +#include #include #include #include @@ -22,17 +23,6 @@ #define DRV_NAME "olpc-xo1" -/* PMC registers (PMS block) */ -#define PM_SCLK 0x10 -#define PM_IN_SLPCTL 0x20 -#define PM_WKXD 0x34 -#define PM_WKD 0x30 -#define PM_SSC 0x54 - -/* PM registers (ACPI block) */ -#define PM1_CNT 0x08 -#define PM_GPE0_STS 0x18 - static unsigned long acpi_base; static unsigned long pms_base; @@ -41,17 +31,17 @@ static void xo1_power_off(void) printk(KERN_INFO "OLPC XO-1 power off sequence...\n"); /* Enable all of these controls with 0 delay */ - outl(0x40000000, pms_base + PM_SCLK); - outl(0x40000000, pms_base + PM_IN_SLPCTL); - outl(0x40000000, pms_base + PM_WKXD); - outl(0x40000000, pms_base + PM_WKD); + outl(0x40000000, pms_base + CS5536_PM_SCLK); + outl(0x40000000, pms_base + CS5536_PM_IN_SLPCTL); + outl(0x40000000, pms_base + CS5536_PM_WKXD); + outl(0x40000000, pms_base + CS5536_PM_WKD); /* Clear status bits (possibly unnecessary) */ - outl(0x0002ffff, pms_base + PM_SSC); - outl(0xffffffff, acpi_base + PM_GPE0_STS); + outl(0x0002ffff, pms_base + CS5536_PM_SSC); + outl(0xffffffff, acpi_base + CS5536_PM_GPE0_STS); /* Write SLP_EN bit to start the machinery */ - outl(0x00002000, acpi_base + PM1_CNT); + outl(0x00002000, acpi_base + CS5536_PM1_CNT); } static int __devinit olpc_xo1_probe(struct platform_device *pdev) -- cgit v1.1 From a3128588b3c6be634a9013a375903e0b55668f0a Mon Sep 17 00:00:00 2001 From: Daniel Drake Date: Sat, 25 Jun 2011 17:34:10 +0100 Subject: x86, olpc: Rename olpc-xo1 to olpc-xo1-pm Based on earlier review comments, we'll no longer try to stick all of our XO-1 goodies in a single driver. We'll split it into a power management driver, and an EC/SCI driver. As a first step, rename olpc-xo1 to olpc-xo1-pm, and make it builtin instead of modular. Signed-off-by: Daniel Drake Link: http://lkml.kernel.org/r/1309019658-1712-4-git-send-email-dsd@laptop.org Acked-by: Andres Salomon Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 7 +- arch/x86/platform/olpc/Makefile | 2 +- arch/x86/platform/olpc/olpc-xo1-pm.c | 123 +++++++++++++++++++++++++++++++ arch/x86/platform/olpc/olpc-xo1.c | 136 ----------------------------------- 4 files changed, 128 insertions(+), 140 deletions(-) create mode 100644 arch/x86/platform/olpc/olpc-xo1-pm.c delete mode 100644 arch/x86/platform/olpc/olpc-xo1.c (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index da34972..29615ee 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2073,11 +2073,12 @@ config OLPC Add support for detecting the unique features of the OLPC XO hardware. -config OLPC_XO1 - tristate "OLPC XO-1 support" +config OLPC_XO1_PM + bool "OLPC XO-1 Power Management" depends on OLPC && MFD_CS5535 + select MFD_CORE ---help--- - Add support for non-essential features of the OLPC XO-1 laptop. + Add support for poweroff of the OLPC XO-1 laptop. endif # X86_32 diff --git a/arch/x86/platform/olpc/Makefile b/arch/x86/platform/olpc/Makefile index 81c5e21..cd25038 100644 --- a/arch/x86/platform/olpc/Makefile +++ b/arch/x86/platform/olpc/Makefile @@ -1,2 +1,2 @@ obj-$(CONFIG_OLPC) += olpc.o olpc_ofw.o olpc_dt.o -obj-$(CONFIG_OLPC_XO1) += olpc-xo1.o +obj-$(CONFIG_OLPC_XO1_PM) += olpc-xo1-pm.o diff --git a/arch/x86/platform/olpc/olpc-xo1-pm.c b/arch/x86/platform/olpc/olpc-xo1-pm.c new file mode 100644 index 0000000..a2a59d3 --- /dev/null +++ b/arch/x86/platform/olpc/olpc-xo1-pm.c @@ -0,0 +1,123 @@ +/* + * Support for power management features of the OLPC XO-1 laptop + * + * Copyright (C) 2010 Andres Salomon + * Copyright (C) 2010 One Laptop per Child + * Copyright (C) 2006 Red Hat, Inc. + * Copyright (C) 2006 Advanced Micro Devices, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include +#include +#include + +#include +#include + +#define DRV_NAME "olpc-xo1-pm" + +static unsigned long acpi_base; +static unsigned long pms_base; + +static void xo1_power_off(void) +{ + printk(KERN_INFO "OLPC XO-1 power off sequence...\n"); + + /* Enable all of these controls with 0 delay */ + outl(0x40000000, pms_base + CS5536_PM_SCLK); + outl(0x40000000, pms_base + CS5536_PM_IN_SLPCTL); + outl(0x40000000, pms_base + CS5536_PM_WKXD); + outl(0x40000000, pms_base + CS5536_PM_WKD); + + /* Clear status bits (possibly unnecessary) */ + outl(0x0002ffff, pms_base + CS5536_PM_SSC); + outl(0xffffffff, acpi_base + CS5536_PM_GPE0_STS); + + /* Write SLP_EN bit to start the machinery */ + outl(0x00002000, acpi_base + CS5536_PM1_CNT); +} + +static int __devinit xo1_pm_probe(struct platform_device *pdev) +{ + struct resource *res; + int err; + + /* don't run on non-XOs */ + if (!machine_is_olpc()) + return -ENODEV; + + err = mfd_cell_enable(pdev); + if (err) + return err; + + res = platform_get_resource(pdev, IORESOURCE_IO, 0); + if (!res) { + dev_err(&pdev->dev, "can't fetch device resource info\n"); + return -EIO; + } + if (strcmp(pdev->name, "cs5535-pms") == 0) + pms_base = res->start; + else if (strcmp(pdev->name, "olpc-xo1-pm-acpi") == 0) + acpi_base = res->start; + + /* If we have both addresses, we can override the poweroff hook */ + if (pms_base && acpi_base) { + pm_power_off = xo1_power_off; + printk(KERN_INFO "OLPC XO-1 support registered\n"); + } + + return 0; +} + +static int __devexit xo1_pm_remove(struct platform_device *pdev) +{ + mfd_cell_disable(pdev); + + if (strcmp(pdev->name, "cs5535-pms") == 0) + pms_base = 0; + else if (strcmp(pdev->name, "olpc-xo1-pm-acpi") == 0) + acpi_base = 0; + + pm_power_off = NULL; + return 0; +} + +static struct platform_driver cs5535_pms_driver = { + .driver = { + .name = "cs5535-pms", + .owner = THIS_MODULE, + }, + .probe = xo1_pm_probe, + .remove = __devexit_p(xo1_pm_remove), +}; + +static struct platform_driver cs5535_acpi_driver = { + .driver = { + .name = "olpc-xo1-pm-acpi", + .owner = THIS_MODULE, + }, + .probe = xo1_pm_probe, + .remove = __devexit_p(xo1_pm_remove), +}; + +static int __init xo1_pm_init(void) +{ + int r; + + r = platform_driver_register(&cs5535_pms_driver); + if (r) + return r; + + r = platform_driver_register(&cs5535_acpi_driver); + if (r) + platform_driver_unregister(&cs5535_pms_driver); + + return r; +} +arch_initcall(xo1_pm_init); diff --git a/arch/x86/platform/olpc/olpc-xo1.c b/arch/x86/platform/olpc/olpc-xo1.c deleted file mode 100644 index a63e948..0000000 --- a/arch/x86/platform/olpc/olpc-xo1.c +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Support for features of the OLPC XO-1 laptop - * - * Copyright (C) 2010 Andres Salomon - * Copyright (C) 2010 One Laptop per Child - * Copyright (C) 2006 Red Hat, Inc. - * Copyright (C) 2006 Advanced Micro Devices, Inc. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ - -#include -#include -#include -#include -#include - -#include -#include - -#define DRV_NAME "olpc-xo1" - -static unsigned long acpi_base; -static unsigned long pms_base; - -static void xo1_power_off(void) -{ - printk(KERN_INFO "OLPC XO-1 power off sequence...\n"); - - /* Enable all of these controls with 0 delay */ - outl(0x40000000, pms_base + CS5536_PM_SCLK); - outl(0x40000000, pms_base + CS5536_PM_IN_SLPCTL); - outl(0x40000000, pms_base + CS5536_PM_WKXD); - outl(0x40000000, pms_base + CS5536_PM_WKD); - - /* Clear status bits (possibly unnecessary) */ - outl(0x0002ffff, pms_base + CS5536_PM_SSC); - outl(0xffffffff, acpi_base + CS5536_PM_GPE0_STS); - - /* Write SLP_EN bit to start the machinery */ - outl(0x00002000, acpi_base + CS5536_PM1_CNT); -} - -static int __devinit olpc_xo1_probe(struct platform_device *pdev) -{ - struct resource *res; - int err; - - /* don't run on non-XOs */ - if (!machine_is_olpc()) - return -ENODEV; - - err = mfd_cell_enable(pdev); - if (err) - return err; - - res = platform_get_resource(pdev, IORESOURCE_IO, 0); - if (!res) { - dev_err(&pdev->dev, "can't fetch device resource info\n"); - return -EIO; - } - if (strcmp(pdev->name, "cs5535-pms") == 0) - pms_base = res->start; - else if (strcmp(pdev->name, "olpc-xo1-pm-acpi") == 0) - acpi_base = res->start; - - /* If we have both addresses, we can override the poweroff hook */ - if (pms_base && acpi_base) { - pm_power_off = xo1_power_off; - printk(KERN_INFO "OLPC XO-1 support registered\n"); - } - - return 0; -} - -static int __devexit olpc_xo1_remove(struct platform_device *pdev) -{ - mfd_cell_disable(pdev); - - if (strcmp(pdev->name, "cs5535-pms") == 0) - pms_base = 0; - else if (strcmp(pdev->name, "olpc-xo1-pm-acpi") == 0) - acpi_base = 0; - - pm_power_off = NULL; - return 0; -} - -static struct platform_driver cs5535_pms_drv = { - .driver = { - .name = "cs5535-pms", - .owner = THIS_MODULE, - }, - .probe = olpc_xo1_probe, - .remove = __devexit_p(olpc_xo1_remove), -}; - -static struct platform_driver cs5535_acpi_drv = { - .driver = { - .name = "olpc-xo1-pm-acpi", - .owner = THIS_MODULE, - }, - .probe = olpc_xo1_probe, - .remove = __devexit_p(olpc_xo1_remove), -}; - -static int __init olpc_xo1_init(void) -{ - int r; - - r = platform_driver_register(&cs5535_pms_drv); - if (r) - return r; - - r = platform_driver_register(&cs5535_acpi_drv); - if (r) - platform_driver_unregister(&cs5535_pms_drv); - - return r; -} - -static void __exit olpc_xo1_exit(void) -{ - platform_driver_unregister(&cs5535_acpi_drv); - platform_driver_unregister(&cs5535_pms_drv); -} - -MODULE_AUTHOR("Daniel Drake "); -MODULE_LICENSE("GPL"); -MODULE_ALIAS("platform:cs5535-pms"); - -module_init(olpc_xo1_init); -module_exit(olpc_xo1_exit); -- cgit v1.1 From 97c4cb71c18fe045a763ff6681a8ebbbbbec0b2b Mon Sep 17 00:00:00 2001 From: Daniel Drake Date: Sat, 25 Jun 2011 17:34:11 +0100 Subject: x86, olpc: Add XO-1 suspend/resume support Add code needed for basic suspend/resume of the XO-1 laptop. Based on earlier work by Jordan Crouse, Andres Salomon, and others. This patch incorporates all earlier feedback from Thomas Gleixner. To clarify a certain point (now more obvious in the code itself): On resume, OpenFirmware returns execution to Linux in protected mode with a kernel-compatible GDT already set up. The changes and simplifications suggested have all been included. Signed-off-by: Daniel Drake Link: http://lkml.kernel.org/r/1309019658-1712-5-git-send-email-dsd@laptop.org Acked-by: Andres Salomon Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 4 +- arch/x86/include/asm/olpc.h | 15 ++++- arch/x86/platform/olpc/Makefile | 2 +- arch/x86/platform/olpc/olpc-xo1-pm.c | 92 ++++++++++++++++++++++++++ arch/x86/platform/olpc/xo1-wakeup.S | 124 +++++++++++++++++++++++++++++++++++ 5 files changed, 231 insertions(+), 6 deletions(-) create mode 100644 arch/x86/platform/olpc/xo1-wakeup.S (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 29615ee..f473151 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2075,10 +2075,10 @@ config OLPC config OLPC_XO1_PM bool "OLPC XO-1 Power Management" - depends on OLPC && MFD_CS5535 + depends on OLPC && MFD_CS5535 && PM_SLEEP select MFD_CORE ---help--- - Add support for poweroff of the OLPC XO-1 laptop. + Add support for poweroff and suspend of the OLPC XO-1 laptop. endif # X86_32 diff --git a/arch/x86/include/asm/olpc.h b/arch/x86/include/asm/olpc.h index 5ca6801..10ea595 100644 --- a/arch/x86/include/asm/olpc.h +++ b/arch/x86/include/asm/olpc.h @@ -76,6 +76,12 @@ static inline int olpc_has_dcon(void) #endif +#ifdef CONFIG_OLPC_XO1_PM +extern void do_olpc_suspend_lowlevel(void); +extern void olpc_xo1_pm_wakeup_set(u16 value); +extern void olpc_xo1_pm_wakeup_clear(u16 value); +#endif + extern int pci_olpc_init(void); /* EC related functions */ @@ -88,9 +94,12 @@ extern int olpc_ec_mask_unset(uint8_t bits); /* EC commands */ -#define EC_FIRMWARE_REV 0x08 -#define EC_WLAN_ENTER_RESET 0x35 -#define EC_WLAN_LEAVE_RESET 0x25 +#define EC_FIRMWARE_REV 0x08 +#define EC_WAKE_UP_WLAN 0x24 +#define EC_WLAN_LEAVE_RESET 0x25 +#define EC_SET_SCI_INHIBIT 0x32 +#define EC_SET_SCI_INHIBIT_RELEASE 0x34 +#define EC_WLAN_ENTER_RESET 0x35 /* SCI source values */ diff --git a/arch/x86/platform/olpc/Makefile b/arch/x86/platform/olpc/Makefile index cd25038..1ae7bed 100644 --- a/arch/x86/platform/olpc/Makefile +++ b/arch/x86/platform/olpc/Makefile @@ -1,2 +1,2 @@ obj-$(CONFIG_OLPC) += olpc.o olpc_ofw.o olpc_dt.o -obj-$(CONFIG_OLPC_XO1_PM) += olpc-xo1-pm.o +obj-$(CONFIG_OLPC_XO1_PM) += olpc-xo1-pm.o xo1-wakeup.o diff --git a/arch/x86/platform/olpc/olpc-xo1-pm.c b/arch/x86/platform/olpc/olpc-xo1-pm.c index a2a59d3..6f3855a 100644 --- a/arch/x86/platform/olpc/olpc-xo1-pm.c +++ b/arch/x86/platform/olpc/olpc-xo1-pm.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -25,6 +26,85 @@ static unsigned long acpi_base; static unsigned long pms_base; +static u16 wakeup_mask = CS5536_PM_PWRBTN; + +static struct { + unsigned long address; + unsigned short segment; +} ofw_bios_entry = { 0xF0000 + PAGE_OFFSET, __KERNEL_CS }; + +/* Set bits in the wakeup mask */ +void olpc_xo1_pm_wakeup_set(u16 value) +{ + wakeup_mask |= value; +} +EXPORT_SYMBOL_GPL(olpc_xo1_pm_wakeup_set); + +/* Clear bits in the wakeup mask */ +void olpc_xo1_pm_wakeup_clear(u16 value) +{ + wakeup_mask &= ~value; +} +EXPORT_SYMBOL_GPL(olpc_xo1_pm_wakeup_clear); + +static int xo1_power_state_enter(suspend_state_t pm_state) +{ + unsigned long saved_sci_mask; + int r; + + /* Only STR is supported */ + if (pm_state != PM_SUSPEND_MEM) + return -EINVAL; + + r = olpc_ec_cmd(EC_SET_SCI_INHIBIT, NULL, 0, NULL, 0); + if (r) + return r; + + /* + * Save SCI mask (this gets lost since PM1_EN is used as a mask for + * wakeup events, which is not necessarily the same event set) + */ + saved_sci_mask = inl(acpi_base + CS5536_PM1_STS); + saved_sci_mask &= 0xffff0000; + + /* Save CPU state */ + do_olpc_suspend_lowlevel(); + + /* Resume path starts here */ + + /* Restore SCI mask (using dword access to CS5536_PM1_EN) */ + outl(saved_sci_mask, acpi_base + CS5536_PM1_STS); + + /* Tell the EC to stop inhibiting SCIs */ + olpc_ec_cmd(EC_SET_SCI_INHIBIT_RELEASE, NULL, 0, NULL, 0); + + /* + * Tell the wireless module to restart USB communication. + * Must be done twice. + */ + olpc_ec_cmd(EC_WAKE_UP_WLAN, NULL, 0, NULL, 0); + olpc_ec_cmd(EC_WAKE_UP_WLAN, NULL, 0, NULL, 0); + + return 0; +} + +asmlinkage int xo1_do_sleep(u8 sleep_state) +{ + void *pgd_addr = __va(read_cr3()); + + /* Program wakeup mask (using dword access to CS5536_PM1_EN) */ + outl(wakeup_mask << 16, acpi_base + CS5536_PM1_STS); + + __asm__("movl %0,%%eax" : : "r" (pgd_addr)); + __asm__("call *(%%edi); cld" + : : "D" (&ofw_bios_entry)); + __asm__("movb $0x34, %al\n\t" + "outb %al, $0x70\n\t" + "movb $0x30, %al\n\t" + "outb %al, $0x71\n\t"); + return 0; +} + static void xo1_power_off(void) { printk(KERN_INFO "OLPC XO-1 power off sequence...\n"); @@ -43,6 +123,17 @@ static void xo1_power_off(void) outl(0x00002000, acpi_base + CS5536_PM1_CNT); } +static int xo1_power_state_valid(suspend_state_t pm_state) +{ + /* suspend-to-RAM only */ + return pm_state == PM_SUSPEND_MEM; +} + +static const struct platform_suspend_ops xo1_suspend_ops = { + .valid = xo1_power_state_valid, + .enter = xo1_power_state_enter, +}; + static int __devinit xo1_pm_probe(struct platform_device *pdev) { struct resource *res; @@ -68,6 +159,7 @@ static int __devinit xo1_pm_probe(struct platform_device *pdev) /* If we have both addresses, we can override the poweroff hook */ if (pms_base && acpi_base) { + suspend_set_ops(&xo1_suspend_ops); pm_power_off = xo1_power_off; printk(KERN_INFO "OLPC XO-1 support registered\n"); } diff --git a/arch/x86/platform/olpc/xo1-wakeup.S b/arch/x86/platform/olpc/xo1-wakeup.S new file mode 100644 index 0000000..948deb2 --- /dev/null +++ b/arch/x86/platform/olpc/xo1-wakeup.S @@ -0,0 +1,124 @@ +.text +#include +#include +#include +#include + + .macro writepost,value + movb $0x34, %al + outb %al, $0x70 + movb $\value, %al + outb %al, $0x71 + .endm + +wakeup_start: + # OFW lands us here, running in protected mode, with a + # kernel-compatible GDT already setup. + + # Clear any dangerous flags + pushl $0 + popfl + + writepost 0x31 + + # Set up %cr3 + movl $initial_page_table - __PAGE_OFFSET, %eax + movl %eax, %cr3 + + movl saved_cr4, %eax + movl %eax, %cr4 + + movl saved_cr0, %eax + movl %eax, %cr0 + + # Control registers were modified, pipeline resync is needed + jmp 1f +1: + + movw $__KERNEL_DS, %ax + movw %ax, %ss + movw %ax, %ds + movw %ax, %es + movw %ax, %fs + movw %ax, %gs + + lgdt saved_gdt + lidt saved_idt + lldt saved_ldt + ljmp $(__KERNEL_CS),$1f +1: + movl %cr3, %eax + movl %eax, %cr3 + wbinvd + + # Go back to the return point + jmp ret_point + +save_registers: + sgdt saved_gdt + sidt saved_idt + sldt saved_ldt + + pushl %edx + movl %cr4, %edx + movl %edx, saved_cr4 + + movl %cr0, %edx + movl %edx, saved_cr0 + + popl %edx + + movl %ebx, saved_context_ebx + movl %ebp, saved_context_ebp + movl %esi, saved_context_esi + movl %edi, saved_context_edi + + pushfl + popl saved_context_eflags + + ret + +restore_registers: + movl saved_context_ebp, %ebp + movl saved_context_ebx, %ebx + movl saved_context_esi, %esi + movl saved_context_edi, %edi + + pushl saved_context_eflags + popfl + + ret + +ENTRY(do_olpc_suspend_lowlevel) + call save_processor_state + call save_registers + + # This is the stack context we want to remember + movl %esp, saved_context_esp + + pushl $3 + call xo1_do_sleep + + jmp wakeup_start + .p2align 4,,7 +ret_point: + movl saved_context_esp, %esp + + writepost 0x32 + + call restore_registers + call restore_processor_state + ret + +.data +saved_gdt: .long 0,0 +saved_idt: .long 0,0 +saved_ldt: .long 0 +saved_cr4: .long 0 +saved_cr0: .long 0 +saved_context_esp: .long 0 +saved_context_edi: .long 0 +saved_context_esi: .long 0 +saved_context_ebx: .long 0 +saved_context_ebp: .long 0 +saved_context_eflags: .long 0 -- cgit v1.1 From 7feda8e9f35ebb0e9f90e03acb02280bc137f784 Mon Sep 17 00:00:00 2001 From: Daniel Drake Date: Sat, 25 Jun 2011 17:34:12 +0100 Subject: x86, olpc: Add XO-1 SCI driver and power button control The System Control Interrupt is used in the OLPC XO-1 to control various features of the laptop. Add the driver base and the power button functionality. This driver can't be built as a module, because functionality added in future patches means that some drivers need to know at boot-time whether SCI-based functionality is available. Signed-off-by: Daniel Drake Link: http://lkml.kernel.org/r/1309019658-1712-6-git-send-email-dsd@laptop.org Acked-by: Andres Salomon Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 9 ++ arch/x86/platform/olpc/Makefile | 1 + arch/x86/platform/olpc/olpc-xo1-sci.c | 191 ++++++++++++++++++++++++++++++++++ 3 files changed, 201 insertions(+) create mode 100644 arch/x86/platform/olpc/olpc-xo1-sci.c (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index f473151..66b7b9d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2080,6 +2080,15 @@ config OLPC_XO1_PM ---help--- Add support for poweroff and suspend of the OLPC XO-1 laptop. +config OLPC_XO1_SCI + bool "OLPC XO-1 SCI extras" + depends on OLPC && OLPC_XO1_PM + select GPIO_CS5535 + select MFD_CORE + ---help--- + Add support for SCI-based features of the OLPC XO-1 laptop: + - Power button + endif # X86_32 config AMD_NB diff --git a/arch/x86/platform/olpc/Makefile b/arch/x86/platform/olpc/Makefile index 1ae7bed..1ec5ade 100644 --- a/arch/x86/platform/olpc/Makefile +++ b/arch/x86/platform/olpc/Makefile @@ -1,2 +1,3 @@ obj-$(CONFIG_OLPC) += olpc.o olpc_ofw.o olpc_dt.o obj-$(CONFIG_OLPC_XO1_PM) += olpc-xo1-pm.o xo1-wakeup.o +obj-$(CONFIG_OLPC_XO1_SCI) += olpc-xo1-sci.o diff --git a/arch/x86/platform/olpc/olpc-xo1-sci.c b/arch/x86/platform/olpc/olpc-xo1-sci.c new file mode 100644 index 0000000..8fbf961 --- /dev/null +++ b/arch/x86/platform/olpc/olpc-xo1-sci.c @@ -0,0 +1,191 @@ +/* + * Support for OLPC XO-1 System Control Interrupts (SCI) + * + * Copyright (C) 2010 One Laptop per Child + * Copyright (C) 2006 Red Hat, Inc. + * Copyright (C) 2006 Advanced Micro Devices, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#define DRV_NAME "olpc-xo1-sci" +#define PFX DRV_NAME ": " + +static unsigned long acpi_base; +static struct input_dev *power_button_idev; +static int sci_irq; + +static irqreturn_t xo1_sci_intr(int irq, void *dev_id) +{ + struct platform_device *pdev = dev_id; + u32 sts; + u32 gpe; + + sts = inl(acpi_base + CS5536_PM1_STS); + outl(sts | 0xffff, acpi_base + CS5536_PM1_STS); + + gpe = inl(acpi_base + CS5536_PM_GPE0_STS); + outl(0xffffffff, acpi_base + CS5536_PM_GPE0_STS); + + dev_dbg(&pdev->dev, "sts %x gpe %x\n", sts, gpe); + + if (sts & CS5536_PWRBTN_FLAG && !(sts & CS5536_WAK_FLAG)) { + input_report_key(power_button_idev, KEY_POWER, 1); + input_sync(power_button_idev); + input_report_key(power_button_idev, KEY_POWER, 0); + input_sync(power_button_idev); + } + + return IRQ_HANDLED; +} + +static int xo1_sci_suspend(struct platform_device *pdev, pm_message_t state) +{ + if (device_may_wakeup(&power_button_idev->dev)) + olpc_xo1_pm_wakeup_set(CS5536_PM_PWRBTN); + else + olpc_xo1_pm_wakeup_clear(CS5536_PM_PWRBTN); + return 0; +} + +static int __devinit setup_sci_interrupt(struct platform_device *pdev) +{ + u32 lo, hi; + u32 sts; + int r; + + rdmsr(0x51400020, lo, hi); + sci_irq = (lo >> 20) & 15; + + if (sci_irq) { + dev_info(&pdev->dev, "SCI is mapped to IRQ %d\n", sci_irq); + } else { + /* Zero means masked */ + dev_info(&pdev->dev, "SCI unmapped. Mapping to IRQ 3\n"); + sci_irq = 3; + lo |= 0x00300000; + wrmsrl(0x51400020, lo); + } + + /* Select level triggered in PIC */ + if (sci_irq < 8) { + lo = inb(CS5536_PIC_INT_SEL1); + lo |= 1 << sci_irq; + outb(lo, CS5536_PIC_INT_SEL1); + } else { + lo = inb(CS5536_PIC_INT_SEL2); + lo |= 1 << (sci_irq - 8); + outb(lo, CS5536_PIC_INT_SEL2); + } + + /* Enable SCI from power button, and clear pending interrupts */ + sts = inl(acpi_base + CS5536_PM1_STS); + outl((CS5536_PM_PWRBTN << 16) | 0xffff, acpi_base + CS5536_PM1_STS); + + r = request_irq(sci_irq, xo1_sci_intr, 0, DRV_NAME, pdev); + if (r) + dev_err(&pdev->dev, "can't request interrupt\n"); + + return r; +} + +static int __devinit setup_power_button(struct platform_device *pdev) +{ + int r; + + power_button_idev = input_allocate_device(); + if (!power_button_idev) + return -ENOMEM; + + power_button_idev->name = "Power Button"; + power_button_idev->phys = DRV_NAME "/input0"; + set_bit(EV_KEY, power_button_idev->evbit); + set_bit(KEY_POWER, power_button_idev->keybit); + + power_button_idev->dev.parent = &pdev->dev; + device_init_wakeup(&power_button_idev->dev, 1); + + r = input_register_device(power_button_idev); + if (r) { + dev_err(&pdev->dev, "failed to register power button: %d\n", r); + input_free_device(power_button_idev); + } + + return r; +} + +static void free_power_button(void) +{ + input_unregister_device(power_button_idev); + input_free_device(power_button_idev); +} + +static int __devinit xo1_sci_probe(struct platform_device *pdev) +{ + struct resource *res; + int r; + + /* don't run on non-XOs */ + if (!machine_is_olpc()) + return -ENODEV; + + r = mfd_cell_enable(pdev); + if (r) + return r; + + res = platform_get_resource(pdev, IORESOURCE_IO, 0); + if (!res) { + dev_err(&pdev->dev, "can't fetch device resource info\n"); + return -EIO; + } + acpi_base = res->start; + + r = setup_power_button(pdev); + if (r) + return r; + + r = setup_sci_interrupt(pdev); + if (r) + free_power_button(); + + return r; +} + +static int __devexit xo1_sci_remove(struct platform_device *pdev) +{ + mfd_cell_disable(pdev); + free_irq(sci_irq, pdev); + free_power_button(); + acpi_base = 0; + return 0; +} + +static struct platform_driver xo1_sci_driver = { + .driver = { + .name = "olpc-xo1-sci-acpi", + }, + .probe = xo1_sci_probe, + .remove = __devexit_p(xo1_sci_remove), + .suspend = xo1_sci_suspend, +}; + +static int __init xo1_sci_init(void) +{ + return platform_driver_register(&xo1_sci_driver); +} +arch_initcall(xo1_sci_init); -- cgit v1.1 From bc4ecd5a5efc2435e6debfb7b279a15ae96697fd Mon Sep 17 00:00:00 2001 From: Daniel Drake Date: Sat, 25 Jun 2011 17:34:13 +0100 Subject: x86, olpc: EC SCI wakeup mask functionality Update the EC SCI masks with recent additions. Add functions to query SCI events and set the wakeup mask, to be used by followup patches. Add functions to tweak an event mask used to select certain EC events as a system wakeup source. Also add a function to determine if EC wakeup functionality is available, as this depends on child drivers (different for each laptop model) to configure the SCI interrupt. Signed-off-by: Daniel Drake Link: http://lkml.kernel.org/r/1309019658-1712-7-git-send-email-dsd@laptop.org Acked-by: Andres Salomon Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/olpc.h | 31 +++++++++++++--- arch/x86/platform/olpc/olpc.c | 86 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 111 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/olpc.h b/arch/x86/include/asm/olpc.h index 10ea595..0e56d01 100644 --- a/arch/x86/include/asm/olpc.h +++ b/arch/x86/include/asm/olpc.h @@ -13,6 +13,7 @@ struct olpc_platform_t { #define OLPC_F_PRESENT 0x01 #define OLPC_F_DCON 0x02 +#define OLPC_F_EC_WIDE_SCI 0x04 #ifdef CONFIG_OLPC @@ -62,6 +63,13 @@ static inline int olpc_board_at_least(uint32_t rev) return olpc_platform_info.boardrev >= rev; } +extern void olpc_ec_wakeup_set(u16 value); +extern void olpc_ec_wakeup_clear(u16 value); +extern bool olpc_ec_wakeup_available(void); + +extern int olpc_ec_mask_write(u16 bits); +extern int olpc_ec_sci_query(u16 *sci_value); + #else static inline int machine_is_olpc(void) @@ -74,6 +82,14 @@ static inline int olpc_has_dcon(void) return 0; } +static inline void olpc_ec_wakeup_set(u16 value) { } +static inline void olpc_ec_wakeup_clear(u16 value) { } + +static inline bool olpc_ec_wakeup_available(void) +{ + return false; +} + #endif #ifdef CONFIG_OLPC_XO1_PM @@ -89,17 +105,18 @@ extern int pci_olpc_init(void); extern int olpc_ec_cmd(unsigned char cmd, unsigned char *inbuf, size_t inlen, unsigned char *outbuf, size_t outlen); -extern int olpc_ec_mask_set(uint8_t bits); -extern int olpc_ec_mask_unset(uint8_t bits); - /* EC commands */ #define EC_FIRMWARE_REV 0x08 +#define EC_WRITE_SCI_MASK 0x1b #define EC_WAKE_UP_WLAN 0x24 #define EC_WLAN_LEAVE_RESET 0x25 #define EC_SET_SCI_INHIBIT 0x32 #define EC_SET_SCI_INHIBIT_RELEASE 0x34 #define EC_WLAN_ENTER_RESET 0x35 +#define EC_WRITE_EXT_SCI_MASK 0x38 +#define EC_SCI_QUERY 0x84 +#define EC_EXT_SCI_QUERY 0x85 /* SCI source values */ @@ -108,10 +125,12 @@ extern int olpc_ec_mask_unset(uint8_t bits); #define EC_SCI_SRC_BATTERY 0x02 #define EC_SCI_SRC_BATSOC 0x04 #define EC_SCI_SRC_BATERR 0x08 -#define EC_SCI_SRC_EBOOK 0x10 -#define EC_SCI_SRC_WLAN 0x20 +#define EC_SCI_SRC_EBOOK 0x10 /* XO-1 only */ +#define EC_SCI_SRC_WLAN 0x20 /* XO-1 only */ #define EC_SCI_SRC_ACPWR 0x40 -#define EC_SCI_SRC_ALL 0x7F +#define EC_SCI_SRC_BATCRIT 0x80 +#define EC_SCI_SRC_GPWAKE 0x100 /* XO-1.5 only */ +#define EC_SCI_SRC_ALL 0x1FF /* GPIO assignments */ diff --git a/arch/x86/platform/olpc/olpc.c b/arch/x86/platform/olpc/olpc.c index 0060fd5..72fd041 100644 --- a/arch/x86/platform/olpc/olpc.c +++ b/arch/x86/platform/olpc/olpc.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -30,6 +31,9 @@ EXPORT_SYMBOL_GPL(olpc_platform_info); static DEFINE_SPINLOCK(ec_lock); +/* EC event mask to be applied during suspend (defining wakeup sources). */ +static u16 ec_wakeup_mask; + /* what the timeout *should* be (in ms) */ #define EC_BASE_TIMEOUT 20 @@ -188,6 +192,79 @@ err: } EXPORT_SYMBOL_GPL(olpc_ec_cmd); +void olpc_ec_wakeup_set(u16 value) +{ + ec_wakeup_mask |= value; +} +EXPORT_SYMBOL_GPL(olpc_ec_wakeup_set); + +void olpc_ec_wakeup_clear(u16 value) +{ + ec_wakeup_mask &= ~value; +} +EXPORT_SYMBOL_GPL(olpc_ec_wakeup_clear); + +/* + * Returns true if the compile and runtime configurations allow for EC events + * to wake the system. + */ +bool olpc_ec_wakeup_available(void) +{ + if (!machine_is_olpc()) + return false; + + /* + * XO-1 EC wakeups are available when olpc-xo1-sci driver is + * compiled in + */ +#ifdef CONFIG_OLPC_XO1_SCI + if (olpc_platform_info.boardrev < olpc_board_pre(0xd0)) /* XO-1 */ + return true; +#endif + + return false; +} +EXPORT_SYMBOL_GPL(olpc_ec_wakeup_available); + +int olpc_ec_mask_write(u16 bits) +{ + if (olpc_platform_info.flags & OLPC_F_EC_WIDE_SCI) { + __be16 ec_word = cpu_to_be16(bits); + return olpc_ec_cmd(EC_WRITE_EXT_SCI_MASK, (void *) &ec_word, 2, + NULL, 0); + } else { + unsigned char ec_byte = bits & 0xff; + return olpc_ec_cmd(EC_WRITE_SCI_MASK, &ec_byte, 1, NULL, 0); + } +} +EXPORT_SYMBOL_GPL(olpc_ec_mask_write); + +int olpc_ec_sci_query(u16 *sci_value) +{ + int ret; + + if (olpc_platform_info.flags & OLPC_F_EC_WIDE_SCI) { + __be16 ec_word; + ret = olpc_ec_cmd(EC_EXT_SCI_QUERY, + NULL, 0, (void *) &ec_word, 2); + if (ret == 0) + *sci_value = be16_to_cpu(ec_word); + } else { + unsigned char ec_byte; + ret = olpc_ec_cmd(EC_SCI_QUERY, NULL, 0, &ec_byte, 1); + if (ret == 0) + *sci_value = ec_byte; + } + + return ret; +} +EXPORT_SYMBOL_GPL(olpc_ec_sci_query); + +static int olpc_ec_suspend(void) +{ + return olpc_ec_mask_write(ec_wakeup_mask); +} + static bool __init check_ofw_architecture(struct device_node *root) { const char *olpc_arch; @@ -242,6 +319,10 @@ static int __init add_xo1_platform_devices(void) return 0; } +static struct syscore_ops olpc_syscore_ops = { + .suspend = olpc_ec_suspend, +}; + static int __init olpc_init(void) { int r = 0; @@ -266,6 +347,9 @@ static int __init olpc_init(void) !cs5535_has_vsa2()) x86_init.pci.arch_init = pci_olpc_init; #endif + /* EC version 0x5f adds support for wide SCI mask */ + if (olpc_platform_info.ecver >= 0x5f) + olpc_platform_info.flags |= OLPC_F_EC_WIDE_SCI; printk(KERN_INFO "OLPC board revision %s%X (EC=%x)\n", ((olpc_platform_info.boardrev & 0xf) < 8) ? "pre" : "", @@ -278,6 +362,8 @@ static int __init olpc_init(void) return r; } + register_syscore_ops(&olpc_syscore_ops); + return 0; } -- cgit v1.1 From 7bc74b3df73776fe06f3df9fafd2d2698e6ca28a Mon Sep 17 00:00:00 2001 From: Daniel Drake Date: Sat, 25 Jun 2011 17:34:14 +0100 Subject: x86, olpc-xo1-sci: Add GPE handler and ebook switch functionality The EC in the OLPC XO-1 delivers GPE events to provide various notifications. Add the basic code for GPE/EC event processing and enable the ebook switch, which can be used as a wakeup source. Signed-off-by: Daniel Drake Link: http://lkml.kernel.org/r/1309019658-1712-8-git-send-email-dsd@laptop.org Acked-by: Andres Salomon Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 2 + arch/x86/include/asm/olpc.h | 5 +- arch/x86/platform/olpc/olpc-xo1-sci.c | 183 +++++++++++++++++++++++++++++++++- 3 files changed, 187 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 66b7b9d..8888910 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2087,7 +2087,9 @@ config OLPC_XO1_SCI select MFD_CORE ---help--- Add support for SCI-based features of the OLPC XO-1 laptop: + - EC-driven system wakeups - Power button + - Ebook switch endif # X86_32 diff --git a/arch/x86/include/asm/olpc.h b/arch/x86/include/asm/olpc.h index 0e56d01..87bdbca 100644 --- a/arch/x86/include/asm/olpc.h +++ b/arch/x86/include/asm/olpc.h @@ -111,6 +111,7 @@ extern int olpc_ec_cmd(unsigned char cmd, unsigned char *inbuf, size_t inlen, #define EC_WRITE_SCI_MASK 0x1b #define EC_WAKE_UP_WLAN 0x24 #define EC_WLAN_LEAVE_RESET 0x25 +#define EC_READ_EB_MODE 0x2a #define EC_SET_SCI_INHIBIT 0x32 #define EC_SET_SCI_INHIBIT_RELEASE 0x34 #define EC_WLAN_ENTER_RESET 0x35 @@ -144,7 +145,7 @@ extern int olpc_ec_cmd(unsigned char cmd, unsigned char *inbuf, size_t inlen, #define OLPC_GPIO_SMB_CLK 14 #define OLPC_GPIO_SMB_DATA 15 #define OLPC_GPIO_WORKAUX geode_gpio(24) -#define OLPC_GPIO_LID geode_gpio(26) -#define OLPC_GPIO_ECSCI geode_gpio(27) +#define OLPC_GPIO_LID 26 +#define OLPC_GPIO_ECSCI 27 #endif /* _ASM_X86_OLPC_H */ diff --git a/arch/x86/platform/olpc/olpc-xo1-sci.c b/arch/x86/platform/olpc/olpc-xo1-sci.c index 8fbf961..63f5050 100644 --- a/arch/x86/platform/olpc/olpc-xo1-sci.c +++ b/arch/x86/platform/olpc/olpc-xo1-sci.c @@ -12,12 +12,15 @@ */ #include +#include +#include #include #include #include #include #include #include +#include #include #include @@ -28,8 +31,60 @@ static unsigned long acpi_base; static struct input_dev *power_button_idev; +static struct input_dev *ebook_switch_idev; + static int sci_irq; +/* Report current ebook switch state through input layer */ +static void send_ebook_state(void) +{ + unsigned char state; + + if (olpc_ec_cmd(EC_READ_EB_MODE, NULL, 0, &state, 1)) { + pr_err(PFX "failed to get ebook state\n"); + return; + } + + input_report_switch(ebook_switch_idev, SW_TABLET_MODE, state); + input_sync(ebook_switch_idev); +} + +/* + * Process all items in the EC's SCI queue. + * + * This is handled in a workqueue because olpc_ec_cmd can be slow (and + * can even timeout). + * + * If propagate_events is false, the queue is drained without events being + * generated for the interrupts. + */ +static void process_sci_queue(bool propagate_events) +{ + int r; + u16 data; + + do { + r = olpc_ec_sci_query(&data); + if (r || !data) + break; + + pr_debug(PFX "SCI 0x%x received\n", data); + + if (data == EC_SCI_SRC_EBOOK && propagate_events) + send_ebook_state(); + } while (data); + + if (r) + pr_err(PFX "Failed to clear SCI queue"); +} + +static void process_sci_queue_work(struct work_struct *work) +{ + process_sci_queue(true); +} + +static DECLARE_WORK(sci_work, process_sci_queue_work); + static irqreturn_t xo1_sci_intr(int irq, void *dev_id) { struct platform_device *pdev = dev_id; @@ -51,6 +106,11 @@ static irqreturn_t xo1_sci_intr(int irq, void *dev_id) input_sync(power_button_idev); } + if (gpe & CS5536_GPIOM7_PME_FLAG) { /* EC GPIO */ + cs5535_gpio_set(OLPC_GPIO_ECSCI, GPIO_NEGATIVE_EDGE_STS); + schedule_work(&sci_work); + } + return IRQ_HANDLED; } @@ -60,6 +120,19 @@ static int xo1_sci_suspend(struct platform_device *pdev, pm_message_t state) olpc_xo1_pm_wakeup_set(CS5536_PM_PWRBTN); else olpc_xo1_pm_wakeup_clear(CS5536_PM_PWRBTN); + + if (device_may_wakeup(&ebook_switch_idev->dev)) + olpc_ec_wakeup_set(EC_SCI_SRC_EBOOK); + else + olpc_ec_wakeup_clear(EC_SCI_SRC_EBOOK); + + return 0; +} + +static int xo1_sci_resume(struct platform_device *pdev) +{ + /* Enable all EC events */ + olpc_ec_mask_write(EC_SCI_SRC_ALL); return 0; } @@ -104,6 +177,50 @@ static int __devinit setup_sci_interrupt(struct platform_device *pdev) return r; } +static int __devinit setup_ec_sci(void) +{ + int r; + + r = gpio_request(OLPC_GPIO_ECSCI, "OLPC-ECSCI"); + if (r) + return r; + + gpio_direction_input(OLPC_GPIO_ECSCI); + + /* Clear pending EC SCI events */ + cs5535_gpio_set(OLPC_GPIO_ECSCI, GPIO_NEGATIVE_EDGE_STS); + cs5535_gpio_set(OLPC_GPIO_ECSCI, GPIO_POSITIVE_EDGE_STS); + + /* + * Enable EC SCI events, and map them to both a PME and the SCI + * interrupt. + * + * Ordinarily, in addition to functioning as GPIOs, Geode GPIOs can + * be mapped to regular interrupts *or* Geode-specific Power + * Management Events (PMEs) - events that bring the system out of + * suspend. In this case, we want both of those things - the system + * wakeup, *and* the ability to get an interrupt when an event occurs. + * + * To achieve this, we map the GPIO to a PME, and then we use one + * of the many generic knobs on the CS5535 PIC to additionally map the + * PME to the regular SCI interrupt line. + */ + cs5535_gpio_set(OLPC_GPIO_ECSCI, GPIO_EVENTS_ENABLE); + + /* Set the SCI to cause a PME event on group 7 */ + cs5535_gpio_setup_event(OLPC_GPIO_ECSCI, 7, 1); + + /* And have group 7 also fire the SCI interrupt */ + cs5535_pic_unreqz_select_high(7, sci_irq); + + return 0; +} + +static void free_ec_sci(void) +{ + gpio_free(OLPC_GPIO_ECSCI); +} + static int __devinit setup_power_button(struct platform_device *pdev) { int r; @@ -135,6 +252,37 @@ static void free_power_button(void) input_free_device(power_button_idev); } +static int __devinit setup_ebook_switch(struct platform_device *pdev) +{ + int r; + + ebook_switch_idev = input_allocate_device(); + if (!ebook_switch_idev) + return -ENOMEM; + + ebook_switch_idev->name = "EBook Switch"; + ebook_switch_idev->phys = DRV_NAME "/input1"; + set_bit(EV_SW, ebook_switch_idev->evbit); + set_bit(SW_TABLET_MODE, ebook_switch_idev->swbit); + + ebook_switch_idev->dev.parent = &pdev->dev; + device_set_wakeup_capable(&ebook_switch_idev->dev, true); + + r = input_register_device(ebook_switch_idev); + if (r) { + dev_err(&pdev->dev, "failed to register ebook switch: %d\n", r); + input_free_device(ebook_switch_idev); + } + + return r; +} + +static void free_ebook_switch(void) +{ + input_unregister_device(ebook_switch_idev); + input_free_device(ebook_switch_idev); +} + static int __devinit xo1_sci_probe(struct platform_device *pdev) { struct resource *res; @@ -159,10 +307,39 @@ static int __devinit xo1_sci_probe(struct platform_device *pdev) if (r) return r; + r = setup_ebook_switch(pdev); + if (r) + goto err_ebook; + + r = setup_ec_sci(); + if (r) + goto err_ecsci; + + /* Enable PME generation for EC-generated events */ + outl(CS5536_GPIOM7_PME_EN, acpi_base + CS5536_PM_GPE0_EN); + + /* Clear pending events */ + outl(0xffffffff, acpi_base + CS5536_PM_GPE0_STS); + process_sci_queue(false); + + /* Initial sync */ + send_ebook_state(); + r = setup_sci_interrupt(pdev); if (r) - free_power_button(); + goto err_sci; + /* Enable all EC events */ + olpc_ec_mask_write(EC_SCI_SRC_ALL); + + return r; + +err_sci: + free_ec_sci(); +err_ecsci: + free_ebook_switch(); +err_ebook: + free_power_button(); return r; } @@ -170,6 +347,9 @@ static int __devexit xo1_sci_remove(struct platform_device *pdev) { mfd_cell_disable(pdev); free_irq(sci_irq, pdev); + cancel_work_sync(&sci_work); + free_ec_sci(); + free_ebook_switch(); free_power_button(); acpi_base = 0; return 0; @@ -182,6 +362,7 @@ static struct platform_driver xo1_sci_driver = { .probe = xo1_sci_probe, .remove = __devexit_p(xo1_sci_remove), .suspend = xo1_sci_suspend, + .resume = xo1_sci_resume, }; static int __init xo1_sci_init(void) -- cgit v1.1 From 2cf2baea103f0a3d68b0f989d28df66f16dbc834 Mon Sep 17 00:00:00 2001 From: Daniel Drake Date: Sat, 25 Jun 2011 17:34:15 +0100 Subject: x86, olpc-xo1-sci: Add lid switch functionality Configure the XO-1's lid switch GPIO to trigger an SCI interrupt, and correctly expose this input device which can be used as a wakeup source. Signed-off-by: Daniel Drake Link: http://lkml.kernel.org/r/1309019658-1712-9-git-send-email-dsd@laptop.org Acked-by: Andres Salomon Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 1 + arch/x86/platform/olpc/olpc-xo1-sci.c | 207 +++++++++++++++++++++++++++++++++- 2 files changed, 207 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 8888910..350ccbb 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2090,6 +2090,7 @@ config OLPC_XO1_SCI - EC-driven system wakeups - Power button - Ebook switch + - Lid switch endif # X86_32 diff --git a/arch/x86/platform/olpc/olpc-xo1-sci.c b/arch/x86/platform/olpc/olpc-xo1-sci.c index 63f5050..ad0670b 100644 --- a/arch/x86/platform/olpc/olpc-xo1-sci.c +++ b/arch/x86/platform/olpc/olpc-xo1-sci.c @@ -32,9 +32,26 @@ static unsigned long acpi_base; static struct input_dev *power_button_idev; static struct input_dev *ebook_switch_idev; +static struct input_dev *lid_switch_idev; static int sci_irq; +static bool lid_open; +static bool lid_inverted; +static int lid_wake_mode; + +enum lid_wake_modes { + LID_WAKE_ALWAYS, + LID_WAKE_OPEN, + LID_WAKE_CLOSE, +}; + +static const char * const lid_wake_mode_names[] = { + [LID_WAKE_ALWAYS] = "always", + [LID_WAKE_OPEN] = "open", + [LID_WAKE_CLOSE] = "close", +}; + /* Report current ebook switch state through input layer */ static void send_ebook_state(void) { @@ -49,6 +66,70 @@ static void send_ebook_state(void) input_sync(ebook_switch_idev); } +static void flip_lid_inverter(void) +{ + /* gpio is high; invert so we'll get l->h event interrupt */ + if (lid_inverted) + cs5535_gpio_clear(OLPC_GPIO_LID, GPIO_INPUT_INVERT); + else + cs5535_gpio_set(OLPC_GPIO_LID, GPIO_INPUT_INVERT); + lid_inverted = !lid_inverted; +} + +static void detect_lid_state(void) +{ + /* + * the edge detector hookup on the gpio inputs on the geode is + * odd, to say the least. See http://dev.laptop.org/ticket/5703 + * for details, but in a nutshell: we don't use the edge + * detectors. instead, we make use of an anomoly: with the both + * edge detectors turned off, we still get an edge event on a + * positive edge transition. to take advantage of this, we use the + * front-end inverter to ensure that that's the edge we're always + * going to see next. + */ + + int state; + + state = cs5535_gpio_isset(OLPC_GPIO_LID, GPIO_READ_BACK); + lid_open = !state ^ !lid_inverted; /* x ^^ y */ + if (!state) + return; + + flip_lid_inverter(); +} + +/* Report current lid switch state through input layer */ +static void send_lid_state(void) +{ + input_report_switch(lid_switch_idev, SW_LID, !lid_open); + input_sync(lid_switch_idev); +} + +static ssize_t lid_wake_mode_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + const char *mode = lid_wake_mode_names[lid_wake_mode]; + return sprintf(buf, "%s\n", mode); +} +static ssize_t lid_wake_mode_set(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + int i; + for (i = 0; i < ARRAY_SIZE(lid_wake_mode_names); i++) { + const char *mode = lid_wake_mode_names[i]; + if (strlen(mode) != count || strncasecmp(mode, buf, count)) + continue; + + lid_wake_mode = i; + return count; + } + return -EINVAL; +} +static DEVICE_ATTR(lid_wake_mode, S_IWUSR | S_IRUGO, lid_wake_mode_show, + lid_wake_mode_set); + /* * Process all items in the EC's SCI queue. * @@ -111,6 +192,11 @@ static irqreturn_t xo1_sci_intr(int irq, void *dev_id) schedule_work(&sci_work); } + cs5535_gpio_set(OLPC_GPIO_LID, GPIO_NEGATIVE_EDGE_STS); + cs5535_gpio_set(OLPC_GPIO_LID, GPIO_POSITIVE_EDGE_STS); + detect_lid_state(); + send_lid_state(); + return IRQ_HANDLED; } @@ -126,11 +212,32 @@ static int xo1_sci_suspend(struct platform_device *pdev, pm_message_t state) else olpc_ec_wakeup_clear(EC_SCI_SRC_EBOOK); + if (!device_may_wakeup(&lid_switch_idev->dev)) { + cs5535_gpio_clear(OLPC_GPIO_LID, GPIO_EVENTS_ENABLE); + } else if ((lid_open && lid_wake_mode == LID_WAKE_OPEN) || + (!lid_open && lid_wake_mode == LID_WAKE_CLOSE)) { + flip_lid_inverter(); + + /* we may have just caused an event */ + cs5535_gpio_set(OLPC_GPIO_LID, GPIO_NEGATIVE_EDGE_STS); + cs5535_gpio_set(OLPC_GPIO_LID, GPIO_POSITIVE_EDGE_STS); + + cs5535_gpio_set(OLPC_GPIO_LID, GPIO_EVENTS_ENABLE); + } + return 0; } static int xo1_sci_resume(struct platform_device *pdev) { + /* + * We don't know what may have happened while we were asleep. + * Reestablish our lid setup so we're sure to catch all transitions. + */ + detect_lid_state(); + send_lid_state(); + cs5535_gpio_set(OLPC_GPIO_LID, GPIO_EVENTS_ENABLE); + /* Enable all EC events */ olpc_ec_mask_write(EC_SCI_SRC_ALL); return 0; @@ -221,6 +328,43 @@ static void free_ec_sci(void) gpio_free(OLPC_GPIO_ECSCI); } +static int __devinit setup_lid_events(void) +{ + int r; + + r = gpio_request(OLPC_GPIO_LID, "OLPC-LID"); + if (r) + return r; + + gpio_direction_input(OLPC_GPIO_LID); + + cs5535_gpio_clear(OLPC_GPIO_LID, GPIO_INPUT_INVERT); + lid_inverted = 0; + + /* Clear edge detection and event enable for now */ + cs5535_gpio_clear(OLPC_GPIO_LID, GPIO_EVENTS_ENABLE); + cs5535_gpio_clear(OLPC_GPIO_LID, GPIO_NEGATIVE_EDGE_EN); + cs5535_gpio_clear(OLPC_GPIO_LID, GPIO_POSITIVE_EDGE_EN); + cs5535_gpio_set(OLPC_GPIO_LID, GPIO_NEGATIVE_EDGE_STS); + cs5535_gpio_set(OLPC_GPIO_LID, GPIO_POSITIVE_EDGE_STS); + + /* Set the LID to cause an PME event on group 6 */ + cs5535_gpio_setup_event(OLPC_GPIO_LID, 6, 1); + + /* Set PME group 6 to fire the SCI interrupt */ + cs5535_gpio_set_irq(6, sci_irq); + + /* Enable the event */ + cs5535_gpio_set(OLPC_GPIO_LID, GPIO_EVENTS_ENABLE); + + return 0; +} + +static void free_lid_events(void) +{ + gpio_free(OLPC_GPIO_LID); +} + static int __devinit setup_power_button(struct platform_device *pdev) { int r; @@ -283,6 +427,50 @@ static void free_ebook_switch(void) input_free_device(ebook_switch_idev); } +static int __devinit setup_lid_switch(struct platform_device *pdev) +{ + int r; + + lid_switch_idev = input_allocate_device(); + if (!lid_switch_idev) + return -ENOMEM; + + lid_switch_idev->name = "Lid Switch"; + lid_switch_idev->phys = DRV_NAME "/input2"; + set_bit(EV_SW, lid_switch_idev->evbit); + set_bit(SW_LID, lid_switch_idev->swbit); + + lid_switch_idev->dev.parent = &pdev->dev; + device_set_wakeup_capable(&lid_switch_idev->dev, true); + + r = input_register_device(lid_switch_idev); + if (r) { + dev_err(&pdev->dev, "failed to register lid switch: %d\n", r); + goto err_register; + } + + r = device_create_file(&lid_switch_idev->dev, &dev_attr_lid_wake_mode); + if (r) { + dev_err(&pdev->dev, "failed to create wake mode attr: %d\n", r); + goto err_create_attr; + } + + return 0; + +err_create_attr: + input_unregister_device(lid_switch_idev); +err_register: + input_free_device(lid_switch_idev); + return r; +} + +static void free_lid_switch(void) +{ + device_remove_file(&lid_switch_idev->dev, &dev_attr_lid_wake_mode); + input_unregister_device(lid_switch_idev); + input_free_device(lid_switch_idev); +} + static int __devinit xo1_sci_probe(struct platform_device *pdev) { struct resource *res; @@ -311,12 +499,21 @@ static int __devinit xo1_sci_probe(struct platform_device *pdev) if (r) goto err_ebook; + r = setup_lid_switch(pdev); + if (r) + goto err_lid; + + r = setup_lid_events(); + if (r) + goto err_lidevt; + r = setup_ec_sci(); if (r) goto err_ecsci; /* Enable PME generation for EC-generated events */ - outl(CS5536_GPIOM7_PME_EN, acpi_base + CS5536_PM_GPE0_EN); + outl(CS5536_GPIOM6_PME_EN | CS5536_GPIOM7_PME_EN, + acpi_base + CS5536_PM_GPE0_EN); /* Clear pending events */ outl(0xffffffff, acpi_base + CS5536_PM_GPE0_STS); @@ -324,6 +521,8 @@ static int __devinit xo1_sci_probe(struct platform_device *pdev) /* Initial sync */ send_ebook_state(); + detect_lid_state(); + send_lid_state(); r = setup_sci_interrupt(pdev); if (r) @@ -337,6 +536,10 @@ static int __devinit xo1_sci_probe(struct platform_device *pdev) err_sci: free_ec_sci(); err_ecsci: + free_lid_events(); +err_lidevt: + free_lid_switch(); +err_lid: free_ebook_switch(); err_ebook: free_power_button(); @@ -349,6 +552,8 @@ static int __devexit xo1_sci_remove(struct platform_device *pdev) free_irq(sci_irq, pdev); cancel_work_sync(&sci_work); free_ec_sci(); + free_lid_events(); + free_lid_switch(); free_ebook_switch(); free_power_button(); acpi_base = 0; -- cgit v1.1 From e1040ac693bac19eaeafbd6c5fd24d9429b5eeb8 Mon Sep 17 00:00:00 2001 From: Daniel Drake Date: Sat, 25 Jun 2011 17:34:16 +0100 Subject: x86, olpc-xo1-sci: Propagate power supply/battery events EC events indicate change in AC power connectivity, battery state of charge, battery error, battery presence, etc. Send notifications to the power supply subsystem when changes are detected. Signed-off-by: Daniel Drake Link: http://lkml.kernel.org/r/1309019658-1712-10-git-send-email-dsd@laptop.org Acked-by: Andres Salomon Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 4 +++- arch/x86/platform/olpc/olpc-xo1-sci.c | 37 +++++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 350ccbb..a6aefbb 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2082,7 +2082,7 @@ config OLPC_XO1_PM config OLPC_XO1_SCI bool "OLPC XO-1 SCI extras" - depends on OLPC && OLPC_XO1_PM + depends on OLPC && OLPC_XO1_PM && POWER_SUPPLY select GPIO_CS5535 select MFD_CORE ---help--- @@ -2091,6 +2091,8 @@ config OLPC_XO1_SCI - Power button - Ebook switch - Lid switch + - AC adapter status updates + - Battery status updates endif # X86_32 diff --git a/arch/x86/platform/olpc/olpc-xo1-sci.c b/arch/x86/platform/olpc/olpc-xo1-sci.c index ad0670b..1d4c783 100644 --- a/arch/x86/platform/olpc/olpc-xo1-sci.c +++ b/arch/x86/platform/olpc/olpc-xo1-sci.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -52,6 +53,26 @@ static const char * const lid_wake_mode_names[] = { [LID_WAKE_CLOSE] = "close", }; +static void battery_status_changed(void) +{ + struct power_supply *psy = power_supply_get_by_name("olpc-battery"); + + if (psy) { + power_supply_changed(psy); + put_device(psy->dev); + } +} + +static void ac_status_changed(void) +{ + struct power_supply *psy = power_supply_get_by_name("olpc-ac"); + + if (psy) { + power_supply_changed(psy); + put_device(psy->dev); + } +} + /* Report current ebook switch state through input layer */ static void send_ebook_state(void) { @@ -151,6 +172,18 @@ static void process_sci_queue(bool propagate_events) pr_debug(PFX "SCI 0x%x received\n", data); + switch (data) { + case EC_SCI_SRC_BATERR: + case EC_SCI_SRC_BATSOC: + case EC_SCI_SRC_BATTERY: + case EC_SCI_SRC_BATCRIT: + battery_status_changed(); + break; + case EC_SCI_SRC_ACPWR: + ac_status_changed(); + break; + } + if (data == EC_SCI_SRC_EBOOK && propagate_events) send_ebook_state(); } while (data); @@ -240,6 +273,10 @@ static int xo1_sci_resume(struct platform_device *pdev) /* Enable all EC events */ olpc_ec_mask_write(EC_SCI_SRC_ALL); + + /* Power/battery status might have changed too */ + battery_status_changed(); + ac_status_changed(); return 0; } -- cgit v1.1 From cfee95977bea090ae5ec4fd442ebd381792d46c4 Mon Sep 17 00:00:00 2001 From: Daniel Drake Date: Sat, 25 Jun 2011 17:34:17 +0100 Subject: x86, olpc: Add XO-1 RTC driver Add a driver to configure the XO-1 RTC via CS5536 MSRs, to be used as a system wakeup source via olpc-xo1-pm. Device detection is based on finding the relevant device tree node. Signed-off-by: Daniel Drake Link: http://lkml.kernel.org/r/1309019658-1712-11-git-send-email-dsd@laptop.org Acked-by: Andres Salomon Acked-by: Grant Likely Reviewed-by: Sebastian Andrzej Siewior Cc: devicetree-discuss@lists.ozlabs.org Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 7 +++ arch/x86/platform/olpc/Makefile | 1 + arch/x86/platform/olpc/olpc-xo1-rtc.c | 81 +++++++++++++++++++++++++++++++++++ 3 files changed, 89 insertions(+) create mode 100644 arch/x86/platform/olpc/olpc-xo1-rtc.c (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index a6aefbb..0a9d573 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2080,6 +2080,13 @@ config OLPC_XO1_PM ---help--- Add support for poweroff and suspend of the OLPC XO-1 laptop. +config OLPC_XO1_RTC + bool "OLPC XO-1 Real Time Clock" + depends on OLPC_XO1_PM && RTC_DRV_CMOS + ---help--- + Add support for the XO-1 real time clock, which can be used as a + programmable wakeup source. + config OLPC_XO1_SCI bool "OLPC XO-1 SCI extras" depends on OLPC && OLPC_XO1_PM && POWER_SUPPLY diff --git a/arch/x86/platform/olpc/Makefile b/arch/x86/platform/olpc/Makefile index 1ec5ade..8922b9b 100644 --- a/arch/x86/platform/olpc/Makefile +++ b/arch/x86/platform/olpc/Makefile @@ -1,3 +1,4 @@ obj-$(CONFIG_OLPC) += olpc.o olpc_ofw.o olpc_dt.o obj-$(CONFIG_OLPC_XO1_PM) += olpc-xo1-pm.o xo1-wakeup.o +obj-$(CONFIG_OLPC_XO1_RTC) += olpc-xo1-rtc.o obj-$(CONFIG_OLPC_XO1_SCI) += olpc-xo1-sci.o diff --git a/arch/x86/platform/olpc/olpc-xo1-rtc.c b/arch/x86/platform/olpc/olpc-xo1-rtc.c new file mode 100644 index 0000000..a2b4efd --- /dev/null +++ b/arch/x86/platform/olpc/olpc-xo1-rtc.c @@ -0,0 +1,81 @@ +/* + * Support for OLPC XO-1 Real Time Clock (RTC) + * + * Copyright (C) 2011 One Laptop per Child + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include +#include +#include + +#include +#include + +static void rtc_wake_on(struct device *dev) +{ + olpc_xo1_pm_wakeup_set(CS5536_PM_RTC); +} + +static void rtc_wake_off(struct device *dev) +{ + olpc_xo1_pm_wakeup_clear(CS5536_PM_RTC); +} + +static struct resource rtc_platform_resource[] = { + [0] = { + .start = RTC_PORT(0), + .end = RTC_PORT(1), + .flags = IORESOURCE_IO, + }, + [1] = { + .start = RTC_IRQ, + .end = RTC_IRQ, + .flags = IORESOURCE_IRQ, + } +}; + +static struct cmos_rtc_board_info rtc_info = { + .rtc_day_alarm = 0, + .rtc_mon_alarm = 0, + .rtc_century = 0, + .wake_on = rtc_wake_on, + .wake_off = rtc_wake_off, +}; + +static struct platform_device xo1_rtc_device = { + .name = "rtc_cmos", + .id = -1, + .num_resources = ARRAY_SIZE(rtc_platform_resource), + .dev.platform_data = &rtc_info, + .resource = rtc_platform_resource, +}; + +static int __init xo1_rtc_init(void) +{ + int r; + struct device_node *node; + + node = of_find_compatible_node(NULL, NULL, "olpc,xo1-rtc"); + if (!node) + return 0; + of_node_put(node); + + pr_info("olpc-xo1-rtc: Initializing OLPC XO-1 RTC\n"); + rdmsrl(MSR_RTC_DOMA_OFFSET, rtc_info.rtc_day_alarm); + rdmsrl(MSR_RTC_MONA_OFFSET, rtc_info.rtc_mon_alarm); + rdmsrl(MSR_RTC_CEN_OFFSET, rtc_info.rtc_century); + + r = platform_device_register(&xo1_rtc_device); + if (r) + return r; + + device_init_wakeup(&xo1_rtc_device.dev, 1); + return 0; +} +arch_initcall(xo1_rtc_init); -- cgit v1.1 From a0f30f592d2d81e28f3ed7fea7f03246b0d55b75 Mon Sep 17 00:00:00 2001 From: Daniel Drake Date: Sat, 25 Jun 2011 17:34:18 +0100 Subject: x86, olpc: Add XO-1.5 SCI driver Add a driver for the ACPI-based EC event interface found on the OLPC XO-1.5 laptop. This enables notification of battery/AC power events, and enables various devices to be used as wakeup sources through regular ACPI mechanisms. This driver can't be built as a module, because some drivers need to know at boot-time if SCI-based functionality is available via olpc_ec_wakeup_available(). Signed-off-by: Daniel Drake Link: http://lkml.kernel.org/r/1309019658-1712-12-git-send-email-dsd@laptop.org Acked-by: Andres Salomon Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 9 ++ arch/x86/platform/olpc/Makefile | 1 + arch/x86/platform/olpc/olpc-xo15-sci.c | 168 +++++++++++++++++++++++++++++++++ arch/x86/platform/olpc/olpc.c | 9 ++ 4 files changed, 187 insertions(+) create mode 100644 arch/x86/platform/olpc/olpc-xo15-sci.c (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 0a9d573..8af5ba8 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2101,6 +2101,15 @@ config OLPC_XO1_SCI - AC adapter status updates - Battery status updates +config OLPC_XO15_SCI + bool "OLPC XO-1.5 SCI extras" + depends on OLPC && ACPI && POWER_SUPPLY + ---help--- + Add support for SCI-based features of the OLPC XO-1.5 laptop: + - EC-driven system wakeups + - AC adapter status updates + - Battery status updates + endif # X86_32 config AMD_NB diff --git a/arch/x86/platform/olpc/Makefile b/arch/x86/platform/olpc/Makefile index 8922b9b..fd332c5 100644 --- a/arch/x86/platform/olpc/Makefile +++ b/arch/x86/platform/olpc/Makefile @@ -2,3 +2,4 @@ obj-$(CONFIG_OLPC) += olpc.o olpc_ofw.o olpc_dt.o obj-$(CONFIG_OLPC_XO1_PM) += olpc-xo1-pm.o xo1-wakeup.o obj-$(CONFIG_OLPC_XO1_RTC) += olpc-xo1-rtc.o obj-$(CONFIG_OLPC_XO1_SCI) += olpc-xo1-sci.o +obj-$(CONFIG_OLPC_XO15_SCI) += olpc-xo15-sci.o diff --git a/arch/x86/platform/olpc/olpc-xo15-sci.c b/arch/x86/platform/olpc/olpc-xo15-sci.c new file mode 100644 index 0000000..a5990eb --- /dev/null +++ b/arch/x86/platform/olpc/olpc-xo15-sci.c @@ -0,0 +1,168 @@ +/* + * Support for OLPC XO-1.5 System Control Interrupts (SCI) + * + * Copyright (C) 2009-2010 One Laptop per Child + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include +#include +#include + +#include +#include +#include + +#define DRV_NAME "olpc-xo15-sci" +#define PFX DRV_NAME ": " +#define XO15_SCI_CLASS DRV_NAME +#define XO15_SCI_DEVICE_NAME "OLPC XO-1.5 SCI" + +static unsigned long xo15_sci_gpe; + +static void battery_status_changed(void) +{ + struct power_supply *psy = power_supply_get_by_name("olpc-battery"); + + if (psy) { + power_supply_changed(psy); + put_device(psy->dev); + } +} + +static void ac_status_changed(void) +{ + struct power_supply *psy = power_supply_get_by_name("olpc-ac"); + + if (psy) { + power_supply_changed(psy); + put_device(psy->dev); + } +} + +static void process_sci_queue(void) +{ + u16 data; + int r; + + do { + r = olpc_ec_sci_query(&data); + if (r || !data) + break; + + pr_debug(PFX "SCI 0x%x received\n", data); + + switch (data) { + case EC_SCI_SRC_BATERR: + case EC_SCI_SRC_BATSOC: + case EC_SCI_SRC_BATTERY: + case EC_SCI_SRC_BATCRIT: + battery_status_changed(); + break; + case EC_SCI_SRC_ACPWR: + ac_status_changed(); + break; + } + } while (data); + + if (r) + pr_err(PFX "Failed to clear SCI queue"); +} + +static void process_sci_queue_work(struct work_struct *work) +{ + process_sci_queue(); +} + +static DECLARE_WORK(sci_work, process_sci_queue_work); + +static u32 xo15_sci_gpe_handler(acpi_handle gpe_device, u32 gpe, void *context) +{ + schedule_work(&sci_work); + return ACPI_INTERRUPT_HANDLED | ACPI_REENABLE_GPE; +} + +static int xo15_sci_add(struct acpi_device *device) +{ + unsigned long long tmp; + acpi_status status; + + if (!device) + return -EINVAL; + + strcpy(acpi_device_name(device), XO15_SCI_DEVICE_NAME); + strcpy(acpi_device_class(device), XO15_SCI_CLASS); + + /* Get GPE bit assignment (EC events). */ + status = acpi_evaluate_integer(device->handle, "_GPE", NULL, &tmp); + if (ACPI_FAILURE(status)) + return -EINVAL; + + xo15_sci_gpe = tmp; + status = acpi_install_gpe_handler(NULL, xo15_sci_gpe, + ACPI_GPE_EDGE_TRIGGERED, + xo15_sci_gpe_handler, device); + if (ACPI_FAILURE(status)) + return -ENODEV; + + dev_info(&device->dev, "Initialized, GPE = 0x%lx\n", xo15_sci_gpe); + + /* Flush queue, and enable all SCI events */ + process_sci_queue(); + olpc_ec_mask_write(EC_SCI_SRC_ALL); + + acpi_enable_gpe(NULL, xo15_sci_gpe); + + /* Enable wake-on-EC */ + if (device->wakeup.flags.valid) + device_set_wakeup_enable(&device->dev, true); + + return 0; +} + +static int xo15_sci_remove(struct acpi_device *device, int type) +{ + acpi_disable_gpe(NULL, xo15_sci_gpe); + acpi_remove_gpe_handler(NULL, xo15_sci_gpe, xo15_sci_gpe_handler); + cancel_work_sync(&sci_work); + return 0; +} + +static int xo15_sci_resume(struct acpi_device *device) +{ + /* Enable all EC events */ + olpc_ec_mask_write(EC_SCI_SRC_ALL); + + /* Power/battery status might have changed */ + battery_status_changed(); + ac_status_changed(); + + return 0; +} + +static const struct acpi_device_id xo15_sci_device_ids[] = { + {"XO15EC", 0}, + {"", 0}, +}; + +static struct acpi_driver xo15_sci_drv = { + .name = DRV_NAME, + .class = XO15_SCI_CLASS, + .ids = xo15_sci_device_ids, + .ops = { + .add = xo15_sci_add, + .remove = xo15_sci_remove, + .resume = xo15_sci_resume, + }, +}; + +static int __init xo15_sci_init(void) +{ + return acpi_bus_register_driver(&xo15_sci_drv); +} +device_initcall(xo15_sci_init); diff --git a/arch/x86/platform/olpc/olpc.c b/arch/x86/platform/olpc/olpc.c index 72fd041..8b9940e 100644 --- a/arch/x86/platform/olpc/olpc.c +++ b/arch/x86/platform/olpc/olpc.c @@ -222,6 +222,15 @@ bool olpc_ec_wakeup_available(void) return true; #endif + /* + * XO-1.5 EC wakeups are available when olpc-xo15-sci driver is + * compiled in + */ +#ifdef CONFIG_OLPC_XO15_SCI + if (olpc_platform_info.boardrev >= olpc_board_pre(0xd0)) /* XO-1.5 */ + return true; +#endif + return false; } EXPORT_SYMBOL_GPL(olpc_ec_wakeup_available); -- cgit v1.1 From e08fbb78f03fe2c4f88824faf6f51ce6af185e11 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 1 Jul 2011 23:04:36 -0400 Subject: tracing, x86/irq: Do not trace arch_local_{*,irq_*}() functions I triggered a triple fault with gcc 4.5.1 because it did not honor the inline annotation to arch_local_save_flags() function and that function was added to the pool of functions traced by the function tracer. When preempt_schedule() called arch_local_save_flags() (called by irqs_disabled()), it was traced, but the first thing the function tracer does is disable preemption. When it enables preemption, the NEED_RESCHED flag will not have been cleared and the preemption check will trigger the call to preempt_schedule() again. Although the dynamic function tracer crashed immediately, the static version of the function tracer (CONFIG_DYNAMIC_FTRACE is not set) actually was able to show where the problem was. swapper-1 3.N.. 103885us : arch_local_save_flags <-preempt_schedule swapper-1 3.N.. 103886us : arch_local_save_flags <-preempt_schedule swapper-1 3.N.. 103886us : arch_local_save_flags <-preempt_schedule swapper-1 3.N.. 103887us : arch_local_save_flags <-preempt_schedule swapper-1 3.N.. 103887us : arch_local_save_flags <-preempt_schedule swapper-1 3.N.. 103888us : arch_local_save_flags <-preempt_schedule swapper-1 3.N.. 103888us : arch_local_save_flags <-preempt_schedule It went on for a while before it triple faulted with a corrupted stack. The arch_local_save_flags and arch_local_irq_* functions should not be traced. Even though they are marked as inline, gcc may still make them a function and enable tracing of them. The simple solution is to just mark them as notrace. I had to add the for this file to include the notrace tag. Signed-off-by: Steven Rostedt Link: http://lkml.kernel.org/r/20110702033852.733414762@goodmis.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/irqflags.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index 5745ce8..bba3cf8 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h @@ -60,23 +60,24 @@ static inline void native_halt(void) #include #else #ifndef __ASSEMBLY__ +#include -static inline unsigned long arch_local_save_flags(void) +static inline notrace unsigned long arch_local_save_flags(void) { return native_save_fl(); } -static inline void arch_local_irq_restore(unsigned long flags) +static inline notrace void arch_local_irq_restore(unsigned long flags) { native_restore_fl(flags); } -static inline void arch_local_irq_disable(void) +static inline notrace void arch_local_irq_disable(void) { native_irq_disable(); } -static inline void arch_local_irq_enable(void) +static inline notrace void arch_local_irq_enable(void) { native_irq_enable(); } @@ -102,7 +103,7 @@ static inline void halt(void) /* * For spinlocks, etc: */ -static inline unsigned long arch_local_irq_save(void) +static inline notrace unsigned long arch_local_irq_save(void) { unsigned long flags = arch_local_save_flags(); arch_local_irq_disable(); -- cgit v1.1 From ded1f6ab43a03be74a53649cf388b5424d447584 Mon Sep 17 00:00:00 2001 From: Naga Chumbalkar Date: Fri, 8 Jul 2011 08:36:34 +0000 Subject: x86: print APIC data a little later during boot To view IOAPIC data you could boot with "apic=debug". When booting in such a way then the kernel will dump the IO-APIC's registers, for example: NR Dst Mask Trig IRR Pol Stat Dmod Deli Vect: 00 000 1 0 0 0 0 0 0 00 01 000 0 0 0 0 0 0 0 31 02 000 0 0 0 0 0 0 0 30 03 000 0 0 0 0 0 0 0 33 04 000 0 0 0 0 0 0 0 34 05 000 0 0 0 0 0 0 0 35 06 000 0 0 0 0 0 0 0 36 07 000 0 0 0 0 0 0 0 37 08 000 0 0 0 0 0 0 0 38 09 000 0 1 0 0 0 0 0 39 0a 000 0 0 0 0 0 0 0 3A 0b 000 0 0 0 0 0 0 0 3B 0c 000 0 0 0 0 0 0 0 3C 0d 000 0 0 0 0 0 0 0 3D 0e 000 0 0 0 0 0 0 0 3E 0f 000 0 0 0 0 0 0 0 3F 10 000 1 0 0 0 0 0 0 00 11 000 1 0 0 0 0 0 0 00 12 000 1 0 0 0 0 0 0 00 13 000 1 0 0 0 0 0 0 00 14 000 1 0 0 0 0 0 0 00 15 000 1 0 0 0 0 0 0 00 16 000 1 0 0 0 0 0 0 00 17 000 1 0 0 0 0 0 0 00 Delaying the call to print_ICs() gives better results: NR Dst Mask Trig IRR Pol Stat Dmod Deli Vect: 00 000 1 0 0 0 0 0 0 00 01 000 0 0 0 0 0 0 0 31 02 000 0 0 0 0 0 0 0 30 03 000 1 0 0 0 0 0 0 33 04 000 1 0 0 0 0 0 0 34 05 000 1 0 0 0 0 0 0 35 06 000 1 0 0 0 0 0 0 36 07 000 1 0 0 0 0 0 0 37 08 000 0 0 0 0 0 0 0 38 09 000 0 1 0 0 0 0 0 39 0a 000 1 0 0 0 0 0 0 3A 0b 000 1 0 0 0 0 0 0 3B 0c 000 0 0 0 0 0 0 0 3C 0d 000 1 0 0 0 0 0 0 3D 0e 000 1 0 0 0 0 0 0 3E 0f 000 1 0 0 0 0 0 0 3F 10 000 1 1 0 1 0 0 0 29 11 000 1 0 0 0 0 0 0 00 12 000 1 0 0 0 0 0 0 00 13 000 1 0 0 0 0 0 0 00 14 000 0 1 0 1 0 0 0 51 15 000 1 0 0 0 0 0 0 00 16 000 0 1 0 1 0 0 0 61 17 000 0 1 0 1 0 0 0 59 Notice that the entries beyond interrupt input signal 0x0f also get populated and arent just the hw-initialization default of all zeroes. Signed-off-by: Naga Chumbalkar Link: http://lkml.kernel.org/r/20110708083555.2598.42216.sendpatchset@nchumbalkar.americas.hpqcorp.net Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index e529339..98c8d7e 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1792,7 +1792,7 @@ __apicdebuginit(int) print_ICs(void) return 0; } -fs_initcall(print_ICs); +late_initcall(print_ICs); /* Where if anywhere is the i8259 connect in external int mode */ -- cgit v1.1 From 14cb6dcf0a023f5977461c94d8d5a163c937979b Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 8 Jul 2011 13:19:26 -0400 Subject: x86, boot: Wait for boot cpu to show up if nr_cpus limit is about to hit nr_cpus allows one to specify number of possible cpus in the system. Current assumption seems to be that first cpu to show up is boot cpu and this assumption will be broken in kdump scenario where we can be booting on a non boot cpu with nr_cpus=1. It might happen that first cpu we parse is not the cpu we boot on and later we ignore boot cpu. Though code later seems to recognize this anomaly and forcibly sets boot cpu in physical cpu map with following warning. if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) { printk(KERN_WARNING "weird, boot CPU (#%d) not listed by the BIOS.\n", hard_smp_processor_id()); physid_set(hard_smp_processor_id(), phys_cpu_present_map); } This patch waits for boot cpu to show up and starts ignoring the cpus once we have hit (nr_cpus - 1) number of cpus. So effectively we are reserving one slot out of nr_cpus for boot cpu explicitly. Signed-off-by: Vivek Goyal Acked-by: Yinghai Lu Link: http://lkml.kernel.org/r/20110708171926.GF2930@redhat.com Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apic/apic.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index b9338b8..68219a9 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1943,10 +1943,28 @@ void disconnect_bsp_APIC(int virt_wire_setup) void __cpuinit generic_processor_info(int apicid, int version) { - int cpu; + int cpu, max = nr_cpu_ids; + bool boot_cpu_detected = physid_isset(boot_cpu_physical_apicid, + phys_cpu_present_map); + + /* + * If boot cpu has not been detected yet, then only allow upto + * nr_cpu_ids - 1 processors and keep one slot free for boot cpu + */ + if (!boot_cpu_detected && num_processors >= nr_cpu_ids - 1 && + apicid != boot_cpu_physical_apicid) { + int thiscpu = max + disabled_cpus - 1; + + pr_warning( + "ACPI: NR_CPUS/possible_cpus limit of %i almost" + " reached. Keeping one slot for boot cpu." + " Processor %d/0x%x ignored.\n", max, thiscpu, apicid); + + disabled_cpus++; + return; + } if (num_processors >= nr_cpu_ids) { - int max = nr_cpu_ids; int thiscpu = max + disabled_cpus; pr_warning( -- cgit v1.1 From 24a42bae6852d27ae569757f5415c91538e6a255 Mon Sep 17 00:00:00 2001 From: Anupam Chanda Date: Fri, 8 Jul 2011 11:42:50 -0700 Subject: x86, hyper: Change hypervisor detection order Detect Xen before HyperV because in Viridian compatibility mode Xen presents itself as HyperV. Move Xen to the top since it seems more likely that Xen would emulate VMware than vice versa. Signed-off-by: Anupam Chanda Link: http://lkml.kernel.org/r/1310150570-26810-1-git-send-email-achanda@nicira.com Acked-by: Stefano Stabellini Acked-by: Yaozu (Eddie) Dong Reviewed-by: H. Peter Anvin Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/hypervisor.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index 8095f86..755f64fb 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -32,11 +32,11 @@ */ static const __initconst struct hypervisor_x86 * const hypervisors[] = { - &x86_hyper_vmware, - &x86_hyper_ms_hyperv, #ifdef CONFIG_XEN_PVHVM &x86_hyper_xen_hvm, #endif + &x86_hyper_vmware, + &x86_hyper_ms_hyperv, }; const struct hypervisor_x86 *x86_hyper; -- cgit v1.1 From 2dc98fd3206f8106520eced769781a21a20707ca Mon Sep 17 00:00:00 2001 From: Michael Witten Date: Fri, 8 Jul 2011 21:11:16 +0000 Subject: doc: Konfig: Documentation/power/{pm => apm-acpi}.txt Signed-off-by: Michael Witten Signed-off-by: Jiri Kosina --- arch/x86/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index da34972..ab03e88 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1737,8 +1737,8 @@ menuconfig APM machines with more than one CPU. In order to use APM, you will need supporting software. For location - and more information, read and the - Battery Powered Linux mini-HOWTO, available from + and more information, read + and the Battery Powered Linux mini-HOWTO, available from . This driver does not spin down disk drives (see the hdparm(8) -- cgit v1.1 From 5da0ef9a8554a8d03dc880a53f213289fe7b576d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 11 Jul 2011 10:34:32 +0200 Subject: x86: Disable AMD_NUMA for 32bit for now Commit 2706a0bf7b ("x86, NUMA: Enable CONFIG_AMD_NUMA on 32bit too") enabled AMD NUMA for 32bit too. Unfortunately, SPARSEMEM on 32bit had rather coarse (512MiB) addr->node mapping granularity due to lack of space in page->flags. This led to boot failure on certain AMD NUMA machines which had 128MiB alignment on nodes. Patches to properly detect this condition and reject NUMA configuration are posted[1] but deemed too pervasive for merge at this point (-rc6). Disable AMD NUMA for 32bit for now and re-enable once the detection logic is merged. [1] http://thread.gmane.org/gmane.linux.kernel/1161279/focus=1162583 Reported-by: Hans Rosenfeld Signed-off-by: Tejun Heo Cc: Conny Seidel Link: http://lkml.kernel.org/r/20110711083432.GC943@htj.dyndns.org Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index da34972..37357a5 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1170,7 +1170,7 @@ comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI" config AMD_NUMA def_bool y prompt "Old style AMD Opteron NUMA detection" - depends on NUMA && PCI + depends on X86_64 && NUMA && PCI ---help--- Enable AMD NUMA node topology detection. You should say Y here if you have a multi processor AMD system. This uses an old method to -- cgit v1.1 From bd6a46e087571897f0b2736917500b97d18dac13 Mon Sep 17 00:00:00 2001 From: Naga Chumbalkar Date: Fri, 8 Jul 2011 18:46:36 +0000 Subject: x86, ioapic: Format clean up for IOAPIC output When IOAPIC data is displayed in "dmesg" with the help of the boot parameter "apic=debug" certain values are not formatted correctly wrt their size. In the "dmesg" snippet below, note that the output for "max redirection entries", and "IO APIC version" which are each defined to be just 8-bits long are displayed as 2 bytes in length. Similarly, "Dst" under the "IRQ redirection table" should only be 8-bits long. IO APIC #0...... ... ... .... register #01: 00170020 ....... : max redirection entries: 0017 ....... : PRQ implemented: 0 ....... : IO APIC version: 0020 ... ... .... IRQ redirection table: NR Dst Mask Trig IRR Pol Stat Dmod Deli Vect: 00 000 1 0 0 0 0 0 0 00 01 000 0 0 0 0 0 0 0 31 02 000 0 0 0 0 0 0 0 30 03 000 1 0 0 0 0 0 0 33 ... ... Do some formatting clean up, so you will see output like below: IO APIC #0...... ... ... .... register #01: 00170020 ....... : max redirection entries: 17 ....... : PRQ implemented: 0 ....... : IO APIC version: 20 ... ... .... IRQ redirection table: NR Dst Mask Trig IRR Pol Stat Dmod Deli Vect: 00 00 1 0 0 0 0 0 0 00 01 00 0 0 0 0 0 0 0 31 02 00 0 0 0 0 0 0 0 30 03 00 1 0 0 0 0 0 0 33 ... ... Signed-off-by: Naga Chumbalkar Link: http://lkml.kernel.org/r/20110708184557.2734.61830.sendpatchset@nchumbalkar.americas.cpqcorp.net Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 98c8d7e..ed4abaa 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1522,10 +1522,12 @@ __apicdebuginit(void) print_IO_APIC(void) printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS); printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)®_01); - printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); + printk(KERN_DEBUG "....... : max redirection entries: %02X\n", + reg_01.bits.entries); printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); - printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version); + printk(KERN_DEBUG "....... : IO APIC version: %02X\n", + reg_01.bits.version); /* * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02, @@ -1558,7 +1560,7 @@ __apicdebuginit(void) print_IO_APIC(void) entry = ioapic_read_entry(apic, i); - printk(KERN_DEBUG " %02x %03X ", + printk(KERN_DEBUG " %02x %02X ", i, entry.dest ); -- cgit v1.1 From 7fece83235a59b15d75d6c8ef2225c24abd4505b Mon Sep 17 00:00:00 2001 From: Naga Chumbalkar Date: Fri, 8 Jul 2011 18:46:42 +0000 Subject: x86, ioapic: Also print Dest field The code in setup_ioapic_irq() determines the Destination Field, so why not also include it in the debug printk output that gets displayed when the boot parameter "apic=debug" is used. Before the change, "dmesg" will show: IOAPIC[0]: Set routing entry (8-1 -> 0x31 -> IRQ 1 Mode:0 Active:0) IOAPIC[0]: Set routing entry (8-2 -> 0x30 -> IRQ 0 Mode:0 Active:0) IOAPIC[0]: Set routing entry (8-3 -> 0x33 -> IRQ 3 Mode:0 Active:0) ... After the change, you will see: IOAPIC[0]: Set routing entry (8-1 -> 0x31 -> IRQ 1 Mode:0 Active:0 Dest:0) IOAPIC[0]: Set routing entry (8-2 -> 0x30 -> IRQ 0 Mode:0 Active:0 Dest:0) IOAPIC[0]: Set routing entry (8-3 -> 0x33 -> IRQ 3 Mode:0 Active:0 Dest:0) ... Signed-off-by: Naga Chumbalkar Link: http://lkml.kernel.org/r/20110708184603.2734.91071.sendpatchset@nchumbalkar.americas.cpqcorp.net Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index ed4abaa..5cba200 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1337,9 +1337,9 @@ static void setup_ioapic_irq(int apic_id, int pin, unsigned int irq, apic_printk(APIC_VERBOSE,KERN_DEBUG "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> " - "IRQ %d Mode:%i Active:%i)\n", + "IRQ %d Mode:%i Active:%i Dest:%d)\n", apic_id, mpc_ioapic_id(apic_id), pin, cfg->vector, - irq, trigger, polarity); + irq, trigger, polarity, dest); if (setup_ioapic_entry(mpc_ioapic_id(apic_id), irq, &entry, -- cgit v1.1 From fef6e26208879f76bada77c11c80d56ebacb32e4 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Wed, 6 Jul 2011 10:16:21 -0400 Subject: xen/pci: Shuffle code around. The file is hard to read. Move the code around so that the contents of it follows a uniform format: - setup GSIs - PV, HVM, and initial domain case - then MSI/MSI-x setup - PV, HVM and then initial domain case. - then MSI/MSI-x teardown - same order. - lastly, the __init functions in PV, HVM, and initial domain order. Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/pci/xen.c | 346 ++++++++++++++++++++++++++--------------------------- 1 file changed, 173 insertions(+), 173 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index f567965..ba4077b 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -19,6 +19,43 @@ #include #include +static int xen_pcifront_enable_irq(struct pci_dev *dev) +{ + int rc; + int share = 1; + int pirq; + u8 gsi; + + rc = pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &gsi); + if (rc < 0) { + dev_warn(&dev->dev, "Xen PCI: failed to read interrupt line: %d\n", + rc); + return rc; + } + + rc = xen_allocate_pirq_gsi(gsi); + if (rc < 0) { + dev_warn(&dev->dev, "Xen PCI: failed to allocate a PIRQ for GSI%d: %d\n", + gsi, rc); + return rc; + } + pirq = rc; + + if (gsi < NR_IRQS_LEGACY) + share = 0; + + rc = xen_bind_pirq_gsi_to_irq(gsi, pirq, share, "pcifront"); + if (rc < 0) { + dev_warn(&dev->dev, "Xen PCI: failed to bind GSI%d (PIRQ%d) to IRQ: %d\n", + gsi, pirq, rc); + return rc; + } + + dev->irq = rc; + dev_info(&dev->dev, "Xen PCI mapped GSI%d to IRQ%d\n", gsi, dev->irq); + return 0; +} + #ifdef CONFIG_ACPI static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi, int trigger, int polarity) @@ -58,6 +95,87 @@ static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi, } #endif +#ifdef CONFIG_XEN_DOM0 +static int xen_register_pirq(u32 gsi, int gsi_override, int triggering) +{ + int rc, pirq, irq = -1; + struct physdev_map_pirq map_irq; + int shareable = 0; + char *name; + + if (!xen_pv_domain()) + return -1; + + if (triggering == ACPI_EDGE_SENSITIVE) { + shareable = 0; + name = "ioapic-edge"; + } else { + shareable = 1; + name = "ioapic-level"; + } + pirq = xen_allocate_pirq_gsi(gsi); + if (pirq < 0) + goto out; + + if (gsi_override >= 0) + irq = xen_bind_pirq_gsi_to_irq(gsi_override, pirq, shareable, name); + else + irq = xen_bind_pirq_gsi_to_irq(gsi, pirq, shareable, name); + if (irq < 0) + goto out; + + printk(KERN_DEBUG "xen: --> pirq=%d -> irq=%d (gsi=%d)\n", pirq, irq, gsi); + + map_irq.domid = DOMID_SELF; + map_irq.type = MAP_PIRQ_TYPE_GSI; + map_irq.index = gsi; + map_irq.pirq = pirq; + + rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq); + if (rc) { + printk(KERN_WARNING "xen map irq failed %d\n", rc); + return -1; + } + +out: + return irq; +} + +static int xen_register_gsi(u32 gsi, int gsi_override, int triggering, int polarity) +{ + int rc, irq; + struct physdev_setup_gsi setup_gsi; + + if (!xen_pv_domain()) + return -1; + + printk(KERN_DEBUG "xen: registering gsi %u triggering %d polarity %d\n", + gsi, triggering, polarity); + + irq = xen_register_pirq(gsi, gsi_override, triggering); + + setup_gsi.gsi = gsi; + setup_gsi.triggering = (triggering == ACPI_EDGE_SENSITIVE ? 0 : 1); + setup_gsi.polarity = (polarity == ACPI_ACTIVE_HIGH ? 0 : 1); + + rc = HYPERVISOR_physdev_op(PHYSDEVOP_setup_gsi, &setup_gsi); + if (rc == -EEXIST) + printk(KERN_INFO "Already setup the GSI :%d\n", gsi); + else if (rc) { + printk(KERN_ERR "Failed to setup GSI :%d, err_code:%d\n", + gsi, rc); + } + + return irq; +} + +static int acpi_register_gsi_xen(struct device *dev, u32 gsi, + int trigger, int polarity) +{ + return xen_register_gsi(gsi, -1 /* no GSI override */, trigger, polarity); +} +#endif + #if defined(CONFIG_PCI_MSI) #include #include @@ -65,6 +183,46 @@ static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi, struct xen_pci_frontend_ops *xen_pci_frontend; EXPORT_SYMBOL_GPL(xen_pci_frontend); +/* + * For MSI interrupts we have to use drivers/xen/event.s functions to + * allocate an irq_desc and setup the right */ +static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) +{ + int irq, ret, i; + struct msi_desc *msidesc; + int *v; + + v = kzalloc(sizeof(int) * max(1, nvec), GFP_KERNEL); + if (!v) + return -ENOMEM; + + if (type == PCI_CAP_ID_MSIX) + ret = xen_pci_frontend_enable_msix(dev, v, nvec); + else + ret = xen_pci_frontend_enable_msi(dev, v); + if (ret) + goto error; + i = 0; + list_for_each_entry(msidesc, &dev->msi_list, list) { + irq = xen_bind_pirq_msi_to_irq(dev, msidesc, v[i], 0, + (type == PCI_CAP_ID_MSIX) ? + "pcifront-msi-x" : + "pcifront-msi", + DOMID_SELF); + if (irq < 0) + goto free; + i++; + } + kfree(v); + return 0; + +error: + dev_err(&dev->dev, "Xen PCI frontend has not registered MSI/MSI-X support!\n"); +free: + kfree(v); + return ret; +} + #define XEN_PIRQ_MSI_DATA (MSI_DATA_TRIGGER_EDGE | \ MSI_DATA_LEVEL_ASSERT | (3 << 8) | MSI_DATA_VECTOR(0)) @@ -123,67 +281,6 @@ error: return -ENODEV; } -/* - * For MSI interrupts we have to use drivers/xen/event.s functions to - * allocate an irq_desc and setup the right */ - - -static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) -{ - int irq, ret, i; - struct msi_desc *msidesc; - int *v; - - v = kzalloc(sizeof(int) * max(1, nvec), GFP_KERNEL); - if (!v) - return -ENOMEM; - - if (type == PCI_CAP_ID_MSIX) - ret = xen_pci_frontend_enable_msix(dev, v, nvec); - else - ret = xen_pci_frontend_enable_msi(dev, v); - if (ret) - goto error; - i = 0; - list_for_each_entry(msidesc, &dev->msi_list, list) { - irq = xen_bind_pirq_msi_to_irq(dev, msidesc, v[i], 0, - (type == PCI_CAP_ID_MSIX) ? - "pcifront-msi-x" : - "pcifront-msi", - DOMID_SELF); - if (irq < 0) - goto free; - i++; - } - kfree(v); - return 0; - -error: - dev_err(&dev->dev, "Xen PCI frontend has not registered MSI/MSI-X support!\n"); -free: - kfree(v); - return ret; -} - -static void xen_teardown_msi_irqs(struct pci_dev *dev) -{ - struct msi_desc *msidesc; - - msidesc = list_entry(dev->msi_list.next, struct msi_desc, list); - if (msidesc->msi_attrib.is_msix) - xen_pci_frontend_disable_msix(dev); - else - xen_pci_frontend_disable_msi(dev); - - /* Free the IRQ's and the msidesc using the generic code. */ - default_teardown_msi_irqs(dev); -} - -static void xen_teardown_msi_irq(unsigned int irq) -{ - xen_destroy_irq(irq); -} - #ifdef CONFIG_XEN_DOM0 static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) { @@ -242,44 +339,28 @@ out: return ret; } #endif -#endif -static int xen_pcifront_enable_irq(struct pci_dev *dev) +static void xen_teardown_msi_irqs(struct pci_dev *dev) { - int rc; - int share = 1; - int pirq; - u8 gsi; + struct msi_desc *msidesc; - rc = pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &gsi); - if (rc < 0) { - dev_warn(&dev->dev, "Xen PCI: failed to read interrupt line: %d\n", - rc); - return rc; - } + msidesc = list_entry(dev->msi_list.next, struct msi_desc, list); + if (msidesc->msi_attrib.is_msix) + xen_pci_frontend_disable_msix(dev); + else + xen_pci_frontend_disable_msi(dev); - rc = xen_allocate_pirq_gsi(gsi); - if (rc < 0) { - dev_warn(&dev->dev, "Xen PCI: failed to allocate a PIRQ for GSI%d: %d\n", - gsi, rc); - return rc; - } - pirq = rc; + /* Free the IRQ's and the msidesc using the generic code. */ + default_teardown_msi_irqs(dev); +} - if (gsi < NR_IRQS_LEGACY) - share = 0; +static void xen_teardown_msi_irq(unsigned int irq) +{ + xen_destroy_irq(irq); +} - rc = xen_bind_pirq_gsi_to_irq(gsi, pirq, share, "pcifront"); - if (rc < 0) { - dev_warn(&dev->dev, "Xen PCI: failed to bind GSI%d (PIRQ%d) to IRQ: %d\n", - gsi, pirq, rc); - return rc; - } +#endif - dev->irq = rc; - dev_info(&dev->dev, "Xen PCI mapped GSI%d to IRQ%d\n", gsi, dev->irq); - return 0; -} int __init pci_xen_init(void) { @@ -327,79 +408,6 @@ int __init pci_xen_hvm_init(void) } #ifdef CONFIG_XEN_DOM0 -static int xen_register_pirq(u32 gsi, int gsi_override, int triggering) -{ - int rc, pirq, irq = -1; - struct physdev_map_pirq map_irq; - int shareable = 0; - char *name; - - if (!xen_pv_domain()) - return -1; - - if (triggering == ACPI_EDGE_SENSITIVE) { - shareable = 0; - name = "ioapic-edge"; - } else { - shareable = 1; - name = "ioapic-level"; - } - pirq = xen_allocate_pirq_gsi(gsi); - if (pirq < 0) - goto out; - - if (gsi_override >= 0) - irq = xen_bind_pirq_gsi_to_irq(gsi_override, pirq, shareable, name); - else - irq = xen_bind_pirq_gsi_to_irq(gsi, pirq, shareable, name); - if (irq < 0) - goto out; - - printk(KERN_DEBUG "xen: --> pirq=%d -> irq=%d (gsi=%d)\n", pirq, irq, gsi); - - map_irq.domid = DOMID_SELF; - map_irq.type = MAP_PIRQ_TYPE_GSI; - map_irq.index = gsi; - map_irq.pirq = pirq; - - rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq); - if (rc) { - printk(KERN_WARNING "xen map irq failed %d\n", rc); - return -1; - } - -out: - return irq; -} - -static int xen_register_gsi(u32 gsi, int gsi_override, int triggering, int polarity) -{ - int rc, irq; - struct physdev_setup_gsi setup_gsi; - - if (!xen_pv_domain()) - return -1; - - printk(KERN_DEBUG "xen: registering gsi %u triggering %d polarity %d\n", - gsi, triggering, polarity); - - irq = xen_register_pirq(gsi, gsi_override, triggering); - - setup_gsi.gsi = gsi; - setup_gsi.triggering = (triggering == ACPI_EDGE_SENSITIVE ? 0 : 1); - setup_gsi.polarity = (polarity == ACPI_ACTIVE_HIGH ? 0 : 1); - - rc = HYPERVISOR_physdev_op(PHYSDEVOP_setup_gsi, &setup_gsi); - if (rc == -EEXIST) - printk(KERN_INFO "Already setup the GSI :%d\n", gsi); - else if (rc) { - printk(KERN_ERR "Failed to setup GSI :%d, err_code:%d\n", - gsi, rc); - } - - return irq; -} - static __init void xen_setup_acpi_sci(void) { int rc; @@ -447,12 +455,6 @@ static __init void xen_setup_acpi_sci(void) return; } -static int acpi_register_gsi_xen(struct device *dev, u32 gsi, - int trigger, int polarity) -{ - return xen_register_gsi(gsi, -1 /* no GSI override */, trigger, polarity); -} - static int __init pci_xen_initial_domain(void) { #ifdef CONFIG_PCI_MSI @@ -493,9 +495,7 @@ void __init xen_setup_pirqs(void) trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE); } } -#endif -#ifdef CONFIG_XEN_DOM0 struct xen_device_domain_owner { domid_t domain; struct pci_dev *dev; -- cgit v1.1 From 996c34aee3525c0ef91052af0e425e87d83ba6e0 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Mon, 6 Jun 2011 12:22:23 -0400 Subject: xen/pci: Update comments and fix empty spaces. Update the out-dated comment at the beginning of the file. Also provide the copyrights of folks who have been contributing to this code lately. Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/pci/xen.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index ba4077b..76b3980 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -1,8 +1,13 @@ /* - * Xen PCI Frontend Stub - puts some "dummy" functions in to the Linux - * x86 PCI core to support the Xen PCI Frontend + * Xen PCI - handle PCI (INTx) and MSI infrastructure calls for PV, HVM and + * initial domain support. We also handle the DSDT _PRT callbacks for GSI's + * used in HVM and initial domain mode (PV does not parse ACPI, so it has no + * concept of GSIs). Under PV we hook under the pnbbios API for IRQs and + * 0xcf8 PCI configuration read/write. * * Author: Ryan Wilson + * Konrad Rzeszutek Wilk + * Stefano Stabellini */ #include #include @@ -183,9 +188,6 @@ static int acpi_register_gsi_xen(struct device *dev, u32 gsi, struct xen_pci_frontend_ops *xen_pci_frontend; EXPORT_SYMBOL_GPL(xen_pci_frontend); -/* - * For MSI interrupts we have to use drivers/xen/event.s functions to - * allocate an irq_desc and setup the right */ static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) { int irq, ret, i; @@ -361,7 +363,6 @@ static void xen_teardown_msi_irq(unsigned int irq) #endif - int __init pci_xen_init(void) { if (!xen_pv_domain() || xen_initial_domain()) @@ -427,7 +428,7 @@ static __init void xen_setup_acpi_sci(void) } trigger = trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE; polarity = polarity ? ACPI_ACTIVE_LOW : ACPI_ACTIVE_HIGH; - + printk(KERN_INFO "xen: sci override: global_irq=%d trigger=%d " "polarity=%d\n", gsi, trigger, polarity); -- cgit v1.1 From d92edd814e3c9d9105de55b14c8958b1f8f20269 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Wed, 6 Jul 2011 10:41:47 -0400 Subject: xen/pci: Provide #ifdef CONFIG_ACPI to easy code squashing. In the past we would guard those code segments to be dependent on CONFIG_XEN_DOM0 (which depends on CONFIG_ACPI) so this patch is not stricly necessary. But the next patch will merge common HVM and initial domain code and we want to make sure the CONFIG_ACPI dependency is preserved - as HVM code does not depend on CONFIG_XEN_DOM0. Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/pci/xen.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index 76b3980..7ee39cc 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -101,6 +101,7 @@ static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi, #endif #ifdef CONFIG_XEN_DOM0 +#ifdef CONFIG_ACPI static int xen_register_pirq(u32 gsi, int gsi_override, int triggering) { int rc, pirq, irq = -1; @@ -180,6 +181,7 @@ static int acpi_register_gsi_xen(struct device *dev, u32 gsi, return xen_register_gsi(gsi, -1 /* no GSI override */, trigger, polarity); } #endif +#endif #if defined(CONFIG_PCI_MSI) #include @@ -409,6 +411,7 @@ int __init pci_xen_hvm_init(void) } #ifdef CONFIG_XEN_DOM0 +#ifdef CONFIG_ACPI static __init void xen_setup_acpi_sci(void) { int rc; @@ -455,16 +458,17 @@ static __init void xen_setup_acpi_sci(void) return; } - +#endif static int __init pci_xen_initial_domain(void) { #ifdef CONFIG_PCI_MSI x86_msi.setup_msi_irqs = xen_initdom_setup_msi_irqs; x86_msi.teardown_msi_irq = xen_teardown_msi_irq; #endif +#ifdef CONFIG_ACPI xen_setup_acpi_sci(); __acpi_register_gsi = acpi_register_gsi_xen; - +#endif return 0; } @@ -484,7 +488,7 @@ void __init xen_setup_pirqs(void) } return; } - +#ifdef CONFIG_ACPI /* Pre-allocate legacy irqs */ for (irq = 0; irq < NR_IRQS_LEGACY; irq++) { int trigger, polarity; @@ -495,6 +499,7 @@ void __init xen_setup_pirqs(void) xen_register_pirq(irq, -1 /* no GSI override */, trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE); } +#endif } struct xen_device_domain_owner { -- cgit v1.1 From 30bd35edfd5c82147bdcf0540c6bd3cf92d101f8 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Wed, 6 Jul 2011 10:48:22 -0400 Subject: xen/pci: In xen_register_pirq bind the GSI to the IRQ after the hypercall. Not before .. also that code segment starts looking like the HVM one. Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/pci/xen.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index 7ee39cc..6b7d849 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -112,26 +112,10 @@ static int xen_register_pirq(u32 gsi, int gsi_override, int triggering) if (!xen_pv_domain()) return -1; - if (triggering == ACPI_EDGE_SENSITIVE) { - shareable = 0; - name = "ioapic-edge"; - } else { - shareable = 1; - name = "ioapic-level"; - } pirq = xen_allocate_pirq_gsi(gsi); if (pirq < 0) goto out; - if (gsi_override >= 0) - irq = xen_bind_pirq_gsi_to_irq(gsi_override, pirq, shareable, name); - else - irq = xen_bind_pirq_gsi_to_irq(gsi, pirq, shareable, name); - if (irq < 0) - goto out; - - printk(KERN_DEBUG "xen: --> pirq=%d -> irq=%d (gsi=%d)\n", pirq, irq, gsi); - map_irq.domid = DOMID_SELF; map_irq.type = MAP_PIRQ_TYPE_GSI; map_irq.index = gsi; @@ -143,6 +127,22 @@ static int xen_register_pirq(u32 gsi, int gsi_override, int triggering) return -1; } + if (triggering == ACPI_EDGE_SENSITIVE) { + shareable = 0; + name = "ioapic-edge"; + } else { + shareable = 1; + name = "ioapic-level"; + } + + if (gsi_override >= 0) + gsi = gsi_override; + + irq = xen_bind_pirq_gsi_to_irq(gsi, pirq, shareable, name); + if (irq < 0) + goto out; + + printk(KERN_DEBUG "xen: --> pirq=%d -> irq=%d (gsi=%d)\n", pirq, irq, gsi); out: return irq; } -- cgit v1.1 From ed89eb6396b3307bf9aaa4785f6a0914a68040cf Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Wed, 6 Jul 2011 12:42:43 -0400 Subject: xen/pci: Use the xen_register_pirq for HVM and initial domain users .. to cut down on the code duplicity. Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/pci/xen.c | 75 +++++++++++++++++------------------------------------- 1 file changed, 23 insertions(+), 52 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index 6b7d849..55c8cc3 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -62,60 +62,19 @@ static int xen_pcifront_enable_irq(struct pci_dev *dev) } #ifdef CONFIG_ACPI -static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi, - int trigger, int polarity) +static int xen_register_pirq(u32 gsi, int gsi_override, int triggering, + bool alloc_pirq) { - int rc, irq; + int rc, pirq = -1, irq = -1; struct physdev_map_pirq map_irq; int shareable = 0; char *name; - if (!xen_hvm_domain()) - return -1; - - map_irq.domid = DOMID_SELF; - map_irq.type = MAP_PIRQ_TYPE_GSI; - map_irq.index = gsi; - map_irq.pirq = -1; - - rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq); - if (rc) { - printk(KERN_WARNING "xen map irq failed %d\n", rc); - return -1; - } - - if (trigger == ACPI_EDGE_SENSITIVE) { - shareable = 0; - name = "ioapic-edge"; - } else { - shareable = 1; - name = "ioapic-level"; + if (alloc_pirq) { + pirq = xen_allocate_pirq_gsi(gsi); + if (pirq < 0) + goto out; } - - irq = xen_bind_pirq_gsi_to_irq(gsi, map_irq.pirq, shareable, name); - - printk(KERN_DEBUG "xen: --> irq=%d, pirq=%d\n", irq, map_irq.pirq); - - return irq; -} -#endif - -#ifdef CONFIG_XEN_DOM0 -#ifdef CONFIG_ACPI -static int xen_register_pirq(u32 gsi, int gsi_override, int triggering) -{ - int rc, pirq, irq = -1; - struct physdev_map_pirq map_irq; - int shareable = 0; - char *name; - - if (!xen_pv_domain()) - return -1; - - pirq = xen_allocate_pirq_gsi(gsi); - if (pirq < 0) - goto out; - map_irq.domid = DOMID_SELF; map_irq.type = MAP_PIRQ_TYPE_GSI; map_irq.index = gsi; @@ -138,15 +97,26 @@ static int xen_register_pirq(u32 gsi, int gsi_override, int triggering) if (gsi_override >= 0) gsi = gsi_override; - irq = xen_bind_pirq_gsi_to_irq(gsi, pirq, shareable, name); + irq = xen_bind_pirq_gsi_to_irq(gsi, map_irq.pirq, shareable, name); if (irq < 0) goto out; - printk(KERN_DEBUG "xen: --> pirq=%d -> irq=%d (gsi=%d)\n", pirq, irq, gsi); + printk(KERN_DEBUG "xen: --> pirq=%d -> irq=%d (gsi=%d)\n", map_irq.pirq, irq, gsi); out: return irq; } +static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi, + int trigger, int polarity) +{ + if (!xen_hvm_domain()) + return -1; + + return xen_register_pirq(gsi, -1 /* no GSI override */, + trigger, false /* no PIRQ allocation */); +} + +#ifdef CONFIG_XEN_DOM0 static int xen_register_gsi(u32 gsi, int gsi_override, int triggering, int polarity) { int rc, irq; @@ -158,7 +128,7 @@ static int xen_register_gsi(u32 gsi, int gsi_override, int triggering, int polar printk(KERN_DEBUG "xen: registering gsi %u triggering %d polarity %d\n", gsi, triggering, polarity); - irq = xen_register_pirq(gsi, gsi_override, triggering); + irq = xen_register_pirq(gsi, gsi_override, triggering, true); setup_gsi.gsi = gsi; setup_gsi.triggering = (triggering == ACPI_EDGE_SENSITIVE ? 0 : 1); @@ -497,7 +467,8 @@ void __init xen_setup_pirqs(void) continue; xen_register_pirq(irq, -1 /* no GSI override */, - trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE); + trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE, + true /* allocate IRQ */); } #endif } -- cgit v1.1 From a0ee05670915006564962114d4211dd578a8b28a Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 9 Jun 2011 09:49:13 -0400 Subject: xen/pci: Squash pci_xen_initial_domain and xen_setup_pirqs together. Since they are only called once and the rest of the pci_xen_* functions follow the same pattern of setup. Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/include/asm/xen/pci.h | 5 +++-- arch/x86/pci/xen.c | 17 ++++++----------- 2 files changed, 9 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/xen/pci.h b/arch/x86/include/asm/xen/pci.h index 4fbda9a..968d57d 100644 --- a/arch/x86/include/asm/xen/pci.h +++ b/arch/x86/include/asm/xen/pci.h @@ -14,13 +14,14 @@ static inline int pci_xen_hvm_init(void) } #endif #if defined(CONFIG_XEN_DOM0) -void __init xen_setup_pirqs(void); +int __init pci_xen_initial_domain(void); int xen_find_device_domain_owner(struct pci_dev *dev); int xen_register_device_domain_owner(struct pci_dev *dev, uint16_t domain); int xen_unregister_device_domain_owner(struct pci_dev *dev); #else -static inline void __init xen_setup_pirqs(void) +static inline int __init pci_xen_initial_domain(void) { + return -1; } static inline int xen_find_device_domain_owner(struct pci_dev *dev) { diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index 55c8cc3..54d5f31 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -429,8 +429,11 @@ static __init void xen_setup_acpi_sci(void) return; } #endif -static int __init pci_xen_initial_domain(void) + +int __init pci_xen_initial_domain(void) { + int pirq, irq; + #ifdef CONFIG_PCI_MSI x86_msi.setup_msi_irqs = xen_initdom_setup_msi_irqs; x86_msi.teardown_msi_irq = xen_teardown_msi_irq; @@ -439,15 +442,6 @@ static int __init pci_xen_initial_domain(void) xen_setup_acpi_sci(); __acpi_register_gsi = acpi_register_gsi_xen; #endif - return 0; -} - -void __init xen_setup_pirqs(void) -{ - int pirq, irq; - - pci_xen_initial_domain(); - if (0 == nr_ioapics) { for (irq = 0; irq < NR_IRQS_LEGACY; irq++) { pirq = xen_allocate_pirq_gsi(irq); @@ -456,7 +450,7 @@ void __init xen_setup_pirqs(void) break; irq = xen_bind_pirq_gsi_to_irq(irq, pirq, 0, "xt-pic"); } - return; + return 0; } #ifdef CONFIG_ACPI /* Pre-allocate legacy irqs */ @@ -471,6 +465,7 @@ void __init xen_setup_pirqs(void) true /* allocate IRQ */); } #endif + return 0; } struct xen_device_domain_owner { -- cgit v1.1 From 9b6519db5e226c0c83acddf788b7091b751fbb75 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Mon, 6 Jun 2011 14:20:35 -0400 Subject: xen/pci: Move the allocation of IRQs when there are no IOAPIC's to the end .. which means we can preset of NR_IRQS_LEGACY interrupts using the 'acpi_get_override_irq' API before this loop. This means that we can get the IRQ's polarity (and trigger) from either the ACPI (or MP); or use the default values. This fixes a bug if we did not have an IOAPIC we would not been able to preset the IRQ's polarity if the MP table existed. Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/pci/xen.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index 54d5f31..e585bf5 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -441,18 +441,6 @@ int __init pci_xen_initial_domain(void) #ifdef CONFIG_ACPI xen_setup_acpi_sci(); __acpi_register_gsi = acpi_register_gsi_xen; -#endif - if (0 == nr_ioapics) { - for (irq = 0; irq < NR_IRQS_LEGACY; irq++) { - pirq = xen_allocate_pirq_gsi(irq); - if (WARN(pirq < 0, - "Could not allocate PIRQ for legacy interrupt\n")) - break; - irq = xen_bind_pirq_gsi_to_irq(irq, pirq, 0, "xt-pic"); - } - return 0; - } -#ifdef CONFIG_ACPI /* Pre-allocate legacy irqs */ for (irq = 0; irq < NR_IRQS_LEGACY; irq++) { int trigger, polarity; @@ -465,6 +453,15 @@ int __init pci_xen_initial_domain(void) true /* allocate IRQ */); } #endif + if (0 == nr_ioapics) { + for (irq = 0; irq < NR_IRQS_LEGACY; irq++) { + pirq = xen_allocate_pirq_gsi(irq); + if (WARN(pirq < 0, + "Could not allocate PIRQ for legacy interrupt\n")) + break; + irq = xen_bind_pirq_gsi_to_irq(irq, pirq, 0, "xt-pic"); + } + } return 0; } -- cgit v1.1 From 34b1d1269d9fdaa1558e3014c3cc03ac0967de95 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Wed, 15 Jun 2011 14:43:52 -0400 Subject: xen/pci: Retire unnecessary #ifdef CONFIG_ACPI As the code paths that are guarded by CONFIG_XEN_DOM0 already depend on CONFIG_ACPI so the extra #ifdef is not required. The earlier patch that added them in had done its job. Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/pci/xen.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index e585bf5..6eddc52 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -381,7 +381,6 @@ int __init pci_xen_hvm_init(void) } #ifdef CONFIG_XEN_DOM0 -#ifdef CONFIG_ACPI static __init void xen_setup_acpi_sci(void) { int rc; @@ -428,7 +427,6 @@ static __init void xen_setup_acpi_sci(void) return; } -#endif int __init pci_xen_initial_domain(void) { @@ -438,7 +436,6 @@ int __init pci_xen_initial_domain(void) x86_msi.setup_msi_irqs = xen_initdom_setup_msi_irqs; x86_msi.teardown_msi_irq = xen_teardown_msi_irq; #endif -#ifdef CONFIG_ACPI xen_setup_acpi_sci(); __acpi_register_gsi = acpi_register_gsi_xen; /* Pre-allocate legacy irqs */ @@ -452,7 +449,6 @@ int __init pci_xen_initial_domain(void) trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE, true /* allocate IRQ */); } -#endif if (0 == nr_ioapics) { for (irq = 0; irq < NR_IRQS_LEGACY; irq++) { pirq = xen_allocate_pirq_gsi(irq); -- cgit v1.1 From 78316ada2222b5e3abc043eea7644e12319042d6 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Wed, 6 Jul 2011 15:15:23 -0400 Subject: xen/pci: Remove 'xen_allocate_pirq_gsi'. In the past (2.6.38) the 'xen_allocate_pirq_gsi' would allocate an entry in a Linux IRQ -> {XEN_IRQ, type, event, ..} array. All of that has been removed in 2.6.39 and the Xen IRQ subsystem uses an linked list that is populated when the call to 'xen_allocate_irq_gsi' (universally done from any of the xen_bind_* calls) is done. The 'xen_allocate_pirq_gsi' is a NOP and there is no need for it anymore so lets remove it. Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/pci/xen.c | 37 ++++++++++++------------------------- 1 file changed, 12 insertions(+), 25 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index 6eddc52..f07c419 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -37,14 +37,8 @@ static int xen_pcifront_enable_irq(struct pci_dev *dev) rc); return rc; } - - rc = xen_allocate_pirq_gsi(gsi); - if (rc < 0) { - dev_warn(&dev->dev, "Xen PCI: failed to allocate a PIRQ for GSI%d: %d\n", - gsi, rc); - return rc; - } - pirq = rc; + /* In PV DomU the Xen PCI backend puts the PIRQ in the interrupt line.*/ + pirq = gsi; if (gsi < NR_IRQS_LEGACY) share = 0; @@ -63,18 +57,16 @@ static int xen_pcifront_enable_irq(struct pci_dev *dev) #ifdef CONFIG_ACPI static int xen_register_pirq(u32 gsi, int gsi_override, int triggering, - bool alloc_pirq) + bool set_pirq) { int rc, pirq = -1, irq = -1; struct physdev_map_pirq map_irq; int shareable = 0; char *name; - if (alloc_pirq) { - pirq = xen_allocate_pirq_gsi(gsi); - if (pirq < 0) - goto out; - } + if (set_pirq) + pirq = gsi; + map_irq.domid = DOMID_SELF; map_irq.type = MAP_PIRQ_TYPE_GSI; map_irq.index = gsi; @@ -112,8 +104,8 @@ static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi, if (!xen_hvm_domain()) return -1; - return xen_register_pirq(gsi, -1 /* no GSI override */, - trigger, false /* no PIRQ allocation */); + return xen_register_pirq(gsi, -1 /* no GSI override */, trigger, + false /* no mapping of GSI to PIRQ */); } #ifdef CONFIG_XEN_DOM0 @@ -430,7 +422,7 @@ static __init void xen_setup_acpi_sci(void) int __init pci_xen_initial_domain(void) { - int pirq, irq; + int irq; #ifdef CONFIG_PCI_MSI x86_msi.setup_msi_irqs = xen_initdom_setup_msi_irqs; @@ -447,16 +439,11 @@ int __init pci_xen_initial_domain(void) xen_register_pirq(irq, -1 /* no GSI override */, trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE, - true /* allocate IRQ */); + true /* Map GSI to PIRQ */); } if (0 == nr_ioapics) { - for (irq = 0; irq < NR_IRQS_LEGACY; irq++) { - pirq = xen_allocate_pirq_gsi(irq); - if (WARN(pirq < 0, - "Could not allocate PIRQ for legacy interrupt\n")) - break; - irq = xen_bind_pirq_gsi_to_irq(irq, pirq, 0, "xt-pic"); - } + for (irq = 0; irq < NR_IRQS_LEGACY; irq++) + xen_bind_pirq_gsi_to_irq(irq, irq, 0, "xt-pic"); } return 0; } -- cgit v1.1 From 97ffab1f14638d2c95ad986ce735481d164a0bd2 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Wed, 6 Jul 2011 13:03:35 -0400 Subject: xen/pci: Use 'acpi_gsi_to_irq' value unconditionally. In the past we would only use the function's value if the returned value was not equal to 'acpi_sci_override_gsi'. Meaning that the INT_SRV_OVR values for global and source irq were different. But it is OK to use the function's value even when the global and source irq are the same. Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/pci/xen.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index f07c419..1017c7b 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -407,10 +407,9 @@ static __init void xen_setup_acpi_sci(void) * the ACPI interpreter and keels over since IRQ 9 has not been * setup as we had setup IRQ 20 for it). */ - /* Check whether the GSI != IRQ */ if (acpi_gsi_to_irq(gsi, &irq) == 0) { - if (irq >= 0 && irq != gsi) - /* Bugger, we MUST have that IRQ. */ + /* Use the provided value if it's valid. */ + if (irq >= 0) gsi_override = irq; } -- cgit v1.1 From 3c52b7bf6967e53dec3fbbcf99fee65e49e600df Mon Sep 17 00:00:00 2001 From: Raghavendra D Prabhu Date: Sat, 9 Jul 2011 21:59:07 +0530 Subject: xen:pvhvm: Modpost section mismatch fix Removing __init from check_platform_magic since it is called by xen_unplug_emulated_devices in non-init contexts (It probably gets inlined because of -finline-functions-called-once, removing __init is more to avoid mismatch being reported). Signed-off-by: Raghavendra D Prabhu Acked-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/platform-pci-unplug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c index 25c52f9..ffcf261 100644 --- a/arch/x86/xen/platform-pci-unplug.c +++ b/arch/x86/xen/platform-pci-unplug.c @@ -35,7 +35,7 @@ EXPORT_SYMBOL_GPL(xen_platform_pci_unplug); #ifdef CONFIG_XEN_PVHVM static int xen_emul_unplug; -static int __init check_platform_magic(void) +static int check_platform_magic(void) { short magic; char protocol; -- cgit v1.1 From a63fdc5156f2ef5690b6cf03d72b0c4917efbba7 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Tue, 14 Jun 2011 10:57:50 +1000 Subject: mm: Move definition of MIN_MEMORY_BLOCK_SIZE to a header The macro MIN_MEMORY_BLOCK_SIZE is currently defined twice in two .c files, and I need it in a third one to fix a powerpc bug, so let's first move it into a header Signed-off-by: Benjamin Herrenschmidt Acked-by: Ingo Molnar --- arch/x86/mm/init_64.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index d865c4ae..bbaaa00 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -895,8 +896,6 @@ const char *arch_vma_name(struct vm_area_struct *vma) } #ifdef CONFIG_X86_UV -#define MIN_MEMORY_BLOCK_SIZE (1 << SECTION_SIZE_BITS) - unsigned long memory_block_size_bytes(void) { if (is_uv_system()) { -- cgit v1.1 From 67cbc90db5c0f03aa5e54204c388403ec8c25862 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Sun, 15 May 2011 00:54:58 +0900 Subject: KVM: x86 emulator: Place insn_fetch helpers together The two macros need special care to use: Assume rc, ctxt, ops and done exist outside of them. Can goto outside. Considering the fact that these are used only in decode functions, moving these right after do_insn_fetch() seems to be a right thing to improve the readability. We also rename do_fetch_insn_byte() to do_insn_fetch_byte() to be consistent. Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index adc9867..6e4722c 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -407,23 +407,6 @@ struct gprefix { } \ } while (0) -/* Fetch next part of the instruction being emulated. */ -#define insn_fetch(_type, _size, _eip) \ -({ unsigned long _x; \ - rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size)); \ - if (rc != X86EMUL_CONTINUE) \ - goto done; \ - (_eip) += (_size); \ - (_type)_x; \ -}) - -#define insn_fetch_arr(_arr, _size, _eip) \ -({ rc = do_insn_fetch(ctxt, ops, (_eip), _arr, (_size)); \ - if (rc != X86EMUL_CONTINUE) \ - goto done; \ - (_eip) += (_size); \ -}) - static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt, enum x86_intercept intercept, enum x86_intercept_stage stage) @@ -671,7 +654,7 @@ static int segmented_read_std(struct x86_emulate_ctxt *ctxt, return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception); } -static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, +static int do_insn_fetch_byte(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, unsigned long eip, u8 *dest) { @@ -707,13 +690,30 @@ static int do_insn_fetch(struct x86_emulate_ctxt *ctxt, if (eip + size - ctxt->eip > 15) return X86EMUL_UNHANDLEABLE; while (size--) { - rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++); + rc = do_insn_fetch_byte(ctxt, ops, eip++, dest++); if (rc != X86EMUL_CONTINUE) return rc; } return X86EMUL_CONTINUE; } +/* Fetch next part of the instruction being emulated. */ +#define insn_fetch(_type, _size, _eip) \ +({ unsigned long _x; \ + rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size)); \ + if (rc != X86EMUL_CONTINUE) \ + goto done; \ + (_eip) += (_size); \ + (_type)_x; \ +}) + +#define insn_fetch_arr(_arr, _size, _eip) \ +({ rc = do_insn_fetch(ctxt, ops, (_eip), _arr, (_size)); \ + if (rc != X86EMUL_CONTINUE) \ + goto done; \ + (_eip) += (_size); \ +}) + /* * Given the 'reg' portion of a ModRM byte, and a register block, return a * pointer into the block that addresses the relevant register. -- cgit v1.1 From ef5d75cc9af2bca7c525158666b5f9696846ffb6 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Sun, 15 May 2011 00:57:43 +0900 Subject: KVM: x86 emulator: Stop passing ctxt->ops as arg of decode helpers Dereference it in the actual users: only do_insn_fetch_byte(). This is consistent with the way __linearize() dereferences it. Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 6e4722c..df9082c 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -655,7 +655,6 @@ static int segmented_read_std(struct x86_emulate_ctxt *ctxt, } static int do_insn_fetch_byte(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, unsigned long eip, u8 *dest) { struct fetch_cache *fc = &ctxt->decode.fetch; @@ -670,8 +669,8 @@ static int do_insn_fetch_byte(struct x86_emulate_ctxt *ctxt, rc = __linearize(ctxt, addr, size, false, true, &linear); if (rc != X86EMUL_CONTINUE) return rc; - rc = ops->fetch(ctxt, linear, fc->data + cur_size, - size, &ctxt->exception); + rc = ctxt->ops->fetch(ctxt, linear, fc->data + cur_size, + size, &ctxt->exception); if (rc != X86EMUL_CONTINUE) return rc; fc->end += size; @@ -681,7 +680,6 @@ static int do_insn_fetch_byte(struct x86_emulate_ctxt *ctxt, } static int do_insn_fetch(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, unsigned long eip, void *dest, unsigned size) { int rc; @@ -690,7 +688,7 @@ static int do_insn_fetch(struct x86_emulate_ctxt *ctxt, if (eip + size - ctxt->eip > 15) return X86EMUL_UNHANDLEABLE; while (size--) { - rc = do_insn_fetch_byte(ctxt, ops, eip++, dest++); + rc = do_insn_fetch_byte(ctxt, eip++, dest++); if (rc != X86EMUL_CONTINUE) return rc; } @@ -700,7 +698,7 @@ static int do_insn_fetch(struct x86_emulate_ctxt *ctxt, /* Fetch next part of the instruction being emulated. */ #define insn_fetch(_type, _size, _eip) \ ({ unsigned long _x; \ - rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size)); \ + rc = do_insn_fetch(ctxt, (_eip), &_x, (_size)); \ if (rc != X86EMUL_CONTINUE) \ goto done; \ (_eip) += (_size); \ @@ -708,7 +706,7 @@ static int do_insn_fetch(struct x86_emulate_ctxt *ctxt, }) #define insn_fetch_arr(_arr, _size, _eip) \ -({ rc = do_insn_fetch(ctxt, ops, (_eip), _arr, (_size)); \ +({ rc = do_insn_fetch(ctxt, (_eip), _arr, (_size)); \ if (rc != X86EMUL_CONTINUE) \ goto done; \ (_eip) += (_size); \ @@ -887,7 +885,6 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt, } static int decode_modrm(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, struct operand *op) { struct decode_cache *c = &ctxt->decode; @@ -1014,7 +1011,6 @@ done: } static int decode_abs(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, struct operand *op) { struct decode_cache *c = &ctxt->decode; @@ -3327,7 +3323,6 @@ static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op, unsigned size, bool sign_extension) { struct decode_cache *c = &ctxt->decode; - struct x86_emulate_ops *ops = ctxt->ops; int rc = X86EMUL_CONTINUE; op->type = OP_IMM; @@ -3362,10 +3357,8 @@ done: return rc; } -int -x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) +int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) { - struct x86_emulate_ops *ops = ctxt->ops; struct decode_cache *c = &ctxt->decode; int rc = X86EMUL_CONTINUE; int mode = ctxt->mode; @@ -3531,11 +3524,11 @@ done_prefixes: /* ModRM and SIB bytes. */ if (c->d & ModRM) { - rc = decode_modrm(ctxt, ops, &memop); + rc = decode_modrm(ctxt, &memop); if (!c->has_seg_override) set_seg_override(c, c->modrm_seg); } else if (c->d & MemAbs) - rc = decode_abs(ctxt, ops, &memop); + rc = decode_abs(ctxt, &memop); if (rc != X86EMUL_CONTINUE) goto done; -- cgit v1.1 From 7b105ca2903b84f023c49965d9a511c5e55256dc Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Sun, 15 May 2011 01:00:52 +0900 Subject: KVM: x86 emulator: Stop passing ctxt->ops as arg of emul functions Dereference it in the actual users. This not only cleans up the emulator but also makes it easy to convert the old emulation functions to the new em_xxx() form later. Note: Remove some inline keywords to let the compiler decide inlining. Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_emulate.h | 3 +- arch/x86/kvm/emulate.c | 259 +++++++++++++++++-------------------- arch/x86/kvm/x86.c | 2 +- 3 files changed, 118 insertions(+), 146 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 0049211..ab09ba2 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -373,6 +373,5 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt); int emulator_task_switch(struct x86_emulate_ctxt *ctxt, u16 tss_selector, int reason, bool has_error_code, u32 error_code); -int emulate_int_real(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, int irq); +int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq); #endif /* _ASM_X86_KVM_X86_EMULATE_H */ diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index df9082c..d3f4466 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -475,13 +475,12 @@ static void set_seg_override(struct decode_cache *c, int seg) c->seg_override = seg; } -static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, int seg) +static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg) { if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS) return 0; - return ops->get_cached_segment_base(ctxt, seg); + return ctxt->ops->get_cached_segment_base(ctxt, seg); } static unsigned seg_override(struct x86_emulate_ctxt *ctxt, @@ -570,7 +569,7 @@ static int __linearize(struct x86_emulate_ctxt *ctxt, u16 sel; unsigned cpl, rpl; - la = seg_base(ctxt, ctxt->ops, addr.seg) + addr.ea; + la = seg_base(ctxt, addr.seg) + addr.ea; switch (ctxt->mode) { case X86EMUL_MODE_REAL: break; @@ -1052,7 +1051,6 @@ static void fetch_bit_operand(struct decode_cache *c) } static int read_emulated(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, unsigned long addr, void *dest, unsigned size) { int rc; @@ -1064,8 +1062,8 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt, if (mc->pos < mc->end) goto read_cached; - rc = ops->read_emulated(ctxt, addr, mc->data + mc->end, n, - &ctxt->exception); + rc = ctxt->ops->read_emulated(ctxt, addr, mc->data + mc->end, n, + &ctxt->exception); if (rc != X86EMUL_CONTINUE) return rc; mc->end += n; @@ -1090,7 +1088,7 @@ static int segmented_read(struct x86_emulate_ctxt *ctxt, rc = linearize(ctxt, addr, size, false, &linear); if (rc != X86EMUL_CONTINUE) return rc; - return read_emulated(ctxt, ctxt->ops, linear, data, size); + return read_emulated(ctxt, linear, data, size); } static int segmented_write(struct x86_emulate_ctxt *ctxt, @@ -1124,7 +1122,6 @@ static int segmented_cmpxchg(struct x86_emulate_ctxt *ctxt, } static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, unsigned int size, unsigned short port, void *dest) { @@ -1143,7 +1140,7 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, if (n == 0) n = 1; rc->pos = rc->end = 0; - if (!ops->pio_in_emulated(ctxt, size, port, rc->data, n)) + if (!ctxt->ops->pio_in_emulated(ctxt, size, port, rc->data, n)) return 0; rc->end = n * size; } @@ -1154,9 +1151,10 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, } static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, u16 selector, struct desc_ptr *dt) { + struct x86_emulate_ops *ops = ctxt->ops; + if (selector & 1 << 2) { struct desc_struct desc; u16 sel; @@ -1173,48 +1171,42 @@ static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, /* allowed just for 8 bytes segments */ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, u16 selector, struct desc_struct *desc) { struct desc_ptr dt; u16 index = selector >> 3; - int ret; ulong addr; - get_descriptor_table_ptr(ctxt, ops, selector, &dt); + get_descriptor_table_ptr(ctxt, selector, &dt); if (dt.size < index * 8 + 7) return emulate_gp(ctxt, selector & 0xfffc); - addr = dt.address + index * 8; - ret = ops->read_std(ctxt, addr, desc, sizeof *desc, &ctxt->exception); - return ret; + addr = dt.address + index * 8; + return ctxt->ops->read_std(ctxt, addr, desc, sizeof *desc, + &ctxt->exception); } /* allowed just for 8 bytes segments */ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, u16 selector, struct desc_struct *desc) { struct desc_ptr dt; u16 index = selector >> 3; ulong addr; - int ret; - get_descriptor_table_ptr(ctxt, ops, selector, &dt); + get_descriptor_table_ptr(ctxt, selector, &dt); if (dt.size < index * 8 + 7) return emulate_gp(ctxt, selector & 0xfffc); addr = dt.address + index * 8; - ret = ops->write_std(ctxt, addr, desc, sizeof *desc, &ctxt->exception); - - return ret; + return ctxt->ops->write_std(ctxt, addr, desc, sizeof *desc, + &ctxt->exception); } /* Does not support long mode */ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, u16 selector, int seg) { struct desc_struct seg_desc; @@ -1249,7 +1241,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, if (null_selector) /* for NULL selector skip all following checks */ goto load; - ret = read_segment_descriptor(ctxt, ops, selector, &seg_desc); + ret = read_segment_descriptor(ctxt, selector, &seg_desc); if (ret != X86EMUL_CONTINUE) return ret; @@ -1267,7 +1259,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, rpl = selector & 3; dpl = seg_desc.dpl; - cpl = ops->cpl(ctxt); + cpl = ctxt->ops->cpl(ctxt); switch (seg) { case VCPU_SREG_SS: @@ -1318,12 +1310,12 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, if (seg_desc.s) { /* mark segment as accessed */ seg_desc.type |= 1; - ret = write_segment_descriptor(ctxt, ops, selector, &seg_desc); + ret = write_segment_descriptor(ctxt, selector, &seg_desc); if (ret != X86EMUL_CONTINUE) return ret; } load: - ops->set_segment(ctxt, selector, &seg_desc, 0, seg); + ctxt->ops->set_segment(ctxt, selector, &seg_desc, 0, seg); return X86EMUL_CONTINUE; exception: emulate_exception(ctxt, err_vec, err_code, true); @@ -1424,13 +1416,12 @@ static int em_pop(struct x86_emulate_ctxt *ctxt) } static int emulate_popf(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, - void *dest, int len) + void *dest, int len) { int rc; unsigned long val, change_mask; int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; - int cpl = ops->cpl(ctxt); + int cpl = ctxt->ops->cpl(ctxt); rc = emulate_pop(ctxt, &val, len); if (rc != X86EMUL_CONTINUE) @@ -1471,11 +1462,10 @@ static int em_popf(struct x86_emulate_ctxt *ctxt) c->dst.type = OP_REG; c->dst.addr.reg = &ctxt->eflags; c->dst.bytes = c->op_bytes; - return emulate_popf(ctxt, ctxt->ops, &c->dst.val, c->op_bytes); + return emulate_popf(ctxt, &c->dst.val, c->op_bytes); } -static int emulate_push_sreg(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, int seg) +static int emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg) { struct decode_cache *c = &ctxt->decode; @@ -1484,8 +1474,7 @@ static int emulate_push_sreg(struct x86_emulate_ctxt *ctxt, return em_push(ctxt); } -static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, int seg) +static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, int seg) { struct decode_cache *c = &ctxt->decode; unsigned long selector; @@ -1495,7 +1484,7 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, if (rc != X86EMUL_CONTINUE) return rc; - rc = load_segment_descriptor(ctxt, ops, (u16)selector, seg); + rc = load_segment_descriptor(ctxt, (u16)selector, seg); return rc; } @@ -1549,10 +1538,10 @@ static int em_popa(struct x86_emulate_ctxt *ctxt) return rc; } -int emulate_int_real(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, int irq) +int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq) { struct decode_cache *c = &ctxt->decode; + struct x86_emulate_ops *ops = ctxt->ops; int rc; struct desc_ptr dt; gva_t cs_addr; @@ -1590,7 +1579,7 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt, if (rc != X86EMUL_CONTINUE) return rc; - rc = load_segment_descriptor(ctxt, ops, cs, VCPU_SREG_CS); + rc = load_segment_descriptor(ctxt, cs, VCPU_SREG_CS); if (rc != X86EMUL_CONTINUE) return rc; @@ -1599,12 +1588,11 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt, return rc; } -static int emulate_int(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, int irq) +static int emulate_int(struct x86_emulate_ctxt *ctxt, int irq) { switch(ctxt->mode) { case X86EMUL_MODE_REAL: - return emulate_int_real(ctxt, ops, irq); + return emulate_int_real(ctxt, irq); case X86EMUL_MODE_VM86: case X86EMUL_MODE_PROT16: case X86EMUL_MODE_PROT32: @@ -1615,8 +1603,7 @@ static int emulate_int(struct x86_emulate_ctxt *ctxt, } } -static int emulate_iret_real(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) +static int emulate_iret_real(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; int rc = X86EMUL_CONTINUE; @@ -1648,7 +1635,7 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt, if (rc != X86EMUL_CONTINUE) return rc; - rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS); + rc = load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS); if (rc != X86EMUL_CONTINUE) return rc; @@ -1669,12 +1656,11 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt, return rc; } -static inline int emulate_iret(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops* ops) +static int emulate_iret(struct x86_emulate_ctxt *ctxt) { switch(ctxt->mode) { case X86EMUL_MODE_REAL: - return emulate_iret_real(ctxt, ops); + return emulate_iret_real(ctxt); case X86EMUL_MODE_VM86: case X86EMUL_MODE_PROT16: case X86EMUL_MODE_PROT32: @@ -1693,7 +1679,7 @@ static int em_jmp_far(struct x86_emulate_ctxt *ctxt) memcpy(&sel, c->src.valptr + c->op_bytes, 2); - rc = load_segment_descriptor(ctxt, ctxt->ops, sel, VCPU_SREG_CS); + rc = load_segment_descriptor(ctxt, sel, VCPU_SREG_CS); if (rc != X86EMUL_CONTINUE) return rc; @@ -1830,8 +1816,7 @@ static int em_grp9(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } -static int emulate_ret_far(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) +static int emulate_ret_far(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; int rc; @@ -1845,12 +1830,11 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt, rc = emulate_pop(ctxt, &cs, c->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; - rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS); + rc = load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS); return rc; } -static int emulate_load_segment(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, int seg) +static int emulate_load_segment(struct x86_emulate_ctxt *ctxt, int seg) { struct decode_cache *c = &ctxt->decode; unsigned short sel; @@ -1858,7 +1842,7 @@ static int emulate_load_segment(struct x86_emulate_ctxt *ctxt, memcpy(&sel, c->src.valptr + c->op_bytes, 2); - rc = load_segment_descriptor(ctxt, ops, sel, seg); + rc = load_segment_descriptor(ctxt, sel, seg); if (rc != X86EMUL_CONTINUE) return rc; @@ -1866,15 +1850,14 @@ static int emulate_load_segment(struct x86_emulate_ctxt *ctxt, return rc; } -static inline void +static void setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, struct desc_struct *cs, - struct desc_struct *ss) + struct desc_struct *cs, struct desc_struct *ss) { u16 selector; memset(cs, 0, sizeof(struct desc_struct)); - ops->get_segment(ctxt, &selector, cs, NULL, VCPU_SREG_CS); + ctxt->ops->get_segment(ctxt, &selector, cs, NULL, VCPU_SREG_CS); memset(ss, 0, sizeof(struct desc_struct)); cs->l = 0; /* will be adjusted later */ @@ -1897,10 +1880,10 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, ss->p = 1; } -static int -emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) +static int emulate_syscall(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; + struct x86_emulate_ops *ops = ctxt->ops; struct desc_struct cs, ss; u64 msr_data; u16 cs_sel, ss_sel; @@ -1912,7 +1895,7 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) return emulate_ud(ctxt); ops->get_msr(ctxt, MSR_EFER, &efer); - setup_syscalls_segments(ctxt, ops, &cs, &ss); + setup_syscalls_segments(ctxt, &cs, &ss); ops->get_msr(ctxt, MSR_STAR, &msr_data); msr_data >>= 32; @@ -1950,16 +1933,16 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) return X86EMUL_CONTINUE; } -static int -emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) +static int emulate_sysenter(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; + struct x86_emulate_ops *ops = ctxt->ops; struct desc_struct cs, ss; u64 msr_data; u16 cs_sel, ss_sel; u64 efer = 0; - ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); + ops->get_msr(ctxt, MSR_EFER, &efer); /* inject #GP if in real mode */ if (ctxt->mode == X86EMUL_MODE_REAL) return emulate_gp(ctxt, 0); @@ -1970,7 +1953,7 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) if (ctxt->mode == X86EMUL_MODE_PROT64) return emulate_ud(ctxt); - setup_syscalls_segments(ctxt, ops, &cs, &ss); + setup_syscalls_segments(ctxt, &cs, &ss); ops->get_msr(ctxt, MSR_IA32_SYSENTER_CS, &msr_data); switch (ctxt->mode) { @@ -2006,10 +1989,10 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) return X86EMUL_CONTINUE; } -static int -emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) +static int emulate_sysexit(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; + struct x86_emulate_ops *ops = ctxt->ops; struct desc_struct cs, ss; u64 msr_data; int usermode; @@ -2020,7 +2003,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) ctxt->mode == X86EMUL_MODE_VM86) return emulate_gp(ctxt, 0); - setup_syscalls_segments(ctxt, ops, &cs, &ss); + setup_syscalls_segments(ctxt, &cs, &ss); if ((c->rex_prefix & 0x8) != 0x0) usermode = X86EMUL_MODE_PROT64; @@ -2058,8 +2041,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) return X86EMUL_CONTINUE; } -static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) +static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt) { int iopl; if (ctxt->mode == X86EMUL_MODE_REAL) @@ -2067,13 +2049,13 @@ static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt, if (ctxt->mode == X86EMUL_MODE_VM86) return true; iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; - return ops->cpl(ctxt) > iopl; + return ctxt->ops->cpl(ctxt) > iopl; } static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, u16 port, u16 len) { + struct x86_emulate_ops *ops = ctxt->ops; struct desc_struct tr_seg; u32 base3; int r; @@ -2104,14 +2086,13 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, } static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, u16 port, u16 len) { if (ctxt->perm_ok) return true; - if (emulator_bad_iopl(ctxt, ops)) - if (!emulator_io_port_access_allowed(ctxt, ops, port, len)) + if (emulator_bad_iopl(ctxt)) + if (!emulator_io_port_access_allowed(ctxt, port, len)) return false; ctxt->perm_ok = true; @@ -2120,7 +2101,6 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt, } static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, struct tss_segment_16 *tss) { struct decode_cache *c = &ctxt->decode; @@ -2144,7 +2124,6 @@ static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt, } static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, struct tss_segment_16 *tss) { struct decode_cache *c = &ctxt->decode; @@ -2175,19 +2154,19 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt, * Now load segment descriptors. If fault happenes at this stage * it is handled in a context of new task */ - ret = load_segment_descriptor(ctxt, ops, tss->ldt, VCPU_SREG_LDTR); + ret = load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR); if (ret != X86EMUL_CONTINUE) return ret; - ret = load_segment_descriptor(ctxt, ops, tss->es, VCPU_SREG_ES); + ret = load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES); if (ret != X86EMUL_CONTINUE) return ret; - ret = load_segment_descriptor(ctxt, ops, tss->cs, VCPU_SREG_CS); + ret = load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS); if (ret != X86EMUL_CONTINUE) return ret; - ret = load_segment_descriptor(ctxt, ops, tss->ss, VCPU_SREG_SS); + ret = load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS); if (ret != X86EMUL_CONTINUE) return ret; - ret = load_segment_descriptor(ctxt, ops, tss->ds, VCPU_SREG_DS); + ret = load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS); if (ret != X86EMUL_CONTINUE) return ret; @@ -2195,10 +2174,10 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt, } static int task_switch_16(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, u16 tss_selector, u16 old_tss_sel, ulong old_tss_base, struct desc_struct *new_desc) { + struct x86_emulate_ops *ops = ctxt->ops; struct tss_segment_16 tss_seg; int ret; u32 new_tss_base = get_desc_base(new_desc); @@ -2209,7 +2188,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, /* FIXME: need to provide precise fault address */ return ret; - save_state_to_tss16(ctxt, ops, &tss_seg); + save_state_to_tss16(ctxt, &tss_seg); ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, &ctxt->exception); @@ -2235,16 +2214,15 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, return ret; } - return load_state_from_tss16(ctxt, ops, &tss_seg); + return load_state_from_tss16(ctxt, &tss_seg); } static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, struct tss_segment_32 *tss) { struct decode_cache *c = &ctxt->decode; - tss->cr3 = ops->get_cr(ctxt, 3); + tss->cr3 = ctxt->ops->get_cr(ctxt, 3); tss->eip = c->eip; tss->eflags = ctxt->eflags; tss->eax = c->regs[VCPU_REGS_RAX]; @@ -2266,13 +2244,12 @@ static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt, } static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, struct tss_segment_32 *tss) { struct decode_cache *c = &ctxt->decode; int ret; - if (ops->set_cr(ctxt, 3, tss->cr3)) + if (ctxt->ops->set_cr(ctxt, 3, tss->cr3)) return emulate_gp(ctxt, 0); c->eip = tss->eip; ctxt->eflags = tss->eflags | 2; @@ -2301,25 +2278,25 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, * Now load segment descriptors. If fault happenes at this stage * it is handled in a context of new task */ - ret = load_segment_descriptor(ctxt, ops, tss->ldt_selector, VCPU_SREG_LDTR); + ret = load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR); if (ret != X86EMUL_CONTINUE) return ret; - ret = load_segment_descriptor(ctxt, ops, tss->es, VCPU_SREG_ES); + ret = load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES); if (ret != X86EMUL_CONTINUE) return ret; - ret = load_segment_descriptor(ctxt, ops, tss->cs, VCPU_SREG_CS); + ret = load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS); if (ret != X86EMUL_CONTINUE) return ret; - ret = load_segment_descriptor(ctxt, ops, tss->ss, VCPU_SREG_SS); + ret = load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS); if (ret != X86EMUL_CONTINUE) return ret; - ret = load_segment_descriptor(ctxt, ops, tss->ds, VCPU_SREG_DS); + ret = load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS); if (ret != X86EMUL_CONTINUE) return ret; - ret = load_segment_descriptor(ctxt, ops, tss->fs, VCPU_SREG_FS); + ret = load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS); if (ret != X86EMUL_CONTINUE) return ret; - ret = load_segment_descriptor(ctxt, ops, tss->gs, VCPU_SREG_GS); + ret = load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS); if (ret != X86EMUL_CONTINUE) return ret; @@ -2327,10 +2304,10 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, } static int task_switch_32(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, u16 tss_selector, u16 old_tss_sel, ulong old_tss_base, struct desc_struct *new_desc) { + struct x86_emulate_ops *ops = ctxt->ops; struct tss_segment_32 tss_seg; int ret; u32 new_tss_base = get_desc_base(new_desc); @@ -2341,7 +2318,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, /* FIXME: need to provide precise fault address */ return ret; - save_state_to_tss32(ctxt, ops, &tss_seg); + save_state_to_tss32(ctxt, &tss_seg); ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, &ctxt->exception); @@ -2367,14 +2344,14 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, return ret; } - return load_state_from_tss32(ctxt, ops, &tss_seg); + return load_state_from_tss32(ctxt, &tss_seg); } static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, u16 tss_selector, int reason, bool has_error_code, u32 error_code) { + struct x86_emulate_ops *ops = ctxt->ops; struct desc_struct curr_tss_desc, next_tss_desc; int ret; u16 old_tss_sel = get_segment_selector(ctxt, VCPU_SREG_TR); @@ -2384,10 +2361,10 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, /* FIXME: old_tss_base == ~0 ? */ - ret = read_segment_descriptor(ctxt, ops, tss_selector, &next_tss_desc); + ret = read_segment_descriptor(ctxt, tss_selector, &next_tss_desc); if (ret != X86EMUL_CONTINUE) return ret; - ret = read_segment_descriptor(ctxt, ops, old_tss_sel, &curr_tss_desc); + ret = read_segment_descriptor(ctxt, old_tss_sel, &curr_tss_desc); if (ret != X86EMUL_CONTINUE) return ret; @@ -2409,8 +2386,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) { curr_tss_desc.type &= ~(1 << 1); /* clear busy flag */ - write_segment_descriptor(ctxt, ops, old_tss_sel, - &curr_tss_desc); + write_segment_descriptor(ctxt, old_tss_sel, &curr_tss_desc); } if (reason == TASK_SWITCH_IRET) @@ -2422,10 +2398,10 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, old_tss_sel = 0xffff; if (next_tss_desc.type & 8) - ret = task_switch_32(ctxt, ops, tss_selector, old_tss_sel, + ret = task_switch_32(ctxt, tss_selector, old_tss_sel, old_tss_base, &next_tss_desc); else - ret = task_switch_16(ctxt, ops, tss_selector, old_tss_sel, + ret = task_switch_16(ctxt, tss_selector, old_tss_sel, old_tss_base, &next_tss_desc); if (ret != X86EMUL_CONTINUE) return ret; @@ -2435,8 +2411,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, if (reason != TASK_SWITCH_IRET) { next_tss_desc.type |= (1 << 1); /* set busy flag */ - write_segment_descriptor(ctxt, ops, tss_selector, - &next_tss_desc); + write_segment_descriptor(ctxt, tss_selector, &next_tss_desc); } ops->set_cr(ctxt, 0, ops->get_cr(ctxt, 0) | X86_CR0_TS); @@ -2458,14 +2433,13 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt, u16 tss_selector, int reason, bool has_error_code, u32 error_code) { - struct x86_emulate_ops *ops = ctxt->ops; struct decode_cache *c = &ctxt->decode; int rc; c->eip = ctxt->eip; c->dst.type = OP_NONE; - rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason, + rc = emulator_do_task_switch(ctxt, tss_selector, reason, has_error_code, error_code); if (rc == X86EMUL_CONTINUE) @@ -2535,7 +2509,7 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt) old_eip = c->eip; memcpy(&sel, c->src.valptr + c->op_bytes, 2); - if (load_segment_descriptor(ctxt, ctxt->ops, sel, VCPU_SREG_CS)) + if (load_segment_descriptor(ctxt, sel, VCPU_SREG_CS)) return X86EMUL_CONTINUE; c->eip = 0; @@ -2973,7 +2947,7 @@ static int check_perm_in(struct x86_emulate_ctxt *ctxt) struct decode_cache *c = &ctxt->decode; c->dst.bytes = min(c->dst.bytes, 4u); - if (!emulator_io_permited(ctxt, ctxt->ops, c->src.val, c->dst.bytes)) + if (!emulator_io_permited(ctxt, c->src.val, c->dst.bytes)) return emulate_gp(ctxt, 0); return X86EMUL_CONTINUE; @@ -2984,7 +2958,7 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt) struct decode_cache *c = &ctxt->decode; c->src.bytes = min(c->src.bytes, 4u); - if (!emulator_io_permited(ctxt, ctxt->ops, c->dst.val, c->src.bytes)) + if (!emulator_io_permited(ctxt, c->dst.val, c->src.bytes)) return emulate_gp(ctxt, 0); return X86EMUL_CONTINUE; @@ -3724,8 +3698,7 @@ static bool string_insn_completed(struct x86_emulate_ctxt *ctxt) return false; } -int -x86_emulate_insn(struct x86_emulate_ctxt *ctxt) +int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) { struct x86_emulate_ops *ops = ctxt->ops; u64 msr_data; @@ -3854,25 +3827,25 @@ special_insn: switch (c->b) { case 0x06: /* push es */ - rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_ES); + rc = emulate_push_sreg(ctxt, VCPU_SREG_ES); break; case 0x07: /* pop es */ - rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES); + rc = emulate_pop_sreg(ctxt, VCPU_SREG_ES); break; case 0x0e: /* push cs */ - rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_CS); + rc = emulate_push_sreg(ctxt, VCPU_SREG_CS); break; case 0x16: /* push ss */ - rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_SS); + rc = emulate_push_sreg(ctxt, VCPU_SREG_SS); break; case 0x17: /* pop ss */ - rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS); + rc = emulate_pop_sreg(ctxt, VCPU_SREG_SS); break; case 0x1e: /* push ds */ - rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_DS); + rc = emulate_push_sreg(ctxt, VCPU_SREG_DS); break; case 0x1f: /* pop ds */ - rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS); + rc = emulate_pop_sreg(ctxt, VCPU_SREG_DS); break; case 0x40 ... 0x47: /* inc r16/r32 */ emulate_1op("inc", c->dst, ctxt->eflags); @@ -3938,7 +3911,7 @@ special_insn: if (c->modrm_reg == VCPU_SREG_SS) ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS; - rc = load_segment_descriptor(ctxt, ops, sel, c->modrm_reg); + rc = load_segment_descriptor(ctxt, sel, c->modrm_reg); c->dst.type = OP_NONE; /* Disable writeback. */ break; @@ -3969,13 +3942,13 @@ special_insn: rc = em_pop(ctxt); break; case 0xc4: /* les */ - rc = emulate_load_segment(ctxt, ops, VCPU_SREG_ES); + rc = emulate_load_segment(ctxt, VCPU_SREG_ES); break; case 0xc5: /* lds */ - rc = emulate_load_segment(ctxt, ops, VCPU_SREG_DS); + rc = emulate_load_segment(ctxt, VCPU_SREG_DS); break; case 0xcb: /* ret far */ - rc = emulate_ret_far(ctxt, ops); + rc = emulate_ret_far(ctxt); break; case 0xcc: /* int3 */ irq = 3; @@ -3983,7 +3956,7 @@ special_insn: case 0xcd: /* int n */ irq = c->src.val; do_interrupt: - rc = emulate_int(ctxt, ops, irq); + rc = emulate_int(ctxt, irq); break; case 0xce: /* into */ if (ctxt->eflags & EFLG_OF) { @@ -3992,7 +3965,7 @@ special_insn: } break; case 0xcf: /* iret */ - rc = emulate_iret(ctxt, ops); + rc = emulate_iret(ctxt); break; case 0xd0 ... 0xd1: /* Grp2 */ rc = em_grp2(ctxt); @@ -4037,7 +4010,7 @@ special_insn: case 0xec: /* in al,dx */ case 0xed: /* in (e/r)ax,dx */ do_io_in: - if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val, + if (!pio_in_emulated(ctxt, c->dst.bytes, c->src.val, &c->dst.val)) goto done; /* IO is needed */ break; @@ -4065,14 +4038,14 @@ special_insn: ctxt->eflags |= EFLG_CF; break; case 0xfa: /* cli */ - if (emulator_bad_iopl(ctxt, ops)) { + if (emulator_bad_iopl(ctxt)) { rc = emulate_gp(ctxt, 0); goto done; } else ctxt->eflags &= ~X86_EFLAGS_IF; break; case 0xfb: /* sti */ - if (emulator_bad_iopl(ctxt, ops)) { + if (emulator_bad_iopl(ctxt)) { rc = emulate_gp(ctxt, 0); goto done; } else { @@ -4154,7 +4127,7 @@ done: twobyte_insn: switch (c->b) { case 0x05: /* syscall */ - rc = emulate_syscall(ctxt, ops); + rc = emulate_syscall(ctxt); break; case 0x06: rc = em_clts(ctxt); @@ -4216,10 +4189,10 @@ twobyte_insn: rc = X86EMUL_CONTINUE; break; case 0x34: /* sysenter */ - rc = emulate_sysenter(ctxt, ops); + rc = emulate_sysenter(ctxt); break; case 0x35: /* sysexit */ - rc = emulate_sysexit(ctxt, ops); + rc = emulate_sysexit(ctxt); break; case 0x40 ... 0x4f: /* cmov */ c->dst.val = c->dst.orig_val = c->src.val; @@ -4234,10 +4207,10 @@ twobyte_insn: c->dst.val = test_cc(c->b, ctxt->eflags); break; case 0xa0: /* push fs */ - rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_FS); + rc = emulate_push_sreg(ctxt, VCPU_SREG_FS); break; case 0xa1: /* pop fs */ - rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS); + rc = emulate_pop_sreg(ctxt, VCPU_SREG_FS); break; case 0xa3: bt: /* bt */ @@ -4251,10 +4224,10 @@ twobyte_insn: emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags); break; case 0xa8: /* push gs */ - rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_GS); + rc = emulate_push_sreg(ctxt, VCPU_SREG_GS); break; case 0xa9: /* pop gs */ - rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS); + rc = emulate_pop_sreg(ctxt, VCPU_SREG_GS); break; case 0xab: bts: /* bts */ @@ -4284,17 +4257,17 @@ twobyte_insn: } break; case 0xb2: /* lss */ - rc = emulate_load_segment(ctxt, ops, VCPU_SREG_SS); + rc = emulate_load_segment(ctxt, VCPU_SREG_SS); break; case 0xb3: btr: /* btr */ emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags); break; case 0xb4: /* lfs */ - rc = emulate_load_segment(ctxt, ops, VCPU_SREG_FS); + rc = emulate_load_segment(ctxt, VCPU_SREG_FS); break; case 0xb5: /* lgs */ - rc = emulate_load_segment(ctxt, ops, VCPU_SREG_GS); + rc = emulate_load_segment(ctxt, VCPU_SREG_GS); break; case 0xb6 ... 0xb7: /* movzx */ c->dst.bytes = c->op_bytes; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 77c9d86..51df0b6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4513,7 +4513,7 @@ int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) vcpu->arch.emulate_ctxt.decode.ad_bytes = 2; vcpu->arch.emulate_ctxt.decode.eip = vcpu->arch.emulate_ctxt.eip + inc_eip; - ret = emulate_int_real(&vcpu->arch.emulate_ctxt, &emulate_ops, irq); + ret = emulate_int_real(&vcpu->arch.emulate_ctxt, irq); if (ret != X86EMUL_CONTINUE) return EMULATE_FAIL; -- cgit v1.1 From 5e520e62787afd8fc28626fd8d4f77491135119d Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 15 May 2011 10:13:12 -0400 Subject: KVM: VMX: Move VMREAD cleanup to exception handler We clean up a failed VMREAD by clearing the output register. Do it in the exception handler instead of unconditionally. This is worthwhile since there are more than a hundred call sites. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_host.h | 6 +++++- arch/x86/kvm/vmx.c | 8 +++++--- 2 files changed, 10 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index d2ac8e2..db4b654 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -830,11 +830,12 @@ enum { asmlinkage void kvm_spurious_fault(void); extern bool kvm_rebooting; -#define __kvm_handle_fault_on_reboot(insn) \ +#define ____kvm_handle_fault_on_reboot(insn, cleanup_insn) \ "666: " insn "\n\t" \ "668: \n\t" \ ".pushsection .fixup, \"ax\" \n" \ "667: \n\t" \ + cleanup_insn "\n\t" \ "cmpb $0, kvm_rebooting \n\t" \ "jne 668b \n\t" \ __ASM_SIZE(push) " $666b \n\t" \ @@ -844,6 +845,9 @@ extern bool kvm_rebooting; _ASM_PTR " 666b, 667b \n\t" \ ".popsection" +#define __kvm_handle_fault_on_reboot(insn) \ + ____kvm_handle_fault_on_reboot(insn, "") + #define KVM_ARCH_WANT_MMU_NOTIFIER int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); int kvm_age_hva(struct kvm *kvm, unsigned long hva); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d48ec60..7d22d6c 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -43,6 +43,8 @@ #include "trace.h" #define __ex(x) __kvm_handle_fault_on_reboot(x) +#define __ex_clear(x, reg) \ + ____kvm_handle_fault_on_reboot(x, "xor " reg " , " reg) MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); @@ -587,10 +589,10 @@ static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa) static unsigned long vmcs_readl(unsigned long field) { - unsigned long value = 0; + unsigned long value; - asm volatile (__ex(ASM_VMX_VMREAD_RDX_RAX) - : "+a"(value) : "d"(field) : "cc"); + asm volatile (__ex_clear(ASM_VMX_VMREAD_RDX_RAX, "%0") + : "=a"(value) : "d"(field) : "cc"); return value; } -- cgit v1.1 From 96304217a783a65c0923a26f54141cfe7a2a71b5 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 15 May 2011 10:13:13 -0400 Subject: KVM: VMX: always_inline VMREADs vmcs_readl() and friends are really short, but gcc thinks they are long because of the out-of-line exception handlers. Mark them always_inline to clear the misunderstanding. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 7d22d6c..3365e5d 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -587,7 +587,7 @@ static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa) } } -static unsigned long vmcs_readl(unsigned long field) +static __always_inline unsigned long vmcs_readl(unsigned long field) { unsigned long value; @@ -596,17 +596,17 @@ static unsigned long vmcs_readl(unsigned long field) return value; } -static u16 vmcs_read16(unsigned long field) +static __always_inline u16 vmcs_read16(unsigned long field) { return vmcs_readl(field); } -static u32 vmcs_read32(unsigned long field) +static __always_inline u32 vmcs_read32(unsigned long field) { return vmcs_readl(field); } -static u64 vmcs_read64(unsigned long field) +static __always_inline u64 vmcs_read64(unsigned long field) { #ifdef CONFIG_X86_64 return vmcs_readl(field); -- cgit v1.1 From 332b207d65c1d7982489dbb83e5071c95e19eb75 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Sun, 15 May 2011 23:20:27 +0800 Subject: KVM: MMU: optimize pte write path if don't have protected sp Simply return from kvm_mmu_pte_write path if no shadow page is write-protected, then we can avoid to walk all shadow pages and hold mmu-lock Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/mmu.c | 9 +++++++++ 2 files changed, 10 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index db4b654..387780e 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -441,6 +441,7 @@ struct kvm_arch { unsigned int n_used_mmu_pages; unsigned int n_requested_mmu_pages; unsigned int n_max_mmu_pages; + unsigned int indirect_shadow_pages; atomic_t invlpg_counter; struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; /* diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index aee3862..b4ae7af 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -498,6 +498,7 @@ static void account_shadowed(struct kvm *kvm, gfn_t gfn) linfo = lpage_info_slot(gfn, slot, i); linfo->write_count += 1; } + kvm->arch.indirect_shadow_pages++; } static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) @@ -513,6 +514,7 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) linfo->write_count -= 1; WARN_ON(linfo->write_count < 0); } + kvm->arch.indirect_shadow_pages--; } static int has_wrprotected_page(struct kvm *kvm, @@ -3233,6 +3235,13 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, int level, npte, invlpg_counter, r, flooded = 0; bool remote_flush, local_flush, zap_page; + /* + * If we don't have indirect shadow pages, it means no page is + * write-protected, so we can exit simply. + */ + if (!ACCESS_ONCE(vcpu->kvm->arch.indirect_shadow_pages)) + return; + zap_page = remote_flush = local_flush = false; offset = offset_in_page(gpa); -- cgit v1.1 From 8b0cedff040b652f3d36b1368778667581b0c140 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Sun, 15 May 2011 23:22:04 +0800 Subject: KVM: use __copy_to_user/__clear_user to write guest page Simply use __copy_to_user/__clear_user to write guest page since we have already verified the user address when the memslot is set Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 51df0b6..0b08986 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1388,7 +1388,7 @@ static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data) return 1; kvm_x86_ops->patch_hypercall(vcpu, instructions); ((unsigned char *)instructions)[3] = 0xc3; /* ret */ - if (copy_to_user((void __user *)addr, instructions, 4)) + if (__copy_to_user((void __user *)addr, instructions, 4)) return 1; kvm->arch.hv_hypercall = data; break; @@ -1415,7 +1415,7 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data) HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT); if (kvm_is_error_hva(addr)) return 1; - if (clear_user((void __user *)addr, PAGE_SIZE)) + if (__clear_user((void __user *)addr, PAGE_SIZE)) return 1; vcpu->arch.hv_vapic = data; break; -- cgit v1.1 From 1249b96e72533ffdb2fa25b5d7471918b065ccc8 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Sun, 15 May 2011 23:25:10 +0800 Subject: KVM: fix uninitialized warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix: warning: ‘cs_sel’ may be used uninitialized in this function warning: ‘ss_sel’ may be used uninitialized in this function Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index d3f4466..bc916bd 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1996,7 +1996,7 @@ static int emulate_sysexit(struct x86_emulate_ctxt *ctxt) struct desc_struct cs, ss; u64 msr_data; int usermode; - u16 cs_sel, ss_sel; + u16 cs_sel = 0, ss_sel = 0; /* inject #GP if in real mode or Virtual 8086 mode */ if (ctxt->mode == X86EMUL_MODE_REAL || -- cgit v1.1 From 53c07b18787d564a105e1aa678795d67eeb27447 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Sun, 15 May 2011 23:26:20 +0800 Subject: KVM: MMU: abstract the operation of rmap Abstract the operation of rmap to spte_list, then we can use it for the reverse mapping of parent pte in the later patch Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/mmu.c | 260 +++++++++++++++++++++------------------- 2 files changed, 140 insertions(+), 122 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 387780e..e6a4a57 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -347,7 +347,7 @@ struct kvm_vcpu_arch { struct kvm_pv_mmu_op_buffer mmu_op_buffer; struct kvm_mmu_memory_cache mmu_pte_chain_cache; - struct kvm_mmu_memory_cache mmu_rmap_desc_cache; + struct kvm_mmu_memory_cache mmu_pte_list_desc_cache; struct kvm_mmu_memory_cache mmu_page_cache; struct kvm_mmu_memory_cache mmu_page_header_cache; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index b4ae7af..a6811cb 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -148,7 +148,7 @@ module_param(oos_shadow, bool, 0644); #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \ | PT64_NX_MASK) -#define RMAP_EXT 4 +#define PTE_LIST_EXT 4 #define ACC_EXEC_MASK 1 #define ACC_WRITE_MASK PT_WRITABLE_MASK @@ -164,9 +164,9 @@ module_param(oos_shadow, bool, 0644); #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) -struct kvm_rmap_desc { - u64 *sptes[RMAP_EXT]; - struct kvm_rmap_desc *more; +struct pte_list_desc { + u64 *sptes[PTE_LIST_EXT]; + struct pte_list_desc *more; }; struct kvm_shadow_walk_iterator { @@ -185,7 +185,7 @@ struct kvm_shadow_walk_iterator { typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte); static struct kmem_cache *pte_chain_cache; -static struct kmem_cache *rmap_desc_cache; +static struct kmem_cache *pte_list_desc_cache; static struct kmem_cache *mmu_page_header_cache; static struct percpu_counter kvm_total_used_mmu_pages; @@ -401,8 +401,8 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu) pte_chain_cache, 4); if (r) goto out; - r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, - rmap_desc_cache, 4 + PTE_PREFETCH_NUM); + r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache, + pte_list_desc_cache, 4 + PTE_PREFETCH_NUM); if (r) goto out; r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8); @@ -416,8 +416,10 @@ out: static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) { - mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache, pte_chain_cache); - mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, rmap_desc_cache); + mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache, + pte_chain_cache); + mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache, + pte_list_desc_cache); mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache); mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache, mmu_page_header_cache); @@ -444,15 +446,15 @@ static void mmu_free_pte_chain(struct kvm_pte_chain *pc) kmem_cache_free(pte_chain_cache, pc); } -static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu) +static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu) { - return mmu_memory_cache_alloc(&vcpu->arch.mmu_rmap_desc_cache, - sizeof(struct kvm_rmap_desc)); + return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache, + sizeof(struct pte_list_desc)); } -static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd) +static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc) { - kmem_cache_free(rmap_desc_cache, rd); + kmem_cache_free(pte_list_desc_cache, pte_list_desc); } static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index) @@ -590,67 +592,42 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) } /* - * Take gfn and return the reverse mapping to it. - */ - -static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) -{ - struct kvm_memory_slot *slot; - struct kvm_lpage_info *linfo; - - slot = gfn_to_memslot(kvm, gfn); - if (likely(level == PT_PAGE_TABLE_LEVEL)) - return &slot->rmap[gfn - slot->base_gfn]; - - linfo = lpage_info_slot(gfn, slot, level); - - return &linfo->rmap_pde; -} - -/* - * Reverse mapping data structures: + * Pte mapping structures: * - * If rmapp bit zero is zero, then rmapp point to the shadw page table entry - * that points to page_address(page). + * If pte_list bit zero is zero, then pte_list point to the spte. * - * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc - * containing more mappings. + * If pte_list bit zero is one, (then pte_list & ~1) points to a struct + * pte_list_desc containing more mappings. * - * Returns the number of rmap entries before the spte was added or zero if + * Returns the number of pte entries before the spte was added or zero if * the spte was not added. * */ -static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) +static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte, + unsigned long *pte_list) { - struct kvm_mmu_page *sp; - struct kvm_rmap_desc *desc; - unsigned long *rmapp; + struct pte_list_desc *desc; int i, count = 0; - if (!is_rmap_spte(*spte)) - return count; - sp = page_header(__pa(spte)); - kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); - rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); - if (!*rmapp) { - rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte); - *rmapp = (unsigned long)spte; - } else if (!(*rmapp & 1)) { - rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte); - desc = mmu_alloc_rmap_desc(vcpu); - desc->sptes[0] = (u64 *)*rmapp; + if (!*pte_list) { + rmap_printk("pte_list_add: %p %llx 0->1\n", spte, *spte); + *pte_list = (unsigned long)spte; + } else if (!(*pte_list & 1)) { + rmap_printk("pte_list_add: %p %llx 1->many\n", spte, *spte); + desc = mmu_alloc_pte_list_desc(vcpu); + desc->sptes[0] = (u64 *)*pte_list; desc->sptes[1] = spte; - *rmapp = (unsigned long)desc | 1; + *pte_list = (unsigned long)desc | 1; ++count; } else { - rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte); - desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); - while (desc->sptes[RMAP_EXT-1] && desc->more) { + rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte); + desc = (struct pte_list_desc *)(*pte_list & ~1ul); + while (desc->sptes[PTE_LIST_EXT-1] && desc->more) { desc = desc->more; - count += RMAP_EXT; + count += PTE_LIST_EXT; } - if (desc->sptes[RMAP_EXT-1]) { - desc->more = mmu_alloc_rmap_desc(vcpu); + if (desc->sptes[PTE_LIST_EXT-1]) { + desc->more = mmu_alloc_pte_list_desc(vcpu); desc = desc->more; } for (i = 0; desc->sptes[i]; ++i) @@ -660,59 +637,78 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) return count; } -static void rmap_desc_remove_entry(unsigned long *rmapp, - struct kvm_rmap_desc *desc, - int i, - struct kvm_rmap_desc *prev_desc) +static u64 *pte_list_next(unsigned long *pte_list, u64 *spte) +{ + struct pte_list_desc *desc; + u64 *prev_spte; + int i; + + if (!*pte_list) + return NULL; + else if (!(*pte_list & 1)) { + if (!spte) + return (u64 *)*pte_list; + return NULL; + } + desc = (struct pte_list_desc *)(*pte_list & ~1ul); + prev_spte = NULL; + while (desc) { + for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) { + if (prev_spte == spte) + return desc->sptes[i]; + prev_spte = desc->sptes[i]; + } + desc = desc->more; + } + return NULL; +} + +static void +pte_list_desc_remove_entry(unsigned long *pte_list, struct pte_list_desc *desc, + int i, struct pte_list_desc *prev_desc) { int j; - for (j = RMAP_EXT - 1; !desc->sptes[j] && j > i; --j) + for (j = PTE_LIST_EXT - 1; !desc->sptes[j] && j > i; --j) ; desc->sptes[i] = desc->sptes[j]; desc->sptes[j] = NULL; if (j != 0) return; if (!prev_desc && !desc->more) - *rmapp = (unsigned long)desc->sptes[0]; + *pte_list = (unsigned long)desc->sptes[0]; else if (prev_desc) prev_desc->more = desc->more; else - *rmapp = (unsigned long)desc->more | 1; - mmu_free_rmap_desc(desc); + *pte_list = (unsigned long)desc->more | 1; + mmu_free_pte_list_desc(desc); } -static void rmap_remove(struct kvm *kvm, u64 *spte) +static void pte_list_remove(u64 *spte, unsigned long *pte_list) { - struct kvm_rmap_desc *desc; - struct kvm_rmap_desc *prev_desc; - struct kvm_mmu_page *sp; - gfn_t gfn; - unsigned long *rmapp; + struct pte_list_desc *desc; + struct pte_list_desc *prev_desc; int i; - sp = page_header(__pa(spte)); - gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); - rmapp = gfn_to_rmap(kvm, gfn, sp->role.level); - if (!*rmapp) { - printk(KERN_ERR "rmap_remove: %p 0->BUG\n", spte); + if (!*pte_list) { + printk(KERN_ERR "pte_list_remove: %p 0->BUG\n", spte); BUG(); - } else if (!(*rmapp & 1)) { - rmap_printk("rmap_remove: %p 1->0\n", spte); - if ((u64 *)*rmapp != spte) { - printk(KERN_ERR "rmap_remove: %p 1->BUG\n", spte); + } else if (!(*pte_list & 1)) { + rmap_printk("pte_list_remove: %p 1->0\n", spte); + if ((u64 *)*pte_list != spte) { + printk(KERN_ERR "pte_list_remove: %p 1->BUG\n", spte); BUG(); } - *rmapp = 0; + *pte_list = 0; } else { - rmap_printk("rmap_remove: %p many->many\n", spte); - desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); + rmap_printk("pte_list_remove: %p many->many\n", spte); + desc = (struct pte_list_desc *)(*pte_list & ~1ul); prev_desc = NULL; while (desc) { - for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) + for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) if (desc->sptes[i] == spte) { - rmap_desc_remove_entry(rmapp, + pte_list_desc_remove_entry(pte_list, desc, i, prev_desc); return; @@ -720,11 +716,59 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) prev_desc = desc; desc = desc->more; } - pr_err("rmap_remove: %p many->many\n", spte); + pr_err("pte_list_remove: %p many->many\n", spte); BUG(); } } +/* + * Take gfn and return the reverse mapping to it. + */ +static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) +{ + struct kvm_memory_slot *slot; + struct kvm_lpage_info *linfo; + + slot = gfn_to_memslot(kvm, gfn); + if (likely(level == PT_PAGE_TABLE_LEVEL)) + return &slot->rmap[gfn - slot->base_gfn]; + + linfo = lpage_info_slot(gfn, slot, level); + + return &linfo->rmap_pde; +} + +static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) +{ + struct kvm_mmu_page *sp; + unsigned long *rmapp; + + if (!is_rmap_spte(*spte)) + return 0; + + sp = page_header(__pa(spte)); + kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); + rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); + return pte_list_add(vcpu, spte, rmapp); +} + +static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) +{ + return pte_list_next(rmapp, spte); +} + +static void rmap_remove(struct kvm *kvm, u64 *spte) +{ + struct kvm_mmu_page *sp; + gfn_t gfn; + unsigned long *rmapp; + + sp = page_header(__pa(spte)); + gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); + rmapp = gfn_to_rmap(kvm, gfn, sp->role.level); + pte_list_remove(spte, rmapp); +} + static int set_spte_track_bits(u64 *sptep, u64 new_spte) { pfn_t pfn; @@ -752,32 +796,6 @@ static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte) rmap_remove(kvm, sptep); } -static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) -{ - struct kvm_rmap_desc *desc; - u64 *prev_spte; - int i; - - if (!*rmapp) - return NULL; - else if (!(*rmapp & 1)) { - if (!spte) - return (u64 *)*rmapp; - return NULL; - } - desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); - prev_spte = NULL; - while (desc) { - for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) { - if (prev_spte == spte) - return desc->sptes[i]; - prev_spte = desc->sptes[i]; - } - desc = desc->more; - } - return NULL; -} - static int rmap_write_protect(struct kvm *kvm, u64 gfn) { unsigned long *rmapp; @@ -3601,8 +3619,8 @@ static void mmu_destroy_caches(void) { if (pte_chain_cache) kmem_cache_destroy(pte_chain_cache); - if (rmap_desc_cache) - kmem_cache_destroy(rmap_desc_cache); + if (pte_list_desc_cache) + kmem_cache_destroy(pte_list_desc_cache); if (mmu_page_header_cache) kmem_cache_destroy(mmu_page_header_cache); } @@ -3614,10 +3632,10 @@ int kvm_mmu_module_init(void) 0, 0, NULL); if (!pte_chain_cache) goto nomem; - rmap_desc_cache = kmem_cache_create("kvm_rmap_desc", - sizeof(struct kvm_rmap_desc), + pte_list_desc_cache = kmem_cache_create("pte_list_desc", + sizeof(struct pte_list_desc), 0, 0, NULL); - if (!rmap_desc_cache) + if (!pte_list_desc_cache) goto nomem; mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header", -- cgit v1.1 From 67052b3508f09956427d6476fd35e8fddde6c618 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Sun, 15 May 2011 23:27:08 +0800 Subject: KVM: MMU: remove the arithmetic of parent pte rmap Parent pte rmap and page rmap are very similar, so use the same arithmetic for them Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_host.h | 7 +- arch/x86/kvm/mmu.c | 189 ++++++++++------------------------------ 2 files changed, 46 insertions(+), 150 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index e6a4a57..ff17deb 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -227,14 +227,10 @@ struct kvm_mmu_page { * in this shadow page. */ DECLARE_BITMAP(slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); - bool multimapped; /* More than one parent_pte? */ bool unsync; int root_count; /* Currently serving as active root */ unsigned int unsync_children; - union { - u64 *parent_pte; /* !multimapped */ - struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */ - }; + unsigned long parent_ptes; /* Reverse mapping for parent_pte */ DECLARE_BITMAP(unsync_child_bitmap, 512); }; @@ -346,7 +342,6 @@ struct kvm_vcpu_arch { * put it here to avoid allocation */ struct kvm_pv_mmu_op_buffer mmu_op_buffer; - struct kvm_mmu_memory_cache mmu_pte_chain_cache; struct kvm_mmu_memory_cache mmu_pte_list_desc_cache; struct kvm_mmu_memory_cache mmu_page_cache; struct kvm_mmu_memory_cache mmu_page_header_cache; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index a6811cb..9eaca1c 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -182,9 +182,6 @@ struct kvm_shadow_walk_iterator { shadow_walk_okay(&(_walker)); \ shadow_walk_next(&(_walker))) -typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte); - -static struct kmem_cache *pte_chain_cache; static struct kmem_cache *pte_list_desc_cache; static struct kmem_cache *mmu_page_header_cache; static struct percpu_counter kvm_total_used_mmu_pages; @@ -397,12 +394,8 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu) { int r; - r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache, - pte_chain_cache, 4); - if (r) - goto out; r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache, - pte_list_desc_cache, 4 + PTE_PREFETCH_NUM); + pte_list_desc_cache, 8 + PTE_PREFETCH_NUM); if (r) goto out; r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8); @@ -416,8 +409,6 @@ out: static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) { - mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache, - pte_chain_cache); mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache, pte_list_desc_cache); mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache); @@ -435,17 +426,6 @@ static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, return p; } -static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu) -{ - return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_chain_cache, - sizeof(struct kvm_pte_chain)); -} - -static void mmu_free_pte_chain(struct kvm_pte_chain *pc) -{ - kmem_cache_free(pte_chain_cache, pc); -} - static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu) { return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache, @@ -721,6 +701,26 @@ static void pte_list_remove(u64 *spte, unsigned long *pte_list) } } +typedef void (*pte_list_walk_fn) (u64 *spte); +static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn) +{ + struct pte_list_desc *desc; + int i; + + if (!*pte_list) + return; + + if (!(*pte_list & 1)) + return fn((u64 *)*pte_list); + + desc = (struct pte_list_desc *)(*pte_list & ~1ul); + while (desc) { + for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) + fn(desc->sptes[i]); + desc = desc->more; + } +} + /* * Take gfn and return the reverse mapping to it. */ @@ -1069,134 +1069,52 @@ static unsigned kvm_page_table_hashfn(gfn_t gfn) return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1); } -static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, - u64 *parent_pte, int direct) -{ - struct kvm_mmu_page *sp; - - sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp); - sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); - if (!direct) - sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, - PAGE_SIZE); - set_page_private(virt_to_page(sp->spt), (unsigned long)sp); - list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); - bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); - sp->multimapped = 0; - sp->parent_pte = parent_pte; - kvm_mod_used_mmu_pages(vcpu->kvm, +1); - return sp; -} - static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *parent_pte) { - struct kvm_pte_chain *pte_chain; - struct hlist_node *node; - int i; - if (!parent_pte) return; - if (!sp->multimapped) { - u64 *old = sp->parent_pte; - if (!old) { - sp->parent_pte = parent_pte; - return; - } - sp->multimapped = 1; - pte_chain = mmu_alloc_pte_chain(vcpu); - INIT_HLIST_HEAD(&sp->parent_ptes); - hlist_add_head(&pte_chain->link, &sp->parent_ptes); - pte_chain->parent_ptes[0] = old; - } - hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) { - if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1]) - continue; - for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) - if (!pte_chain->parent_ptes[i]) { - pte_chain->parent_ptes[i] = parent_pte; - return; - } - } - pte_chain = mmu_alloc_pte_chain(vcpu); - BUG_ON(!pte_chain); - hlist_add_head(&pte_chain->link, &sp->parent_ptes); - pte_chain->parent_ptes[0] = parent_pte; + pte_list_add(vcpu, parent_pte, &sp->parent_ptes); } static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp, u64 *parent_pte) { - struct kvm_pte_chain *pte_chain; - struct hlist_node *node; - int i; - - if (!sp->multimapped) { - BUG_ON(sp->parent_pte != parent_pte); - sp->parent_pte = NULL; - return; - } - hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) - for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { - if (!pte_chain->parent_ptes[i]) - break; - if (pte_chain->parent_ptes[i] != parent_pte) - continue; - while (i + 1 < NR_PTE_CHAIN_ENTRIES - && pte_chain->parent_ptes[i + 1]) { - pte_chain->parent_ptes[i] - = pte_chain->parent_ptes[i + 1]; - ++i; - } - pte_chain->parent_ptes[i] = NULL; - if (i == 0) { - hlist_del(&pte_chain->link); - mmu_free_pte_chain(pte_chain); - if (hlist_empty(&sp->parent_ptes)) { - sp->multimapped = 0; - sp->parent_pte = NULL; - } - } - return; - } - BUG(); + pte_list_remove(parent_pte, &sp->parent_ptes); } -static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn) +static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, + u64 *parent_pte, int direct) { - struct kvm_pte_chain *pte_chain; - struct hlist_node *node; - struct kvm_mmu_page *parent_sp; - int i; - - if (!sp->multimapped && sp->parent_pte) { - parent_sp = page_header(__pa(sp->parent_pte)); - fn(parent_sp, sp->parent_pte); - return; - } - - hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) - for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { - u64 *spte = pte_chain->parent_ptes[i]; - - if (!spte) - break; - parent_sp = page_header(__pa(spte)); - fn(parent_sp, spte); - } + struct kvm_mmu_page *sp; + sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, + sizeof *sp); + sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); + if (!direct) + sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, + PAGE_SIZE); + set_page_private(virt_to_page(sp->spt), (unsigned long)sp); + list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); + bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); + sp->parent_ptes = 0; + mmu_page_add_parent_pte(vcpu, sp, parent_pte); + kvm_mod_used_mmu_pages(vcpu->kvm, +1); + return sp; } -static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte); +static void mark_unsync(u64 *spte); static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp) { - mmu_parent_walk(sp, mark_unsync); + pte_list_walk(&sp->parent_ptes, mark_unsync); } -static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte) +static void mark_unsync(u64 *spte) { + struct kvm_mmu_page *sp; unsigned int index; + sp = page_header(__pa(spte)); index = spte - sp->spt; if (__test_and_set_bit(index, sp->unsync_child_bitmap)) return; @@ -1694,17 +1612,7 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) { u64 *parent_pte; - while (sp->multimapped || sp->parent_pte) { - if (!sp->multimapped) - parent_pte = sp->parent_pte; - else { - struct kvm_pte_chain *chain; - - chain = container_of(sp->parent_ptes.first, - struct kvm_pte_chain, link); - parent_pte = chain->parent_ptes[0]; - } - BUG_ON(!parent_pte); + while ((parent_pte = pte_list_next(&sp->parent_ptes, NULL))) { kvm_mmu_put_page(sp, parent_pte); __set_spte(parent_pte, shadow_trap_nonpresent_pte); } @@ -3617,8 +3525,6 @@ static struct shrinker mmu_shrinker = { static void mmu_destroy_caches(void) { - if (pte_chain_cache) - kmem_cache_destroy(pte_chain_cache); if (pte_list_desc_cache) kmem_cache_destroy(pte_list_desc_cache); if (mmu_page_header_cache) @@ -3627,11 +3533,6 @@ static void mmu_destroy_caches(void) int kvm_mmu_module_init(void) { - pte_chain_cache = kmem_cache_create("kvm_pte_chain", - sizeof(struct kvm_pte_chain), - 0, 0, NULL); - if (!pte_chain_cache) - goto nomem; pte_list_desc_cache = kmem_cache_create("pte_list_desc", sizeof(struct pte_list_desc), 0, 0, NULL); -- cgit v1.1 From 38e3b2b28c5f8fe7914172f4ba631ef4552824d6 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Sun, 15 May 2011 23:27:52 +0800 Subject: KVM: MMU: cleanup for kvm_mmu_page_unlink_children Cleanup the same operation between kvm_mmu_page_unlink_children and mmu_pte_write_zap_pte Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 66 +++++++++++++++++++----------------------------------- 1 file changed, 23 insertions(+), 43 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 9eaca1c..71eddc4 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1566,32 +1566,33 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, } } +static void mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, + u64 *spte) +{ + u64 pte; + struct kvm_mmu_page *child; + + pte = *spte; + if (is_shadow_present_pte(pte)) { + if (is_last_spte(pte, sp->role.level)) + drop_spte(kvm, spte, shadow_trap_nonpresent_pte); + else { + child = page_header(pte & PT64_BASE_ADDR_MASK); + mmu_page_remove_parent_pte(child, spte); + } + } + __set_spte(spte, shadow_trap_nonpresent_pte); + if (is_large_pte(pte)) + --kvm->stat.lpages; +} + static void kvm_mmu_page_unlink_children(struct kvm *kvm, struct kvm_mmu_page *sp) { unsigned i; - u64 *pt; - u64 ent; - - pt = sp->spt; - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { - ent = pt[i]; - - if (is_shadow_present_pte(ent)) { - if (!is_last_spte(ent, sp->role.level)) { - ent &= PT64_BASE_ADDR_MASK; - mmu_page_remove_parent_pte(page_header(ent), - &pt[i]); - } else { - if (is_large_pte(ent)) - --kvm->stat.lpages; - drop_spte(kvm, &pt[i], - shadow_trap_nonpresent_pte); - } - } - pt[i] = shadow_trap_nonpresent_pte; - } + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) + mmu_page_zap_pte(kvm, sp, sp->spt + i); } static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte) @@ -3069,27 +3070,6 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_mmu_unload); -static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp, - u64 *spte) -{ - u64 pte; - struct kvm_mmu_page *child; - - pte = *spte; - if (is_shadow_present_pte(pte)) { - if (is_last_spte(pte, sp->role.level)) - drop_spte(vcpu->kvm, spte, shadow_trap_nonpresent_pte); - else { - child = page_header(pte & PT64_BASE_ADDR_MASK); - mmu_page_remove_parent_pte(child, spte); - } - } - __set_spte(spte, shadow_trap_nonpresent_pte); - if (is_large_pte(pte)) - --vcpu->kvm->stat.lpages; -} - static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, const void *new) @@ -3271,7 +3251,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, spte = &sp->spt[page_offset / sizeof(*spte)]; while (npte--) { entry = *spte; - mmu_pte_write_zap_pte(vcpu, sp, spte); + mmu_page_zap_pte(vcpu->kvm, sp, spte); if (gentry && !((sp->role.word ^ vcpu->arch.mmu.base_role.word) & mask.word)) -- cgit v1.1 From bcdd9a93c57571652c887cb522176db741893581 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Sun, 15 May 2011 23:28:29 +0800 Subject: KVM: MMU: cleanup for dropping parent pte Introduce drop_parent_pte to remove the rmap of parent pte and clear parent pte Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 71eddc4..15afa1e 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1084,6 +1084,13 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp, pte_list_remove(parent_pte, &sp->parent_ptes); } +static void drop_parent_pte(struct kvm_mmu_page *sp, + u64 *parent_pte) +{ + mmu_page_remove_parent_pte(sp, parent_pte); + __set_spte(parent_pte, shadow_trap_nonpresent_pte); +} + static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, u64 *parent_pte, int direct) { @@ -1560,8 +1567,7 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (child->role.access == direct_access) return; - mmu_page_remove_parent_pte(child, sptep); - __set_spte(sptep, shadow_trap_nonpresent_pte); + drop_parent_pte(child, sptep); kvm_flush_remote_tlbs(vcpu->kvm); } } @@ -1578,7 +1584,7 @@ static void mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, drop_spte(kvm, spte, shadow_trap_nonpresent_pte); else { child = page_header(pte & PT64_BASE_ADDR_MASK); - mmu_page_remove_parent_pte(child, spte); + drop_parent_pte(child, spte); } } __set_spte(spte, shadow_trap_nonpresent_pte); @@ -1613,10 +1619,8 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) { u64 *parent_pte; - while ((parent_pte = pte_list_next(&sp->parent_ptes, NULL))) { - kvm_mmu_put_page(sp, parent_pte); - __set_spte(parent_pte, shadow_trap_nonpresent_pte); - } + while ((parent_pte = pte_list_next(&sp->parent_ptes, NULL))) + drop_parent_pte(sp, parent_pte); } static int mmu_zap_unsync_children(struct kvm *kvm, @@ -2046,8 +2050,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 pte = *sptep; child = page_header(pte & PT64_BASE_ADDR_MASK); - mmu_page_remove_parent_pte(child, sptep); - __set_spte(sptep, shadow_trap_nonpresent_pte); + drop_parent_pte(child, sptep); kvm_flush_remote_tlbs(vcpu->kvm); } else if (pfn != spte_to_pfn(*sptep)) { pgprintk("hfn old %llx new %llx\n", -- cgit v1.1 From 24c82e576b7860a4f02a21103e9df39e11e97006 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 18 May 2011 05:56:07 -0400 Subject: KVM: Sanitize cpuid Instead of blacklisting known-unsupported cpuid leaves, whitelist known- supported leaves. This is more conservative and prevents us from reporting features we don't support. Also whitelist a few more leaves while at it. Signed-off-by: Avi Kivity Acked-by: Joerg Roedel Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0b08986..da48622 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2283,6 +2283,13 @@ static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->flags = 0; } +static bool supported_xcr0_bit(unsigned bit) +{ + u64 mask = ((u64)1 << bit); + + return mask & (XSTATE_FP | XSTATE_SSE | XSTATE_YMM) & host_xcr0; +} + #define F(x) bit(X86_FEATURE_##x) static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, @@ -2393,6 +2400,8 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, } break; } + case 9: + break; case 0xb: { int i, level_type; @@ -2414,7 +2423,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; for (i = 1; *nent < maxnent && i < 64; ++i) { - if (entry[i].eax == 0) + if (entry[i].eax == 0 || !supported_xcr0_bit(i)) continue; do_cpuid_1_ent(&entry[i], function, i); entry[i].flags |= @@ -2451,6 +2460,24 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->ecx &= kvm_supported_word6_x86_features; cpuid_mask(&entry->ecx, 6); break; + case 0x80000008: { + unsigned g_phys_as = (entry->eax >> 16) & 0xff; + unsigned virt_as = max((entry->eax >> 8) & 0xff, 48U); + unsigned phys_as = entry->eax & 0xff; + + if (!g_phys_as) + g_phys_as = phys_as; + entry->eax = g_phys_as | (virt_as << 8); + entry->ebx = entry->edx = 0; + break; + } + case 0x80000019: + entry->ecx = entry->edx = 0; + break; + case 0x8000001a: + break; + case 0x8000001d: + break; /*Add support for Centaur's CPUID instruction*/ case 0xC0000000: /*Just support up to 0xC0000004 now*/ @@ -2460,10 +2487,16 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->edx &= kvm_supported_word5_x86_features; cpuid_mask(&entry->edx, 5); break; + case 3: /* Processor serial number */ + case 5: /* MONITOR/MWAIT */ + case 6: /* Thermal management */ + case 0xA: /* Architectural Performance Monitoring */ + case 0x80000007: /* Advanced power management */ case 0xC0000002: case 0xC0000003: case 0xC0000004: - /*Now nothing to do, reserved for the future*/ + default: + entry->eax = entry->ebx = entry->ecx = entry->edx = 0; break; } -- cgit v1.1 From d462b8192368f10e979250377930f9695a4039d0 Mon Sep 17 00:00:00 2001 From: Nadav Har'El Date: Tue, 24 May 2011 15:26:10 +0300 Subject: KVM: VMX: Keep list of loaded VMCSs, instead of vcpus In VMX, before we bring down a CPU we must VMCLEAR all VMCSs loaded on it because (at least in theory) the processor might not have written all of its content back to memory. Since a patch from June 26, 2008, this is done using a per-cpu "vcpus_on_cpu" linked list of vcpus loaded on each CPU. The problem is that with nested VMX, we no longer have the concept of a vcpu being loaded on a cpu: A vcpu has multiple VMCSs (one for L1, a pool for L2s), and each of those may be have been last loaded on a different cpu. So instead of linking the vcpus, we link the VMCSs, using a new structure loaded_vmcs. This structure contains the VMCS, and the information pertaining to its loading on a specific cpu (namely, the cpu number, and whether it was already launched on this cpu once). In nested we will also use the same structure to hold L2 VMCSs, and vmx->loaded_vmcs is a pointer to the currently active VMCS. Signed-off-by: Nadav Har'El Acked-by: Acked-by: Kevin Tian Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 150 ++++++++++++++++++++++++++++++----------------------- 1 file changed, 86 insertions(+), 64 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 3365e5d..1444e41 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -118,6 +118,18 @@ struct vmcs { char data[0]; }; +/* + * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also + * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs + * loaded on this CPU (so we can clear them if the CPU goes down). + */ +struct loaded_vmcs { + struct vmcs *vmcs; + int cpu; + int launched; + struct list_head loaded_vmcss_on_cpu_link; +}; + struct shared_msr_entry { unsigned index; u64 data; @@ -126,9 +138,7 @@ struct shared_msr_entry { struct vcpu_vmx { struct kvm_vcpu vcpu; - struct list_head local_vcpus_link; unsigned long host_rsp; - int launched; u8 fail; u8 cpl; bool nmi_known_unmasked; @@ -142,7 +152,14 @@ struct vcpu_vmx { u64 msr_host_kernel_gs_base; u64 msr_guest_kernel_gs_base; #endif - struct vmcs *vmcs; + /* + * loaded_vmcs points to the VMCS currently used in this vcpu. For a + * non-nested (L1) guest, it always points to vmcs01. For a nested + * guest (L2), it points to a different VMCS. + */ + struct loaded_vmcs vmcs01; + struct loaded_vmcs *loaded_vmcs; + bool __launched; /* temporary, used in vmx_vcpu_run */ struct msr_autoload { unsigned nr; struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS]; @@ -202,7 +219,11 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); -static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu); +/* + * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed + * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it. + */ +static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu); static DEFINE_PER_CPU(struct desc_ptr, host_gdt); static unsigned long *vmx_io_bitmap_a; @@ -503,6 +524,13 @@ static void vmcs_clear(struct vmcs *vmcs) vmcs, phys_addr); } +static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs) +{ + vmcs_clear(loaded_vmcs->vmcs); + loaded_vmcs->cpu = -1; + loaded_vmcs->launched = 0; +} + static void vmcs_load(struct vmcs *vmcs) { u64 phys_addr = __pa(vmcs); @@ -516,25 +544,24 @@ static void vmcs_load(struct vmcs *vmcs) vmcs, phys_addr); } -static void __vcpu_clear(void *arg) +static void __loaded_vmcs_clear(void *arg) { - struct vcpu_vmx *vmx = arg; + struct loaded_vmcs *loaded_vmcs = arg; int cpu = raw_smp_processor_id(); - if (vmx->vcpu.cpu == cpu) - vmcs_clear(vmx->vmcs); - if (per_cpu(current_vmcs, cpu) == vmx->vmcs) + if (loaded_vmcs->cpu != cpu) + return; /* vcpu migration can race with cpu offline */ + if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs) per_cpu(current_vmcs, cpu) = NULL; - list_del(&vmx->local_vcpus_link); - vmx->vcpu.cpu = -1; - vmx->launched = 0; + list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link); + loaded_vmcs_init(loaded_vmcs); } -static void vcpu_clear(struct vcpu_vmx *vmx) +static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) { - if (vmx->vcpu.cpu == -1) - return; - smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1); + if (loaded_vmcs->cpu != -1) + smp_call_function_single( + loaded_vmcs->cpu, __loaded_vmcs_clear, loaded_vmcs, 1); } static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx) @@ -973,22 +1000,22 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (!vmm_exclusive) kvm_cpu_vmxon(phys_addr); - else if (vcpu->cpu != cpu) - vcpu_clear(vmx); + else if (vmx->loaded_vmcs->cpu != cpu) + loaded_vmcs_clear(vmx->loaded_vmcs); - if (per_cpu(current_vmcs, cpu) != vmx->vmcs) { - per_cpu(current_vmcs, cpu) = vmx->vmcs; - vmcs_load(vmx->vmcs); + if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) { + per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs; + vmcs_load(vmx->loaded_vmcs->vmcs); } - if (vcpu->cpu != cpu) { + if (vmx->loaded_vmcs->cpu != cpu) { struct desc_ptr *gdt = &__get_cpu_var(host_gdt); unsigned long sysenter_esp; kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); local_irq_disable(); - list_add(&vmx->local_vcpus_link, - &per_cpu(vcpus_on_cpu, cpu)); + list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link, + &per_cpu(loaded_vmcss_on_cpu, cpu)); local_irq_enable(); /* @@ -1000,6 +1027,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ + vmx->loaded_vmcs->cpu = cpu; } } @@ -1007,7 +1035,8 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu) { __vmx_load_host_state(to_vmx(vcpu)); if (!vmm_exclusive) { - __vcpu_clear(to_vmx(vcpu)); + __loaded_vmcs_clear(to_vmx(vcpu)->loaded_vmcs); + vcpu->cpu = -1; kvm_cpu_vmxoff(); } } @@ -1471,7 +1500,7 @@ static int hardware_enable(void *garbage) if (read_cr4() & X86_CR4_VMXE) return -EBUSY; - INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu)); + INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); rdmsrl(MSR_IA32_FEATURE_CONTROL, old); test_bits = FEATURE_CONTROL_LOCKED; @@ -1495,14 +1524,14 @@ static int hardware_enable(void *garbage) return 0; } -static void vmclear_local_vcpus(void) +static void vmclear_local_loaded_vmcss(void) { int cpu = raw_smp_processor_id(); - struct vcpu_vmx *vmx, *n; + struct loaded_vmcs *v, *n; - list_for_each_entry_safe(vmx, n, &per_cpu(vcpus_on_cpu, cpu), - local_vcpus_link) - __vcpu_clear(vmx); + list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu), + loaded_vmcss_on_cpu_link) + __loaded_vmcs_clear(v); } @@ -1517,7 +1546,7 @@ static void kvm_cpu_vmxoff(void) static void hardware_disable(void *garbage) { if (vmm_exclusive) { - vmclear_local_vcpus(); + vmclear_local_loaded_vmcss(); kvm_cpu_vmxoff(); } write_cr4(read_cr4() & ~X86_CR4_VMXE); @@ -1698,6 +1727,18 @@ static void free_vmcs(struct vmcs *vmcs) free_pages((unsigned long)vmcs, vmcs_config.order); } +/* + * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded + */ +static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) +{ + if (!loaded_vmcs->vmcs) + return; + loaded_vmcs_clear(loaded_vmcs); + free_vmcs(loaded_vmcs->vmcs); + loaded_vmcs->vmcs = NULL; +} + static void free_kvm_area(void) { int cpu; @@ -4169,6 +4210,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) vmx_set_interrupt_shadow(vcpu, 0); + vmx->__launched = vmx->loaded_vmcs->launched; asm( /* Store host registers */ "push %%"R"dx; push %%"R"bp;" @@ -4239,7 +4281,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) "pop %%"R"bp; pop %%"R"dx \n\t" "setbe %c[fail](%0) \n\t" : : "c"(vmx), "d"((unsigned long)HOST_RSP), - [launched]"i"(offsetof(struct vcpu_vmx, launched)), + [launched]"i"(offsetof(struct vcpu_vmx, __launched)), [fail]"i"(offsetof(struct vcpu_vmx, fail)), [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), @@ -4279,7 +4321,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); - vmx->launched = 1; + vmx->loaded_vmcs->launched = 1; vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); @@ -4291,41 +4333,17 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) #undef R #undef Q -static void vmx_free_vmcs(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (vmx->vmcs) { - vcpu_clear(vmx); - free_vmcs(vmx->vmcs); - vmx->vmcs = NULL; - } -} - static void vmx_free_vcpu(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); free_vpid(vmx); - vmx_free_vmcs(vcpu); + free_loaded_vmcs(vmx->loaded_vmcs); kfree(vmx->guest_msrs); kvm_vcpu_uninit(vcpu); kmem_cache_free(kvm_vcpu_cache, vmx); } -static inline void vmcs_init(struct vmcs *vmcs) -{ - u64 phys_addr = __pa(per_cpu(vmxarea, raw_smp_processor_id())); - - if (!vmm_exclusive) - kvm_cpu_vmxon(phys_addr); - - vmcs_clear(vmcs); - - if (!vmm_exclusive) - kvm_cpu_vmxoff(); -} - static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) { int err; @@ -4347,11 +4365,15 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) goto uninit_vcpu; } - vmx->vmcs = alloc_vmcs(); - if (!vmx->vmcs) + vmx->loaded_vmcs = &vmx->vmcs01; + vmx->loaded_vmcs->vmcs = alloc_vmcs(); + if (!vmx->loaded_vmcs->vmcs) goto free_msrs; - - vmcs_init(vmx->vmcs); + if (!vmm_exclusive) + kvm_cpu_vmxon(__pa(per_cpu(vmxarea, raw_smp_processor_id()))); + loaded_vmcs_init(vmx->loaded_vmcs); + if (!vmm_exclusive) + kvm_cpu_vmxoff(); cpu = get_cpu(); vmx_vcpu_load(&vmx->vcpu, cpu); @@ -4380,7 +4402,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) return &vmx->vcpu; free_vmcs: - free_vmcs(vmx->vmcs); + free_vmcs(vmx->loaded_vmcs->vmcs); free_msrs: kfree(vmx->guest_msrs); uninit_vcpu: -- cgit v1.1 From d780592b99d7d8a5ff905f6bacca519d4a342c76 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 23 May 2011 10:33:05 +0200 Subject: KVM: Clean up error handling during VCPU creation So far kvm_arch_vcpu_setup is responsible for freeing the vcpu struct if it fails. Move this confusing resonsibility back into the hands of kvm_vm_ioctl_create_vcpu. Only kvm_arch_vcpu_setup of x86 is affected, all other archs cannot fail. Signed-off-by: Jan Kiszka Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index da48622..aaa3735 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6126,12 +6126,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) if (r == 0) r = kvm_mmu_setup(vcpu); vcpu_put(vcpu); - if (r < 0) - goto free_vcpu; - return 0; -free_vcpu: - kvm_x86_ops->vcpu_free(vcpu); return r; } -- cgit v1.1 From adf52235b4082e67f31bf1fba36f1dce312633d6 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Wed, 25 May 2011 11:06:16 +0900 Subject: KVM: x86 emulator: Clean up init_emulate_ctxt() Use a local pointer to the emulate_ctxt for simplicity. Then, arrange the hard-to-read mode selection lines neatly. Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index aaa3735..ae2353c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4508,7 +4508,8 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu) static void init_emulate_ctxt(struct kvm_vcpu *vcpu) { - struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; + struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + struct decode_cache *c = &ctxt->decode; int cs_db, cs_l; /* @@ -4521,15 +4522,15 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); - vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu); - vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu); - vcpu->arch.emulate_ctxt.mode = - (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : - (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) - ? X86EMUL_MODE_VM86 : cs_l - ? X86EMUL_MODE_PROT64 : cs_db - ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; - vcpu->arch.emulate_ctxt.guest_mode = is_guest_mode(vcpu); + ctxt->eflags = kvm_get_rflags(vcpu); + ctxt->eip = kvm_rip_read(vcpu); + ctxt->mode = (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : + (ctxt->eflags & X86_EFLAGS_VM) ? X86EMUL_MODE_VM86 : + cs_l ? X86EMUL_MODE_PROT64 : + cs_db ? X86EMUL_MODE_PROT32 : + X86EMUL_MODE_PROT16; + ctxt->guest_mode = is_guest_mode(vcpu); + memset(c, 0, sizeof(struct decode_cache)); memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); vcpu->arch.emulate_regs_need_sync_from_vcpu = false; -- cgit v1.1 From b5c9ff731f3cee5a2f2d7154f48f8006b48eb66d Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Wed, 25 May 2011 11:09:38 +0900 Subject: KVM: x86 emulator: Avoid clearing the whole decode_cache During tracing the emulator, we noticed that init_emulate_ctxt() sometimes took a bit longer time than we expected. This patch is for mitigating the problem by some degree. By looking into the function, we soon notice that it clears the whole decode_cache whose size is about 2.5K bytes now. Furthermore, most of the bytes are taken for the two read_cache arrays, which are used only by a few instructions. Considering the fact that we are not assuming the cache arrays have been cleared when we store actual data, we do not need to clear the arrays: 2K bytes elimination. In addition, we can avoid clearing the fetch_cache and regs arrays. This patch changes the initialization not to clear the arrays. On our 64-bit host, init_emulate_ctxt() becomes 0.3 to 0.5us faster with this patch applied. Signed-off-by: Takuya Yoshikawa Cc: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 5 +++-- arch/x86/kvm/x86.c | 17 +++++++++++++++-- 2 files changed, 18 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index ab09ba2..c0f77e0 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -246,8 +246,6 @@ struct decode_cache { unsigned int d; int (*execute)(struct x86_emulate_ctxt *ctxt); int (*check_perm)(struct x86_emulate_ctxt *ctxt); - unsigned long regs[NR_VCPU_REGS]; - unsigned long eip; /* modrm */ u8 modrm; u8 modrm_mod; @@ -255,6 +253,9 @@ struct decode_cache { u8 modrm_rm; u8 modrm_seg; bool rip_relative; + unsigned long eip; + /* Fields above regs are cleared together. */ + unsigned long regs[NR_VCPU_REGS]; struct fetch_cache fetch; struct read_cache io_read; struct read_cache mem_read; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ae2353c..d88de56 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4506,6 +4506,20 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu) kvm_queue_exception(vcpu, ctxt->exception.vector); } +static void init_decode_cache(struct decode_cache *c, + const unsigned long *regs) +{ + memset(c, 0, offsetof(struct decode_cache, regs)); + memcpy(c->regs, regs, sizeof(c->regs)); + + c->fetch.start = 0; + c->fetch.end = 0; + c->io_read.pos = 0; + c->io_read.end = 0; + c->mem_read.pos = 0; + c->mem_read.end = 0; +} + static void init_emulate_ctxt(struct kvm_vcpu *vcpu) { struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; @@ -4531,8 +4545,7 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) X86EMUL_MODE_PROT16; ctxt->guest_mode = is_guest_mode(vcpu); - memset(c, 0, sizeof(struct decode_cache)); - memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); + init_decode_cache(c, vcpu->arch.regs); vcpu->arch.emulate_regs_need_sync_from_vcpu = false; } -- cgit v1.1 From 801d342432190947928e18f893f073fd87cd8bdf Mon Sep 17 00:00:00 2001 From: Nadav Har'El Date: Wed, 25 May 2011 23:02:23 +0300 Subject: KVM: nVMX: Add "nested" module option to kvm_intel This patch adds to kvm_intel a module option "nested". This option controls whether the guest can use VMX instructions, i.e., whether we allow nested virtualization. A similar, but separate, option already exists for the SVM module. This option currently defaults to 0, meaning that nested VMX must be explicitly enabled by giving nested=1. When nested VMX matures, the default should probably be changed to enable nested VMX by default - just like nested SVM is currently enabled by default. Signed-off-by: Nadav Har'El Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 1444e41..263be2d 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -74,6 +74,14 @@ module_param(vmm_exclusive, bool, S_IRUGO); static int __read_mostly yield_on_hlt = 1; module_param(yield_on_hlt, bool, S_IRUGO); +/* + * If nested=1, nested virtualization is supported, i.e., guests may use + * VMX and be a hypervisor for its own guests. If nested=0, guests may not + * use VMX instructions. + */ +static int __read_mostly nested = 0; +module_param(nested, bool, S_IRUGO); + #define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD) #define KVM_GUEST_CR0_MASK \ @@ -1292,6 +1300,23 @@ static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) return target_tsc - native_read_tsc(); } +static bool guest_cpuid_has_vmx(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best = kvm_find_cpuid_entry(vcpu, 1, 0); + return best && (best->ecx & (1 << (X86_FEATURE_VMX & 31))); +} + +/* + * nested_vmx_allowed() checks whether a guest should be allowed to use VMX + * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for + * all guests if the "nested" module option is off, and can also be disabled + * for a single guest by disabling its VMX cpuid bit. + */ +static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu) +{ + return nested && guest_cpuid_has_vmx(vcpu); +} + /* * Reads an msr value (of 'msr_index') into 'pdata'. * Returns 0 on success, non-0 otherwise. -- cgit v1.1 From ec378aeef9dfc7c4ba72e9bd6cd4bd6f7d5fd0cc Mon Sep 17 00:00:00 2001 From: Nadav Har'El Date: Wed, 25 May 2011 23:02:54 +0300 Subject: KVM: nVMX: Implement VMXON and VMXOFF This patch allows a guest to use the VMXON and VMXOFF instructions, and emulates them accordingly. Basically this amounts to checking some prerequisites, and then remembering whether the guest has enabled or disabled VMX operation. Signed-off-by: Nadav Har'El Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 110 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 108 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 263be2d..3a727ca 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -144,6 +144,15 @@ struct shared_msr_entry { u64 mask; }; +/* + * The nested_vmx structure is part of vcpu_vmx, and holds information we need + * for correct emulation of VMX (i.e., nested VMX) on this vcpu. + */ +struct nested_vmx { + /* Has the level1 guest done vmxon? */ + bool vmxon; +}; + struct vcpu_vmx { struct kvm_vcpu vcpu; unsigned long host_rsp; @@ -203,6 +212,9 @@ struct vcpu_vmx { u32 exit_reason; bool rdtscp_enabled; + + /* Support for a guest hypervisor (nested VMX) */ + struct nested_vmx nested; }; enum segment_cache_field { @@ -3934,6 +3946,99 @@ static int handle_invalid_op(struct kvm_vcpu *vcpu) } /* + * Emulate the VMXON instruction. + * Currently, we just remember that VMX is active, and do not save or even + * inspect the argument to VMXON (the so-called "VMXON pointer") because we + * do not currently need to store anything in that guest-allocated memory + * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their + * argument is different from the VMXON pointer (which the spec says they do). + */ +static int handle_vmon(struct kvm_vcpu *vcpu) +{ + struct kvm_segment cs; + struct vcpu_vmx *vmx = to_vmx(vcpu); + + /* The Intel VMX Instruction Reference lists a bunch of bits that + * are prerequisite to running VMXON, most notably cr4.VMXE must be + * set to 1 (see vmx_set_cr4() for when we allow the guest to set this). + * Otherwise, we should fail with #UD. We test these now: + */ + if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE) || + !kvm_read_cr0_bits(vcpu, X86_CR0_PE) || + (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); + if (is_long_mode(vcpu) && !cs.l) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + if (vmx_get_cpl(vcpu)) { + kvm_inject_gp(vcpu, 0); + return 1; + } + + vmx->nested.vmxon = true; + + skip_emulated_instruction(vcpu); + return 1; +} + +/* + * Intel's VMX Instruction Reference specifies a common set of prerequisites + * for running VMX instructions (except VMXON, whose prerequisites are + * slightly different). It also specifies what exception to inject otherwise. + */ +static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) +{ + struct kvm_segment cs; + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (!vmx->nested.vmxon) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 0; + } + + vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); + if ((vmx_get_rflags(vcpu) & X86_EFLAGS_VM) || + (is_long_mode(vcpu) && !cs.l)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 0; + } + + if (vmx_get_cpl(vcpu)) { + kvm_inject_gp(vcpu, 0); + return 0; + } + + return 1; +} + +/* + * Free whatever needs to be freed from vmx->nested when L1 goes down, or + * just stops using VMX. + */ +static void free_nested(struct vcpu_vmx *vmx) +{ + if (!vmx->nested.vmxon) + return; + vmx->nested.vmxon = false; +} + +/* Emulate the VMXOFF instruction */ +static int handle_vmoff(struct kvm_vcpu *vcpu) +{ + if (!nested_vmx_check_permission(vcpu)) + return 1; + free_nested(to_vmx(vcpu)); + skip_emulated_instruction(vcpu); + return 1; +} + +/* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs * to be done to userspace and return 0. @@ -3961,8 +4066,8 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_VMREAD] = handle_vmx_insn, [EXIT_REASON_VMRESUME] = handle_vmx_insn, [EXIT_REASON_VMWRITE] = handle_vmx_insn, - [EXIT_REASON_VMOFF] = handle_vmx_insn, - [EXIT_REASON_VMON] = handle_vmx_insn, + [EXIT_REASON_VMOFF] = handle_vmoff, + [EXIT_REASON_VMON] = handle_vmon, [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, [EXIT_REASON_APIC_ACCESS] = handle_apic_access, [EXIT_REASON_WBINVD] = handle_wbinvd, @@ -4363,6 +4468,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); free_vpid(vmx); + free_nested(vmx); free_loaded_vmcs(vmx->loaded_vmcs); kfree(vmx->guest_msrs); kvm_vcpu_uninit(vcpu); -- cgit v1.1 From 5e1746d6205d1efa3193cc0c67aa2d15e54799bd Mon Sep 17 00:00:00 2001 From: Nadav Har'El Date: Wed, 25 May 2011 23:03:24 +0300 Subject: KVM: nVMX: Allow setting the VMXE bit in CR4 This patch allows the guest to enable the VMXE bit in CR4, which is a prerequisite to running VMXON. Whether to allow setting the VMXE bit now depends on the architecture (svm or vmx), so its checking has moved to kvm_x86_ops->set_cr4(). This function now returns an int: If kvm_x86_ops->set_cr4() returns 1, __kvm_set_cr4() will also return 1, and this will cause kvm_set_cr4() will throw a #GP. Turning on the VMXE bit is allowed only when the nested VMX feature is enabled, and turning it off is forbidden after a vmxon. Signed-off-by: Nadav Har'El Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/svm.c | 6 +++++- arch/x86/kvm/vmx.c | 17 +++++++++++++++-- arch/x86/kvm/x86.c | 4 +--- 4 files changed, 22 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index ff17deb..d167039 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -555,7 +555,7 @@ struct kvm_x86_ops { void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu); void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); - void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4); + int (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4); void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer); void (*get_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void (*set_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 506e4fe..475d1c9 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1496,11 +1496,14 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) update_cr0_intercept(svm); } -static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +static int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { unsigned long host_cr4_mce = read_cr4() & X86_CR4_MCE; unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4; + if (cr4 & X86_CR4_VMXE) + return 1; + if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE)) svm_flush_tlb(vcpu); @@ -1510,6 +1513,7 @@ static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) cr4 |= host_cr4_mce; to_svm(vcpu)->vmcb->save.cr4 = cr4; mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); + return 0; } static void svm_set_segment(struct kvm_vcpu *vcpu, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 3a727ca..eda2cb6 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2121,7 +2121,7 @@ static void ept_save_pdptrs(struct kvm_vcpu *vcpu) (unsigned long *)&vcpu->arch.regs_dirty); } -static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); +static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, unsigned long cr0, @@ -2219,11 +2219,23 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) vmcs_writel(GUEST_CR3, guest_cr3); } -static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { unsigned long hw_cr4 = cr4 | (to_vmx(vcpu)->rmode.vm86_active ? KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); + if (cr4 & X86_CR4_VMXE) { + /* + * To use VMXON (and later other VMX instructions), a guest + * must first be able to turn on cr4.VMXE (see handle_vmon()). + * So basically the check on whether to allow nested VMX + * is here. + */ + if (!nested_vmx_allowed(vcpu)) + return 1; + } else if (to_vmx(vcpu)->nested.vmxon) + return 1; + vcpu->arch.cr4 = cr4; if (enable_ept) { if (!is_paging(vcpu)) { @@ -2236,6 +2248,7 @@ static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) vmcs_writel(CR4_READ_SHADOW, cr4); vmcs_writel(GUEST_CR4, hw_cr4); + return 0; } static void vmx_get_segment(struct kvm_vcpu *vcpu, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d88de56..460932b6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -615,11 +615,9 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) kvm_read_cr3(vcpu))) return 1; - if (cr4 & X86_CR4_VMXE) + if (kvm_x86_ops->set_cr4(vcpu, cr4)) return 1; - kvm_x86_ops->set_cr4(vcpu, cr4); - if ((cr4 ^ old_cr4) & pdptr_bits) kvm_mmu_reset_context(vcpu); -- cgit v1.1 From a9d30f33dd21b67b2f4db09f3dfe63a7c390d1b3 Mon Sep 17 00:00:00 2001 From: Nadav Har'El Date: Wed, 25 May 2011 23:03:55 +0300 Subject: KVM: nVMX: Introduce vmcs12: a VMCS structure for L1 An implementation of VMX needs to define a VMCS structure. This structure is kept in guest memory, but is opaque to the guest (who can only read or write it with VMX instructions). This patch starts to define the VMCS structure which our nested VMX implementation will present to L1. We call it "vmcs12", as it is the VMCS that L1 keeps for its L2 guest. We will add more content to this structure in later patches. This patch also adds the notion (as required by the VMX spec) of L1's "current VMCS", and finally includes utility functions for mapping the guest-allocated VMCSs in host memory. Signed-off-by: Nadav Har'El Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index eda2cb6..914dc4e 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -145,12 +145,53 @@ struct shared_msr_entry { }; /* + * struct vmcs12 describes the state that our guest hypervisor (L1) keeps for a + * single nested guest (L2), hence the name vmcs12. Any VMX implementation has + * a VMCS structure, and vmcs12 is our emulated VMX's VMCS. This structure is + * stored in guest memory specified by VMPTRLD, but is opaque to the guest, + * which must access it using VMREAD/VMWRITE/VMCLEAR instructions. + * More than one of these structures may exist, if L1 runs multiple L2 guests. + * nested_vmx_run() will use the data here to build a vmcs02: a VMCS for the + * underlying hardware which will be used to run L2. + * This structure is packed to ensure that its layout is identical across + * machines (necessary for live migration). + * If there are changes in this struct, VMCS12_REVISION must be changed. + */ +struct __packed vmcs12 { + /* According to the Intel spec, a VMCS region must start with the + * following two fields. Then follow implementation-specific data. + */ + u32 revision_id; + u32 abort; +}; + +/* + * VMCS12_REVISION is an arbitrary id that should be changed if the content or + * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and + * VMPTRLD verifies that the VMCS region that L1 is loading contains this id. + */ +#define VMCS12_REVISION 0x11e57ed0 + +/* + * VMCS12_SIZE is the number of bytes L1 should allocate for the VMXON region + * and any VMCS region. Although only sizeof(struct vmcs12) are used by the + * current implementation, 4K are reserved to avoid future complications. + */ +#define VMCS12_SIZE 0x1000 + +/* * The nested_vmx structure is part of vcpu_vmx, and holds information we need * for correct emulation of VMX (i.e., nested VMX) on this vcpu. */ struct nested_vmx { /* Has the level1 guest done vmxon? */ bool vmxon; + + /* The guest-physical address of the current VMCS L1 keeps for L2 */ + gpa_t current_vmptr; + /* The host-usable pointer to the above */ + struct page *current_vmcs12_page; + struct vmcs12 *current_vmcs12; }; struct vcpu_vmx { @@ -231,6 +272,31 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) return container_of(vcpu, struct vcpu_vmx, vcpu); } +static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu) +{ + return to_vmx(vcpu)->nested.current_vmcs12; +} + +static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr) +{ + struct page *page = gfn_to_page(vcpu->kvm, addr >> PAGE_SHIFT); + if (is_error_page(page)) { + kvm_release_page_clean(page); + return NULL; + } + return page; +} + +static void nested_release_page(struct page *page) +{ + kvm_release_page_dirty(page); +} + +static void nested_release_page_clean(struct page *page) +{ + kvm_release_page_clean(page); +} + static u64 construct_eptp(unsigned long root_hpa); static void kvm_cpu_vmxon(u64 addr); static void kvm_cpu_vmxoff(void); @@ -4039,6 +4105,12 @@ static void free_nested(struct vcpu_vmx *vmx) if (!vmx->nested.vmxon) return; vmx->nested.vmxon = false; + if (vmx->nested.current_vmptr != -1ull) { + kunmap(vmx->nested.current_vmcs12_page); + nested_release_page(vmx->nested.current_vmcs12_page); + vmx->nested.current_vmptr = -1ull; + vmx->nested.current_vmcs12 = NULL; + } } /* Emulate the VMXOFF instruction */ @@ -4543,6 +4615,9 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) goto free_vmcs; } + vmx->nested.current_vmptr = -1ull; + vmx->nested.current_vmcs12 = NULL; + return &vmx->vcpu; free_vmcs: -- cgit v1.1 From b87a51ae2893a5907f796eadb4beb60747a69209 Mon Sep 17 00:00:00 2001 From: Nadav Har'El Date: Wed, 25 May 2011 23:04:25 +0300 Subject: KVM: nVMX: Implement reading and writing of VMX MSRs When the guest can use VMX instructions (when the "nested" module option is on), it should also be able to read and write VMX MSRs, e.g., to query about VMX capabilities. This patch adds this support. Signed-off-by: Nadav Har'El Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/msr-index.h | 12 +++ arch/x86/kvm/vmx.c | 219 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 231 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 485b4f1..e3022cc 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -438,6 +438,18 @@ #define MSR_IA32_VMX_VMCS_ENUM 0x0000048a #define MSR_IA32_VMX_PROCBASED_CTLS2 0x0000048b #define MSR_IA32_VMX_EPT_VPID_CAP 0x0000048c +#define MSR_IA32_VMX_TRUE_PINBASED_CTLS 0x0000048d +#define MSR_IA32_VMX_TRUE_PROCBASED_CTLS 0x0000048e +#define MSR_IA32_VMX_TRUE_EXIT_CTLS 0x0000048f +#define MSR_IA32_VMX_TRUE_ENTRY_CTLS 0x00000490 + +/* VMX_BASIC bits and bitmasks */ +#define VMX_BASIC_VMCS_SIZE_SHIFT 32 +#define VMX_BASIC_64 0x0001000000000000LLU +#define VMX_BASIC_MEM_TYPE_SHIFT 50 +#define VMX_BASIC_MEM_TYPE_MASK 0x003c000000000000LLU +#define VMX_BASIC_MEM_TYPE_WB 6LLU +#define VMX_BASIC_INOUT 0x0040000000000000LLU /* AMD-V MSRs */ diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 914dc4e..487952b 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1396,6 +1396,218 @@ static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu) } /* + * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be + * returned for the various VMX controls MSRs when nested VMX is enabled. + * The same values should also be used to verify that vmcs12 control fields are + * valid during nested entry from L1 to L2. + * Each of these control msrs has a low and high 32-bit half: A low bit is on + * if the corresponding bit in the (32-bit) control field *must* be on, and a + * bit in the high half is on if the corresponding bit in the control field + * may be on. See also vmx_control_verify(). + * TODO: allow these variables to be modified (downgraded) by module options + * or other means. + */ +static u32 nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high; +static u32 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high; +static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high; +static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high; +static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high; +static __init void nested_vmx_setup_ctls_msrs(void) +{ + /* + * Note that as a general rule, the high half of the MSRs (bits in + * the control fields which may be 1) should be initialized by the + * intersection of the underlying hardware's MSR (i.e., features which + * can be supported) and the list of features we want to expose - + * because they are known to be properly supported in our code. + * Also, usually, the low half of the MSRs (bits which must be 1) can + * be set to 0, meaning that L1 may turn off any of these bits. The + * reason is that if one of these bits is necessary, it will appear + * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control + * fields of vmcs01 and vmcs02, will turn these bits off - and + * nested_vmx_exit_handled() will not pass related exits to L1. + * These rules have exceptions below. + */ + + /* pin-based controls */ + /* + * According to the Intel spec, if bit 55 of VMX_BASIC is off (as it is + * in our case), bits 1, 2 and 4 (i.e., 0x16) must be 1 in this MSR. + */ + nested_vmx_pinbased_ctls_low = 0x16 ; + nested_vmx_pinbased_ctls_high = 0x16 | + PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING | + PIN_BASED_VIRTUAL_NMIS; + + /* exit controls */ + nested_vmx_exit_ctls_low = 0; +#ifdef CONFIG_X86_64 + nested_vmx_exit_ctls_high = VM_EXIT_HOST_ADDR_SPACE_SIZE; +#else + nested_vmx_exit_ctls_high = 0; +#endif + + /* entry controls */ + rdmsr(MSR_IA32_VMX_ENTRY_CTLS, + nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high); + nested_vmx_entry_ctls_low = 0; + nested_vmx_entry_ctls_high &= + VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_IA32E_MODE; + + /* cpu-based controls */ + rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, + nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high); + nested_vmx_procbased_ctls_low = 0; + nested_vmx_procbased_ctls_high &= + CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_USE_TSC_OFFSETING | + CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | + CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING | + CPU_BASED_CR3_STORE_EXITING | +#ifdef CONFIG_X86_64 + CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | +#endif + CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | + CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING | + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; + /* + * We can allow some features even when not supported by the + * hardware. For example, L1 can specify an MSR bitmap - and we + * can use it to avoid exits to L1 - even when L0 runs L2 + * without MSR bitmaps. + */ + nested_vmx_procbased_ctls_high |= CPU_BASED_USE_MSR_BITMAPS; + + /* secondary cpu-based controls */ + rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, + nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high); + nested_vmx_secondary_ctls_low = 0; + nested_vmx_secondary_ctls_high &= + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; +} + +static inline bool vmx_control_verify(u32 control, u32 low, u32 high) +{ + /* + * Bits 0 in high must be 0, and bits 1 in low must be 1. + */ + return ((control & high) | low) == control; +} + +static inline u64 vmx_control_msr(u32 low, u32 high) +{ + return low | ((u64)high << 32); +} + +/* + * If we allow our guest to use VMX instructions (i.e., nested VMX), we should + * also let it use VMX-specific MSRs. + * vmx_get_vmx_msr() and vmx_set_vmx_msr() return 1 when we handled a + * VMX-specific MSR, or 0 when we haven't (and the caller should handle it + * like all other MSRs). + */ +static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) +{ + if (!nested_vmx_allowed(vcpu) && msr_index >= MSR_IA32_VMX_BASIC && + msr_index <= MSR_IA32_VMX_TRUE_ENTRY_CTLS) { + /* + * According to the spec, processors which do not support VMX + * should throw a #GP(0) when VMX capability MSRs are read. + */ + kvm_queue_exception_e(vcpu, GP_VECTOR, 0); + return 1; + } + + switch (msr_index) { + case MSR_IA32_FEATURE_CONTROL: + *pdata = 0; + break; + case MSR_IA32_VMX_BASIC: + /* + * This MSR reports some information about VMX support. We + * should return information about the VMX we emulate for the + * guest, and the VMCS structure we give it - not about the + * VMX support of the underlying hardware. + */ + *pdata = VMCS12_REVISION | + ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | + (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); + break; + case MSR_IA32_VMX_TRUE_PINBASED_CTLS: + case MSR_IA32_VMX_PINBASED_CTLS: + *pdata = vmx_control_msr(nested_vmx_pinbased_ctls_low, + nested_vmx_pinbased_ctls_high); + break; + case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: + case MSR_IA32_VMX_PROCBASED_CTLS: + *pdata = vmx_control_msr(nested_vmx_procbased_ctls_low, + nested_vmx_procbased_ctls_high); + break; + case MSR_IA32_VMX_TRUE_EXIT_CTLS: + case MSR_IA32_VMX_EXIT_CTLS: + *pdata = vmx_control_msr(nested_vmx_exit_ctls_low, + nested_vmx_exit_ctls_high); + break; + case MSR_IA32_VMX_TRUE_ENTRY_CTLS: + case MSR_IA32_VMX_ENTRY_CTLS: + *pdata = vmx_control_msr(nested_vmx_entry_ctls_low, + nested_vmx_entry_ctls_high); + break; + case MSR_IA32_VMX_MISC: + *pdata = 0; + break; + /* + * These MSRs specify bits which the guest must keep fixed (on or off) + * while L1 is in VMXON mode (in L1's root mode, or running an L2). + * We picked the standard core2 setting. + */ +#define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) +#define VMXON_CR4_ALWAYSON X86_CR4_VMXE + case MSR_IA32_VMX_CR0_FIXED0: + *pdata = VMXON_CR0_ALWAYSON; + break; + case MSR_IA32_VMX_CR0_FIXED1: + *pdata = -1ULL; + break; + case MSR_IA32_VMX_CR4_FIXED0: + *pdata = VMXON_CR4_ALWAYSON; + break; + case MSR_IA32_VMX_CR4_FIXED1: + *pdata = -1ULL; + break; + case MSR_IA32_VMX_VMCS_ENUM: + *pdata = 0x1f; + break; + case MSR_IA32_VMX_PROCBASED_CTLS2: + *pdata = vmx_control_msr(nested_vmx_secondary_ctls_low, + nested_vmx_secondary_ctls_high); + break; + case MSR_IA32_VMX_EPT_VPID_CAP: + /* Currently, no nested ept or nested vpid */ + *pdata = 0; + break; + default: + return 0; + } + + return 1; +} + +static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) +{ + if (!nested_vmx_allowed(vcpu)) + return 0; + + if (msr_index == MSR_IA32_FEATURE_CONTROL) + /* TODO: the right thing. */ + return 1; + /* + * No need to treat VMX capability MSRs specially: If we don't handle + * them, handle_wrmsr will #GP(0), which is correct (they are readonly) + */ + return 0; +} + +/* * Reads an msr value (of 'msr_index') into 'pdata'. * Returns 0 on success, non-0 otherwise. * Assumes vcpu_load() was already called. @@ -1443,6 +1655,8 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) /* Otherwise falls through */ default: vmx_load_host_state(to_vmx(vcpu)); + if (vmx_get_vmx_msr(vcpu, msr_index, pdata)) + return 0; msr = find_msr_entry(to_vmx(vcpu), msr_index); if (msr) { vmx_load_host_state(to_vmx(vcpu)); @@ -1514,6 +1728,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) return 1; /* Otherwise falls through */ default: + if (vmx_set_vmx_msr(vcpu, msr_index, data)) + break; msr = find_msr_entry(vmx, msr_index); if (msr) { vmx_load_host_state(vmx); @@ -1902,6 +2118,9 @@ static __init int hardware_setup(void) if (!cpu_has_vmx_ple()) ple_gap = 0; + if (nested) + nested_vmx_setup_ctls_msrs(); + return alloc_kvm_area(); } -- cgit v1.1 From 064aea774768749c6fd308b37818ea3a9600583d Mon Sep 17 00:00:00 2001 From: Nadav Har'El Date: Wed, 25 May 2011 23:04:56 +0300 Subject: KVM: nVMX: Decoding memory operands of VMX instructions This patch includes a utility function for decoding pointer operands of VMX instructions issued by L1 (a guest hypervisor) Signed-off-by: Nadav Har'El Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/x86.c | 3 ++- arch/x86/kvm/x86.h | 4 ++++ 3 files changed, 59 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 487952b..74afd86 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4343,6 +4343,59 @@ static int handle_vmoff(struct kvm_vcpu *vcpu) } /* + * Decode the memory-address operand of a vmx instruction, as recorded on an + * exit caused by such an instruction (run by a guest hypervisor). + * On success, returns 0. When the operand is invalid, returns 1 and throws + * #UD or #GP. + */ +static int get_vmx_mem_address(struct kvm_vcpu *vcpu, + unsigned long exit_qualification, + u32 vmx_instruction_info, gva_t *ret) +{ + /* + * According to Vol. 3B, "Information for VM Exits Due to Instruction + * Execution", on an exit, vmx_instruction_info holds most of the + * addressing components of the operand. Only the displacement part + * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). + * For how an actual address is calculated from all these components, + * refer to Vol. 1, "Operand Addressing". + */ + int scaling = vmx_instruction_info & 3; + int addr_size = (vmx_instruction_info >> 7) & 7; + bool is_reg = vmx_instruction_info & (1u << 10); + int seg_reg = (vmx_instruction_info >> 15) & 7; + int index_reg = (vmx_instruction_info >> 18) & 0xf; + bool index_is_valid = !(vmx_instruction_info & (1u << 22)); + int base_reg = (vmx_instruction_info >> 23) & 0xf; + bool base_is_valid = !(vmx_instruction_info & (1u << 27)); + + if (is_reg) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + /* Addr = segment_base + offset */ + /* offset = base + [index * scale] + displacement */ + *ret = vmx_get_segment_base(vcpu, seg_reg); + if (base_is_valid) + *ret += kvm_register_read(vcpu, base_reg); + if (index_is_valid) + *ret += kvm_register_read(vcpu, index_reg)< Date: Wed, 25 May 2011 23:05:27 +0300 Subject: KVM: nVMX: Introduce vmcs02: VMCS used to run L2 We saw in a previous patch that L1 controls its L2 guest with a vcms12. L0 needs to create a real VMCS for running L2. We call that "vmcs02". A later patch will contain the code, prepare_vmcs02(), for filling the vmcs02 fields. This patch only contains code for allocating vmcs02. In this version, prepare_vmcs02() sets *all* of vmcs02's fields each time we enter from L1 to L2, so keeping just one vmcs02 for the vcpu is enough: It can be reused even when L1 runs multiple L2 guests. However, in future versions we'll probably want to add an optimization where vmcs02 fields that rarely change will not be set each time. For that, we may want to keep around several vmcs02s of L2 guests that have recently run, so that potentially we could run these L2s again more quickly because less vmwrites to vmcs02 will be needed. This patch adds to each vcpu a vmcs02 pool, vmx->nested.vmcs02_pool, which remembers the vmcs02s last used to run up to VMCS02_POOL_SIZE L2s. As explained above, in the current version we choose VMCS02_POOL_SIZE=1, I.e., one vmcs02 is allocated (and loaded onto the processor), and it is reused to enter any L2 guest. In the future, when prepare_vmcs02() is optimized not to set all fields every time, VMCS02_POOL_SIZE should be increased. Signed-off-by: Nadav Har'El Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 100 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 100 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 74afd86..03b29ac 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -119,6 +119,7 @@ static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW; module_param(ple_window, int, S_IRUGO); #define NR_AUTOLOAD_MSRS 1 +#define VMCS02_POOL_SIZE 1 struct vmcs { u32 revision_id; @@ -179,6 +180,13 @@ struct __packed vmcs12 { */ #define VMCS12_SIZE 0x1000 +/* Used to remember the last vmcs02 used for some recently used vmcs12s */ +struct vmcs02_list { + struct list_head list; + gpa_t vmptr; + struct loaded_vmcs vmcs02; +}; + /* * The nested_vmx structure is part of vcpu_vmx, and holds information we need * for correct emulation of VMX (i.e., nested VMX) on this vcpu. @@ -192,6 +200,10 @@ struct nested_vmx { /* The host-usable pointer to the above */ struct page *current_vmcs12_page; struct vmcs12 *current_vmcs12; + + /* vmcs02_list cache of VMCSs recently used to run L2 guests */ + struct list_head vmcs02_pool; + int vmcs02_num; }; struct vcpu_vmx { @@ -4244,6 +4256,89 @@ static int handle_invalid_op(struct kvm_vcpu *vcpu) } /* + * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12. + * We could reuse a single VMCS for all the L2 guests, but we also want the + * option to allocate a separate vmcs02 for each separate loaded vmcs12 - this + * allows keeping them loaded on the processor, and in the future will allow + * optimizations where prepare_vmcs02 doesn't need to set all the fields on + * every entry if they never change. + * So we keep, in vmx->nested.vmcs02_pool, a cache of size VMCS02_POOL_SIZE + * (>=0) with a vmcs02 for each recently loaded vmcs12s, most recent first. + * + * The following functions allocate and free a vmcs02 in this pool. + */ + +/* Get a VMCS from the pool to use as vmcs02 for the current vmcs12. */ +static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx) +{ + struct vmcs02_list *item; + list_for_each_entry(item, &vmx->nested.vmcs02_pool, list) + if (item->vmptr == vmx->nested.current_vmptr) { + list_move(&item->list, &vmx->nested.vmcs02_pool); + return &item->vmcs02; + } + + if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) { + /* Recycle the least recently used VMCS. */ + item = list_entry(vmx->nested.vmcs02_pool.prev, + struct vmcs02_list, list); + item->vmptr = vmx->nested.current_vmptr; + list_move(&item->list, &vmx->nested.vmcs02_pool); + return &item->vmcs02; + } + + /* Create a new VMCS */ + item = (struct vmcs02_list *) + kmalloc(sizeof(struct vmcs02_list), GFP_KERNEL); + if (!item) + return NULL; + item->vmcs02.vmcs = alloc_vmcs(); + if (!item->vmcs02.vmcs) { + kfree(item); + return NULL; + } + loaded_vmcs_init(&item->vmcs02); + item->vmptr = vmx->nested.current_vmptr; + list_add(&(item->list), &(vmx->nested.vmcs02_pool)); + vmx->nested.vmcs02_num++; + return &item->vmcs02; +} + +/* Free and remove from pool a vmcs02 saved for a vmcs12 (if there is one) */ +static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr) +{ + struct vmcs02_list *item; + list_for_each_entry(item, &vmx->nested.vmcs02_pool, list) + if (item->vmptr == vmptr) { + free_loaded_vmcs(&item->vmcs02); + list_del(&item->list); + kfree(item); + vmx->nested.vmcs02_num--; + return; + } +} + +/* + * Free all VMCSs saved for this vcpu, except the one pointed by + * vmx->loaded_vmcs. These include the VMCSs in vmcs02_pool (except the one + * currently used, if running L2), and vmcs01 when running L2. + */ +static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx) +{ + struct vmcs02_list *item, *n; + list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) { + if (vmx->loaded_vmcs != &item->vmcs02) + free_loaded_vmcs(&item->vmcs02); + list_del(&item->list); + kfree(item); + } + vmx->nested.vmcs02_num = 0; + + if (vmx->loaded_vmcs != &vmx->vmcs01) + free_loaded_vmcs(&vmx->vmcs01); +} + +/* * Emulate the VMXON instruction. * Currently, we just remember that VMX is active, and do not save or even * inspect the argument to VMXON (the so-called "VMXON pointer") because we @@ -4279,6 +4374,9 @@ static int handle_vmon(struct kvm_vcpu *vcpu) return 1; } + INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool)); + vmx->nested.vmcs02_num = 0; + vmx->nested.vmxon = true; skip_emulated_instruction(vcpu); @@ -4330,6 +4428,8 @@ static void free_nested(struct vcpu_vmx *vmx) vmx->nested.current_vmptr = -1ull; vmx->nested.current_vmcs12 = NULL; } + + nested_free_all_saved_vmcss(vmx); } /* Emulate the VMXOFF instruction */ -- cgit v1.1 From 22bd035868b06a614debf7352c09fb3efdc7c269 Mon Sep 17 00:00:00 2001 From: Nadav Har'El Date: Wed, 25 May 2011 23:05:57 +0300 Subject: KVM: nVMX: Add VMCS fields to the vmcs12 In this patch we add to vmcs12 (the VMCS that L1 keeps for L2) all the standard VMCS fields. Later patches will enable L1 to read and write these fields using VMREAD/ VMWRITE, and they will be used during a VMLAUNCH/VMRESUME in preparing vmcs02, a hardware VMCS for running L2. Signed-off-by: Nadav Har'El Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 281 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 281 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 03b29ac..33476eb 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -158,12 +158,150 @@ struct shared_msr_entry { * machines (necessary for live migration). * If there are changes in this struct, VMCS12_REVISION must be changed. */ +typedef u64 natural_width; struct __packed vmcs12 { /* According to the Intel spec, a VMCS region must start with the * following two fields. Then follow implementation-specific data. */ u32 revision_id; u32 abort; + + u64 io_bitmap_a; + u64 io_bitmap_b; + u64 msr_bitmap; + u64 vm_exit_msr_store_addr; + u64 vm_exit_msr_load_addr; + u64 vm_entry_msr_load_addr; + u64 tsc_offset; + u64 virtual_apic_page_addr; + u64 apic_access_addr; + u64 ept_pointer; + u64 guest_physical_address; + u64 vmcs_link_pointer; + u64 guest_ia32_debugctl; + u64 guest_ia32_pat; + u64 guest_ia32_efer; + u64 guest_ia32_perf_global_ctrl; + u64 guest_pdptr0; + u64 guest_pdptr1; + u64 guest_pdptr2; + u64 guest_pdptr3; + u64 host_ia32_pat; + u64 host_ia32_efer; + u64 host_ia32_perf_global_ctrl; + u64 padding64[8]; /* room for future expansion */ + /* + * To allow migration of L1 (complete with its L2 guests) between + * machines of different natural widths (32 or 64 bit), we cannot have + * unsigned long fields with no explict size. We use u64 (aliased + * natural_width) instead. Luckily, x86 is little-endian. + */ + natural_width cr0_guest_host_mask; + natural_width cr4_guest_host_mask; + natural_width cr0_read_shadow; + natural_width cr4_read_shadow; + natural_width cr3_target_value0; + natural_width cr3_target_value1; + natural_width cr3_target_value2; + natural_width cr3_target_value3; + natural_width exit_qualification; + natural_width guest_linear_address; + natural_width guest_cr0; + natural_width guest_cr3; + natural_width guest_cr4; + natural_width guest_es_base; + natural_width guest_cs_base; + natural_width guest_ss_base; + natural_width guest_ds_base; + natural_width guest_fs_base; + natural_width guest_gs_base; + natural_width guest_ldtr_base; + natural_width guest_tr_base; + natural_width guest_gdtr_base; + natural_width guest_idtr_base; + natural_width guest_dr7; + natural_width guest_rsp; + natural_width guest_rip; + natural_width guest_rflags; + natural_width guest_pending_dbg_exceptions; + natural_width guest_sysenter_esp; + natural_width guest_sysenter_eip; + natural_width host_cr0; + natural_width host_cr3; + natural_width host_cr4; + natural_width host_fs_base; + natural_width host_gs_base; + natural_width host_tr_base; + natural_width host_gdtr_base; + natural_width host_idtr_base; + natural_width host_ia32_sysenter_esp; + natural_width host_ia32_sysenter_eip; + natural_width host_rsp; + natural_width host_rip; + natural_width paddingl[8]; /* room for future expansion */ + u32 pin_based_vm_exec_control; + u32 cpu_based_vm_exec_control; + u32 exception_bitmap; + u32 page_fault_error_code_mask; + u32 page_fault_error_code_match; + u32 cr3_target_count; + u32 vm_exit_controls; + u32 vm_exit_msr_store_count; + u32 vm_exit_msr_load_count; + u32 vm_entry_controls; + u32 vm_entry_msr_load_count; + u32 vm_entry_intr_info_field; + u32 vm_entry_exception_error_code; + u32 vm_entry_instruction_len; + u32 tpr_threshold; + u32 secondary_vm_exec_control; + u32 vm_instruction_error; + u32 vm_exit_reason; + u32 vm_exit_intr_info; + u32 vm_exit_intr_error_code; + u32 idt_vectoring_info_field; + u32 idt_vectoring_error_code; + u32 vm_exit_instruction_len; + u32 vmx_instruction_info; + u32 guest_es_limit; + u32 guest_cs_limit; + u32 guest_ss_limit; + u32 guest_ds_limit; + u32 guest_fs_limit; + u32 guest_gs_limit; + u32 guest_ldtr_limit; + u32 guest_tr_limit; + u32 guest_gdtr_limit; + u32 guest_idtr_limit; + u32 guest_es_ar_bytes; + u32 guest_cs_ar_bytes; + u32 guest_ss_ar_bytes; + u32 guest_ds_ar_bytes; + u32 guest_fs_ar_bytes; + u32 guest_gs_ar_bytes; + u32 guest_ldtr_ar_bytes; + u32 guest_tr_ar_bytes; + u32 guest_interruptibility_info; + u32 guest_activity_state; + u32 guest_sysenter_cs; + u32 host_ia32_sysenter_cs; + u32 padding32[8]; /* room for future expansion */ + u16 virtual_processor_id; + u16 guest_es_selector; + u16 guest_cs_selector; + u16 guest_ss_selector; + u16 guest_ds_selector; + u16 guest_fs_selector; + u16 guest_gs_selector; + u16 guest_ldtr_selector; + u16 guest_tr_selector; + u16 host_es_selector; + u16 host_cs_selector; + u16 host_ss_selector; + u16 host_ds_selector; + u16 host_fs_selector; + u16 host_gs_selector; + u16 host_tr_selector; }; /* @@ -284,6 +422,149 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) return container_of(vcpu, struct vcpu_vmx, vcpu); } +#define VMCS12_OFFSET(x) offsetof(struct vmcs12, x) +#define FIELD(number, name) [number] = VMCS12_OFFSET(name) +#define FIELD64(number, name) [number] = VMCS12_OFFSET(name), \ + [number##_HIGH] = VMCS12_OFFSET(name)+4 + +static unsigned short vmcs_field_to_offset_table[] = { + FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id), + FIELD(GUEST_ES_SELECTOR, guest_es_selector), + FIELD(GUEST_CS_SELECTOR, guest_cs_selector), + FIELD(GUEST_SS_SELECTOR, guest_ss_selector), + FIELD(GUEST_DS_SELECTOR, guest_ds_selector), + FIELD(GUEST_FS_SELECTOR, guest_fs_selector), + FIELD(GUEST_GS_SELECTOR, guest_gs_selector), + FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector), + FIELD(GUEST_TR_SELECTOR, guest_tr_selector), + FIELD(HOST_ES_SELECTOR, host_es_selector), + FIELD(HOST_CS_SELECTOR, host_cs_selector), + FIELD(HOST_SS_SELECTOR, host_ss_selector), + FIELD(HOST_DS_SELECTOR, host_ds_selector), + FIELD(HOST_FS_SELECTOR, host_fs_selector), + FIELD(HOST_GS_SELECTOR, host_gs_selector), + FIELD(HOST_TR_SELECTOR, host_tr_selector), + FIELD64(IO_BITMAP_A, io_bitmap_a), + FIELD64(IO_BITMAP_B, io_bitmap_b), + FIELD64(MSR_BITMAP, msr_bitmap), + FIELD64(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr), + FIELD64(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr), + FIELD64(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr), + FIELD64(TSC_OFFSET, tsc_offset), + FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr), + FIELD64(APIC_ACCESS_ADDR, apic_access_addr), + FIELD64(EPT_POINTER, ept_pointer), + FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address), + FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer), + FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl), + FIELD64(GUEST_IA32_PAT, guest_ia32_pat), + FIELD64(GUEST_IA32_EFER, guest_ia32_efer), + FIELD64(GUEST_IA32_PERF_GLOBAL_CTRL, guest_ia32_perf_global_ctrl), + FIELD64(GUEST_PDPTR0, guest_pdptr0), + FIELD64(GUEST_PDPTR1, guest_pdptr1), + FIELD64(GUEST_PDPTR2, guest_pdptr2), + FIELD64(GUEST_PDPTR3, guest_pdptr3), + FIELD64(HOST_IA32_PAT, host_ia32_pat), + FIELD64(HOST_IA32_EFER, host_ia32_efer), + FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl), + FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control), + FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control), + FIELD(EXCEPTION_BITMAP, exception_bitmap), + FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask), + FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match), + FIELD(CR3_TARGET_COUNT, cr3_target_count), + FIELD(VM_EXIT_CONTROLS, vm_exit_controls), + FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count), + FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count), + FIELD(VM_ENTRY_CONTROLS, vm_entry_controls), + FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count), + FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field), + FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE, vm_entry_exception_error_code), + FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len), + FIELD(TPR_THRESHOLD, tpr_threshold), + FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control), + FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error), + FIELD(VM_EXIT_REASON, vm_exit_reason), + FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info), + FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code), + FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field), + FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code), + FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len), + FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info), + FIELD(GUEST_ES_LIMIT, guest_es_limit), + FIELD(GUEST_CS_LIMIT, guest_cs_limit), + FIELD(GUEST_SS_LIMIT, guest_ss_limit), + FIELD(GUEST_DS_LIMIT, guest_ds_limit), + FIELD(GUEST_FS_LIMIT, guest_fs_limit), + FIELD(GUEST_GS_LIMIT, guest_gs_limit), + FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit), + FIELD(GUEST_TR_LIMIT, guest_tr_limit), + FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit), + FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit), + FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes), + FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes), + FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes), + FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes), + FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes), + FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes), + FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes), + FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes), + FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info), + FIELD(GUEST_ACTIVITY_STATE, guest_activity_state), + FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs), + FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs), + FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask), + FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask), + FIELD(CR0_READ_SHADOW, cr0_read_shadow), + FIELD(CR4_READ_SHADOW, cr4_read_shadow), + FIELD(CR3_TARGET_VALUE0, cr3_target_value0), + FIELD(CR3_TARGET_VALUE1, cr3_target_value1), + FIELD(CR3_TARGET_VALUE2, cr3_target_value2), + FIELD(CR3_TARGET_VALUE3, cr3_target_value3), + FIELD(EXIT_QUALIFICATION, exit_qualification), + FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address), + FIELD(GUEST_CR0, guest_cr0), + FIELD(GUEST_CR3, guest_cr3), + FIELD(GUEST_CR4, guest_cr4), + FIELD(GUEST_ES_BASE, guest_es_base), + FIELD(GUEST_CS_BASE, guest_cs_base), + FIELD(GUEST_SS_BASE, guest_ss_base), + FIELD(GUEST_DS_BASE, guest_ds_base), + FIELD(GUEST_FS_BASE, guest_fs_base), + FIELD(GUEST_GS_BASE, guest_gs_base), + FIELD(GUEST_LDTR_BASE, guest_ldtr_base), + FIELD(GUEST_TR_BASE, guest_tr_base), + FIELD(GUEST_GDTR_BASE, guest_gdtr_base), + FIELD(GUEST_IDTR_BASE, guest_idtr_base), + FIELD(GUEST_DR7, guest_dr7), + FIELD(GUEST_RSP, guest_rsp), + FIELD(GUEST_RIP, guest_rip), + FIELD(GUEST_RFLAGS, guest_rflags), + FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions), + FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp), + FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip), + FIELD(HOST_CR0, host_cr0), + FIELD(HOST_CR3, host_cr3), + FIELD(HOST_CR4, host_cr4), + FIELD(HOST_FS_BASE, host_fs_base), + FIELD(HOST_GS_BASE, host_gs_base), + FIELD(HOST_TR_BASE, host_tr_base), + FIELD(HOST_GDTR_BASE, host_gdtr_base), + FIELD(HOST_IDTR_BASE, host_idtr_base), + FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp), + FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip), + FIELD(HOST_RSP, host_rsp), + FIELD(HOST_RIP, host_rip), +}; +static const int max_vmcs_field = ARRAY_SIZE(vmcs_field_to_offset_table); + +static inline short vmcs_field_to_offset(unsigned long field) +{ + if (field >= max_vmcs_field || vmcs_field_to_offset_table[field] == 0) + return -1; + return vmcs_field_to_offset_table[field]; +} + static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu) { return to_vmx(vcpu)->nested.current_vmcs12; -- cgit v1.1 From 0140caea3b9972f826416a796271f17b42cbe827 Mon Sep 17 00:00:00 2001 From: Nadav Har'El Date: Wed, 25 May 2011 23:06:28 +0300 Subject: KVM: nVMX: Success/failure of VMX instructions. VMX instructions specify success or failure by setting certain RFLAGS bits. This patch contains common functions to do this, and they will be used in the following patches which emulate the various VMX instructions. Signed-off-by: Nadav Har'El Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/vmx.h | 31 +++++++++++++++++++++++++++++++ arch/x86/kvm/vmx.c | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 84471b8..37690bd 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -426,4 +426,35 @@ struct vmx_msr_entry { u64 value; } __aligned(16); +/* + * VM-instruction error numbers + */ +enum vm_instruction_error_number { + VMXERR_VMCALL_IN_VMX_ROOT_OPERATION = 1, + VMXERR_VMCLEAR_INVALID_ADDRESS = 2, + VMXERR_VMCLEAR_VMXON_POINTER = 3, + VMXERR_VMLAUNCH_NONCLEAR_VMCS = 4, + VMXERR_VMRESUME_NONLAUNCHED_VMCS = 5, + VMXERR_VMRESUME_AFTER_VMXOFF = 6, + VMXERR_ENTRY_INVALID_CONTROL_FIELD = 7, + VMXERR_ENTRY_INVALID_HOST_STATE_FIELD = 8, + VMXERR_VMPTRLD_INVALID_ADDRESS = 9, + VMXERR_VMPTRLD_VMXON_POINTER = 10, + VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID = 11, + VMXERR_UNSUPPORTED_VMCS_COMPONENT = 12, + VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT = 13, + VMXERR_VMXON_IN_VMX_ROOT_OPERATION = 15, + VMXERR_ENTRY_INVALID_EXECUTIVE_VMCS_POINTER = 16, + VMXERR_ENTRY_NONLAUNCHED_EXECUTIVE_VMCS = 17, + VMXERR_ENTRY_EXECUTIVE_VMCS_POINTER_NOT_VMXON_POINTER = 18, + VMXERR_VMCALL_NONCLEAR_VMCS = 19, + VMXERR_VMCALL_INVALID_VM_EXIT_CONTROL_FIELDS = 20, + VMXERR_VMCALL_INCORRECT_MSEG_REVISION_ID = 22, + VMXERR_VMXOFF_UNDER_DUAL_MONITOR_TREATMENT_OF_SMIS_AND_SMM = 23, + VMXERR_VMCALL_INVALID_SMM_MONITOR_FEATURES = 24, + VMXERR_ENTRY_INVALID_VM_EXECUTION_CONTROL_FIELDS_IN_EXECUTIVE_VMCS = 25, + VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS = 26, + VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID = 28, +}; + #endif diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 33476eb..8432448 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4777,6 +4777,44 @@ static int get_vmx_mem_address(struct kvm_vcpu *vcpu, } /* + * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), + * set the success or error code of an emulated VMX instruction, as specified + * by Vol 2B, VMX Instruction Reference, "Conventions". + */ +static void nested_vmx_succeed(struct kvm_vcpu *vcpu) +{ + vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) + & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | + X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); +} + +static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu) +{ + vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) + & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | + X86_EFLAGS_SF | X86_EFLAGS_OF)) + | X86_EFLAGS_CF); +} + +static void nested_vmx_failValid(struct kvm_vcpu *vcpu, + u32 vm_instruction_error) +{ + if (to_vmx(vcpu)->nested.current_vmptr == -1ull) { + /* + * failValid writes the error number to the current VMCS, which + * can't be done there isn't a current VMCS. + */ + nested_vmx_failInvalid(vcpu); + return; + } + vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) + & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | + X86_EFLAGS_SF | X86_EFLAGS_OF)) + | X86_EFLAGS_ZF); + get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; +} + +/* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs * to be done to userspace and return 0. -- cgit v1.1 From 27d6c865211662721e6cf305706e4a3da35f12b4 Mon Sep 17 00:00:00 2001 From: Nadav Har'El Date: Wed, 25 May 2011 23:06:59 +0300 Subject: KVM: nVMX: Implement VMCLEAR This patch implements the VMCLEAR instruction. Signed-off-by: Nadav Har'El Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++++- arch/x86/kvm/x86.c | 1 + 2 files changed, 65 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 8432448..c760000 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -166,6 +166,9 @@ struct __packed vmcs12 { u32 revision_id; u32 abort; + u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */ + u32 padding[7]; /* room for future expansion */ + u64 io_bitmap_a; u64 io_bitmap_b; u64 msr_bitmap; @@ -4814,6 +4817,66 @@ static void nested_vmx_failValid(struct kvm_vcpu *vcpu, get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; } +/* Emulate the VMCLEAR instruction */ +static int handle_vmclear(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + gva_t gva; + gpa_t vmptr; + struct vmcs12 *vmcs12; + struct page *page; + struct x86_exception e; + + if (!nested_vmx_check_permission(vcpu)) + return 1; + + if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), + vmcs_read32(VMX_INSTRUCTION_INFO), &gva)) + return 1; + + if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr, + sizeof(vmptr), &e)) { + kvm_inject_page_fault(vcpu, &e); + return 1; + } + + if (!IS_ALIGNED(vmptr, PAGE_SIZE)) { + nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS); + skip_emulated_instruction(vcpu); + return 1; + } + + if (vmptr == vmx->nested.current_vmptr) { + kunmap(vmx->nested.current_vmcs12_page); + nested_release_page(vmx->nested.current_vmcs12_page); + vmx->nested.current_vmptr = -1ull; + vmx->nested.current_vmcs12 = NULL; + } + + page = nested_get_page(vcpu, vmptr); + if (page == NULL) { + /* + * For accurate processor emulation, VMCLEAR beyond available + * physical memory should do nothing at all. However, it is + * possible that a nested vmx bug, not a guest hypervisor bug, + * resulted in this case, so let's shut down before doing any + * more damage: + */ + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + return 1; + } + vmcs12 = kmap(page); + vmcs12->launch_state = 0; + kunmap(page); + nested_release_page(page); + + nested_free_vmcs02(vmx, vmptr); + + skip_emulated_instruction(vcpu); + nested_vmx_succeed(vcpu); + return 1; +} + /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -4835,7 +4898,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_INVD] = handle_invd, [EXIT_REASON_INVLPG] = handle_invlpg, [EXIT_REASON_VMCALL] = handle_vmcall, - [EXIT_REASON_VMCLEAR] = handle_vmx_insn, + [EXIT_REASON_VMCLEAR] = handle_vmclear, [EXIT_REASON_VMLAUNCH] = handle_vmx_insn, [EXIT_REASON_VMPTRLD] = handle_vmx_insn, [EXIT_REASON_VMPTRST] = handle_vmx_insn, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 27d12a3..c1b5a18 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -347,6 +347,7 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) vcpu->arch.cr2 = fault->address; kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code); } +EXPORT_SYMBOL_GPL(kvm_inject_page_fault); void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) { -- cgit v1.1 From 63846663eac783eabbc1ab7c325e41a3627d986f Mon Sep 17 00:00:00 2001 From: Nadav Har'El Date: Wed, 25 May 2011 23:07:29 +0300 Subject: KVM: nVMX: Implement VMPTRLD This patch implements the VMPTRLD instruction. Signed-off-by: Nadav Har'El Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 61 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index c760000..54866f5 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4877,6 +4877,66 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) return 1; } +/* Emulate the VMPTRLD instruction */ +static int handle_vmptrld(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + gva_t gva; + gpa_t vmptr; + struct x86_exception e; + + if (!nested_vmx_check_permission(vcpu)) + return 1; + + if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), + vmcs_read32(VMX_INSTRUCTION_INFO), &gva)) + return 1; + + if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr, + sizeof(vmptr), &e)) { + kvm_inject_page_fault(vcpu, &e); + return 1; + } + + if (!IS_ALIGNED(vmptr, PAGE_SIZE)) { + nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS); + skip_emulated_instruction(vcpu); + return 1; + } + + if (vmx->nested.current_vmptr != vmptr) { + struct vmcs12 *new_vmcs12; + struct page *page; + page = nested_get_page(vcpu, vmptr); + if (page == NULL) { + nested_vmx_failInvalid(vcpu); + skip_emulated_instruction(vcpu); + return 1; + } + new_vmcs12 = kmap(page); + if (new_vmcs12->revision_id != VMCS12_REVISION) { + kunmap(page); + nested_release_page_clean(page); + nested_vmx_failValid(vcpu, + VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); + skip_emulated_instruction(vcpu); + return 1; + } + if (vmx->nested.current_vmptr != -1ull) { + kunmap(vmx->nested.current_vmcs12_page); + nested_release_page(vmx->nested.current_vmcs12_page); + } + + vmx->nested.current_vmptr = vmptr; + vmx->nested.current_vmcs12 = new_vmcs12; + vmx->nested.current_vmcs12_page = page; + } + + nested_vmx_succeed(vcpu); + skip_emulated_instruction(vcpu); + return 1; +} + /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -4900,7 +4960,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_VMCALL] = handle_vmcall, [EXIT_REASON_VMCLEAR] = handle_vmclear, [EXIT_REASON_VMLAUNCH] = handle_vmx_insn, - [EXIT_REASON_VMPTRLD] = handle_vmx_insn, + [EXIT_REASON_VMPTRLD] = handle_vmptrld, [EXIT_REASON_VMPTRST] = handle_vmx_insn, [EXIT_REASON_VMREAD] = handle_vmx_insn, [EXIT_REASON_VMRESUME] = handle_vmx_insn, -- cgit v1.1 From 6a4d7550601b5b17df227959bdbec208384f729c Mon Sep 17 00:00:00 2001 From: Nadav Har'El Date: Wed, 25 May 2011 23:08:00 +0300 Subject: KVM: nVMX: Implement VMPTRST This patch implements the VMPTRST instruction. Signed-off-by: Nadav Har'El Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 28 +++++++++++++++++++++++++++- arch/x86/kvm/x86.c | 3 ++- arch/x86/kvm/x86.h | 4 ++++ 3 files changed, 33 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 54866f5..2bc521c 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4937,6 +4937,32 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) return 1; } +/* Emulate the VMPTRST instruction */ +static int handle_vmptrst(struct kvm_vcpu *vcpu) +{ + unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + gva_t vmcs_gva; + struct x86_exception e; + + if (!nested_vmx_check_permission(vcpu)) + return 1; + + if (get_vmx_mem_address(vcpu, exit_qualification, + vmx_instruction_info, &vmcs_gva)) + return 1; + /* ok to use *_system, as nested_vmx_check_permission verified cpl=0 */ + if (kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, vmcs_gva, + (void *)&to_vmx(vcpu)->nested.current_vmptr, + sizeof(u64), &e)) { + kvm_inject_page_fault(vcpu, &e); + return 1; + } + nested_vmx_succeed(vcpu); + skip_emulated_instruction(vcpu); + return 1; +} + /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -4961,7 +4987,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_VMCLEAR] = handle_vmclear, [EXIT_REASON_VMLAUNCH] = handle_vmx_insn, [EXIT_REASON_VMPTRLD] = handle_vmptrld, - [EXIT_REASON_VMPTRST] = handle_vmx_insn, + [EXIT_REASON_VMPTRST] = handle_vmptrst, [EXIT_REASON_VMREAD] = handle_vmx_insn, [EXIT_REASON_VMRESUME] = handle_vmx_insn, [EXIT_REASON_VMWRITE] = handle_vmx_insn, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c1b5a18..de262a0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3869,7 +3869,7 @@ static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt, return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception); } -static int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, +int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val, unsigned int bytes, struct x86_exception *exception) @@ -3901,6 +3901,7 @@ static int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, out: return r; } +EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system); static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, unsigned long addr, diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 6f15053..256da82 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -85,4 +85,8 @@ int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val, unsigned int bytes, struct x86_exception *exception); +int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, + gva_t addr, void *val, unsigned int bytes, + struct x86_exception *exception); + #endif -- cgit v1.1 From 49f705c5324aa13bb5623b392c23996e23eabc23 Mon Sep 17 00:00:00 2001 From: Nadav Har'El Date: Wed, 25 May 2011 23:08:30 +0300 Subject: KVM: nVMX: Implement VMREAD and VMWRITE Implement the VMREAD and VMWRITE instructions. With these instructions, L1 can read and write to the VMCS it is holding. The values are read or written to the fields of the vmcs12 structure introduced in a previous patch. Signed-off-by: Nadav Har'El Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 193 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 191 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 2bc521c..84d9c93 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4877,6 +4877,195 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) return 1; } +enum vmcs_field_type { + VMCS_FIELD_TYPE_U16 = 0, + VMCS_FIELD_TYPE_U64 = 1, + VMCS_FIELD_TYPE_U32 = 2, + VMCS_FIELD_TYPE_NATURAL_WIDTH = 3 +}; + +static inline int vmcs_field_type(unsigned long field) +{ + if (0x1 & field) /* the *_HIGH fields are all 32 bit */ + return VMCS_FIELD_TYPE_U32; + return (field >> 13) & 0x3 ; +} + +static inline int vmcs_field_readonly(unsigned long field) +{ + return (((field >> 10) & 0x3) == 1); +} + +/* + * Read a vmcs12 field. Since these can have varying lengths and we return + * one type, we chose the biggest type (u64) and zero-extend the return value + * to that size. Note that the caller, handle_vmread, might need to use only + * some of the bits we return here (e.g., on 32-bit guests, only 32 bits of + * 64-bit fields are to be returned). + */ +static inline bool vmcs12_read_any(struct kvm_vcpu *vcpu, + unsigned long field, u64 *ret) +{ + short offset = vmcs_field_to_offset(field); + char *p; + + if (offset < 0) + return 0; + + p = ((char *)(get_vmcs12(vcpu))) + offset; + + switch (vmcs_field_type(field)) { + case VMCS_FIELD_TYPE_NATURAL_WIDTH: + *ret = *((natural_width *)p); + return 1; + case VMCS_FIELD_TYPE_U16: + *ret = *((u16 *)p); + return 1; + case VMCS_FIELD_TYPE_U32: + *ret = *((u32 *)p); + return 1; + case VMCS_FIELD_TYPE_U64: + *ret = *((u64 *)p); + return 1; + default: + return 0; /* can never happen. */ + } +} + +/* + * VMX instructions which assume a current vmcs12 (i.e., that VMPTRLD was + * used before) all generate the same failure when it is missing. + */ +static int nested_vmx_check_vmcs12(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + if (vmx->nested.current_vmptr == -1ull) { + nested_vmx_failInvalid(vcpu); + skip_emulated_instruction(vcpu); + return 0; + } + return 1; +} + +static int handle_vmread(struct kvm_vcpu *vcpu) +{ + unsigned long field; + u64 field_value; + unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + gva_t gva = 0; + + if (!nested_vmx_check_permission(vcpu) || + !nested_vmx_check_vmcs12(vcpu)) + return 1; + + /* Decode instruction info and find the field to read */ + field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); + /* Read the field, zero-extended to a u64 field_value */ + if (!vmcs12_read_any(vcpu, field, &field_value)) { + nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); + skip_emulated_instruction(vcpu); + return 1; + } + /* + * Now copy part of this value to register or memory, as requested. + * Note that the number of bits actually copied is 32 or 64 depending + * on the guest's mode (32 or 64 bit), not on the given field's length. + */ + if (vmx_instruction_info & (1u << 10)) { + kvm_register_write(vcpu, (((vmx_instruction_info) >> 3) & 0xf), + field_value); + } else { + if (get_vmx_mem_address(vcpu, exit_qualification, + vmx_instruction_info, &gva)) + return 1; + /* _system ok, as nested_vmx_check_permission verified cpl=0 */ + kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, gva, + &field_value, (is_long_mode(vcpu) ? 8 : 4), NULL); + } + + nested_vmx_succeed(vcpu); + skip_emulated_instruction(vcpu); + return 1; +} + + +static int handle_vmwrite(struct kvm_vcpu *vcpu) +{ + unsigned long field; + gva_t gva; + unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + char *p; + short offset; + /* The value to write might be 32 or 64 bits, depending on L1's long + * mode, and eventually we need to write that into a field of several + * possible lengths. The code below first zero-extends the value to 64 + * bit (field_value), and then copies only the approriate number of + * bits into the vmcs12 field. + */ + u64 field_value = 0; + struct x86_exception e; + + if (!nested_vmx_check_permission(vcpu) || + !nested_vmx_check_vmcs12(vcpu)) + return 1; + + if (vmx_instruction_info & (1u << 10)) + field_value = kvm_register_read(vcpu, + (((vmx_instruction_info) >> 3) & 0xf)); + else { + if (get_vmx_mem_address(vcpu, exit_qualification, + vmx_instruction_info, &gva)) + return 1; + if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, + &field_value, (is_long_mode(vcpu) ? 8 : 4), &e)) { + kvm_inject_page_fault(vcpu, &e); + return 1; + } + } + + + field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); + if (vmcs_field_readonly(field)) { + nested_vmx_failValid(vcpu, + VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); + skip_emulated_instruction(vcpu); + return 1; + } + + offset = vmcs_field_to_offset(field); + if (offset < 0) { + nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); + skip_emulated_instruction(vcpu); + return 1; + } + p = ((char *) get_vmcs12(vcpu)) + offset; + + switch (vmcs_field_type(field)) { + case VMCS_FIELD_TYPE_U16: + *(u16 *)p = field_value; + break; + case VMCS_FIELD_TYPE_U32: + *(u32 *)p = field_value; + break; + case VMCS_FIELD_TYPE_U64: + *(u64 *)p = field_value; + break; + case VMCS_FIELD_TYPE_NATURAL_WIDTH: + *(natural_width *)p = field_value; + break; + default: + nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); + skip_emulated_instruction(vcpu); + return 1; + } + + nested_vmx_succeed(vcpu); + skip_emulated_instruction(vcpu); + return 1; +} + /* Emulate the VMPTRLD instruction */ static int handle_vmptrld(struct kvm_vcpu *vcpu) { @@ -4988,9 +5177,9 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_VMLAUNCH] = handle_vmx_insn, [EXIT_REASON_VMPTRLD] = handle_vmptrld, [EXIT_REASON_VMPTRST] = handle_vmptrst, - [EXIT_REASON_VMREAD] = handle_vmx_insn, + [EXIT_REASON_VMREAD] = handle_vmread, [EXIT_REASON_VMRESUME] = handle_vmx_insn, - [EXIT_REASON_VMWRITE] = handle_vmx_insn, + [EXIT_REASON_VMWRITE] = handle_vmwrite, [EXIT_REASON_VMOFF] = handle_vmoff, [EXIT_REASON_VMON] = handle_vmon, [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, -- cgit v1.1 From a3a8ff8ebf87cbd828f54276d902c3c8ee0c4781 Mon Sep 17 00:00:00 2001 From: Nadav Har'El Date: Wed, 25 May 2011 23:09:01 +0300 Subject: KVM: nVMX: Move host-state field setup to a function Move the setting of constant host-state fields (fields that do not change throughout the life of the guest) from vmx_vcpu_setup to a new common function vmx_set_constant_host_state(). This function will also be used to set the host state when running L2 guests. Signed-off-by: Nadav Har'El Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 74 +++++++++++++++++++++++++++++++----------------------- 1 file changed, 42 insertions(+), 32 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 84d9c93..4b97161 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3395,17 +3395,51 @@ static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only) } /* + * Set up the vmcs's constant host-state fields, i.e., host-state fields that + * will not change in the lifetime of the guest. + * Note that host-state that does change is set elsewhere. E.g., host-state + * that is set differently for each CPU is set in vmx_vcpu_load(), not here. + */ +static void vmx_set_constant_host_state(void) +{ + u32 low32, high32; + unsigned long tmpl; + struct desc_ptr dt; + + vmcs_writel(HOST_CR0, read_cr0() | X86_CR0_TS); /* 22.2.3 */ + vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */ + vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */ + + vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ + vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ + vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ + vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ + vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ + + native_store_idt(&dt); + vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */ + + asm("mov $.Lkvm_vmx_return, %0" : "=r"(tmpl)); + vmcs_writel(HOST_RIP, tmpl); /* 22.2.5 */ + + rdmsr(MSR_IA32_SYSENTER_CS, low32, high32); + vmcs_write32(HOST_IA32_SYSENTER_CS, low32); + rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl); + vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */ + + if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) { + rdmsr(MSR_IA32_CR_PAT, low32, high32); + vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32)); + } +} + +/* * Sets up the vmcs for emulated real mode. */ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) { - u32 host_sysenter_cs, msr_low, msr_high; - u32 junk; - u64 host_pat; unsigned long a; - struct desc_ptr dt; int i; - unsigned long kvm_vmx_return; u32 exec_control; /* I/O */ @@ -3462,16 +3496,9 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf); vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ - vmcs_writel(HOST_CR0, read_cr0() | X86_CR0_TS); /* 22.2.3 */ - vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */ - vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */ - - vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ - vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ - vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */ vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */ - vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ + vmx_set_constant_host_state(); #ifdef CONFIG_X86_64 rdmsrl(MSR_FS_BASE, a); vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */ @@ -3482,32 +3509,15 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */ #endif - vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ - - native_store_idt(&dt); - vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */ - - asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return)); - vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */ vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host)); vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest)); - rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk); - vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs); - rdmsrl(MSR_IA32_SYSENTER_ESP, a); - vmcs_writel(HOST_IA32_SYSENTER_ESP, a); /* 22.2.3 */ - rdmsrl(MSR_IA32_SYSENTER_EIP, a); - vmcs_writel(HOST_IA32_SYSENTER_EIP, a); /* 22.2.3 */ - - if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) { - rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high); - host_pat = msr_low | ((u64) msr_high << 32); - vmcs_write64(HOST_IA32_PAT, host_pat); - } if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { + u32 msr_low, msr_high; + u64 host_pat; rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high); host_pat = msr_low | ((u64) msr_high << 32); /* Write the default value follow host pat */ -- cgit v1.1 From bf8179a011d40e7322d34440039b5e86a0d03aed Mon Sep 17 00:00:00 2001 From: Nadav Har'El Date: Wed, 25 May 2011 23:09:31 +0300 Subject: KVM: nVMX: Move control field setup to functions Move some of the control field setup to common functions. These functions will also be needed for running L2 guests - L0's desires (expressed in these functions) will be appropriately merged with L1's desires. Signed-off-by: Nadav Har'El Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 80 ++++++++++++++++++++++++++++++++---------------------- 1 file changed, 47 insertions(+), 33 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 4b97161..3134638 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3433,6 +3433,49 @@ static void vmx_set_constant_host_state(void) } } +static void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) +{ + vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS; + if (enable_ept) + vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE; + vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); +} + +static u32 vmx_exec_control(struct vcpu_vmx *vmx) +{ + u32 exec_control = vmcs_config.cpu_based_exec_ctrl; + if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) { + exec_control &= ~CPU_BASED_TPR_SHADOW; +#ifdef CONFIG_X86_64 + exec_control |= CPU_BASED_CR8_STORE_EXITING | + CPU_BASED_CR8_LOAD_EXITING; +#endif + } + if (!enable_ept) + exec_control |= CPU_BASED_CR3_STORE_EXITING | + CPU_BASED_CR3_LOAD_EXITING | + CPU_BASED_INVLPG_EXITING; + return exec_control; +} + +static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) +{ + u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; + if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) + exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + if (vmx->vpid == 0) + exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; + if (!enable_ept) { + exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; + enable_unrestricted_guest = 0; + } + if (!enable_unrestricted_guest) + exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; + if (!ple_gap) + exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; + return exec_control; +} + /* * Sets up the vmcs for emulated real mode. */ @@ -3440,7 +3483,6 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) { unsigned long a; int i; - u32 exec_control; /* I/O */ vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a)); @@ -3455,36 +3497,11 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmcs_config.pin_based_exec_ctrl); - exec_control = vmcs_config.cpu_based_exec_ctrl; - if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) { - exec_control &= ~CPU_BASED_TPR_SHADOW; -#ifdef CONFIG_X86_64 - exec_control |= CPU_BASED_CR8_STORE_EXITING | - CPU_BASED_CR8_LOAD_EXITING; -#endif - } - if (!enable_ept) - exec_control |= CPU_BASED_CR3_STORE_EXITING | - CPU_BASED_CR3_LOAD_EXITING | - CPU_BASED_INVLPG_EXITING; - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control); + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx)); if (cpu_has_secondary_exec_ctrls()) { - exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; - if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) - exec_control &= - ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; - if (vmx->vpid == 0) - exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; - if (!enable_ept) { - exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; - enable_unrestricted_guest = 0; - } - if (!enable_unrestricted_guest) - exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; - if (!ple_gap) - exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, + vmx_secondary_exec_control(vmx)); } if (ple_gap) { @@ -3547,10 +3564,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl); vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); - vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS; - if (enable_ept) - vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE; - vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); + set_cr4_guest_host_mask(vmx); kvm_write_tsc(&vmx->vcpu, 0); -- cgit v1.1 From fe3ef05c7572d68721c1ddd4d36009611f565ba2 Mon Sep 17 00:00:00 2001 From: Nadav Har'El Date: Wed, 25 May 2011 23:10:02 +0300 Subject: KVM: nVMX: Prepare vmcs02 from vmcs01 and vmcs12 This patch contains code to prepare the VMCS which can be used to actually run the L2 guest, vmcs02. prepare_vmcs02 appropriately merges the information in vmcs12 (the vmcs that L1 built for L2) and in vmcs01 (our desires for our own guests). Signed-off-by: Nadav Har'El Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 279 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 279 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 3134638..506c914 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -345,6 +345,12 @@ struct nested_vmx { /* vmcs02_list cache of VMCSs recently used to run L2 guests */ struct list_head vmcs02_pool; int vmcs02_num; + u64 vmcs01_tsc_offset; + /* + * Guest pages referred to in vmcs02 with host-physical pointers, so + * we must keep them pinned while L2 runs. + */ + struct page *apic_access_page; }; struct vcpu_vmx { @@ -847,6 +853,18 @@ static inline bool report_flexpriority(void) return flexpriority_enabled; } +static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit) +{ + return vmcs12->cpu_based_vm_exec_control & bit; +} + +static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit) +{ + return (vmcs12->cpu_based_vm_exec_control & + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && + (vmcs12->secondary_vm_exec_control & bit); +} + static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) { int i; @@ -1441,6 +1459,22 @@ static void vmx_fpu_activate(struct kvm_vcpu *vcpu) static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu); +/* + * Return the cr0 value that a nested guest would read. This is a combination + * of the real cr0 used to run the guest (guest_cr0), and the bits shadowed by + * its hypervisor (cr0_read_shadow). + */ +static inline unsigned long nested_read_cr0(struct vmcs12 *fields) +{ + return (fields->guest_cr0 & ~fields->cr0_guest_host_mask) | + (fields->cr0_read_shadow & fields->cr0_guest_host_mask); +} +static inline unsigned long nested_read_cr4(struct vmcs12 *fields) +{ + return (fields->guest_cr4 & ~fields->cr4_guest_host_mask) | + (fields->cr4_read_shadow & fields->cr4_guest_host_mask); +} + static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) { vmx_decache_cr0_guest_bits(vcpu); @@ -3438,6 +3472,9 @@ static void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS; if (enable_ept) vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE; + if (is_guest_mode(&vmx->vcpu)) + vmx->vcpu.arch.cr4_guest_owned_bits &= + ~get_vmcs12(&vmx->vcpu)->cr4_guest_host_mask; vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); } @@ -4736,6 +4773,11 @@ static void free_nested(struct vcpu_vmx *vmx) vmx->nested.current_vmptr = -1ull; vmx->nested.current_vmcs12 = NULL; } + /* Unpin physical memory we referred to in current vmcs02 */ + if (vmx->nested.apic_access_page) { + nested_release_page(vmx->nested.apic_access_page); + vmx->nested.apic_access_page = 0; + } nested_free_all_saved_vmcss(vmx); } @@ -5810,6 +5852,243 @@ static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) { } +/* + * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested + * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it + * with L0's requirements for its guest (a.k.a. vmsc01), so we can run the L2 + * guest in a way that will both be appropriate to L1's requests, and our + * needs. In addition to modifying the active vmcs (which is vmcs02), this + * function also has additional necessary side-effects, like setting various + * vcpu->arch fields. + */ +static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 exec_control; + + vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); + vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); + vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); + vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector); + vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector); + vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector); + vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector); + vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector); + vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit); + vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit); + vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit); + vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit); + vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit); + vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit); + vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit); + vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit); + vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit); + vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit); + vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes); + vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); + vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes); + vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes); + vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes); + vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes); + vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes); + vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes); + vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); + vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); + vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base); + vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base); + vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base); + vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base); + vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base); + vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base); + vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); + vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); + + vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, + vmcs12->vm_entry_intr_info_field); + vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, + vmcs12->vm_entry_exception_error_code); + vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, + vmcs12->vm_entry_instruction_len); + vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, + vmcs12->guest_interruptibility_info); + vmcs_write32(GUEST_ACTIVITY_STATE, vmcs12->guest_activity_state); + vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); + vmcs_writel(GUEST_DR7, vmcs12->guest_dr7); + vmcs_writel(GUEST_RFLAGS, vmcs12->guest_rflags); + vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, + vmcs12->guest_pending_dbg_exceptions); + vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); + vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); + + vmcs_write64(VMCS_LINK_POINTER, -1ull); + + vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, + (vmcs_config.pin_based_exec_ctrl | + vmcs12->pin_based_vm_exec_control)); + + /* + * Whether page-faults are trapped is determined by a combination of + * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. + * If enable_ept, L0 doesn't care about page faults and we should + * set all of these to L1's desires. However, if !enable_ept, L0 does + * care about (at least some) page faults, and because it is not easy + * (if at all possible?) to merge L0 and L1's desires, we simply ask + * to exit on each and every L2 page fault. This is done by setting + * MASK=MATCH=0 and (see below) EB.PF=1. + * Note that below we don't need special code to set EB.PF beyond the + * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, + * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when + * !enable_ept, EB.PF is 1, so the "or" will always be 1. + * + * A problem with this approach (when !enable_ept) is that L1 may be + * injected with more page faults than it asked for. This could have + * caused problems, but in practice existing hypervisors don't care. + * To fix this, we will need to emulate the PFEC checking (on the L1 + * page tables), using walk_addr(), when injecting PFs to L1. + */ + vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, + enable_ept ? vmcs12->page_fault_error_code_mask : 0); + vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, + enable_ept ? vmcs12->page_fault_error_code_match : 0); + + if (cpu_has_secondary_exec_ctrls()) { + u32 exec_control = vmx_secondary_exec_control(vmx); + if (!vmx->rdtscp_enabled) + exec_control &= ~SECONDARY_EXEC_RDTSCP; + /* Take the following fields only from vmcs12 */ + exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + if (nested_cpu_has(vmcs12, + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) + exec_control |= vmcs12->secondary_vm_exec_control; + + if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) { + /* + * Translate L1 physical address to host physical + * address for vmcs02. Keep the page pinned, so this + * physical address remains valid. We keep a reference + * to it so we can release it later. + */ + if (vmx->nested.apic_access_page) /* shouldn't happen */ + nested_release_page(vmx->nested.apic_access_page); + vmx->nested.apic_access_page = + nested_get_page(vcpu, vmcs12->apic_access_addr); + /* + * If translation failed, no matter: This feature asks + * to exit when accessing the given address, and if it + * can never be accessed, this feature won't do + * anything anyway. + */ + if (!vmx->nested.apic_access_page) + exec_control &= + ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + else + vmcs_write64(APIC_ACCESS_ADDR, + page_to_phys(vmx->nested.apic_access_page)); + } + + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); + } + + + /* + * Set host-state according to L0's settings (vmcs12 is irrelevant here) + * Some constant fields are set here by vmx_set_constant_host_state(). + * Other fields are different per CPU, and will be set later when + * vmx_vcpu_load() is called, and when vmx_save_host_state() is called. + */ + vmx_set_constant_host_state(); + + /* + * HOST_RSP is normally set correctly in vmx_vcpu_run() just before + * entry, but only if the current (host) sp changed from the value + * we wrote last (vmx->host_rsp). This cache is no longer relevant + * if we switch vmcs, and rather than hold a separate cache per vmcs, + * here we just force the write to happen on entry. + */ + vmx->host_rsp = 0; + + exec_control = vmx_exec_control(vmx); /* L0's desires */ + exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; + exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING; + exec_control &= ~CPU_BASED_TPR_SHADOW; + exec_control |= vmcs12->cpu_based_vm_exec_control; + /* + * Merging of IO and MSR bitmaps not currently supported. + * Rather, exit every time. + */ + exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; + exec_control &= ~CPU_BASED_USE_IO_BITMAPS; + exec_control |= CPU_BASED_UNCOND_IO_EXITING; + + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control); + + /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the + * bitwise-or of what L1 wants to trap for L2, and what we want to + * trap. Note that CR0.TS also needs updating - we do this later. + */ + update_exception_bitmap(vcpu); + vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; + vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); + + /* Note: IA32_MODE, LOAD_IA32_EFER are modified by vmx_set_efer below */ + vmcs_write32(VM_EXIT_CONTROLS, + vmcs12->vm_exit_controls | vmcs_config.vmexit_ctrl); + vmcs_write32(VM_ENTRY_CONTROLS, vmcs12->vm_entry_controls | + (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE)); + + if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) + vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); + else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) + vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); + + + set_cr4_guest_host_mask(vmx); + + vmcs_write64(TSC_OFFSET, + vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset); + + if (enable_vpid) { + /* + * Trivially support vpid by letting L2s share their parent + * L1's vpid. TODO: move to a more elaborate solution, giving + * each L2 its own vpid and exposing the vpid feature to L1. + */ + vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); + vmx_flush_tlb(vcpu); + } + + if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER) + vcpu->arch.efer = vmcs12->guest_ia32_efer; + if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) + vcpu->arch.efer |= (EFER_LMA | EFER_LME); + else + vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); + /* Note: modifies VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ + vmx_set_efer(vcpu, vcpu->arch.efer); + + /* + * This sets GUEST_CR0 to vmcs12->guest_cr0, with possibly a modified + * TS bit (for lazy fpu) and bits which we consider mandatory enabled. + * The CR0_READ_SHADOW is what L2 should have expected to read given + * the specifications by L1; It's not enough to take + * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we + * have more bits than L1 expected. + */ + vmx_set_cr0(vcpu, vmcs12->guest_cr0); + vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); + + vmx_set_cr4(vcpu, vmcs12->guest_cr4); + vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12)); + + /* shadow page tables on either EPT or shadow page tables */ + kvm_set_cr3(vcpu, vmcs12->guest_cr3); + kvm_mmu_reset_context(vcpu); + + kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp); + kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip); +} + static int vmx_check_intercept(struct kvm_vcpu *vcpu, struct x86_instruction_info *info, enum x86_intercept_stage stage) -- cgit v1.1 From cd232ad02f00286c3f8c9df30948da17212ef905 Mon Sep 17 00:00:00 2001 From: Nadav Har'El Date: Wed, 25 May 2011 23:10:33 +0300 Subject: KVM: nVMX: Implement VMLAUNCH and VMRESUME Implement the VMLAUNCH and VMRESUME instructions, allowing a guest hypervisor to run its own guests. This patch does not include some of the necessary validity checks on vmcs12 fields before the entry. These will appear in a separate patch below. Signed-off-by: Nadav Har'El Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 63 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 506c914..1a8b328 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4943,6 +4943,21 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) return 1; } +static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch); + +/* Emulate the VMLAUNCH instruction */ +static int handle_vmlaunch(struct kvm_vcpu *vcpu) +{ + return nested_vmx_run(vcpu, true); +} + +/* Emulate the VMRESUME instruction */ +static int handle_vmresume(struct kvm_vcpu *vcpu) +{ + + return nested_vmx_run(vcpu, false); +} + enum vmcs_field_type { VMCS_FIELD_TYPE_U16 = 0, VMCS_FIELD_TYPE_U64 = 1, @@ -5240,11 +5255,11 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_INVLPG] = handle_invlpg, [EXIT_REASON_VMCALL] = handle_vmcall, [EXIT_REASON_VMCLEAR] = handle_vmclear, - [EXIT_REASON_VMLAUNCH] = handle_vmx_insn, + [EXIT_REASON_VMLAUNCH] = handle_vmlaunch, [EXIT_REASON_VMPTRLD] = handle_vmptrld, [EXIT_REASON_VMPTRST] = handle_vmptrst, [EXIT_REASON_VMREAD] = handle_vmread, - [EXIT_REASON_VMRESUME] = handle_vmx_insn, + [EXIT_REASON_VMRESUME] = handle_vmresume, [EXIT_REASON_VMWRITE] = handle_vmwrite, [EXIT_REASON_VMOFF] = handle_vmoff, [EXIT_REASON_VMON] = handle_vmon, @@ -6089,6 +6104,52 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip); } +/* + * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1 + * for running an L2 nested guest. + */ +static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) +{ + struct vmcs12 *vmcs12; + struct vcpu_vmx *vmx = to_vmx(vcpu); + int cpu; + struct loaded_vmcs *vmcs02; + + if (!nested_vmx_check_permission(vcpu) || + !nested_vmx_check_vmcs12(vcpu)) + return 1; + + skip_emulated_instruction(vcpu); + vmcs12 = get_vmcs12(vcpu); + + vmcs02 = nested_get_current_vmcs02(vmx); + if (!vmcs02) + return -ENOMEM; + + enter_guest_mode(vcpu); + + vmx->nested.vmcs01_tsc_offset = vmcs_read64(TSC_OFFSET); + + cpu = get_cpu(); + vmx->loaded_vmcs = vmcs02; + vmx_vcpu_put(vcpu); + vmx_vcpu_load(vcpu, cpu); + vcpu->cpu = cpu; + put_cpu(); + + vmcs12->launch_state = 1; + + prepare_vmcs02(vcpu, vmcs12); + + /* + * Note no nested_vmx_succeed or nested_vmx_fail here. At this point + * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet + * returned as far as L1 is concerned. It will only return (and set + * the success flag) when L2 exits (see nested_vmx_vmexit()). + */ + return 1; +} + static int vmx_check_intercept(struct kvm_vcpu *vcpu, struct x86_instruction_info *info, enum x86_intercept_stage stage) -- cgit v1.1 From 99e65e805dea4df061aa4038211112aa96416412 Mon Sep 17 00:00:00 2001 From: Nadav Har'El Date: Wed, 25 May 2011 23:11:03 +0300 Subject: KVM: nVMX: No need for handle_vmx_insn function any more Before nested VMX support, the exit handler for a guest executing a VMX instruction (vmclear, vmlaunch, vmptrld, vmptrst, vmread, vmread, vmresume, vmwrite, vmon, vmoff), was handle_vmx_insn(). This handler simply threw a #UD exception. Now that all these exit reasons are properly handled (and emulate the respective VMX instruction), nothing calls this dummy handler and it can be removed. Signed-off-by: Nadav Har'El Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 1a8b328..f8dccb9 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4309,12 +4309,6 @@ static int handle_vmcall(struct kvm_vcpu *vcpu) return 1; } -static int handle_vmx_insn(struct kvm_vcpu *vcpu) -{ - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; -} - static int handle_invd(struct kvm_vcpu *vcpu) { return emulate_instruction(vcpu, 0) == EMULATE_DONE; -- cgit v1.1 From 4704d0befb0721274bda863192c4782febb6b94c Mon Sep 17 00:00:00 2001 From: Nadav Har'El Date: Wed, 25 May 2011 23:11:34 +0300 Subject: KVM: nVMX: Exiting from L2 to L1 This patch implements nested_vmx_vmexit(), called when the nested L2 guest exits and we want to run its L1 parent and let it handle this exit. Note that this will not necessarily be called on every L2 exit. L0 may decide to handle a particular exit on its own, without L1's involvement; In that case, L0 will handle the exit, and resume running L2, without running L1 and without calling nested_vmx_vmexit(). The logic for deciding whether to handle a particular exit in L1 or in L0, i.e., whether to call nested_vmx_vmexit(), will appear in a separate patch below. Signed-off-by: Nadav Har'El Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/vmx.h | 4 + arch/x86/kvm/vmx.c | 262 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 266 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 37690bd..b747773 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -132,6 +132,8 @@ enum vmcs_field { GUEST_IA32_PAT_HIGH = 0x00002805, GUEST_IA32_EFER = 0x00002806, GUEST_IA32_EFER_HIGH = 0x00002807, + GUEST_IA32_PERF_GLOBAL_CTRL = 0x00002808, + GUEST_IA32_PERF_GLOBAL_CTRL_HIGH= 0x00002809, GUEST_PDPTR0 = 0x0000280a, GUEST_PDPTR0_HIGH = 0x0000280b, GUEST_PDPTR1 = 0x0000280c, @@ -144,6 +146,8 @@ enum vmcs_field { HOST_IA32_PAT_HIGH = 0x00002c01, HOST_IA32_EFER = 0x00002c02, HOST_IA32_EFER_HIGH = 0x00002c03, + HOST_IA32_PERF_GLOBAL_CTRL = 0x00002c04, + HOST_IA32_PERF_GLOBAL_CTRL_HIGH = 0x00002c05, PIN_BASED_VM_EXEC_CONTROL = 0x00004000, CPU_BASED_VM_EXEC_CONTROL = 0x00004002, EXCEPTION_BITMAP = 0x00004004, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f8dccb9..7d778e4 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6144,6 +6144,268 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) return 1; } +/* + * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date + * because L2 may have changed some cr0 bits directly (CRO_GUEST_HOST_MASK). + * This function returns the new value we should put in vmcs12.guest_cr0. + * It's not enough to just return the vmcs02 GUEST_CR0. Rather, + * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now + * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0 + * didn't trap the bit, because if L1 did, so would L0). + * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have + * been modified by L2, and L1 knows it. So just leave the old value of + * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0 + * isn't relevant, because if L0 traps this bit it can set it to anything. + * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have + * changed these bits, and therefore they need to be updated, but L0 + * didn't necessarily allow them to be changed in GUEST_CR0 - and rather + * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there. + */ +static inline unsigned long +vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) +{ + return + /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) | + /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) | + /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask | + vcpu->arch.cr0_guest_owned_bits)); +} + +static inline unsigned long +vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) +{ + return + /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) | + /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) | + /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask | + vcpu->arch.cr4_guest_owned_bits)); +} + +/* + * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits + * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), + * and this function updates it to reflect the changes to the guest state while + * L2 was running (and perhaps made some exits which were handled directly by L0 + * without going back to L1), and to reflect the exit reason. + * Note that we do not have to copy here all VMCS fields, just those that + * could have changed by the L2 guest or the exit - i.e., the guest-state and + * exit-information fields only. Other fields are modified by L1 with VMWRITE, + * which already writes to vmcs12 directly. + */ +void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) +{ + /* update guest state fields: */ + vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); + vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); + + kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7); + vmcs12->guest_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); + vmcs12->guest_rip = kvm_register_read(vcpu, VCPU_REGS_RIP); + vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); + + vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR); + vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR); + vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR); + vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR); + vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR); + vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR); + vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR); + vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR); + vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT); + vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT); + vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT); + vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT); + vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT); + vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT); + vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT); + vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT); + vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT); + vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT); + vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES); + vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES); + vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES); + vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES); + vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES); + vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES); + vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES); + vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES); + vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE); + vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE); + vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE); + vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE); + vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE); + vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE); + vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE); + vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE); + vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); + vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); + + vmcs12->guest_activity_state = vmcs_read32(GUEST_ACTIVITY_STATE); + vmcs12->guest_interruptibility_info = + vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); + vmcs12->guest_pending_dbg_exceptions = + vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); + + /* TODO: These cannot have changed unless we have MSR bitmaps and + * the relevant bit asks not to trap the change */ + vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); + if (vmcs12->vm_entry_controls & VM_EXIT_SAVE_IA32_PAT) + vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT); + vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS); + vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP); + vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP); + + /* update exit information fields: */ + + vmcs12->vm_exit_reason = vmcs_read32(VM_EXIT_REASON); + vmcs12->exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + + vmcs12->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + vmcs12->vm_exit_intr_error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); + vmcs12->idt_vectoring_info_field = + vmcs_read32(IDT_VECTORING_INFO_FIELD); + vmcs12->idt_vectoring_error_code = + vmcs_read32(IDT_VECTORING_ERROR_CODE); + vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + + /* clear vm-entry fields which are to be cleared on exit */ + if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) + vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK; +} + +/* + * A part of what we need to when the nested L2 guest exits and we want to + * run its L1 parent, is to reset L1's guest state to the host state specified + * in vmcs12. + * This function is to be called not only on normal nested exit, but also on + * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry + * Failures During or After Loading Guest State"). + * This function should be called when the active VMCS is L1's (vmcs01). + */ +void load_vmcs12_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) +{ + if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) + vcpu->arch.efer = vmcs12->host_ia32_efer; + if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) + vcpu->arch.efer |= (EFER_LMA | EFER_LME); + else + vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); + vmx_set_efer(vcpu, vcpu->arch.efer); + + kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->host_rsp); + kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip); + /* + * Note that calling vmx_set_cr0 is important, even if cr0 hasn't + * actually changed, because it depends on the current state of + * fpu_active (which may have changed). + * Note that vmx_set_cr0 refers to efer set above. + */ + kvm_set_cr0(vcpu, vmcs12->host_cr0); + /* + * If we did fpu_activate()/fpu_deactivate() during L2's run, we need + * to apply the same changes to L1's vmcs. We just set cr0 correctly, + * but we also need to update cr0_guest_host_mask and exception_bitmap. + */ + update_exception_bitmap(vcpu); + vcpu->arch.cr0_guest_owned_bits = (vcpu->fpu_active ? X86_CR0_TS : 0); + vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); + + /* + * Note that CR4_GUEST_HOST_MASK is already set in the original vmcs01 + * (KVM doesn't change it)- no reason to call set_cr4_guest_host_mask(); + */ + vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); + kvm_set_cr4(vcpu, vmcs12->host_cr4); + + /* shadow page tables on either EPT or shadow page tables */ + kvm_set_cr3(vcpu, vmcs12->host_cr3); + kvm_mmu_reset_context(vcpu); + + if (enable_vpid) { + /* + * Trivially support vpid by letting L2s share their parent + * L1's vpid. TODO: move to a more elaborate solution, giving + * each L2 its own vpid and exposing the vpid feature to L1. + */ + vmx_flush_tlb(vcpu); + } + + + vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); + vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp); + vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); + vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); + vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); + vmcs_writel(GUEST_TR_BASE, vmcs12->host_tr_base); + vmcs_writel(GUEST_GS_BASE, vmcs12->host_gs_base); + vmcs_writel(GUEST_FS_BASE, vmcs12->host_fs_base); + vmcs_write16(GUEST_ES_SELECTOR, vmcs12->host_es_selector); + vmcs_write16(GUEST_CS_SELECTOR, vmcs12->host_cs_selector); + vmcs_write16(GUEST_SS_SELECTOR, vmcs12->host_ss_selector); + vmcs_write16(GUEST_DS_SELECTOR, vmcs12->host_ds_selector); + vmcs_write16(GUEST_FS_SELECTOR, vmcs12->host_fs_selector); + vmcs_write16(GUEST_GS_SELECTOR, vmcs12->host_gs_selector); + vmcs_write16(GUEST_TR_SELECTOR, vmcs12->host_tr_selector); + + if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) + vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); + if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) + vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL, + vmcs12->host_ia32_perf_global_ctrl); +} + +/* + * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1 + * and modify vmcs12 to make it see what it would expect to see there if + * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) + */ +static void nested_vmx_vmexit(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int cpu; + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + + leave_guest_mode(vcpu); + prepare_vmcs12(vcpu, vmcs12); + + cpu = get_cpu(); + vmx->loaded_vmcs = &vmx->vmcs01; + vmx_vcpu_put(vcpu); + vmx_vcpu_load(vcpu, cpu); + vcpu->cpu = cpu; + put_cpu(); + + /* if no vmcs02 cache requested, remove the one we used */ + if (VMCS02_POOL_SIZE == 0) + nested_free_vmcs02(vmx, vmx->nested.current_vmptr); + + load_vmcs12_host_state(vcpu, vmcs12); + + /* Update TSC_OFFSET if vmx_adjust_tsc_offset() was used while L2 ran */ + vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset); + + /* This is needed for same reason as it was needed in prepare_vmcs02 */ + vmx->host_rsp = 0; + + /* Unpin physical memory we referred to in vmcs02 */ + if (vmx->nested.apic_access_page) { + nested_release_page(vmx->nested.apic_access_page); + vmx->nested.apic_access_page = 0; + } + + /* + * Exiting from L2 to L1, we're now back to L1 which thinks it just + * finished a VMLAUNCH or VMRESUME instruction, so we need to set the + * success or failure flag accordingly. + */ + if (unlikely(vmx->fail)) { + vmx->fail = 0; + nested_vmx_failValid(vcpu, vmcs_read32(VM_INSTRUCTION_ERROR)); + } else + nested_vmx_succeed(vcpu); +} + static int vmx_check_intercept(struct kvm_vcpu *vcpu, struct x86_instruction_info *info, enum x86_intercept_stage stage) -- cgit v1.1 From 7c1779384a2b2479722e90778721c40811e1b7a7 Mon Sep 17 00:00:00 2001 From: Nadav Har'El Date: Wed, 25 May 2011 23:12:04 +0300 Subject: KVM: nVMX: vmcs12 checks on nested entry This patch adds a bunch of tests of the validity of the vmcs12 fields, according to what the VMX spec and our implementation allows. If fields we cannot (or don't want to) honor are discovered, an entry failure is emulated. According to the spec, there are two types of entry failures: If the problem was in vmcs12's host state or control fields, the VMLAUNCH instruction simply fails. But a problem is found in the guest state, the behavior is more similar to that of an exit. Signed-off-by: Nadav Har'El Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/vmx.h | 8 ++++ arch/x86/kvm/vmx.c | 101 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 109 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index b747773..2caf290 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -431,6 +431,14 @@ struct vmx_msr_entry { } __aligned(16); /* + * Exit Qualifications for entry failure during or after loading guest state + */ +#define ENTRY_FAIL_DEFAULT 0 +#define ENTRY_FAIL_PDPTE 2 +#define ENTRY_FAIL_NMI 3 +#define ENTRY_FAIL_VMCS_LINK_PTR 4 + +/* * VM-instruction error numbers */ enum vm_instruction_error_number { diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 7d778e4..ee25b9f 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -865,6 +865,10 @@ static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit) (vmcs12->secondary_vm_exec_control & bit); } +static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12, + u32 reason, unsigned long qualification); + static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) { int i; @@ -6116,6 +6120,86 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) skip_emulated_instruction(vcpu); vmcs12 = get_vmcs12(vcpu); + /* + * The nested entry process starts with enforcing various prerequisites + * on vmcs12 as required by the Intel SDM, and act appropriately when + * they fail: As the SDM explains, some conditions should cause the + * instruction to fail, while others will cause the instruction to seem + * to succeed, but return an EXIT_REASON_INVALID_STATE. + * To speed up the normal (success) code path, we should avoid checking + * for misconfigurations which will anyway be caught by the processor + * when using the merged vmcs02. + */ + if (vmcs12->launch_state == launch) { + nested_vmx_failValid(vcpu, + launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS + : VMXERR_VMRESUME_NONLAUNCHED_VMCS); + return 1; + } + + if ((vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_MSR_BITMAPS) && + !IS_ALIGNED(vmcs12->msr_bitmap, PAGE_SIZE)) { + /*TODO: Also verify bits beyond physical address width are 0*/ + nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); + return 1; + } + + if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && + !IS_ALIGNED(vmcs12->apic_access_addr, PAGE_SIZE)) { + /*TODO: Also verify bits beyond physical address width are 0*/ + nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); + return 1; + } + + if (vmcs12->vm_entry_msr_load_count > 0 || + vmcs12->vm_exit_msr_load_count > 0 || + vmcs12->vm_exit_msr_store_count > 0) { + if (printk_ratelimit()) + printk(KERN_WARNING + "%s: VMCS MSR_{LOAD,STORE} unsupported\n", __func__); + nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); + return 1; + } + + if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, + nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high) || + !vmx_control_verify(vmcs12->secondary_vm_exec_control, + nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high) || + !vmx_control_verify(vmcs12->pin_based_vm_exec_control, + nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high) || + !vmx_control_verify(vmcs12->vm_exit_controls, + nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high) || + !vmx_control_verify(vmcs12->vm_entry_controls, + nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high)) + { + nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); + return 1; + } + + if (((vmcs12->host_cr0 & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON) || + ((vmcs12->host_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) { + nested_vmx_failValid(vcpu, + VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); + return 1; + } + + if (((vmcs12->guest_cr0 & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON) || + ((vmcs12->guest_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) { + nested_vmx_entry_failure(vcpu, vmcs12, + EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT); + return 1; + } + if (vmcs12->vmcs_link_pointer != -1ull) { + nested_vmx_entry_failure(vcpu, vmcs12, + EXIT_REASON_INVALID_STATE, ENTRY_FAIL_VMCS_LINK_PTR); + return 1; + } + + /* + * We're finally done with prerequisite checking, and can start with + * the nested entry. + */ + vmcs02 = nested_get_current_vmcs02(vmx); if (!vmcs02) return -ENOMEM; @@ -6406,6 +6490,23 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu) nested_vmx_succeed(vcpu); } +/* + * L1's failure to enter L2 is a subset of a normal exit, as explained in + * 23.7 "VM-entry failures during or after loading guest state" (this also + * lists the acceptable exit-reason and exit-qualification parameters). + * It should only be called before L2 actually succeeded to run, and when + * vmcs01 is current (it doesn't leave_guest_mode() or switch vmcss). + */ +static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12, + u32 reason, unsigned long qualification) +{ + load_vmcs12_host_state(vcpu, vmcs12); + vmcs12->vm_exit_reason = reason | VMX_EXIT_REASONS_FAILED_VMENTRY; + vmcs12->exit_qualification = qualification; + nested_vmx_succeed(vcpu); +} + static int vmx_check_intercept(struct kvm_vcpu *vcpu, struct x86_instruction_info *info, enum x86_intercept_stage stage) -- cgit v1.1 From 644d711aa0e16111d8aba6d289caebec013e26ea Mon Sep 17 00:00:00 2001 From: Nadav Har'El Date: Wed, 25 May 2011 23:12:35 +0300 Subject: KVM: nVMX: Deciding if L0 or L1 should handle an L2 exit This patch contains the logic of whether an L2 exit should be handled by L0 and then L2 should be resumed, or whether L1 should be run to handle this exit (using the nested_vmx_vmexit() function of the previous patch). The basic idea is to let L1 handle the exit only if it actually asked to trap this sort of event. For example, when L2 exits on a change to CR0, we check L1's CR0_GUEST_HOST_MASK to see if L1 expressed interest in any bit which changed; If it did, we exit to L1. But if it didn't it means that it is we (L0) that wished to trap this event, so we handle it ourselves. The next two patches add additional logic of what to do when an interrupt or exception is injected: Does L0 need to do it, should we exit to L1 to do it, or should we resume L2 and keep the exception to be injected later. We keep a new flag, "nested_run_pending", which can override the decision of which should run next, L1 or L2. nested_run_pending=1 means that we *must* run L2 next, not L1. This is necessary in particular when L1 did a VMLAUNCH of L2 and therefore expects L2 to be run (and perhaps be injected with an event it specified, etc.). Nested_run_pending is especially intended to avoid switching to L1 in the injection decision-point described above. Signed-off-by: Nadav Har'El Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 253 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 252 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ee25b9f..7f62dc3 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -346,6 +346,8 @@ struct nested_vmx { struct list_head vmcs02_pool; int vmcs02_num; u64 vmcs01_tsc_offset; + /* L2 must run next, and mustn't decide to exit to L1. */ + bool nested_run_pending; /* * Guest pages referred to in vmcs02 with host-physical pointers, so * we must keep them pinned while L2 runs. @@ -865,6 +867,19 @@ static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit) (vmcs12->secondary_vm_exec_control & bit); } +static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12, + struct kvm_vcpu *vcpu) +{ + return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS; +} + +static inline bool is_exception(u32 intr_info) +{ + return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) + == (INTR_TYPE_HARD_EXCEPTION | INTR_INFO_VALID_MASK); +} + +static void nested_vmx_vmexit(struct kvm_vcpu *vcpu); static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, u32 reason, unsigned long qualification); @@ -5277,6 +5292,229 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { static const int kvm_vmx_max_exit_handlers = ARRAY_SIZE(kvm_vmx_exit_handlers); +/* + * Return 1 if we should exit from L2 to L1 to handle an MSR access access, + * rather than handle it ourselves in L0. I.e., check whether L1 expressed + * disinterest in the current event (read or write a specific MSR) by using an + * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps. + */ +static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12, u32 exit_reason) +{ + u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX]; + gpa_t bitmap; + + if (!nested_cpu_has(get_vmcs12(vcpu), CPU_BASED_USE_MSR_BITMAPS)) + return 1; + + /* + * The MSR_BITMAP page is divided into four 1024-byte bitmaps, + * for the four combinations of read/write and low/high MSR numbers. + * First we need to figure out which of the four to use: + */ + bitmap = vmcs12->msr_bitmap; + if (exit_reason == EXIT_REASON_MSR_WRITE) + bitmap += 2048; + if (msr_index >= 0xc0000000) { + msr_index -= 0xc0000000; + bitmap += 1024; + } + + /* Then read the msr_index'th bit from this bitmap: */ + if (msr_index < 1024*8) { + unsigned char b; + kvm_read_guest(vcpu->kvm, bitmap + msr_index/8, &b, 1); + return 1 & (b >> (msr_index & 7)); + } else + return 1; /* let L1 handle the wrong parameter */ +} + +/* + * Return 1 if we should exit from L2 to L1 to handle a CR access exit, + * rather than handle it ourselves in L0. I.e., check if L1 wanted to + * intercept (via guest_host_mask etc.) the current event. + */ +static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + int cr = exit_qualification & 15; + int reg = (exit_qualification >> 8) & 15; + unsigned long val = kvm_register_read(vcpu, reg); + + switch ((exit_qualification >> 4) & 3) { + case 0: /* mov to cr */ + switch (cr) { + case 0: + if (vmcs12->cr0_guest_host_mask & + (val ^ vmcs12->cr0_read_shadow)) + return 1; + break; + case 3: + if ((vmcs12->cr3_target_count >= 1 && + vmcs12->cr3_target_value0 == val) || + (vmcs12->cr3_target_count >= 2 && + vmcs12->cr3_target_value1 == val) || + (vmcs12->cr3_target_count >= 3 && + vmcs12->cr3_target_value2 == val) || + (vmcs12->cr3_target_count >= 4 && + vmcs12->cr3_target_value3 == val)) + return 0; + if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING)) + return 1; + break; + case 4: + if (vmcs12->cr4_guest_host_mask & + (vmcs12->cr4_read_shadow ^ val)) + return 1; + break; + case 8: + if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING)) + return 1; + break; + } + break; + case 2: /* clts */ + if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) && + (vmcs12->cr0_read_shadow & X86_CR0_TS)) + return 1; + break; + case 1: /* mov from cr */ + switch (cr) { + case 3: + if (vmcs12->cpu_based_vm_exec_control & + CPU_BASED_CR3_STORE_EXITING) + return 1; + break; + case 8: + if (vmcs12->cpu_based_vm_exec_control & + CPU_BASED_CR8_STORE_EXITING) + return 1; + break; + } + break; + case 3: /* lmsw */ + /* + * lmsw can change bits 1..3 of cr0, and only set bit 0 of + * cr0. Other attempted changes are ignored, with no exit. + */ + if (vmcs12->cr0_guest_host_mask & 0xe & + (val ^ vmcs12->cr0_read_shadow)) + return 1; + if ((vmcs12->cr0_guest_host_mask & 0x1) && + !(vmcs12->cr0_read_shadow & 0x1) && + (val & 0x1)) + return 1; + break; + } + return 0; +} + +/* + * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we + * should handle it ourselves in L0 (and then continue L2). Only call this + * when in is_guest_mode (L2). + */ +static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) +{ + u32 exit_reason = vmcs_read32(VM_EXIT_REASON); + u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + + if (vmx->nested.nested_run_pending) + return 0; + + if (unlikely(vmx->fail)) { + printk(KERN_INFO "%s failed vm entry %x\n", + __func__, vmcs_read32(VM_INSTRUCTION_ERROR)); + return 1; + } + + switch (exit_reason) { + case EXIT_REASON_EXCEPTION_NMI: + if (!is_exception(intr_info)) + return 0; + else if (is_page_fault(intr_info)) + return enable_ept; + return vmcs12->exception_bitmap & + (1u << (intr_info & INTR_INFO_VECTOR_MASK)); + case EXIT_REASON_EXTERNAL_INTERRUPT: + return 0; + case EXIT_REASON_TRIPLE_FAULT: + return 1; + case EXIT_REASON_PENDING_INTERRUPT: + case EXIT_REASON_NMI_WINDOW: + /* + * prepare_vmcs02() set the CPU_BASED_VIRTUAL_INTR_PENDING bit + * (aka Interrupt Window Exiting) only when L1 turned it on, + * so if we got a PENDING_INTERRUPT exit, this must be for L1. + * Same for NMI Window Exiting. + */ + return 1; + case EXIT_REASON_TASK_SWITCH: + return 1; + case EXIT_REASON_CPUID: + return 1; + case EXIT_REASON_HLT: + return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); + case EXIT_REASON_INVD: + return 1; + case EXIT_REASON_INVLPG: + return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); + case EXIT_REASON_RDPMC: + return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); + case EXIT_REASON_RDTSC: + return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); + case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: + case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: + case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD: + case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE: + case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: + /* + * VMX instructions trap unconditionally. This allows L1 to + * emulate them for its L2 guest, i.e., allows 3-level nesting! + */ + return 1; + case EXIT_REASON_CR_ACCESS: + return nested_vmx_exit_handled_cr(vcpu, vmcs12); + case EXIT_REASON_DR_ACCESS: + return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); + case EXIT_REASON_IO_INSTRUCTION: + /* TODO: support IO bitmaps */ + return 1; + case EXIT_REASON_MSR_READ: + case EXIT_REASON_MSR_WRITE: + return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); + case EXIT_REASON_INVALID_STATE: + return 1; + case EXIT_REASON_MWAIT_INSTRUCTION: + return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); + case EXIT_REASON_MONITOR_INSTRUCTION: + return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING); + case EXIT_REASON_PAUSE_INSTRUCTION: + return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) || + nested_cpu_has2(vmcs12, + SECONDARY_EXEC_PAUSE_LOOP_EXITING); + case EXIT_REASON_MCE_DURING_VMENTRY: + return 0; + case EXIT_REASON_TPR_BELOW_THRESHOLD: + return 1; + case EXIT_REASON_APIC_ACCESS: + return nested_cpu_has2(vmcs12, + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); + case EXIT_REASON_EPT_VIOLATION: + case EXIT_REASON_EPT_MISCONFIG: + return 0; + case EXIT_REASON_WBINVD: + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); + case EXIT_REASON_XSETBV: + return 1; + default: + return 1; + } +} + static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) { *info1 = vmcs_readl(EXIT_QUALIFICATION); @@ -5299,6 +5537,17 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) if (vmx->emulation_required && emulate_invalid_guest_state) return handle_invalid_guest_state(vcpu); + if (exit_reason == EXIT_REASON_VMLAUNCH || + exit_reason == EXIT_REASON_VMRESUME) + vmx->nested.nested_run_pending = 1; + else + vmx->nested.nested_run_pending = 0; + + if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) { + nested_vmx_vmexit(vcpu); + return 1; + } + if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) { vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; vcpu->run->fail_entry.hardware_entry_failure_reason @@ -5321,7 +5570,9 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) "(0x%x) and exit reason is 0x%x\n", __func__, vectoring_info, exit_reason); - if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) { + if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked && + !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis( + get_vmcs12(vcpu), vcpu)))) { if (vmx_interrupt_allowed(vcpu)) { vmx->soft_vnmi_blocked = 0; } else if (vmx->vnmi_blocked_time > 1000000000LL && -- cgit v1.1 From b6f1250edb4462e38d72c7f6cce35911df21d31b Mon Sep 17 00:00:00 2001 From: Nadav Har'El Date: Wed, 25 May 2011 23:13:06 +0300 Subject: KVM: nVMX: Correct handling of interrupt injection The code in this patch correctly emulates external-interrupt injection while a nested guest L2 is running. Because of this code's relative un-obviousness, I include here a longer-than- usual justification for what it does - much longer than the code itself ;-) To understand how to correctly emulate interrupt injection while L2 is running, let's look first at what we need to emulate: How would things look like if the extra L0 hypervisor layer is removed, and instead of L0 injecting an interrupt, we had hardware delivering an interrupt? Now we have L1 running on bare metal with a guest L2, and the hardware generates an interrupt. Assuming that L1 set PIN_BASED_EXT_INTR_MASK to 1, and VM_EXIT_ACK_INTR_ON_EXIT to 0 (we'll revisit these assumptions below), what happens now is this: The processor exits from L2 to L1, with an external- interrupt exit reason but without an interrupt vector. L1 runs, with interrupts disabled, and it doesn't yet know what the interrupt was. Soon after, it enables interrupts and only at that moment, it gets the interrupt from the processor. when L1 is KVM, Linux handles this interrupt. Now we need exactly the same thing to happen when that L1->L2 system runs on top of L0, instead of real hardware. This is how we do this: When L0 wants to inject an interrupt, it needs to exit from L2 to L1, with external-interrupt exit reason (with an invalid interrupt vector), and run L1. Just like in the bare metal case, it likely can't deliver the interrupt to L1 now because L1 is running with interrupts disabled, in which case it turns on the interrupt window when running L1 after the exit. L1 will soon enable interrupts, and at that point L0 will gain control again and inject the interrupt to L1. Finally, there is an extra complication in the code: when nested_run_pending, we cannot return to L1 now, and must launch L2. We need to remember the interrupt we wanted to inject (and not clear it now), and do it on the next exit. The above explanation shows that the relative strangeness of the nested interrupt injection code in this patch, and the extra interrupt-window exit incurred, are in fact necessary for accurate emulation, and are not just an unoptimized implementation. Let's revisit now the two assumptions made above: If L1 turns off PIN_BASED_EXT_INTR_MASK (no hypervisor that I know does, by the way), things are simple: L0 may inject the interrupt directly to the L2 guest - using the normal code path that injects to any guest. We support this case in the code below. If L1 turns on VM_EXIT_ACK_INTR_ON_EXIT, things look very different from the description above: L1 expects to see an exit from L2 with the interrupt vector already filled in the exit information, and does not expect to be interrupted again with this interrupt. The current code does not (yet) support this case, so we do not allow the VM_EXIT_ACK_INTR_ON_EXIT exit-control to be turned on by L1. Signed-off-by: Nadav Har'El Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 7f62dc3..ab218da 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1790,6 +1790,7 @@ static __init void nested_vmx_setup_ctls_msrs(void) /* exit controls */ nested_vmx_exit_ctls_low = 0; + /* Note that guest use of VM_EXIT_ACK_INTR_ON_EXIT is not supported. */ #ifdef CONFIG_X86_64 nested_vmx_exit_ctls_high = VM_EXIT_HOST_ADDR_SPACE_SIZE; #else @@ -3744,9 +3745,25 @@ out: return ret; } +/* + * In nested virtualization, check if L1 asked to exit on external interrupts. + * For most existing hypervisors, this will always return true. + */ +static bool nested_exit_on_intr(struct kvm_vcpu *vcpu) +{ + return get_vmcs12(vcpu)->pin_based_vm_exec_control & + PIN_BASED_EXT_INTR_MASK; +} + static void enable_irq_window(struct kvm_vcpu *vcpu) { u32 cpu_based_vm_exec_control; + if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) + /* We can get here when nested_run_pending caused + * vmx_interrupt_allowed() to return false. In this case, do + * nothing - the interrupt will be injected later. + */ + return; cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; @@ -3869,6 +3886,17 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) { + if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) { + struct vmcs12 *vmcs12; + if (to_vmx(vcpu)->nested.nested_run_pending) + return 0; + nested_vmx_vmexit(vcpu); + vmcs12 = get_vmcs12(vcpu); + vmcs12->vm_exit_reason = EXIT_REASON_EXTERNAL_INTERRUPT; + vmcs12->vm_exit_intr_info = 0; + /* fall through to normal code, but now in L1, not L2 */ + } + return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); @@ -5537,6 +5565,14 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) if (vmx->emulation_required && emulate_invalid_guest_state) return handle_invalid_guest_state(vcpu); + /* + * the KVM_REQ_EVENT optimization bit is only on for one entry, and if + * we did not inject a still-pending event to L1 now because of + * nested_run_pending, we need to re-enable this bit. + */ + if (vmx->nested.nested_run_pending) + kvm_make_request(KVM_REQ_EVENT, vcpu); + if (exit_reason == EXIT_REASON_VMLAUNCH || exit_reason == EXIT_REASON_VMRESUME) vmx->nested.nested_run_pending = 1; -- cgit v1.1 From 0b6ac343fc8e120b7d32fd2d51a8f81354086fa0 Mon Sep 17 00:00:00 2001 From: Nadav Har'El Date: Wed, 25 May 2011 23:13:36 +0300 Subject: KVM: nVMX: Correct handling of exception injection Similar to the previous patch, but concerning injection of exceptions rather than external interrupts. Signed-off-by: Nadav Har'El Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ab218da..9604af7 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1585,6 +1585,25 @@ static void vmx_clear_hlt(struct kvm_vcpu *vcpu) vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); } +/* + * KVM wants to inject page-faults which it got to the guest. This function + * checks whether in a nested guest, we need to inject them to L1 or L2. + * This function assumes it is called with the exit reason in vmcs02 being + * a #PF exception (this is the only case in which KVM injects a #PF when L2 + * is running). + */ +static int nested_pf_handled(struct kvm_vcpu *vcpu) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + + /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */ + if (!(vmcs12->exception_bitmap & PF_VECTOR)) + return 0; + + nested_vmx_vmexit(vcpu); + return 1; +} + static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, bool has_error_code, u32 error_code, bool reinject) @@ -1592,6 +1611,10 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, struct vcpu_vmx *vmx = to_vmx(vcpu); u32 intr_info = nr | INTR_INFO_VALID_MASK; + if (nr == PF_VECTOR && is_guest_mode(vcpu) && + nested_pf_handled(vcpu)) + return; + if (has_error_code) { vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); intr_info |= INTR_INFO_DELIVER_CODE_MASK; @@ -3820,6 +3843,9 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + if (is_guest_mode(vcpu)) + return; + if (!cpu_has_virtual_nmis()) { /* * Tracking the NMI-blocked state in software is built upon -- cgit v1.1 From 66c78ae40cd0a7258d01ef433ede74e33e4adbbe Mon Sep 17 00:00:00 2001 From: Nadav Har'El Date: Wed, 25 May 2011 23:14:07 +0300 Subject: KVM: nVMX: Correct handling of idt vectoring info This patch adds correct handling of IDT_VECTORING_INFO_FIELD for the nested case. When a guest exits while delivering an interrupt or exception, we get this information in IDT_VECTORING_INFO_FIELD in the VMCS. When L2 exits to L1, there's nothing we need to do, because L1 will see this field in vmcs12, and handle it itself. However, when L2 exits and L0 handles the exit itself and plans to return to L2, L0 must inject this event to L2. In the normal non-nested case, the idt_vectoring_info case is discovered after the exit, and the decision to inject (though not the injection itself) is made at that point. However, in the nested case a decision of whether to return to L2 or L1 also happens during the injection phase (see the previous patches), so in the nested case we can only decide what to do about the idt_vectoring_info right after the injection, i.e., in the beginning of vmx_vcpu_run, which is the first time we know for sure if we're staying in L2. Therefore, when we exit L2 (is_guest_mode(vcpu)), we disable the regular vmx_complete_interrupts() code which queues the idt_vectoring_info for injection on next entry - because such injection would not be appropriate if we will decide to exit to L1. Rather, we just save the idt_vectoring_info and related fields in vmcs12 (which is a convenient place to save these fields). On the next entry in vmx_vcpu_run (*after* the injection phase, potentially exiting to L1 to inject an event requested by user space), if we find ourselves in L1 we don't need to do anything with those values we saved (as explained above). But if we find that we're in L2, or rather *still* at L2 (it's not nested_run_pending, meaning that this is the first round of L2 running after L1 having just launched it), we need to inject the event saved in those fields - by writing the appropriate VMCS fields. Signed-off-by: Nadav Har'El Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 9604af7..1e6bd69 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5797,6 +5797,8 @@ static void __vmx_complete_interrupts(struct vcpu_vmx *vmx, static void vmx_complete_interrupts(struct vcpu_vmx *vmx) { + if (is_guest_mode(&vmx->vcpu)) + return; __vmx_complete_interrupts(vmx, vmx->idt_vectoring_info, VM_EXIT_INSTRUCTION_LEN, IDT_VECTORING_ERROR_CODE); @@ -5804,6 +5806,8 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) static void vmx_cancel_injection(struct kvm_vcpu *vcpu) { + if (is_guest_mode(vcpu)) + return; __vmx_complete_interrupts(to_vmx(vcpu), vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), VM_ENTRY_INSTRUCTION_LEN, @@ -5824,6 +5828,21 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending) { + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + if (vmcs12->idt_vectoring_info_field & + VECTORING_INFO_VALID_MASK) { + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, + vmcs12->idt_vectoring_info_field); + vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, + vmcs12->vm_exit_instruction_len); + if (vmcs12->idt_vectoring_info_field & + VECTORING_INFO_DELIVER_CODE_MASK) + vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, + vmcs12->idt_vectoring_error_code); + } + } + /* Record the guest's net vcpu time for enforced NMI injections. */ if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) vmx->entry_time = ktime_get(); @@ -5956,6 +5975,17 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); + if (is_guest_mode(vcpu)) { + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + vmcs12->idt_vectoring_info_field = vmx->idt_vectoring_info; + if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) { + vmcs12->idt_vectoring_error_code = + vmcs_read32(IDT_VECTORING_ERROR_CODE); + vmcs12->vm_exit_instruction_len = + vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + } + } + asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); vmx->loaded_vmcs->launched = 1; -- cgit v1.1 From eeadf9e7558ce2c34c0d91985d26047a6e2245e7 Mon Sep 17 00:00:00 2001 From: Nadav Har'El Date: Wed, 25 May 2011 23:14:38 +0300 Subject: KVM: nVMX: Handling of CR0 and CR4 modifying instructions When L2 tries to modify CR0 or CR4 (with mov or clts), and modifies a bit which L1 asked to shadow (via CR[04]_GUEST_HOST_MASK), we already do the right thing: we let L1 handle the trap (see nested_vmx_exit_handled_cr() in a previous patch). When L2 modifies bits that L1 doesn't care about, we let it think (via CR[04]_READ_SHADOW) that it did these modifications, while only changing (in GUEST_CR[04]) the bits that L0 doesn't shadow. This is needed for corect handling of CR0.TS for lazy FPU loading: L0 may want to leave TS on, while pretending to allow the guest to change it. Signed-off-by: Nadav Har'El Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 55 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 1e6bd69..6e9bebc 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4164,6 +4164,58 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) hypercall[2] = 0xc1; } +/* called to set cr0 as approriate for a mov-to-cr0 exit. */ +static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) +{ + if (to_vmx(vcpu)->nested.vmxon && + ((val & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON)) + return 1; + + if (is_guest_mode(vcpu)) { + /* + * We get here when L2 changed cr0 in a way that did not change + * any of L1's shadowed bits (see nested_vmx_exit_handled_cr), + * but did change L0 shadowed bits. This can currently happen + * with the TS bit: L0 may want to leave TS on (for lazy fpu + * loading) while pretending to allow the guest to change it. + */ + if (kvm_set_cr0(vcpu, (val & vcpu->arch.cr0_guest_owned_bits) | + (vcpu->arch.cr0 & ~vcpu->arch.cr0_guest_owned_bits))) + return 1; + vmcs_writel(CR0_READ_SHADOW, val); + return 0; + } else + return kvm_set_cr0(vcpu, val); +} + +static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val) +{ + if (is_guest_mode(vcpu)) { + if (kvm_set_cr4(vcpu, (val & vcpu->arch.cr4_guest_owned_bits) | + (vcpu->arch.cr4 & ~vcpu->arch.cr4_guest_owned_bits))) + return 1; + vmcs_writel(CR4_READ_SHADOW, val); + return 0; + } else + return kvm_set_cr4(vcpu, val); +} + +/* called to set cr0 as approriate for clts instruction exit. */ +static void handle_clts(struct kvm_vcpu *vcpu) +{ + if (is_guest_mode(vcpu)) { + /* + * We get here when L2 did CLTS, and L1 didn't shadow CR0.TS + * but we did (!fpu_active). We need to keep GUEST_CR0.TS on, + * just pretend it's off (also in arch.cr0 for fpu_activate). + */ + vmcs_writel(CR0_READ_SHADOW, + vmcs_readl(CR0_READ_SHADOW) & ~X86_CR0_TS); + vcpu->arch.cr0 &= ~X86_CR0_TS; + } else + vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); +} + static int handle_cr(struct kvm_vcpu *vcpu) { unsigned long exit_qualification, val; @@ -4180,7 +4232,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) trace_kvm_cr_write(cr, val); switch (cr) { case 0: - err = kvm_set_cr0(vcpu, val); + err = handle_set_cr0(vcpu, val); kvm_complete_insn_gp(vcpu, err); return 1; case 3: @@ -4188,7 +4240,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) kvm_complete_insn_gp(vcpu, err); return 1; case 4: - err = kvm_set_cr4(vcpu, val); + err = handle_set_cr4(vcpu, val); kvm_complete_insn_gp(vcpu, err); return 1; case 8: { @@ -4206,7 +4258,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) }; break; case 2: /* clts */ - vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); + handle_clts(vcpu); trace_kvm_cr_write(0, kvm_read_cr0(vcpu)); skip_emulated_instruction(vcpu); vmx_fpu_activate(vcpu); -- cgit v1.1 From 36cf24e01e9eba8c9ea201202762081ced2f8cdf Mon Sep 17 00:00:00 2001 From: Nadav Har'El Date: Wed, 25 May 2011 23:15:08 +0300 Subject: KVM: nVMX: Further fixes for lazy FPU loading KVM's "Lazy FPU loading" means that sometimes L0 needs to set CR0.TS, even if a guest didn't set it. Moreover, L0 must also trap CR0.TS changes and NM exceptions, even if we have a guest hypervisor (L1) who didn't want these traps. And of course, conversely: If L1 wanted to trap these events, we must let it, even if L0 is not interested in them. This patch fixes some existing KVM code (in update_exception_bitmap(), vmx_fpu_activate(), vmx_fpu_deactivate()) to do the correct merging of L0's and L1's needs. Note that handle_cr() was already fixed in the above patch, and that new code in introduced in previous patches already handles CR0 correctly (see prepare_vmcs02(), prepare_vmcs12(), and nested_vmx_vmexit()). Signed-off-by: Nadav Har'El Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 6e9bebc..10b2261 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1179,6 +1179,15 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ if (vcpu->fpu_active) eb &= ~(1u << NM_VECTOR); + + /* When we are running a nested L2 guest and L1 specified for it a + * certain exception bitmap, we must trap the same exceptions and pass + * them to L1. When running L2, we will only handle the exceptions + * specified above if L1 did not want them. + */ + if (is_guest_mode(vcpu)) + eb |= get_vmcs12(vcpu)->exception_bitmap; + vmcs_write32(EXCEPTION_BITMAP, eb); } @@ -1473,6 +1482,9 @@ static void vmx_fpu_activate(struct kvm_vcpu *vcpu) vmcs_writel(GUEST_CR0, cr0); update_exception_bitmap(vcpu); vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; + if (is_guest_mode(vcpu)) + vcpu->arch.cr0_guest_owned_bits &= + ~get_vmcs12(vcpu)->cr0_guest_host_mask; vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); } @@ -1496,12 +1508,29 @@ static inline unsigned long nested_read_cr4(struct vmcs12 *fields) static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) { + /* Note that there is no vcpu->fpu_active = 0 here. The caller must + * set this *before* calling this function. + */ vmx_decache_cr0_guest_bits(vcpu); vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP); update_exception_bitmap(vcpu); vcpu->arch.cr0_guest_owned_bits = 0; vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); - vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); + if (is_guest_mode(vcpu)) { + /* + * L1's specified read shadow might not contain the TS bit, + * so now that we turned on shadowing of this bit, we need to + * set this bit of the shadow. Like in nested_vmx_run we need + * nested_read_cr0(vmcs12), but vmcs12->guest_cr0 is not yet + * up-to-date here because we just decached cr0.TS (and we'll + * only update vmcs12->guest_cr0 on nested exit). + */ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + vmcs12->guest_cr0 = (vmcs12->guest_cr0 & ~X86_CR0_TS) | + (vcpu->arch.cr0 & X86_CR0_TS); + vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); + } else + vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); } static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) -- cgit v1.1 From 7991825b8558a719eb7cfb93c4458d767ae1f2eb Mon Sep 17 00:00:00 2001 From: Nadav Har'El Date: Wed, 25 May 2011 23:15:39 +0300 Subject: KVM: nVMX: Additional TSC-offset handling In the unlikely case that L1 does not capture MSR_IA32_TSC, L0 needs to emulate this MSR write by L2 by modifying vmcs02.tsc_offset. We also need to set vmcs12.tsc_offset, for this change to survive the next nested entry (see prepare_vmcs02()). Additionally, we also need to modify vmx_adjust_tsc_offset: The semantics of this function is that the TSC of all guests on this vcpu, L1 and possibly several L2s, need to be adjusted. To do this, we need to adjust vmcs01's tsc_offset (this offset will also apply to each L2s we enter). We can't set vmcs01 now, so we have to remember this adjustment and apply it when we later exit to L1. Signed-off-by: Nadav Har'El Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 10b2261..afc1f06 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1766,12 +1766,24 @@ static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) { vmcs_write64(TSC_OFFSET, offset); + if (is_guest_mode(vcpu)) + /* + * We're here if L1 chose not to trap the TSC MSR. Since + * prepare_vmcs12() does not copy tsc_offset, we need to also + * set the vmcs12 field here. + */ + get_vmcs12(vcpu)->tsc_offset = offset - + to_vmx(vcpu)->nested.vmcs01_tsc_offset; } static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) { u64 offset = vmcs_read64(TSC_OFFSET); vmcs_write64(TSC_OFFSET, offset + adjustment); + if (is_guest_mode(vcpu)) { + /* Even when running L2, the adjustment needs to apply to L1 */ + to_vmx(vcpu)->nested.vmcs01_tsc_offset += adjustment; + } } static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) -- cgit v1.1 From 7b8050f570a03718d21fc8662c54586192ea2dac Mon Sep 17 00:00:00 2001 From: Nadav Har'El Date: Wed, 25 May 2011 23:16:10 +0300 Subject: KVM: nVMX: Add VMX to list of supported cpuid features If the "nested" module option is enabled, add the "VMX" CPU feature to the list of CPU features KVM advertises with the KVM_GET_SUPPORTED_CPUID ioctl. Qemu uses this ioctl, and intersects KVM's list with its own list of desired cpu features (depending on the -cpu option given to qemu) to determine the final list of features presented to the guest. Signed-off-by: Nadav Har'El Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index afc1f06..a600fd7 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6299,6 +6299,8 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) { + if (func == 1 && nested) + entry->ecx |= bit(X86_FEATURE_VMX); } /* -- cgit v1.1 From 2844d8490523c5768cc37f21e065c76c45232724 Mon Sep 17 00:00:00 2001 From: Nadav Har'El Date: Wed, 25 May 2011 23:16:40 +0300 Subject: KVM: nVMX: Miscellenous small corrections Small corrections of KVM (spelling, etc.) not directly related to nested VMX. Signed-off-by: Nadav Har'El Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a600fd7..54e732d 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -959,7 +959,7 @@ static void vmcs_load(struct vmcs *vmcs) : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr) : "cc", "memory"); if (error) - printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n", + printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n", vmcs, phys_addr); } -- cgit v1.1 From 9d74191ab1ea857d1cc27e439316eebf8ae46d19 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Sun, 29 May 2011 21:53:48 +0900 Subject: KVM: x86 emulator: Use the pointers ctxt and c consistently We should use the local variables ctxt and c when the emulate_ctxt and decode appears many times. At least, we need to be consistent about how we use these in a function. Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 6 ++--- arch/x86/kvm/x86.c | 59 +++++++++++++++++++++++++------------------------- 2 files changed, 32 insertions(+), 33 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index bc916bd..6c054f8 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3707,7 +3707,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) int saved_dst_type = c->dst.type; int irq; /* Used for int 3, int, and into */ - ctxt->decode.mem_read.pos = 0; + c->mem_read.pos = 0; if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) { rc = emulate_ud(ctxt); @@ -4092,7 +4092,7 @@ writeback: &c->dst); if (c->rep_prefix && (c->d & String)) { - struct read_cache *r = &ctxt->decode.io_read; + struct read_cache *r = &c->io_read; register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1); if (!string_insn_completed(ctxt)) { @@ -4107,7 +4107,7 @@ writeback: * decode, but since instruction is restarted * we have to do it here. */ - ctxt->decode.mem_read.end = 0; + c->mem_read.end = 0; return EMULATION_RESTART; } goto done; /* skip rip writeback */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index de262a0..39d8b04 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4552,24 +4552,24 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) { - struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; + struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + struct decode_cache *c = &ctxt->decode; int ret; init_emulate_ctxt(vcpu); - vcpu->arch.emulate_ctxt.decode.op_bytes = 2; - vcpu->arch.emulate_ctxt.decode.ad_bytes = 2; - vcpu->arch.emulate_ctxt.decode.eip = vcpu->arch.emulate_ctxt.eip + - inc_eip; - ret = emulate_int_real(&vcpu->arch.emulate_ctxt, irq); + c->op_bytes = 2; + c->ad_bytes = 2; + c->eip = ctxt->eip + inc_eip; + ret = emulate_int_real(ctxt, irq); if (ret != X86EMUL_CONTINUE) return EMULATE_FAIL; - vcpu->arch.emulate_ctxt.eip = c->eip; + ctxt->eip = c->eip; memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); - kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); - kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); + kvm_rip_write(vcpu, ctxt->eip); + kvm_set_rflags(vcpu, ctxt->eflags); if (irq == NMI_VECTOR) vcpu->arch.nmi_pending = false; @@ -4630,21 +4630,22 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, int insn_len) { int r; - struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; + struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + struct decode_cache *c = &ctxt->decode; bool writeback = true; kvm_clear_exception_queue(vcpu); if (!(emulation_type & EMULTYPE_NO_DECODE)) { init_emulate_ctxt(vcpu); - vcpu->arch.emulate_ctxt.interruptibility = 0; - vcpu->arch.emulate_ctxt.have_exception = false; - vcpu->arch.emulate_ctxt.perm_ok = false; + ctxt->interruptibility = 0; + ctxt->have_exception = false; + ctxt->perm_ok = false; - vcpu->arch.emulate_ctxt.only_vendor_specific_insn + ctxt->only_vendor_specific_insn = emulation_type & EMULTYPE_TRAP_UD; - r = x86_decode_insn(&vcpu->arch.emulate_ctxt, insn, insn_len); + r = x86_decode_insn(ctxt, insn, insn_len); trace_kvm_emulate_insn_start(vcpu); ++vcpu->stat.insn_emulation; @@ -4660,7 +4661,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, } if (emulation_type & EMULTYPE_SKIP) { - kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.decode.eip); + kvm_rip_write(vcpu, c->eip); return EMULATE_DONE; } @@ -4672,7 +4673,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, } restart: - r = x86_emulate_insn(&vcpu->arch.emulate_ctxt); + r = x86_emulate_insn(ctxt); if (r == EMULATION_INTERCEPTED) return EMULATE_DONE; @@ -4684,7 +4685,7 @@ restart: return handle_emulation_failure(vcpu); } - if (vcpu->arch.emulate_ctxt.have_exception) { + if (ctxt->have_exception) { inject_emulated_exception(vcpu); r = EMULATE_DONE; } else if (vcpu->arch.pio.count) { @@ -4703,13 +4704,12 @@ restart: r = EMULATE_DONE; if (writeback) { - toggle_interruptibility(vcpu, - vcpu->arch.emulate_ctxt.interruptibility); - kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); + toggle_interruptibility(vcpu, ctxt->interruptibility); + kvm_set_rflags(vcpu, ctxt->eflags); kvm_make_request(KVM_REQ_EVENT, vcpu); memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); vcpu->arch.emulate_regs_need_sync_to_vcpu = false; - kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); + kvm_rip_write(vcpu, ctxt->eip); } else vcpu->arch.emulate_regs_need_sync_to_vcpu = true; @@ -5130,8 +5130,7 @@ int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt) kvm_x86_ops->patch_hypercall(vcpu, instruction); - return emulator_write_emulated(&vcpu->arch.emulate_ctxt, - rip, instruction, 3, NULL); + return emulator_write_emulated(ctxt, rip, instruction, 3, NULL); } static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) @@ -5849,21 +5848,21 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, bool has_error_code, u32 error_code) { - struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; + struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + struct decode_cache *c = &ctxt->decode; int ret; init_emulate_ctxt(vcpu); - ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, - tss_selector, reason, has_error_code, - error_code); + ret = emulator_task_switch(ctxt, tss_selector, reason, + has_error_code, error_code); if (ret) return EMULATE_FAIL; memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); - kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); - kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); + kvm_rip_write(vcpu, ctxt->eip); + kvm_set_rflags(vcpu, ctxt->eflags); kvm_make_request(KVM_REQ_EVENT, vcpu); return EMULATE_DONE; } -- cgit v1.1 From e01991e71a179ddab494c8e02100ad73bc0010c4 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Sun, 29 May 2011 21:55:10 +0900 Subject: KVM: x86 emulator: Rename emulate_xxx() to em_xxx() The next patch will change these to be called by opcode::execute. Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 6c054f8..4af9442 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1656,7 +1656,7 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt) return rc; } -static int emulate_iret(struct x86_emulate_ctxt *ctxt) +static int em_iret(struct x86_emulate_ctxt *ctxt) { switch(ctxt->mode) { case X86EMUL_MODE_REAL: @@ -1816,7 +1816,7 @@ static int em_grp9(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } -static int emulate_ret_far(struct x86_emulate_ctxt *ctxt) +static int em_ret_far(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; int rc; @@ -1880,7 +1880,7 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, ss->p = 1; } -static int emulate_syscall(struct x86_emulate_ctxt *ctxt) +static int em_syscall(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; struct x86_emulate_ops *ops = ctxt->ops; @@ -1933,7 +1933,7 @@ static int emulate_syscall(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } -static int emulate_sysenter(struct x86_emulate_ctxt *ctxt) +static int em_sysenter(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; struct x86_emulate_ops *ops = ctxt->ops; @@ -1989,7 +1989,7 @@ static int emulate_sysenter(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } -static int emulate_sysexit(struct x86_emulate_ctxt *ctxt) +static int em_sysexit(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; struct x86_emulate_ops *ops = ctxt->ops; @@ -3948,7 +3948,7 @@ special_insn: rc = emulate_load_segment(ctxt, VCPU_SREG_DS); break; case 0xcb: /* ret far */ - rc = emulate_ret_far(ctxt); + rc = em_ret_far(ctxt); break; case 0xcc: /* int3 */ irq = 3; @@ -3965,7 +3965,7 @@ special_insn: } break; case 0xcf: /* iret */ - rc = emulate_iret(ctxt); + rc = em_iret(ctxt); break; case 0xd0 ... 0xd1: /* Grp2 */ rc = em_grp2(ctxt); @@ -4127,7 +4127,7 @@ done: twobyte_insn: switch (c->b) { case 0x05: /* syscall */ - rc = emulate_syscall(ctxt); + rc = em_syscall(ctxt); break; case 0x06: rc = em_clts(ctxt); @@ -4189,10 +4189,10 @@ twobyte_insn: rc = X86EMUL_CONTINUE; break; case 0x34: /* sysenter */ - rc = emulate_sysenter(ctxt); + rc = em_sysenter(ctxt); break; case 0x35: /* sysexit */ - rc = emulate_sysexit(ctxt); + rc = em_sysexit(ctxt); break; case 0x40 ... 0x4f: /* cmov */ c->dst.val = c->dst.orig_val = c->src.val; -- cgit v1.1 From db5b0762f3cab58398f16379ab37ef66ef9ba497 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Sun, 29 May 2011 21:56:26 +0900 Subject: KVM: x86 emulator: Use opcode::execute for some instructions Move the following functions to the opcode tables: RET (Far return) : CB IRET : CF JMP (Jump far) : EA SYSCALL : 0F 05 CLTS : 0F 06 SYSENTER : 0F 34 SYSEXIT : 0F 35 Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 37 ++++++++----------------------------- 1 file changed, 8 insertions(+), 29 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 4af9442..136bc6c 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3169,9 +3169,9 @@ static struct opcode opcode_table[256] = { D(DstReg | SrcMemFAddr | ModRM | No64), D(DstReg | SrcMemFAddr | ModRM | No64), G(ByteOp, group11), G(0, group11), /* 0xC8 - 0xCF */ - N, N, N, D(ImplicitOps | Stack), + N, N, N, I(ImplicitOps | Stack, em_ret_far), D(ImplicitOps), DI(SrcImmByte, intn), - D(ImplicitOps | No64), DI(ImplicitOps, iret), + D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret), /* 0xD0 - 0xD7 */ D2bv(DstMem | SrcOne | ModRM), D2bv(DstMem | ModRM), N, N, N, N, @@ -3183,7 +3183,7 @@ static struct opcode opcode_table[256] = { D2bvIP(SrcAcc | DstImmUByte, out, check_perm_out), /* 0xE8 - 0xEF */ D(SrcImm | Stack), D(SrcImm | ImplicitOps), - D(SrcImmFAddr | No64), D(SrcImmByte | ImplicitOps), + I(SrcImmFAddr | No64, em_jmp_far), D(SrcImmByte | ImplicitOps), D2bvIP(SrcDX | DstAcc, in, check_perm_in), D2bvIP(SrcAcc | DstDX, out, check_perm_out), /* 0xF0 - 0xF7 */ @@ -3198,7 +3198,8 @@ static struct opcode opcode_table[256] = { static struct opcode twobyte_table[256] = { /* 0x00 - 0x0F */ G(0, group6), GD(0, &group7), N, N, - N, D(ImplicitOps | VendorSpecific), DI(ImplicitOps | Priv, clts), N, + N, I(ImplicitOps | VendorSpecific, em_syscall), + II(ImplicitOps | Priv, em_clts, clts), N, DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N, N, D(ImplicitOps | ModRM), N, N, /* 0x10 - 0x1F */ @@ -3215,7 +3216,8 @@ static struct opcode twobyte_table[256] = { IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc), DI(ImplicitOps | Priv, rdmsr), DIP(ImplicitOps | Priv, rdpmc, check_rdpmc), - D(ImplicitOps | VendorSpecific), D(ImplicitOps | Priv | VendorSpecific), + I(ImplicitOps | VendorSpecific, em_sysenter), + I(ImplicitOps | Priv | VendorSpecific, em_sysexit), N, N, N, N, N, N, N, N, N, N, /* 0x40 - 0x4F */ @@ -3947,9 +3949,6 @@ special_insn: case 0xc5: /* lds */ rc = emulate_load_segment(ctxt, VCPU_SREG_DS); break; - case 0xcb: /* ret far */ - rc = em_ret_far(ctxt); - break; case 0xcc: /* int3 */ irq = 3; goto do_interrupt; @@ -3964,9 +3963,6 @@ special_insn: goto do_interrupt; } break; - case 0xcf: /* iret */ - rc = em_iret(ctxt); - break; case 0xd0 ... 0xd1: /* Grp2 */ rc = em_grp2(ctxt); break; @@ -3998,12 +3994,7 @@ special_insn: break; } case 0xe9: /* jmp rel */ - goto jmp; - case 0xea: /* jmp far */ - rc = em_jmp_far(ctxt); - break; - case 0xeb: - jmp: /* jmp rel short */ + case 0xeb: /* jmp rel short */ jmp_rel(c, c->src.val); c->dst.type = OP_NONE; /* Disable writeback. */ break; @@ -4126,12 +4117,6 @@ done: twobyte_insn: switch (c->b) { - case 0x05: /* syscall */ - rc = em_syscall(ctxt); - break; - case 0x06: - rc = em_clts(ctxt); - break; case 0x09: /* wbinvd */ (ctxt->ops->wbinvd)(ctxt); break; @@ -4188,12 +4173,6 @@ twobyte_insn: } rc = X86EMUL_CONTINUE; break; - case 0x34: /* sysenter */ - rc = em_sysenter(ctxt); - break; - case 0x35: /* sysexit */ - rc = em_sysexit(ctxt); - break; case 0x40 ... 0x4f: /* cmov */ c->dst.val = c->dst.orig_val = c->src.val; if (!test_cc(c->b, ctxt->eflags)) -- cgit v1.1 From 9f21ca599cd609502de8a56c1d4c4688d40abb2d Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Sun, 29 May 2011 21:57:53 +0900 Subject: KVM: x86 emulator: Use opcode::execute for TEST(84/85, A8/A9) Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 136bc6c..d25cfc2 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2605,6 +2605,14 @@ static int em_cmp(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +static int em_test(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); + return X86EMUL_CONTINUE; +} + static int em_imul(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; @@ -3135,7 +3143,8 @@ static struct opcode opcode_table[256] = { G(DstMem | SrcImm | ModRM | Group, group1), G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1), G(DstMem | SrcImmByte | ModRM | Group, group1), - D2bv(DstMem | SrcReg | ModRM), D2bv(DstMem | SrcReg | ModRM | Lock), + I2bv(DstMem | SrcReg | ModRM, em_test), + D2bv(DstMem | SrcReg | ModRM | Lock), /* 0x88 - 0x8F */ I2bv(DstMem | SrcReg | ModRM | Mov, em_mov), I2bv(DstReg | SrcMem | ModRM | Mov, em_mov), @@ -3154,7 +3163,7 @@ static struct opcode opcode_table[256] = { I2bv(SrcSI | DstDI | Mov | String, em_mov), I2bv(SrcSI | DstDI | String, em_cmp), /* 0xA8 - 0xAF */ - D2bv(DstAcc | SrcImm), + I2bv(DstAcc | SrcImm, em_test), I2bv(SrcAcc | DstDI | Mov | String, em_mov), I2bv(SrcSI | DstAcc | Mov | String, em_mov), I2bv(SrcAcc | DstDI | String, em_cmp), @@ -3873,10 +3882,6 @@ special_insn: if (test_cc(c->b, ctxt->eflags)) jmp_rel(c, c->src.val); break; - case 0x84 ... 0x85: - test: - emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); - break; case 0x86 ... 0x87: /* xchg */ xchg: /* Write back the register source. */ @@ -3932,8 +3937,6 @@ special_insn: case 8: c->dst.val = (s32)c->dst.val; break; } break; - case 0xa8 ... 0xa9: /* test ax, imm */ - goto test; case 0xc0 ... 0xc1: rc = em_grp2(ctxt); break; -- cgit v1.1 From e4f973ae913028bac8c07187e0fd49c1dc08ce58 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Sun, 29 May 2011 21:59:09 +0900 Subject: KVM: x86 emulator: Use opcode::execute for XCHG(86/87) In addition, replace one "goto xchg" with an em_xchg() call. Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index d25cfc2..c3d071d 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2613,6 +2613,20 @@ static int em_test(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +static int em_xchg(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + /* Write back the register source. */ + c->src.val = c->dst.val; + write_register_operand(&c->src); + + /* Write back the memory destination with implicit LOCK prefix. */ + c->dst.val = c->src.orig_val; + c->lock_prefix = 1; + return X86EMUL_CONTINUE; +} + static int em_imul(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; @@ -3144,7 +3158,7 @@ static struct opcode opcode_table[256] = { G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1), G(DstMem | SrcImmByte | ModRM | Group, group1), I2bv(DstMem | SrcReg | ModRM, em_test), - D2bv(DstMem | SrcReg | ModRM | Lock), + I2bv(DstMem | SrcReg | ModRM | Lock, em_xchg), /* 0x88 - 0x8F */ I2bv(DstMem | SrcReg | ModRM | Mov, em_mov), I2bv(DstReg | SrcMem | ModRM | Mov, em_mov), @@ -3882,18 +3896,6 @@ special_insn: if (test_cc(c->b, ctxt->eflags)) jmp_rel(c, c->src.val); break; - case 0x86 ... 0x87: /* xchg */ - xchg: - /* Write back the register source. */ - c->src.val = c->dst.val; - write_register_operand(&c->src); - /* - * Write back the memory destination with implicit LOCK - * prefix. - */ - c->dst.val = c->src.orig_val; - c->lock_prefix = 1; - break; case 0x8c: /* mov r/m, sreg */ if (c->modrm_reg > VCPU_SREG_GS) { rc = emulate_ud(ctxt); @@ -3929,7 +3931,8 @@ special_insn: case 0x90 ... 0x97: /* nop / xchg reg, rax */ if (c->dst.addr.reg == &c->regs[VCPU_REGS_RAX]) break; - goto xchg; + rc = em_xchg(ctxt); + break; case 0x98: /* cbw/cwde/cdqe */ switch (c->op_bytes) { case 2: c->dst.val = (s8)c->dst.val; break; -- cgit v1.1 From ebda02c2a5a6001c787f311b4d5a0dc827ce2d92 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Sun, 29 May 2011 22:00:22 +0900 Subject: KVM: x86 emulator: Use opcode::execute for RET(C3) Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index c3d071d..2ebec69 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1816,6 +1816,16 @@ static int em_grp9(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +static int em_ret(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + c->dst.type = OP_REG; + c->dst.addr.reg = &c->eip; + c->dst.bytes = c->op_bytes; + return em_pop(ctxt); +} + static int em_ret_far(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; @@ -3188,7 +3198,7 @@ static struct opcode opcode_table[256] = { /* 0xC0 - 0xC7 */ D2bv(DstMem | SrcImmByte | ModRM), I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm), - D(ImplicitOps | Stack), + I(ImplicitOps | Stack, em_ret), D(DstReg | SrcMemFAddr | ModRM | No64), D(DstReg | SrcMemFAddr | ModRM | No64), G(ByteOp, group11), G(0, group11), /* 0xC8 - 0xCF */ @@ -3943,12 +3953,6 @@ special_insn: case 0xc0 ... 0xc1: rc = em_grp2(ctxt); break; - case 0xc3: /* ret */ - c->dst.type = OP_REG; - c->dst.addr.reg = &c->eip; - c->dst.bytes = c->op_bytes; - rc = em_pop(ctxt); - break; case 0xc4: /* les */ rc = emulate_load_segment(ctxt, VCPU_SREG_ES); break; -- cgit v1.1 From 1bd5f469b2d54330ba41d9c4b857dc5051e8dcf7 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Sun, 29 May 2011 22:01:33 +0900 Subject: KVM: x86 emulator: Use opcode::execute for MOV(8C/8E) Different functions for those which take segment register operands. Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 59 ++++++++++++++++++++++++++------------------------ 1 file changed, 31 insertions(+), 28 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 2ebec69..5561680 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2683,6 +2683,33 @@ static int em_mov(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +static int em_mov_rm_sreg(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + if (c->modrm_reg > VCPU_SREG_GS) + return emulate_ud(ctxt); + + c->dst.val = get_segment_selector(ctxt, c->modrm_reg); + return X86EMUL_CONTINUE; +} + +static int em_mov_sreg_rm(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + u16 sel = c->src.val; + + if (c->modrm_reg == VCPU_SREG_CS || c->modrm_reg > VCPU_SREG_GS) + return emulate_ud(ctxt); + + if (c->modrm_reg == VCPU_SREG_SS) + ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS; + + /* Disable writeback. */ + c->dst.type = OP_NONE; + return load_segment_descriptor(ctxt, sel, c->modrm_reg); +} + static int em_movdqu(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; @@ -3172,8 +3199,10 @@ static struct opcode opcode_table[256] = { /* 0x88 - 0x8F */ I2bv(DstMem | SrcReg | ModRM | Mov, em_mov), I2bv(DstReg | SrcMem | ModRM | Mov, em_mov), - D(DstMem | SrcNone | ModRM | Mov), D(ModRM | SrcMem | NoAccess | DstReg), - D(ImplicitOps | SrcMem16 | ModRM), G(0, group1A), + I(DstMem | SrcNone | ModRM | Mov, em_mov_rm_sreg), + D(ModRM | SrcMem | NoAccess | DstReg), + I(ImplicitOps | SrcMem16 | ModRM, em_mov_sreg_rm), + G(0, group1A), /* 0x90 - 0x97 */ DI(SrcAcc | DstReg, pause), X7(D(SrcAcc | DstReg)), /* 0x98 - 0x9F */ @@ -3906,35 +3935,9 @@ special_insn: if (test_cc(c->b, ctxt->eflags)) jmp_rel(c, c->src.val); break; - case 0x8c: /* mov r/m, sreg */ - if (c->modrm_reg > VCPU_SREG_GS) { - rc = emulate_ud(ctxt); - goto done; - } - c->dst.val = get_segment_selector(ctxt, c->modrm_reg); - break; case 0x8d: /* lea r16/r32, m */ c->dst.val = c->src.addr.mem.ea; break; - case 0x8e: { /* mov seg, r/m16 */ - uint16_t sel; - - sel = c->src.val; - - if (c->modrm_reg == VCPU_SREG_CS || - c->modrm_reg > VCPU_SREG_GS) { - rc = emulate_ud(ctxt); - goto done; - } - - if (c->modrm_reg == VCPU_SREG_SS) - ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS; - - rc = load_segment_descriptor(ctxt, sel, c->modrm_reg); - - c->dst.type = OP_NONE; /* Disable writeback. */ - break; - } case 0x8f: /* pop (sole member of Grp1a) */ rc = em_grp1a(ctxt); break; -- cgit v1.1 From 5c5df76b8b32055956ee4cca338d29046016b13e Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Sun, 29 May 2011 22:02:55 +0900 Subject: KVM: x86 emulator: Clean up INT n/INTO/INT 3(CC/CD/CE) Call emulate_int() directly to avoid spaghetti goto's. Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 5561680..d7df7ba 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3769,7 +3769,6 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) struct decode_cache *c = &ctxt->decode; int rc = X86EMUL_CONTINUE; int saved_dst_type = c->dst.type; - int irq; /* Used for int 3, int, and into */ c->mem_read.pos = 0; @@ -3963,18 +3962,14 @@ special_insn: rc = emulate_load_segment(ctxt, VCPU_SREG_DS); break; case 0xcc: /* int3 */ - irq = 3; - goto do_interrupt; + rc = emulate_int(ctxt, 3); + break; case 0xcd: /* int n */ - irq = c->src.val; - do_interrupt: - rc = emulate_int(ctxt, irq); + rc = emulate_int(ctxt, c->src.val); break; case 0xce: /* into */ - if (ctxt->eflags & EFLG_OF) { - irq = 4; - goto do_interrupt; - } + if (ctxt->eflags & EFLG_OF) + rc = emulate_int(ctxt, 4); break; case 0xd0 ... 0xd1: /* Grp2 */ rc = em_grp2(ctxt); -- cgit v1.1 From d06e03adcb30f9e9fff4df1d80a3087f54a62d9a Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Sun, 29 May 2011 22:04:08 +0900 Subject: KVM: x86 emulator: Use opcode::execute for LOOP/JCXZ LOOP/LOOPcc : E0-E2 JCXZ/JECXZ/JRCXZ : E3 Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index d7df7ba..e9dbbc9 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2824,6 +2824,28 @@ static int em_lmsw(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +static int em_loop(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1); + if ((address_mask(c, c->regs[VCPU_REGS_RCX]) != 0) && + (c->b == 0xe2 || test_cc(c->b ^ 0x5, ctxt->eflags))) + jmp_rel(c, c->src.val); + + return X86EMUL_CONTINUE; +} + +static int em_jcxz(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) + jmp_rel(c, c->src.val); + + return X86EMUL_CONTINUE; +} + static bool valid_cr(int nr) { switch (nr) { @@ -3240,7 +3262,8 @@ static struct opcode opcode_table[256] = { /* 0xD8 - 0xDF */ N, N, N, N, N, N, N, N, /* 0xE0 - 0xE7 */ - X4(D(SrcImmByte)), + X3(I(SrcImmByte, em_loop)), + I(SrcImmByte, em_jcxz), D2bvIP(SrcImmUByte | DstAcc, in, check_perm_in), D2bvIP(SrcAcc | DstImmUByte, out, check_perm_out), /* 0xE8 - 0xEF */ @@ -3978,16 +4001,6 @@ special_insn: c->src.val = c->regs[VCPU_REGS_RCX]; rc = em_grp2(ctxt); break; - case 0xe0 ... 0xe2: /* loop/loopz/loopnz */ - register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1); - if (address_mask(c, c->regs[VCPU_REGS_RCX]) != 0 && - (c->b == 0xe2 || test_cc(c->b ^ 0x5, ctxt->eflags))) - jmp_rel(c, c->src.val); - break; - case 0xe3: /* jcxz/jecxz/jrcxz */ - if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) - jmp_rel(c, c->src.val); - break; case 0xe4: /* inb */ case 0xe5: /* in */ goto do_io_in; -- cgit v1.1 From f411e6cdc275e63ead2ffb427d0497daae6f6069 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Sun, 29 May 2011 22:05:15 +0900 Subject: KVM: x86 emulator: Use opcode::execute for CLI/STI(FA/FB) Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index e9dbbc9..663bdb3 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2846,6 +2846,25 @@ static int em_jcxz(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +static int em_cli(struct x86_emulate_ctxt *ctxt) +{ + if (emulator_bad_iopl(ctxt)) + return emulate_gp(ctxt, 0); + + ctxt->eflags &= ~X86_EFLAGS_IF; + return X86EMUL_CONTINUE; +} + +static int em_sti(struct x86_emulate_ctxt *ctxt) +{ + if (emulator_bad_iopl(ctxt)) + return emulate_gp(ctxt, 0); + + ctxt->interruptibility = KVM_X86_SHADOW_INT_STI; + ctxt->eflags |= X86_EFLAGS_IF; + return X86EMUL_CONTINUE; +} + static bool valid_cr(int nr) { switch (nr) { @@ -3276,7 +3295,8 @@ static struct opcode opcode_table[256] = { DI(ImplicitOps | Priv, hlt), D(ImplicitOps), G(ByteOp, group3), G(0, group3), /* 0xF8 - 0xFF */ - D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), + D(ImplicitOps), D(ImplicitOps), + I(ImplicitOps, em_cli), I(ImplicitOps, em_sti), D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5), }; @@ -4049,22 +4069,6 @@ special_insn: case 0xf9: /* stc */ ctxt->eflags |= EFLG_CF; break; - case 0xfa: /* cli */ - if (emulator_bad_iopl(ctxt)) { - rc = emulate_gp(ctxt, 0); - goto done; - } else - ctxt->eflags &= ~X86_EFLAGS_IF; - break; - case 0xfb: /* sti */ - if (emulator_bad_iopl(ctxt)) { - rc = emulate_gp(ctxt, 0); - goto done; - } else { - ctxt->interruptibility = KVM_X86_SHADOW_INT_STI; - ctxt->eflags |= X86_EFLAGS_IF; - } - break; case 0xfc: /* cld */ ctxt->eflags &= ~EFLG_DF; break; -- cgit v1.1 From 2e4ce7f574369f374ad537a180b4870e2098cf0e Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Wed, 1 Jun 2011 12:57:30 +0200 Subject: KVM: VMX: Silence warning on 32-bit hosts a is unused now on CONFIG_X86_32. Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 54e732d..58badf1 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3602,7 +3602,9 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) */ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) { +#ifdef CONFIG_X86_64 unsigned long a; +#endif int i; /* I/O */ -- cgit v1.1 From 36dd9bb5ce32bc39e25a5fcc61415f13e3ed5d17 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 1 Jun 2011 15:34:24 +0300 Subject: KVM: x86 emulator: rename decode_cache::eip to _eip The name eip conflicts with a field of the same name in x86_emulate_ctxt, which we plan to fold decode_cache into. The name _eip is unfortunate, but what's really needed is a refactoring here, not a better name. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_emulate.h | 2 +- arch/x86/kvm/emulate.c | 120 ++++++++++++++++++------------------- arch/x86/kvm/trace.h | 2 +- arch/x86/kvm/x86.c | 6 +- 4 files changed, 65 insertions(+), 65 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index c0f77e0..d0e100f 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -253,7 +253,7 @@ struct decode_cache { u8 modrm_rm; u8 modrm_seg; bool rip_relative; - unsigned long eip; + unsigned long _eip; /* Fields above regs are cleared together. */ unsigned long regs[NR_VCPU_REGS]; struct fetch_cache fetch; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 663bdb3..a1b9705 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -459,7 +459,7 @@ register_address_increment(struct decode_cache *c, unsigned long *reg, int inc) static inline void jmp_rel(struct decode_cache *c, int rel) { - register_address_increment(c, &c->eip, rel); + register_address_increment(c, &c->_eip, rel); } static u32 desc_limit_scaled(struct desc_struct *desc) @@ -898,7 +898,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, c->modrm_rm = base_reg = (c->rex_prefix & 1) << 3; /* REG.B */ } - c->modrm = insn_fetch(u8, 1, c->eip); + c->modrm = insn_fetch(u8, 1, c->_eip); c->modrm_mod |= (c->modrm & 0xc0) >> 6; c->modrm_reg |= (c->modrm & 0x38) >> 3; c->modrm_rm |= (c->modrm & 0x07); @@ -932,13 +932,13 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, switch (c->modrm_mod) { case 0: if (c->modrm_rm == 6) - modrm_ea += insn_fetch(u16, 2, c->eip); + modrm_ea += insn_fetch(u16, 2, c->_eip); break; case 1: - modrm_ea += insn_fetch(s8, 1, c->eip); + modrm_ea += insn_fetch(s8, 1, c->_eip); break; case 2: - modrm_ea += insn_fetch(u16, 2, c->eip); + modrm_ea += insn_fetch(u16, 2, c->_eip); break; } switch (c->modrm_rm) { @@ -975,13 +975,13 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, } else { /* 32/64-bit ModR/M decode. */ if ((c->modrm_rm & 7) == 4) { - sib = insn_fetch(u8, 1, c->eip); + sib = insn_fetch(u8, 1, c->_eip); index_reg |= (sib >> 3) & 7; base_reg |= sib & 7; scale = sib >> 6; if ((base_reg & 7) == 5 && c->modrm_mod == 0) - modrm_ea += insn_fetch(s32, 4, c->eip); + modrm_ea += insn_fetch(s32, 4, c->_eip); else modrm_ea += c->regs[base_reg]; if (index_reg != 4) @@ -994,13 +994,13 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, switch (c->modrm_mod) { case 0: if (c->modrm_rm == 5) - modrm_ea += insn_fetch(s32, 4, c->eip); + modrm_ea += insn_fetch(s32, 4, c->_eip); break; case 1: - modrm_ea += insn_fetch(s8, 1, c->eip); + modrm_ea += insn_fetch(s8, 1, c->_eip); break; case 2: - modrm_ea += insn_fetch(s32, 4, c->eip); + modrm_ea += insn_fetch(s32, 4, c->_eip); break; } } @@ -1018,13 +1018,13 @@ static int decode_abs(struct x86_emulate_ctxt *ctxt, op->type = OP_MEM; switch (c->ad_bytes) { case 2: - op->addr.mem.ea = insn_fetch(u16, 2, c->eip); + op->addr.mem.ea = insn_fetch(u16, 2, c->_eip); break; case 4: - op->addr.mem.ea = insn_fetch(u32, 4, c->eip); + op->addr.mem.ea = insn_fetch(u32, 4, c->_eip); break; case 8: - op->addr.mem.ea = insn_fetch(u64, 8, c->eip); + op->addr.mem.ea = insn_fetch(u64, 8, c->_eip); break; } done: @@ -1561,7 +1561,7 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq) if (rc != X86EMUL_CONTINUE) return rc; - c->src.val = c->eip; + c->src.val = c->_eip; rc = em_push(ctxt); if (rc != X86EMUL_CONTINUE) return rc; @@ -1583,7 +1583,7 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq) if (rc != X86EMUL_CONTINUE) return rc; - c->eip = eip; + c->_eip = eip; return rc; } @@ -1640,7 +1640,7 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt) if (rc != X86EMUL_CONTINUE) return rc; - c->eip = temp_eip; + c->_eip = temp_eip; if (c->op_bytes == 4) @@ -1683,8 +1683,8 @@ static int em_jmp_far(struct x86_emulate_ctxt *ctxt) if (rc != X86EMUL_CONTINUE) return rc; - c->eip = 0; - memcpy(&c->eip, c->src.valptr, c->op_bytes); + c->_eip = 0; + memcpy(&c->_eip, c->src.valptr, c->op_bytes); return X86EMUL_CONTINUE; } @@ -1778,14 +1778,14 @@ static int em_grp45(struct x86_emulate_ctxt *ctxt) break; case 2: /* call near abs */ { long int old_eip; - old_eip = c->eip; - c->eip = c->src.val; + old_eip = c->_eip; + c->_eip = c->src.val; c->src.val = old_eip; rc = em_push(ctxt); break; } case 4: /* jmp abs */ - c->eip = c->src.val; + c->_eip = c->src.val; break; case 5: /* jmp far */ rc = em_jmp_far(ctxt); @@ -1821,7 +1821,7 @@ static int em_ret(struct x86_emulate_ctxt *ctxt) struct decode_cache *c = &ctxt->decode; c->dst.type = OP_REG; - c->dst.addr.reg = &c->eip; + c->dst.addr.reg = &c->_eip; c->dst.bytes = c->op_bytes; return em_pop(ctxt); } @@ -1832,11 +1832,11 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt) int rc; unsigned long cs; - rc = emulate_pop(ctxt, &c->eip, c->op_bytes); + rc = emulate_pop(ctxt, &c->_eip, c->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; if (c->op_bytes == 4) - c->eip = (u32)c->eip; + c->_eip = (u32)c->_eip; rc = emulate_pop(ctxt, &cs, c->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; @@ -1919,7 +1919,7 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt) ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS); ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); - c->regs[VCPU_REGS_RCX] = c->eip; + c->regs[VCPU_REGS_RCX] = c->_eip; if (efer & EFER_LMA) { #ifdef CONFIG_X86_64 c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF; @@ -1927,7 +1927,7 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt) ops->get_msr(ctxt, ctxt->mode == X86EMUL_MODE_PROT64 ? MSR_LSTAR : MSR_CSTAR, &msr_data); - c->eip = msr_data; + c->_eip = msr_data; ops->get_msr(ctxt, MSR_SYSCALL_MASK, &msr_data); ctxt->eflags &= ~(msr_data | EFLG_RF); @@ -1935,7 +1935,7 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt) } else { /* legacy mode */ ops->get_msr(ctxt, MSR_STAR, &msr_data); - c->eip = (u32)msr_data; + c->_eip = (u32)msr_data; ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); } @@ -1991,7 +1991,7 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt) ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); ops->get_msr(ctxt, MSR_IA32_SYSENTER_EIP, &msr_data); - c->eip = msr_data; + c->_eip = msr_data; ops->get_msr(ctxt, MSR_IA32_SYSENTER_ESP, &msr_data); c->regs[VCPU_REGS_RSP] = msr_data; @@ -2045,7 +2045,7 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt) ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS); ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); - c->eip = c->regs[VCPU_REGS_RDX]; + c->_eip = c->regs[VCPU_REGS_RDX]; c->regs[VCPU_REGS_RSP] = c->regs[VCPU_REGS_RCX]; return X86EMUL_CONTINUE; @@ -2115,7 +2115,7 @@ static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt, { struct decode_cache *c = &ctxt->decode; - tss->ip = c->eip; + tss->ip = c->_eip; tss->flag = ctxt->eflags; tss->ax = c->regs[VCPU_REGS_RAX]; tss->cx = c->regs[VCPU_REGS_RCX]; @@ -2139,7 +2139,7 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt, struct decode_cache *c = &ctxt->decode; int ret; - c->eip = tss->ip; + c->_eip = tss->ip; ctxt->eflags = tss->flag | 2; c->regs[VCPU_REGS_RAX] = tss->ax; c->regs[VCPU_REGS_RCX] = tss->cx; @@ -2233,7 +2233,7 @@ static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt, struct decode_cache *c = &ctxt->decode; tss->cr3 = ctxt->ops->get_cr(ctxt, 3); - tss->eip = c->eip; + tss->eip = c->_eip; tss->eflags = ctxt->eflags; tss->eax = c->regs[VCPU_REGS_RAX]; tss->ecx = c->regs[VCPU_REGS_RCX]; @@ -2261,7 +2261,7 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, if (ctxt->ops->set_cr(ctxt, 3, tss->cr3)) return emulate_gp(ctxt, 0); - c->eip = tss->eip; + c->_eip = tss->eip; ctxt->eflags = tss->eflags | 2; c->regs[VCPU_REGS_RAX] = tss->eax; c->regs[VCPU_REGS_RCX] = tss->ecx; @@ -2446,14 +2446,14 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt, struct decode_cache *c = &ctxt->decode; int rc; - c->eip = ctxt->eip; + c->_eip = ctxt->eip; c->dst.type = OP_NONE; rc = emulator_do_task_switch(ctxt, tss_selector, reason, has_error_code, error_code); if (rc == X86EMUL_CONTINUE) - ctxt->eip = c->eip; + ctxt->eip = c->_eip; return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; } @@ -2516,14 +2516,14 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt) int rc; old_cs = get_segment_selector(ctxt, VCPU_SREG_CS); - old_eip = c->eip; + old_eip = c->_eip; memcpy(&sel, c->src.valptr + c->op_bytes, 2); if (load_segment_descriptor(ctxt, sel, VCPU_SREG_CS)) return X86EMUL_CONTINUE; - c->eip = 0; - memcpy(&c->eip, c->src.valptr, c->op_bytes); + c->_eip = 0; + memcpy(&c->_eip, c->src.valptr, c->op_bytes); c->src.val = old_cs; rc = em_push(ctxt); @@ -2540,7 +2540,7 @@ static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt) int rc; c->dst.type = OP_REG; - c->dst.addr.reg = &c->eip; + c->dst.addr.reg = &c->_eip; c->dst.bytes = c->op_bytes; rc = emulate_pop(ctxt, &c->dst.val, c->op_bytes); if (rc != X86EMUL_CONTINUE) @@ -2754,7 +2754,7 @@ static int em_vmcall(struct x86_emulate_ctxt *ctxt) return rc; /* Let the processor re-execute the fixed hypercall */ - c->eip = ctxt->eip; + c->_eip = ctxt->eip; /* Disable writeback. */ c->dst.type = OP_NONE; return X86EMUL_CONTINUE; @@ -3408,17 +3408,17 @@ static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op, op->type = OP_IMM; op->bytes = size; - op->addr.mem.ea = c->eip; + op->addr.mem.ea = c->_eip; /* NB. Immediates are sign-extended as necessary. */ switch (op->bytes) { case 1: - op->val = insn_fetch(s8, 1, c->eip); + op->val = insn_fetch(s8, 1, c->_eip); break; case 2: - op->val = insn_fetch(s16, 2, c->eip); + op->val = insn_fetch(s16, 2, c->_eip); break; case 4: - op->val = insn_fetch(s32, 4, c->eip); + op->val = insn_fetch(s32, 4, c->_eip); break; } if (!sign_extension) { @@ -3448,8 +3448,8 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) struct opcode opcode; struct operand memop = { .type = OP_NONE }, *memopp = NULL; - c->eip = ctxt->eip; - c->fetch.start = c->eip; + c->_eip = ctxt->eip; + c->fetch.start = c->_eip; c->fetch.end = c->fetch.start + insn_len; if (insn_len > 0) memcpy(c->fetch.data, insn, insn_len); @@ -3478,7 +3478,7 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) /* Legacy prefixes. */ for (;;) { - switch (c->b = insn_fetch(u8, 1, c->eip)) { + switch (c->b = insn_fetch(u8, 1, c->_eip)) { case 0x66: /* operand-size override */ op_prefix = true; /* switch between 2/4 bytes */ @@ -3534,7 +3534,7 @@ done_prefixes: /* Two-byte opcode? */ if (c->b == 0x0f) { c->twobyte = 1; - c->b = insn_fetch(u8, 1, c->eip); + c->b = insn_fetch(u8, 1, c->_eip); opcode = twobyte_table[c->b]; } c->d = opcode.flags; @@ -3542,14 +3542,14 @@ done_prefixes: while (c->d & GroupMask) { switch (c->d & GroupMask) { case Group: - c->modrm = insn_fetch(u8, 1, c->eip); - --c->eip; + c->modrm = insn_fetch(u8, 1, c->_eip); + --c->_eip; goffset = (c->modrm >> 3) & 7; opcode = opcode.u.group[goffset]; break; case GroupDual: - c->modrm = insn_fetch(u8, 1, c->eip); - --c->eip; + c->modrm = insn_fetch(u8, 1, c->_eip); + --c->_eip; goffset = (c->modrm >> 3) & 7; if ((c->modrm >> 6) == 3) opcode = opcode.u.gdual->mod3[goffset]; @@ -3679,9 +3679,9 @@ done_prefixes: break; case SrcImmFAddr: c->src.type = OP_IMM; - c->src.addr.mem.ea = c->eip; + c->src.addr.mem.ea = c->_eip; c->src.bytes = c->op_bytes + 2; - insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip); + insn_fetch_arr(c->src.valptr, c->src.bytes, c->_eip); break; case SrcMemFAddr: memop.bytes = c->op_bytes + 2; @@ -3732,9 +3732,9 @@ done_prefixes: break; case DstImmUByte: c->dst.type = OP_IMM; - c->dst.addr.mem.ea = c->eip; + c->dst.addr.mem.ea = c->_eip; c->dst.bytes = 1; - c->dst.val = insn_fetch(u8, 1, c->eip); + c->dst.val = insn_fetch(u8, 1, c->_eip); break; case DstMem: case DstMem64: @@ -3778,7 +3778,7 @@ done_prefixes: done: if (memopp && memopp->type == OP_MEM && c->rip_relative) - memopp->addr.mem.ea += c->eip; + memopp->addr.mem.ea += c->_eip; return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; } @@ -3879,7 +3879,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) if (c->rep_prefix && (c->d & String)) { /* All REP prefixes have the same first termination condition */ if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) { - ctxt->eip = c->eip; + ctxt->eip = c->_eip; goto done; } } @@ -4029,7 +4029,7 @@ special_insn: goto do_io_out; case 0xe8: /* call (near) */ { long int rel = c->src.val; - c->src.val = (unsigned long) c->eip; + c->src.val = (unsigned long) c->_eip; jmp_rel(c, rel); rc = em_push(ctxt); break; @@ -4130,7 +4130,7 @@ writeback: } } - ctxt->eip = c->eip; + ctxt->eip = c->_eip; done: if (rc == X86EMUL_PROPAGATE_FAULT) diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index db93276..d69e758 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -677,7 +677,7 @@ TRACE_EVENT(kvm_emulate_insn, TP_fast_assign( __entry->rip = vcpu->arch.emulate_ctxt.decode.fetch.start; __entry->csbase = kvm_x86_ops->get_segment_base(vcpu, VCPU_SREG_CS); - __entry->len = vcpu->arch.emulate_ctxt.decode.eip + __entry->len = vcpu->arch.emulate_ctxt.decode._eip - vcpu->arch.emulate_ctxt.decode.fetch.start; memcpy(__entry->insn, vcpu->arch.emulate_ctxt.decode.fetch.data, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 39d8b04..7e452fe 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4560,13 +4560,13 @@ int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) c->op_bytes = 2; c->ad_bytes = 2; - c->eip = ctxt->eip + inc_eip; + c->_eip = ctxt->eip + inc_eip; ret = emulate_int_real(ctxt, irq); if (ret != X86EMUL_CONTINUE) return EMULATE_FAIL; - ctxt->eip = c->eip; + ctxt->eip = c->_eip; memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); kvm_rip_write(vcpu, ctxt->eip); kvm_set_rflags(vcpu, ctxt->eflags); @@ -4661,7 +4661,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, } if (emulation_type & EMULTYPE_SKIP) { - kvm_rip_write(vcpu, c->eip); + kvm_rip_write(vcpu, c->_eip); return EMULATE_DONE; } -- cgit v1.1 From 9dac77fa4011bdb4b541a8db087eac96a602faec Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 1 Jun 2011 15:34:25 +0300 Subject: KVM: x86 emulator: fold decode_cache into x86_emulate_ctxt This saves a lot of pointless casts x86_emulate_ctxt and decode_cache. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_emulate.h | 44 +- arch/x86/kvm/emulate.c | 1279 +++++++++++++++++------------------- arch/x86/kvm/trace.h | 8 +- arch/x86/kvm/x86.c | 47 +- 4 files changed, 632 insertions(+), 746 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index d0e100f..6040d11 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -229,7 +229,26 @@ struct read_cache { unsigned long end; }; -struct decode_cache { +struct x86_emulate_ctxt { + struct x86_emulate_ops *ops; + + /* Register state before/after emulation. */ + unsigned long eflags; + unsigned long eip; /* eip before instruction emulation */ + /* Emulated execution mode, represented by an X86EMUL_MODE value. */ + int mode; + + /* interruptibility state, as a result of execution of STI or MOV SS */ + int interruptibility; + + bool guest_mode; /* guest running a nested guest */ + bool perm_ok; /* do not check permissions if true */ + bool only_vendor_specific_insn; + + bool have_exception; + struct x86_exception exception; + + /* decode cache */ u8 twobyte; u8 b; u8 intercept; @@ -261,29 +280,6 @@ struct decode_cache { struct read_cache mem_read; }; -struct x86_emulate_ctxt { - struct x86_emulate_ops *ops; - - /* Register state before/after emulation. */ - unsigned long eflags; - unsigned long eip; /* eip before instruction emulation */ - /* Emulated execution mode, represented by an X86EMUL_MODE value. */ - int mode; - - /* interruptibility state, as a result of execution of STI or MOV SS */ - int interruptibility; - - bool guest_mode; /* guest running a nested guest */ - bool perm_ok; /* do not check permissions if true */ - bool only_vendor_specific_insn; - - bool have_exception; - struct x86_exception exception; - - /* decode cache */ - struct decode_cache decode; -}; - /* Repeat String Operation Prefix */ #define REPE_PREFIX 0xf3 #define REPNE_PREFIX 0xf2 diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index a1b9705..6f08bc9 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -413,53 +413,53 @@ static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt, { struct x86_instruction_info info = { .intercept = intercept, - .rep_prefix = ctxt->decode.rep_prefix, - .modrm_mod = ctxt->decode.modrm_mod, - .modrm_reg = ctxt->decode.modrm_reg, - .modrm_rm = ctxt->decode.modrm_rm, - .src_val = ctxt->decode.src.val64, - .src_bytes = ctxt->decode.src.bytes, - .dst_bytes = ctxt->decode.dst.bytes, - .ad_bytes = ctxt->decode.ad_bytes, + .rep_prefix = ctxt->rep_prefix, + .modrm_mod = ctxt->modrm_mod, + .modrm_reg = ctxt->modrm_reg, + .modrm_rm = ctxt->modrm_rm, + .src_val = ctxt->src.val64, + .src_bytes = ctxt->src.bytes, + .dst_bytes = ctxt->dst.bytes, + .ad_bytes = ctxt->ad_bytes, .next_rip = ctxt->eip, }; return ctxt->ops->intercept(ctxt, &info, stage); } -static inline unsigned long ad_mask(struct decode_cache *c) +static inline unsigned long ad_mask(struct x86_emulate_ctxt *ctxt) { - return (1UL << (c->ad_bytes << 3)) - 1; + return (1UL << (ctxt->ad_bytes << 3)) - 1; } /* Access/update address held in a register, based on addressing mode. */ static inline unsigned long -address_mask(struct decode_cache *c, unsigned long reg) +address_mask(struct x86_emulate_ctxt *ctxt, unsigned long reg) { - if (c->ad_bytes == sizeof(unsigned long)) + if (ctxt->ad_bytes == sizeof(unsigned long)) return reg; else - return reg & ad_mask(c); + return reg & ad_mask(ctxt); } static inline unsigned long -register_address(struct decode_cache *c, unsigned long reg) +register_address(struct x86_emulate_ctxt *ctxt, unsigned long reg) { - return address_mask(c, reg); + return address_mask(ctxt, reg); } static inline void -register_address_increment(struct decode_cache *c, unsigned long *reg, int inc) +register_address_increment(struct x86_emulate_ctxt *ctxt, unsigned long *reg, int inc) { - if (c->ad_bytes == sizeof(unsigned long)) + if (ctxt->ad_bytes == sizeof(unsigned long)) *reg += inc; else - *reg = (*reg & ~ad_mask(c)) | ((*reg + inc) & ad_mask(c)); + *reg = (*reg & ~ad_mask(ctxt)) | ((*reg + inc) & ad_mask(ctxt)); } -static inline void jmp_rel(struct decode_cache *c, int rel) +static inline void jmp_rel(struct x86_emulate_ctxt *ctxt, int rel) { - register_address_increment(c, &c->_eip, rel); + register_address_increment(ctxt, &ctxt->_eip, rel); } static u32 desc_limit_scaled(struct desc_struct *desc) @@ -469,10 +469,10 @@ static u32 desc_limit_scaled(struct desc_struct *desc) return desc->g ? (limit << 12) | 0xfff : limit; } -static void set_seg_override(struct decode_cache *c, int seg) +static void set_seg_override(struct x86_emulate_ctxt *ctxt, int seg) { - c->has_seg_override = true; - c->seg_override = seg; + ctxt->has_seg_override = true; + ctxt->seg_override = seg; } static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg) @@ -483,13 +483,12 @@ static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg) return ctxt->ops->get_cached_segment_base(ctxt, seg); } -static unsigned seg_override(struct x86_emulate_ctxt *ctxt, - struct decode_cache *c) +static unsigned seg_override(struct x86_emulate_ctxt *ctxt) { - if (!c->has_seg_override) + if (!ctxt->has_seg_override) return 0; - return c->seg_override; + return ctxt->seg_override; } static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec, @@ -561,7 +560,6 @@ static int __linearize(struct x86_emulate_ctxt *ctxt, unsigned size, bool write, bool fetch, ulong *linear) { - struct decode_cache *c = &ctxt->decode; struct desc_struct desc; bool usable; ulong la; @@ -619,7 +617,7 @@ static int __linearize(struct x86_emulate_ctxt *ctxt, } break; } - if (fetch ? ctxt->mode != X86EMUL_MODE_PROT64 : c->ad_bytes != 8) + if (fetch ? ctxt->mode != X86EMUL_MODE_PROT64 : ctxt->ad_bytes != 8) la &= (u32)-1; *linear = la; return X86EMUL_CONTINUE; @@ -656,7 +654,7 @@ static int segmented_read_std(struct x86_emulate_ctxt *ctxt, static int do_insn_fetch_byte(struct x86_emulate_ctxt *ctxt, unsigned long eip, u8 *dest) { - struct fetch_cache *fc = &ctxt->decode.fetch; + struct fetch_cache *fc = &ctxt->fetch; int rc; int size, cur_size; @@ -854,16 +852,15 @@ static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, static void decode_register_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, - struct decode_cache *c, int inhibit_bytereg) { - unsigned reg = c->modrm_reg; - int highbyte_regs = c->rex_prefix == 0; + unsigned reg = ctxt->modrm_reg; + int highbyte_regs = ctxt->rex_prefix == 0; - if (!(c->d & ModRM)) - reg = (c->b & 7) | ((c->rex_prefix & 1) << 3); + if (!(ctxt->d & ModRM)) + reg = (ctxt->b & 7) | ((ctxt->rex_prefix & 1) << 3); - if (c->d & Sse) { + if (ctxt->d & Sse) { op->type = OP_XMM; op->bytes = 16; op->addr.xmm = reg; @@ -872,12 +869,12 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt, } op->type = OP_REG; - if ((c->d & ByteOp) && !inhibit_bytereg) { - op->addr.reg = decode_register(reg, c->regs, highbyte_regs); + if ((ctxt->d & ByteOp) && !inhibit_bytereg) { + op->addr.reg = decode_register(reg, ctxt->regs, highbyte_regs); op->bytes = 1; } else { - op->addr.reg = decode_register(reg, c->regs, 0); - op->bytes = c->op_bytes; + op->addr.reg = decode_register(reg, ctxt->regs, 0); + op->bytes = ctxt->op_bytes; } fetch_register_operand(op); op->orig_val = op->val; @@ -886,34 +883,33 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt, static int decode_modrm(struct x86_emulate_ctxt *ctxt, struct operand *op) { - struct decode_cache *c = &ctxt->decode; u8 sib; int index_reg = 0, base_reg = 0, scale; int rc = X86EMUL_CONTINUE; ulong modrm_ea = 0; - if (c->rex_prefix) { - c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */ - index_reg = (c->rex_prefix & 2) << 2; /* REX.X */ - c->modrm_rm = base_reg = (c->rex_prefix & 1) << 3; /* REG.B */ + if (ctxt->rex_prefix) { + ctxt->modrm_reg = (ctxt->rex_prefix & 4) << 1; /* REX.R */ + index_reg = (ctxt->rex_prefix & 2) << 2; /* REX.X */ + ctxt->modrm_rm = base_reg = (ctxt->rex_prefix & 1) << 3; /* REG.B */ } - c->modrm = insn_fetch(u8, 1, c->_eip); - c->modrm_mod |= (c->modrm & 0xc0) >> 6; - c->modrm_reg |= (c->modrm & 0x38) >> 3; - c->modrm_rm |= (c->modrm & 0x07); - c->modrm_seg = VCPU_SREG_DS; + ctxt->modrm = insn_fetch(u8, 1, ctxt->_eip); + ctxt->modrm_mod |= (ctxt->modrm & 0xc0) >> 6; + ctxt->modrm_reg |= (ctxt->modrm & 0x38) >> 3; + ctxt->modrm_rm |= (ctxt->modrm & 0x07); + ctxt->modrm_seg = VCPU_SREG_DS; - if (c->modrm_mod == 3) { + if (ctxt->modrm_mod == 3) { op->type = OP_REG; - op->bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - op->addr.reg = decode_register(c->modrm_rm, - c->regs, c->d & ByteOp); - if (c->d & Sse) { + op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; + op->addr.reg = decode_register(ctxt->modrm_rm, + ctxt->regs, ctxt->d & ByteOp); + if (ctxt->d & Sse) { op->type = OP_XMM; op->bytes = 16; - op->addr.xmm = c->modrm_rm; - read_sse_reg(ctxt, &op->vec_val, c->modrm_rm); + op->addr.xmm = ctxt->modrm_rm; + read_sse_reg(ctxt, &op->vec_val, ctxt->modrm_rm); return rc; } fetch_register_operand(op); @@ -922,26 +918,26 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, op->type = OP_MEM; - if (c->ad_bytes == 2) { - unsigned bx = c->regs[VCPU_REGS_RBX]; - unsigned bp = c->regs[VCPU_REGS_RBP]; - unsigned si = c->regs[VCPU_REGS_RSI]; - unsigned di = c->regs[VCPU_REGS_RDI]; + if (ctxt->ad_bytes == 2) { + unsigned bx = ctxt->regs[VCPU_REGS_RBX]; + unsigned bp = ctxt->regs[VCPU_REGS_RBP]; + unsigned si = ctxt->regs[VCPU_REGS_RSI]; + unsigned di = ctxt->regs[VCPU_REGS_RDI]; /* 16-bit ModR/M decode. */ - switch (c->modrm_mod) { + switch (ctxt->modrm_mod) { case 0: - if (c->modrm_rm == 6) - modrm_ea += insn_fetch(u16, 2, c->_eip); + if (ctxt->modrm_rm == 6) + modrm_ea += insn_fetch(u16, 2, ctxt->_eip); break; case 1: - modrm_ea += insn_fetch(s8, 1, c->_eip); + modrm_ea += insn_fetch(s8, 1, ctxt->_eip); break; case 2: - modrm_ea += insn_fetch(u16, 2, c->_eip); + modrm_ea += insn_fetch(u16, 2, ctxt->_eip); break; } - switch (c->modrm_rm) { + switch (ctxt->modrm_rm) { case 0: modrm_ea += bx + si; break; @@ -961,46 +957,46 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, modrm_ea += di; break; case 6: - if (c->modrm_mod != 0) + if (ctxt->modrm_mod != 0) modrm_ea += bp; break; case 7: modrm_ea += bx; break; } - if (c->modrm_rm == 2 || c->modrm_rm == 3 || - (c->modrm_rm == 6 && c->modrm_mod != 0)) - c->modrm_seg = VCPU_SREG_SS; + if (ctxt->modrm_rm == 2 || ctxt->modrm_rm == 3 || + (ctxt->modrm_rm == 6 && ctxt->modrm_mod != 0)) + ctxt->modrm_seg = VCPU_SREG_SS; modrm_ea = (u16)modrm_ea; } else { /* 32/64-bit ModR/M decode. */ - if ((c->modrm_rm & 7) == 4) { - sib = insn_fetch(u8, 1, c->_eip); + if ((ctxt->modrm_rm & 7) == 4) { + sib = insn_fetch(u8, 1, ctxt->_eip); index_reg |= (sib >> 3) & 7; base_reg |= sib & 7; scale = sib >> 6; - if ((base_reg & 7) == 5 && c->modrm_mod == 0) - modrm_ea += insn_fetch(s32, 4, c->_eip); + if ((base_reg & 7) == 5 && ctxt->modrm_mod == 0) + modrm_ea += insn_fetch(s32, 4, ctxt->_eip); else - modrm_ea += c->regs[base_reg]; + modrm_ea += ctxt->regs[base_reg]; if (index_reg != 4) - modrm_ea += c->regs[index_reg] << scale; - } else if ((c->modrm_rm & 7) == 5 && c->modrm_mod == 0) { + modrm_ea += ctxt->regs[index_reg] << scale; + } else if ((ctxt->modrm_rm & 7) == 5 && ctxt->modrm_mod == 0) { if (ctxt->mode == X86EMUL_MODE_PROT64) - c->rip_relative = 1; + ctxt->rip_relative = 1; } else - modrm_ea += c->regs[c->modrm_rm]; - switch (c->modrm_mod) { + modrm_ea += ctxt->regs[ctxt->modrm_rm]; + switch (ctxt->modrm_mod) { case 0: - if (c->modrm_rm == 5) - modrm_ea += insn_fetch(s32, 4, c->_eip); + if (ctxt->modrm_rm == 5) + modrm_ea += insn_fetch(s32, 4, ctxt->_eip); break; case 1: - modrm_ea += insn_fetch(s8, 1, c->_eip); + modrm_ea += insn_fetch(s8, 1, ctxt->_eip); break; case 2: - modrm_ea += insn_fetch(s32, 4, c->_eip); + modrm_ea += insn_fetch(s32, 4, ctxt->_eip); break; } } @@ -1012,49 +1008,48 @@ done: static int decode_abs(struct x86_emulate_ctxt *ctxt, struct operand *op) { - struct decode_cache *c = &ctxt->decode; int rc = X86EMUL_CONTINUE; op->type = OP_MEM; - switch (c->ad_bytes) { + switch (ctxt->ad_bytes) { case 2: - op->addr.mem.ea = insn_fetch(u16, 2, c->_eip); + op->addr.mem.ea = insn_fetch(u16, 2, ctxt->_eip); break; case 4: - op->addr.mem.ea = insn_fetch(u32, 4, c->_eip); + op->addr.mem.ea = insn_fetch(u32, 4, ctxt->_eip); break; case 8: - op->addr.mem.ea = insn_fetch(u64, 8, c->_eip); + op->addr.mem.ea = insn_fetch(u64, 8, ctxt->_eip); break; } done: return rc; } -static void fetch_bit_operand(struct decode_cache *c) +static void fetch_bit_operand(struct x86_emulate_ctxt *ctxt) { long sv = 0, mask; - if (c->dst.type == OP_MEM && c->src.type == OP_REG) { - mask = ~(c->dst.bytes * 8 - 1); + if (ctxt->dst.type == OP_MEM && ctxt->src.type == OP_REG) { + mask = ~(ctxt->dst.bytes * 8 - 1); - if (c->src.bytes == 2) - sv = (s16)c->src.val & (s16)mask; - else if (c->src.bytes == 4) - sv = (s32)c->src.val & (s32)mask; + if (ctxt->src.bytes == 2) + sv = (s16)ctxt->src.val & (s16)mask; + else if (ctxt->src.bytes == 4) + sv = (s32)ctxt->src.val & (s32)mask; - c->dst.addr.mem.ea += (sv >> 3); + ctxt->dst.addr.mem.ea += (sv >> 3); } /* only subword offset */ - c->src.val &= (c->dst.bytes << 3) - 1; + ctxt->src.val &= (ctxt->dst.bytes << 3) - 1; } static int read_emulated(struct x86_emulate_ctxt *ctxt, unsigned long addr, void *dest, unsigned size) { int rc; - struct read_cache *mc = &ctxt->decode.mem_read; + struct read_cache *mc = &ctxt->mem_read; while (size) { int n = min(size, 8u); @@ -1125,16 +1120,15 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, unsigned int size, unsigned short port, void *dest) { - struct read_cache *rc = &ctxt->decode.io_read; + struct read_cache *rc = &ctxt->io_read; if (rc->pos == rc->end) { /* refill pio read ahead */ - struct decode_cache *c = &ctxt->decode; unsigned int in_page, n; - unsigned int count = c->rep_prefix ? - address_mask(c, c->regs[VCPU_REGS_RCX]) : 1; + unsigned int count = ctxt->rep_prefix ? + address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) : 1; in_page = (ctxt->eflags & EFLG_DF) ? - offset_in_page(c->regs[VCPU_REGS_RDI]) : - PAGE_SIZE - offset_in_page(c->regs[VCPU_REGS_RDI]); + offset_in_page(ctxt->regs[VCPU_REGS_RDI]) : + PAGE_SIZE - offset_in_page(ctxt->regs[VCPU_REGS_RDI]); n = min(min(in_page, (unsigned int)sizeof(rc->data)) / size, count); if (n == 0) @@ -1344,29 +1338,28 @@ static void write_register_operand(struct operand *op) static int writeback(struct x86_emulate_ctxt *ctxt) { int rc; - struct decode_cache *c = &ctxt->decode; - switch (c->dst.type) { + switch (ctxt->dst.type) { case OP_REG: - write_register_operand(&c->dst); + write_register_operand(&ctxt->dst); break; case OP_MEM: - if (c->lock_prefix) + if (ctxt->lock_prefix) rc = segmented_cmpxchg(ctxt, - c->dst.addr.mem, - &c->dst.orig_val, - &c->dst.val, - c->dst.bytes); + ctxt->dst.addr.mem, + &ctxt->dst.orig_val, + &ctxt->dst.val, + ctxt->dst.bytes); else rc = segmented_write(ctxt, - c->dst.addr.mem, - &c->dst.val, - c->dst.bytes); + ctxt->dst.addr.mem, + &ctxt->dst.val, + ctxt->dst.bytes); if (rc != X86EMUL_CONTINUE) return rc; break; case OP_XMM: - write_sse_reg(ctxt, &c->dst.vec_val, c->dst.addr.xmm); + write_sse_reg(ctxt, &ctxt->dst.vec_val, ctxt->dst.addr.xmm); break; case OP_NONE: /* no writeback */ @@ -1379,40 +1372,36 @@ static int writeback(struct x86_emulate_ctxt *ctxt) static int em_push(struct x86_emulate_ctxt *ctxt) { - struct decode_cache *c = &ctxt->decode; struct segmented_address addr; - register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes); - addr.ea = register_address(c, c->regs[VCPU_REGS_RSP]); + register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], -ctxt->op_bytes); + addr.ea = register_address(ctxt, ctxt->regs[VCPU_REGS_RSP]); addr.seg = VCPU_SREG_SS; /* Disable writeback. */ - c->dst.type = OP_NONE; - return segmented_write(ctxt, addr, &c->src.val, c->op_bytes); + ctxt->dst.type = OP_NONE; + return segmented_write(ctxt, addr, &ctxt->src.val, ctxt->op_bytes); } static int emulate_pop(struct x86_emulate_ctxt *ctxt, void *dest, int len) { - struct decode_cache *c = &ctxt->decode; int rc; struct segmented_address addr; - addr.ea = register_address(c, c->regs[VCPU_REGS_RSP]); + addr.ea = register_address(ctxt, ctxt->regs[VCPU_REGS_RSP]); addr.seg = VCPU_SREG_SS; rc = segmented_read(ctxt, addr, dest, len); if (rc != X86EMUL_CONTINUE) return rc; - register_address_increment(c, &c->regs[VCPU_REGS_RSP], len); + register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], len); return rc; } static int em_pop(struct x86_emulate_ctxt *ctxt) { - struct decode_cache *c = &ctxt->decode; - - return emulate_pop(ctxt, &c->dst.val, c->op_bytes); + return emulate_pop(ctxt, &ctxt->dst.val, ctxt->op_bytes); } static int emulate_popf(struct x86_emulate_ctxt *ctxt, @@ -1457,30 +1446,25 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt, static int em_popf(struct x86_emulate_ctxt *ctxt) { - struct decode_cache *c = &ctxt->decode; - - c->dst.type = OP_REG; - c->dst.addr.reg = &ctxt->eflags; - c->dst.bytes = c->op_bytes; - return emulate_popf(ctxt, &c->dst.val, c->op_bytes); + ctxt->dst.type = OP_REG; + ctxt->dst.addr.reg = &ctxt->eflags; + ctxt->dst.bytes = ctxt->op_bytes; + return emulate_popf(ctxt, &ctxt->dst.val, ctxt->op_bytes); } static int emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg) { - struct decode_cache *c = &ctxt->decode; - - c->src.val = get_segment_selector(ctxt, seg); + ctxt->src.val = get_segment_selector(ctxt, seg); return em_push(ctxt); } static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, int seg) { - struct decode_cache *c = &ctxt->decode; unsigned long selector; int rc; - rc = emulate_pop(ctxt, &selector, c->op_bytes); + rc = emulate_pop(ctxt, &selector, ctxt->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; @@ -1490,14 +1474,13 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, int seg) static int em_pusha(struct x86_emulate_ctxt *ctxt) { - struct decode_cache *c = &ctxt->decode; - unsigned long old_esp = c->regs[VCPU_REGS_RSP]; + unsigned long old_esp = ctxt->regs[VCPU_REGS_RSP]; int rc = X86EMUL_CONTINUE; int reg = VCPU_REGS_RAX; while (reg <= VCPU_REGS_RDI) { (reg == VCPU_REGS_RSP) ? - (c->src.val = old_esp) : (c->src.val = c->regs[reg]); + (ctxt->src.val = old_esp) : (ctxt->src.val = ctxt->regs[reg]); rc = em_push(ctxt); if (rc != X86EMUL_CONTINUE) @@ -1511,26 +1494,23 @@ static int em_pusha(struct x86_emulate_ctxt *ctxt) static int em_pushf(struct x86_emulate_ctxt *ctxt) { - struct decode_cache *c = &ctxt->decode; - - c->src.val = (unsigned long)ctxt->eflags; + ctxt->src.val = (unsigned long)ctxt->eflags; return em_push(ctxt); } static int em_popa(struct x86_emulate_ctxt *ctxt) { - struct decode_cache *c = &ctxt->decode; int rc = X86EMUL_CONTINUE; int reg = VCPU_REGS_RDI; while (reg >= VCPU_REGS_RAX) { if (reg == VCPU_REGS_RSP) { - register_address_increment(c, &c->regs[VCPU_REGS_RSP], - c->op_bytes); + register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], + ctxt->op_bytes); --reg; } - rc = emulate_pop(ctxt, &c->regs[reg], c->op_bytes); + rc = emulate_pop(ctxt, &ctxt->regs[reg], ctxt->op_bytes); if (rc != X86EMUL_CONTINUE) break; --reg; @@ -1540,7 +1520,6 @@ static int em_popa(struct x86_emulate_ctxt *ctxt) int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq) { - struct decode_cache *c = &ctxt->decode; struct x86_emulate_ops *ops = ctxt->ops; int rc; struct desc_ptr dt; @@ -1549,19 +1528,19 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq) u16 cs, eip; /* TODO: Add limit checks */ - c->src.val = ctxt->eflags; + ctxt->src.val = ctxt->eflags; rc = em_push(ctxt); if (rc != X86EMUL_CONTINUE) return rc; ctxt->eflags &= ~(EFLG_IF | EFLG_TF | EFLG_AC); - c->src.val = get_segment_selector(ctxt, VCPU_SREG_CS); + ctxt->src.val = get_segment_selector(ctxt, VCPU_SREG_CS); rc = em_push(ctxt); if (rc != X86EMUL_CONTINUE) return rc; - c->src.val = c->_eip; + ctxt->src.val = ctxt->_eip; rc = em_push(ctxt); if (rc != X86EMUL_CONTINUE) return rc; @@ -1583,7 +1562,7 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq) if (rc != X86EMUL_CONTINUE) return rc; - c->_eip = eip; + ctxt->_eip = eip; return rc; } @@ -1605,7 +1584,6 @@ static int emulate_int(struct x86_emulate_ctxt *ctxt, int irq) static int emulate_iret_real(struct x86_emulate_ctxt *ctxt) { - struct decode_cache *c = &ctxt->decode; int rc = X86EMUL_CONTINUE; unsigned long temp_eip = 0; unsigned long temp_eflags = 0; @@ -1617,7 +1595,7 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt) /* TODO: Add stack limit check */ - rc = emulate_pop(ctxt, &temp_eip, c->op_bytes); + rc = emulate_pop(ctxt, &temp_eip, ctxt->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; @@ -1625,12 +1603,12 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt) if (temp_eip & ~0xffff) return emulate_gp(ctxt, 0); - rc = emulate_pop(ctxt, &cs, c->op_bytes); + rc = emulate_pop(ctxt, &cs, ctxt->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; - rc = emulate_pop(ctxt, &temp_eflags, c->op_bytes); + rc = emulate_pop(ctxt, &temp_eflags, ctxt->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; @@ -1640,12 +1618,12 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt) if (rc != X86EMUL_CONTINUE) return rc; - c->_eip = temp_eip; + ctxt->_eip = temp_eip; - if (c->op_bytes == 4) + if (ctxt->op_bytes == 4) ctxt->eflags = ((temp_eflags & mask) | (ctxt->eflags & vm86_mask)); - else if (c->op_bytes == 2) { + else if (ctxt->op_bytes == 2) { ctxt->eflags &= ~0xffff; ctxt->eflags |= temp_eflags; } @@ -1673,53 +1651,49 @@ static int em_iret(struct x86_emulate_ctxt *ctxt) static int em_jmp_far(struct x86_emulate_ctxt *ctxt) { - struct decode_cache *c = &ctxt->decode; int rc; unsigned short sel; - memcpy(&sel, c->src.valptr + c->op_bytes, 2); + memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2); rc = load_segment_descriptor(ctxt, sel, VCPU_SREG_CS); if (rc != X86EMUL_CONTINUE) return rc; - c->_eip = 0; - memcpy(&c->_eip, c->src.valptr, c->op_bytes); + ctxt->_eip = 0; + memcpy(&ctxt->_eip, ctxt->src.valptr, ctxt->op_bytes); return X86EMUL_CONTINUE; } static int em_grp1a(struct x86_emulate_ctxt *ctxt) { - struct decode_cache *c = &ctxt->decode; - - return emulate_pop(ctxt, &c->dst.val, c->dst.bytes); + return emulate_pop(ctxt, &ctxt->dst.val, ctxt->dst.bytes); } static int em_grp2(struct x86_emulate_ctxt *ctxt) { - struct decode_cache *c = &ctxt->decode; - switch (c->modrm_reg) { + switch (ctxt->modrm_reg) { case 0: /* rol */ - emulate_2op_SrcB("rol", c->src, c->dst, ctxt->eflags); + emulate_2op_SrcB("rol", ctxt->src, ctxt->dst, ctxt->eflags); break; case 1: /* ror */ - emulate_2op_SrcB("ror", c->src, c->dst, ctxt->eflags); + emulate_2op_SrcB("ror", ctxt->src, ctxt->dst, ctxt->eflags); break; case 2: /* rcl */ - emulate_2op_SrcB("rcl", c->src, c->dst, ctxt->eflags); + emulate_2op_SrcB("rcl", ctxt->src, ctxt->dst, ctxt->eflags); break; case 3: /* rcr */ - emulate_2op_SrcB("rcr", c->src, c->dst, ctxt->eflags); + emulate_2op_SrcB("rcr", ctxt->src, ctxt->dst, ctxt->eflags); break; case 4: /* sal/shl */ case 6: /* sal/shl */ - emulate_2op_SrcB("sal", c->src, c->dst, ctxt->eflags); + emulate_2op_SrcB("sal", ctxt->src, ctxt->dst, ctxt->eflags); break; case 5: /* shr */ - emulate_2op_SrcB("shr", c->src, c->dst, ctxt->eflags); + emulate_2op_SrcB("shr", ctxt->src, ctxt->dst, ctxt->eflags); break; case 7: /* sar */ - emulate_2op_SrcB("sar", c->src, c->dst, ctxt->eflags); + emulate_2op_SrcB("sar", ctxt->src, ctxt->dst, ctxt->eflags); break; } return X86EMUL_CONTINUE; @@ -1727,33 +1701,32 @@ static int em_grp2(struct x86_emulate_ctxt *ctxt) static int em_grp3(struct x86_emulate_ctxt *ctxt) { - struct decode_cache *c = &ctxt->decode; - unsigned long *rax = &c->regs[VCPU_REGS_RAX]; - unsigned long *rdx = &c->regs[VCPU_REGS_RDX]; + unsigned long *rax = &ctxt->regs[VCPU_REGS_RAX]; + unsigned long *rdx = &ctxt->regs[VCPU_REGS_RDX]; u8 de = 0; - switch (c->modrm_reg) { + switch (ctxt->modrm_reg) { case 0 ... 1: /* test */ - emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); + emulate_2op_SrcV("test", ctxt->src, ctxt->dst, ctxt->eflags); break; case 2: /* not */ - c->dst.val = ~c->dst.val; + ctxt->dst.val = ~ctxt->dst.val; break; case 3: /* neg */ - emulate_1op("neg", c->dst, ctxt->eflags); + emulate_1op("neg", ctxt->dst, ctxt->eflags); break; case 4: /* mul */ - emulate_1op_rax_rdx("mul", c->src, *rax, *rdx, ctxt->eflags); + emulate_1op_rax_rdx("mul", ctxt->src, *rax, *rdx, ctxt->eflags); break; case 5: /* imul */ - emulate_1op_rax_rdx("imul", c->src, *rax, *rdx, ctxt->eflags); + emulate_1op_rax_rdx("imul", ctxt->src, *rax, *rdx, ctxt->eflags); break; case 6: /* div */ - emulate_1op_rax_rdx_ex("div", c->src, *rax, *rdx, + emulate_1op_rax_rdx_ex("div", ctxt->src, *rax, *rdx, ctxt->eflags, de); break; case 7: /* idiv */ - emulate_1op_rax_rdx_ex("idiv", c->src, *rax, *rdx, + emulate_1op_rax_rdx_ex("idiv", ctxt->src, *rax, *rdx, ctxt->eflags, de); break; default: @@ -1766,26 +1739,25 @@ static int em_grp3(struct x86_emulate_ctxt *ctxt) static int em_grp45(struct x86_emulate_ctxt *ctxt) { - struct decode_cache *c = &ctxt->decode; int rc = X86EMUL_CONTINUE; - switch (c->modrm_reg) { + switch (ctxt->modrm_reg) { case 0: /* inc */ - emulate_1op("inc", c->dst, ctxt->eflags); + emulate_1op("inc", ctxt->dst, ctxt->eflags); break; case 1: /* dec */ - emulate_1op("dec", c->dst, ctxt->eflags); + emulate_1op("dec", ctxt->dst, ctxt->eflags); break; case 2: /* call near abs */ { long int old_eip; - old_eip = c->_eip; - c->_eip = c->src.val; - c->src.val = old_eip; + old_eip = ctxt->_eip; + ctxt->_eip = ctxt->src.val; + ctxt->src.val = old_eip; rc = em_push(ctxt); break; } case 4: /* jmp abs */ - c->_eip = c->src.val; + ctxt->_eip = ctxt->src.val; break; case 5: /* jmp far */ rc = em_jmp_far(ctxt); @@ -1799,17 +1771,16 @@ static int em_grp45(struct x86_emulate_ctxt *ctxt) static int em_grp9(struct x86_emulate_ctxt *ctxt) { - struct decode_cache *c = &ctxt->decode; - u64 old = c->dst.orig_val64; + u64 old = ctxt->dst.orig_val64; - if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) || - ((u32) (old >> 32) != (u32) c->regs[VCPU_REGS_RDX])) { - c->regs[VCPU_REGS_RAX] = (u32) (old >> 0); - c->regs[VCPU_REGS_RDX] = (u32) (old >> 32); + if (((u32) (old >> 0) != (u32) ctxt->regs[VCPU_REGS_RAX]) || + ((u32) (old >> 32) != (u32) ctxt->regs[VCPU_REGS_RDX])) { + ctxt->regs[VCPU_REGS_RAX] = (u32) (old >> 0); + ctxt->regs[VCPU_REGS_RDX] = (u32) (old >> 32); ctxt->eflags &= ~EFLG_ZF; } else { - c->dst.val64 = ((u64)c->regs[VCPU_REGS_RCX] << 32) | - (u32) c->regs[VCPU_REGS_RBX]; + ctxt->dst.val64 = ((u64)ctxt->regs[VCPU_REGS_RCX] << 32) | + (u32) ctxt->regs[VCPU_REGS_RBX]; ctxt->eflags |= EFLG_ZF; } @@ -1818,26 +1789,23 @@ static int em_grp9(struct x86_emulate_ctxt *ctxt) static int em_ret(struct x86_emulate_ctxt *ctxt) { - struct decode_cache *c = &ctxt->decode; - - c->dst.type = OP_REG; - c->dst.addr.reg = &c->_eip; - c->dst.bytes = c->op_bytes; + ctxt->dst.type = OP_REG; + ctxt->dst.addr.reg = &ctxt->_eip; + ctxt->dst.bytes = ctxt->op_bytes; return em_pop(ctxt); } static int em_ret_far(struct x86_emulate_ctxt *ctxt) { - struct decode_cache *c = &ctxt->decode; int rc; unsigned long cs; - rc = emulate_pop(ctxt, &c->_eip, c->op_bytes); + rc = emulate_pop(ctxt, &ctxt->_eip, ctxt->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; - if (c->op_bytes == 4) - c->_eip = (u32)c->_eip; - rc = emulate_pop(ctxt, &cs, c->op_bytes); + if (ctxt->op_bytes == 4) + ctxt->_eip = (u32)ctxt->_eip; + rc = emulate_pop(ctxt, &cs, ctxt->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; rc = load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS); @@ -1846,17 +1814,16 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt) static int emulate_load_segment(struct x86_emulate_ctxt *ctxt, int seg) { - struct decode_cache *c = &ctxt->decode; unsigned short sel; int rc; - memcpy(&sel, c->src.valptr + c->op_bytes, 2); + memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2); rc = load_segment_descriptor(ctxt, sel, seg); if (rc != X86EMUL_CONTINUE) return rc; - c->dst.val = c->src.val; + ctxt->dst.val = ctxt->src.val; return rc; } @@ -1892,7 +1859,6 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, static int em_syscall(struct x86_emulate_ctxt *ctxt) { - struct decode_cache *c = &ctxt->decode; struct x86_emulate_ops *ops = ctxt->ops; struct desc_struct cs, ss; u64 msr_data; @@ -1919,15 +1885,15 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt) ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS); ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); - c->regs[VCPU_REGS_RCX] = c->_eip; + ctxt->regs[VCPU_REGS_RCX] = ctxt->_eip; if (efer & EFER_LMA) { #ifdef CONFIG_X86_64 - c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF; + ctxt->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF; ops->get_msr(ctxt, ctxt->mode == X86EMUL_MODE_PROT64 ? MSR_LSTAR : MSR_CSTAR, &msr_data); - c->_eip = msr_data; + ctxt->_eip = msr_data; ops->get_msr(ctxt, MSR_SYSCALL_MASK, &msr_data); ctxt->eflags &= ~(msr_data | EFLG_RF); @@ -1935,7 +1901,7 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt) } else { /* legacy mode */ ops->get_msr(ctxt, MSR_STAR, &msr_data); - c->_eip = (u32)msr_data; + ctxt->_eip = (u32)msr_data; ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); } @@ -1945,7 +1911,6 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt) static int em_sysenter(struct x86_emulate_ctxt *ctxt) { - struct decode_cache *c = &ctxt->decode; struct x86_emulate_ops *ops = ctxt->ops; struct desc_struct cs, ss; u64 msr_data; @@ -1991,17 +1956,16 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt) ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); ops->get_msr(ctxt, MSR_IA32_SYSENTER_EIP, &msr_data); - c->_eip = msr_data; + ctxt->_eip = msr_data; ops->get_msr(ctxt, MSR_IA32_SYSENTER_ESP, &msr_data); - c->regs[VCPU_REGS_RSP] = msr_data; + ctxt->regs[VCPU_REGS_RSP] = msr_data; return X86EMUL_CONTINUE; } static int em_sysexit(struct x86_emulate_ctxt *ctxt) { - struct decode_cache *c = &ctxt->decode; struct x86_emulate_ops *ops = ctxt->ops; struct desc_struct cs, ss; u64 msr_data; @@ -2015,7 +1979,7 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt) setup_syscalls_segments(ctxt, &cs, &ss); - if ((c->rex_prefix & 0x8) != 0x0) + if ((ctxt->rex_prefix & 0x8) != 0x0) usermode = X86EMUL_MODE_PROT64; else usermode = X86EMUL_MODE_PROT32; @@ -2045,8 +2009,8 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt) ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS); ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); - c->_eip = c->regs[VCPU_REGS_RDX]; - c->regs[VCPU_REGS_RSP] = c->regs[VCPU_REGS_RCX]; + ctxt->_eip = ctxt->regs[VCPU_REGS_RDX]; + ctxt->regs[VCPU_REGS_RSP] = ctxt->regs[VCPU_REGS_RCX]; return X86EMUL_CONTINUE; } @@ -2113,18 +2077,16 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt, static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt, struct tss_segment_16 *tss) { - struct decode_cache *c = &ctxt->decode; - - tss->ip = c->_eip; + tss->ip = ctxt->_eip; tss->flag = ctxt->eflags; - tss->ax = c->regs[VCPU_REGS_RAX]; - tss->cx = c->regs[VCPU_REGS_RCX]; - tss->dx = c->regs[VCPU_REGS_RDX]; - tss->bx = c->regs[VCPU_REGS_RBX]; - tss->sp = c->regs[VCPU_REGS_RSP]; - tss->bp = c->regs[VCPU_REGS_RBP]; - tss->si = c->regs[VCPU_REGS_RSI]; - tss->di = c->regs[VCPU_REGS_RDI]; + tss->ax = ctxt->regs[VCPU_REGS_RAX]; + tss->cx = ctxt->regs[VCPU_REGS_RCX]; + tss->dx = ctxt->regs[VCPU_REGS_RDX]; + tss->bx = ctxt->regs[VCPU_REGS_RBX]; + tss->sp = ctxt->regs[VCPU_REGS_RSP]; + tss->bp = ctxt->regs[VCPU_REGS_RBP]; + tss->si = ctxt->regs[VCPU_REGS_RSI]; + tss->di = ctxt->regs[VCPU_REGS_RDI]; tss->es = get_segment_selector(ctxt, VCPU_SREG_ES); tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS); @@ -2136,19 +2098,18 @@ static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt, static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt, struct tss_segment_16 *tss) { - struct decode_cache *c = &ctxt->decode; int ret; - c->_eip = tss->ip; + ctxt->_eip = tss->ip; ctxt->eflags = tss->flag | 2; - c->regs[VCPU_REGS_RAX] = tss->ax; - c->regs[VCPU_REGS_RCX] = tss->cx; - c->regs[VCPU_REGS_RDX] = tss->dx; - c->regs[VCPU_REGS_RBX] = tss->bx; - c->regs[VCPU_REGS_RSP] = tss->sp; - c->regs[VCPU_REGS_RBP] = tss->bp; - c->regs[VCPU_REGS_RSI] = tss->si; - c->regs[VCPU_REGS_RDI] = tss->di; + ctxt->regs[VCPU_REGS_RAX] = tss->ax; + ctxt->regs[VCPU_REGS_RCX] = tss->cx; + ctxt->regs[VCPU_REGS_RDX] = tss->dx; + ctxt->regs[VCPU_REGS_RBX] = tss->bx; + ctxt->regs[VCPU_REGS_RSP] = tss->sp; + ctxt->regs[VCPU_REGS_RBP] = tss->bp; + ctxt->regs[VCPU_REGS_RSI] = tss->si; + ctxt->regs[VCPU_REGS_RDI] = tss->di; /* * SDM says that segment selectors are loaded before segment @@ -2230,19 +2191,17 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt, struct tss_segment_32 *tss) { - struct decode_cache *c = &ctxt->decode; - tss->cr3 = ctxt->ops->get_cr(ctxt, 3); - tss->eip = c->_eip; + tss->eip = ctxt->_eip; tss->eflags = ctxt->eflags; - tss->eax = c->regs[VCPU_REGS_RAX]; - tss->ecx = c->regs[VCPU_REGS_RCX]; - tss->edx = c->regs[VCPU_REGS_RDX]; - tss->ebx = c->regs[VCPU_REGS_RBX]; - tss->esp = c->regs[VCPU_REGS_RSP]; - tss->ebp = c->regs[VCPU_REGS_RBP]; - tss->esi = c->regs[VCPU_REGS_RSI]; - tss->edi = c->regs[VCPU_REGS_RDI]; + tss->eax = ctxt->regs[VCPU_REGS_RAX]; + tss->ecx = ctxt->regs[VCPU_REGS_RCX]; + tss->edx = ctxt->regs[VCPU_REGS_RDX]; + tss->ebx = ctxt->regs[VCPU_REGS_RBX]; + tss->esp = ctxt->regs[VCPU_REGS_RSP]; + tss->ebp = ctxt->regs[VCPU_REGS_RBP]; + tss->esi = ctxt->regs[VCPU_REGS_RSI]; + tss->edi = ctxt->regs[VCPU_REGS_RDI]; tss->es = get_segment_selector(ctxt, VCPU_SREG_ES); tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS); @@ -2256,21 +2215,20 @@ static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt, static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, struct tss_segment_32 *tss) { - struct decode_cache *c = &ctxt->decode; int ret; if (ctxt->ops->set_cr(ctxt, 3, tss->cr3)) return emulate_gp(ctxt, 0); - c->_eip = tss->eip; + ctxt->_eip = tss->eip; ctxt->eflags = tss->eflags | 2; - c->regs[VCPU_REGS_RAX] = tss->eax; - c->regs[VCPU_REGS_RCX] = tss->ecx; - c->regs[VCPU_REGS_RDX] = tss->edx; - c->regs[VCPU_REGS_RBX] = tss->ebx; - c->regs[VCPU_REGS_RSP] = tss->esp; - c->regs[VCPU_REGS_RBP] = tss->ebp; - c->regs[VCPU_REGS_RSI] = tss->esi; - c->regs[VCPU_REGS_RDI] = tss->edi; + ctxt->regs[VCPU_REGS_RAX] = tss->eax; + ctxt->regs[VCPU_REGS_RCX] = tss->ecx; + ctxt->regs[VCPU_REGS_RDX] = tss->edx; + ctxt->regs[VCPU_REGS_RBX] = tss->ebx; + ctxt->regs[VCPU_REGS_RSP] = tss->esp; + ctxt->regs[VCPU_REGS_RBP] = tss->ebp; + ctxt->regs[VCPU_REGS_RSI] = tss->esi; + ctxt->regs[VCPU_REGS_RDI] = tss->edi; /* * SDM says that segment selectors are loaded before segment @@ -2428,11 +2386,9 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, ops->set_segment(ctxt, tss_selector, &next_tss_desc, 0, VCPU_SREG_TR); if (has_error_code) { - struct decode_cache *c = &ctxt->decode; - - c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2; - c->lock_prefix = 0; - c->src.val = (unsigned long) error_code; + ctxt->op_bytes = ctxt->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2; + ctxt->lock_prefix = 0; + ctxt->src.val = (unsigned long) error_code; ret = em_push(ctxt); } @@ -2443,17 +2399,16 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt, u16 tss_selector, int reason, bool has_error_code, u32 error_code) { - struct decode_cache *c = &ctxt->decode; int rc; - c->_eip = ctxt->eip; - c->dst.type = OP_NONE; + ctxt->_eip = ctxt->eip; + ctxt->dst.type = OP_NONE; rc = emulator_do_task_switch(ctxt, tss_selector, reason, has_error_code, error_code); if (rc == X86EMUL_CONTINUE) - ctxt->eip = c->_eip; + ctxt->eip = ctxt->_eip; return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; } @@ -2461,22 +2416,20 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt, static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned seg, int reg, struct operand *op) { - struct decode_cache *c = &ctxt->decode; int df = (ctxt->eflags & EFLG_DF) ? -1 : 1; - register_address_increment(c, &c->regs[reg], df * op->bytes); - op->addr.mem.ea = register_address(c, c->regs[reg]); + register_address_increment(ctxt, &ctxt->regs[reg], df * op->bytes); + op->addr.mem.ea = register_address(ctxt, ctxt->regs[reg]); op->addr.mem.seg = seg; } static int em_das(struct x86_emulate_ctxt *ctxt) { - struct decode_cache *c = &ctxt->decode; u8 al, old_al; bool af, cf, old_cf; cf = ctxt->eflags & X86_EFLAGS_CF; - al = c->dst.val; + al = ctxt->dst.val; old_al = al; old_cf = cf; @@ -2494,12 +2447,12 @@ static int em_das(struct x86_emulate_ctxt *ctxt) cf = true; } - c->dst.val = al; + ctxt->dst.val = al; /* Set PF, ZF, SF */ - c->src.type = OP_IMM; - c->src.val = 0; - c->src.bytes = 1; - emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags); + ctxt->src.type = OP_IMM; + ctxt->src.val = 0; + ctxt->src.bytes = 1; + emulate_2op_SrcV("or", ctxt->src, ctxt->dst, ctxt->eflags); ctxt->eflags &= ~(X86_EFLAGS_AF | X86_EFLAGS_CF); if (cf) ctxt->eflags |= X86_EFLAGS_CF; @@ -2510,224 +2463,189 @@ static int em_das(struct x86_emulate_ctxt *ctxt) static int em_call_far(struct x86_emulate_ctxt *ctxt) { - struct decode_cache *c = &ctxt->decode; u16 sel, old_cs; ulong old_eip; int rc; old_cs = get_segment_selector(ctxt, VCPU_SREG_CS); - old_eip = c->_eip; + old_eip = ctxt->_eip; - memcpy(&sel, c->src.valptr + c->op_bytes, 2); + memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2); if (load_segment_descriptor(ctxt, sel, VCPU_SREG_CS)) return X86EMUL_CONTINUE; - c->_eip = 0; - memcpy(&c->_eip, c->src.valptr, c->op_bytes); + ctxt->_eip = 0; + memcpy(&ctxt->_eip, ctxt->src.valptr, ctxt->op_bytes); - c->src.val = old_cs; + ctxt->src.val = old_cs; rc = em_push(ctxt); if (rc != X86EMUL_CONTINUE) return rc; - c->src.val = old_eip; + ctxt->src.val = old_eip; return em_push(ctxt); } static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt) { - struct decode_cache *c = &ctxt->decode; int rc; - c->dst.type = OP_REG; - c->dst.addr.reg = &c->_eip; - c->dst.bytes = c->op_bytes; - rc = emulate_pop(ctxt, &c->dst.val, c->op_bytes); + ctxt->dst.type = OP_REG; + ctxt->dst.addr.reg = &ctxt->_eip; + ctxt->dst.bytes = ctxt->op_bytes; + rc = emulate_pop(ctxt, &ctxt->dst.val, ctxt->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; - register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->src.val); + register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], ctxt->src.val); return X86EMUL_CONTINUE; } static int em_add(struct x86_emulate_ctxt *ctxt) { - struct decode_cache *c = &ctxt->decode; - - emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags); + emulate_2op_SrcV("add", ctxt->src, ctxt->dst, ctxt->eflags); return X86EMUL_CONTINUE; } static int em_or(struct x86_emulate_ctxt *ctxt) { - struct decode_cache *c = &ctxt->decode; - - emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags); + emulate_2op_SrcV("or", ctxt->src, ctxt->dst, ctxt->eflags); return X86EMUL_CONTINUE; } static int em_adc(struct x86_emulate_ctxt *ctxt) { - struct decode_cache *c = &ctxt->decode; - - emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags); + emulate_2op_SrcV("adc", ctxt->src, ctxt->dst, ctxt->eflags); return X86EMUL_CONTINUE; } static int em_sbb(struct x86_emulate_ctxt *ctxt) { - struct decode_cache *c = &ctxt->decode; - - emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags); + emulate_2op_SrcV("sbb", ctxt->src, ctxt->dst, ctxt->eflags); return X86EMUL_CONTINUE; } static int em_and(struct x86_emulate_ctxt *ctxt) { - struct decode_cache *c = &ctxt->decode; - - emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags); + emulate_2op_SrcV("and", ctxt->src, ctxt->dst, ctxt->eflags); return X86EMUL_CONTINUE; } static int em_sub(struct x86_emulate_ctxt *ctxt) { - struct decode_cache *c = &ctxt->decode; - - emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags); + emulate_2op_SrcV("sub", ctxt->src, ctxt->dst, ctxt->eflags); return X86EMUL_CONTINUE; } static int em_xor(struct x86_emulate_ctxt *ctxt) { - struct decode_cache *c = &ctxt->decode; - - emulate_2op_SrcV("xor", c->src, c->dst, ctxt->eflags); + emulate_2op_SrcV("xor", ctxt->src, ctxt->dst, ctxt->eflags); return X86EMUL_CONTINUE; } static int em_cmp(struct x86_emulate_ctxt *ctxt) { - struct decode_cache *c = &ctxt->decode; - - emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags); + emulate_2op_SrcV("cmp", ctxt->src, ctxt->dst, ctxt->eflags); /* Disable writeback. */ - c->dst.type = OP_NONE; + ctxt->dst.type = OP_NONE; return X86EMUL_CONTINUE; } static int em_test(struct x86_emulate_ctxt *ctxt) { - struct decode_cache *c = &ctxt->decode; - - emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); + emulate_2op_SrcV("test", ctxt->src, ctxt->dst, ctxt->eflags); return X86EMUL_CONTINUE; } static int em_xchg(struct x86_emulate_ctxt *ctxt) { - struct decode_cache *c = &ctxt->decode; - /* Write back the register source. */ - c->src.val = c->dst.val; - write_register_operand(&c->src); + ctxt->src.val = ctxt->dst.val; + write_register_operand(&ctxt->src); /* Write back the memory destination with implicit LOCK prefix. */ - c->dst.val = c->src.orig_val; - c->lock_prefix = 1; + ctxt->dst.val = ctxt->src.orig_val; + ctxt->lock_prefix = 1; return X86EMUL_CONTINUE; } static int em_imul(struct x86_emulate_ctxt *ctxt) { - struct decode_cache *c = &ctxt->decode; - - emulate_2op_SrcV_nobyte("imul", c->src, c->dst, ctxt->eflags); + emulate_2op_SrcV_nobyte("imul", ctxt->src, ctxt->dst, ctxt->eflags); return X86EMUL_CONTINUE; } static int em_imul_3op(struct x86_emulate_ctxt *ctxt) { - struct decode_cache *c = &ctxt->decode; - - c->dst.val = c->src2.val; + ctxt->dst.val = ctxt->src2.val; return em_imul(ctxt); } static int em_cwd(struct x86_emulate_ctxt *ctxt) { - struct decode_cache *c = &ctxt->decode; - - c->dst.type = OP_REG; - c->dst.bytes = c->src.bytes; - c->dst.addr.reg = &c->regs[VCPU_REGS_RDX]; - c->dst.val = ~((c->src.val >> (c->src.bytes * 8 - 1)) - 1); + ctxt->dst.type = OP_REG; + ctxt->dst.bytes = ctxt->src.bytes; + ctxt->dst.addr.reg = &ctxt->regs[VCPU_REGS_RDX]; + ctxt->dst.val = ~((ctxt->src.val >> (ctxt->src.bytes * 8 - 1)) - 1); return X86EMUL_CONTINUE; } static int em_rdtsc(struct x86_emulate_ctxt *ctxt) { - struct decode_cache *c = &ctxt->decode; u64 tsc = 0; ctxt->ops->get_msr(ctxt, MSR_IA32_TSC, &tsc); - c->regs[VCPU_REGS_RAX] = (u32)tsc; - c->regs[VCPU_REGS_RDX] = tsc >> 32; + ctxt->regs[VCPU_REGS_RAX] = (u32)tsc; + ctxt->regs[VCPU_REGS_RDX] = tsc >> 32; return X86EMUL_CONTINUE; } static int em_mov(struct x86_emulate_ctxt *ctxt) { - struct decode_cache *c = &ctxt->decode; - c->dst.val = c->src.val; + ctxt->dst.val = ctxt->src.val; return X86EMUL_CONTINUE; } static int em_mov_rm_sreg(struct x86_emulate_ctxt *ctxt) { - struct decode_cache *c = &ctxt->decode; - - if (c->modrm_reg > VCPU_SREG_GS) + if (ctxt->modrm_reg > VCPU_SREG_GS) return emulate_ud(ctxt); - c->dst.val = get_segment_selector(ctxt, c->modrm_reg); + ctxt->dst.val = get_segment_selector(ctxt, ctxt->modrm_reg); return X86EMUL_CONTINUE; } static int em_mov_sreg_rm(struct x86_emulate_ctxt *ctxt) { - struct decode_cache *c = &ctxt->decode; - u16 sel = c->src.val; + u16 sel = ctxt->src.val; - if (c->modrm_reg == VCPU_SREG_CS || c->modrm_reg > VCPU_SREG_GS) + if (ctxt->modrm_reg == VCPU_SREG_CS || ctxt->modrm_reg > VCPU_SREG_GS) return emulate_ud(ctxt); - if (c->modrm_reg == VCPU_SREG_SS) + if (ctxt->modrm_reg == VCPU_SREG_SS) ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS; /* Disable writeback. */ - c->dst.type = OP_NONE; - return load_segment_descriptor(ctxt, sel, c->modrm_reg); + ctxt->dst.type = OP_NONE; + return load_segment_descriptor(ctxt, sel, ctxt->modrm_reg); } static int em_movdqu(struct x86_emulate_ctxt *ctxt) { - struct decode_cache *c = &ctxt->decode; - memcpy(&c->dst.vec_val, &c->src.vec_val, c->op_bytes); + memcpy(&ctxt->dst.vec_val, &ctxt->src.vec_val, ctxt->op_bytes); return X86EMUL_CONTINUE; } static int em_invlpg(struct x86_emulate_ctxt *ctxt) { - struct decode_cache *c = &ctxt->decode; int rc; ulong linear; - rc = linearize(ctxt, c->src.addr.mem, 1, false, &linear); + rc = linearize(ctxt, ctxt->src.addr.mem, 1, false, &linear); if (rc == X86EMUL_CONTINUE) ctxt->ops->invlpg(ctxt, linear); /* Disable writeback. */ - c->dst.type = OP_NONE; + ctxt->dst.type = OP_NONE; return X86EMUL_CONTINUE; } @@ -2743,10 +2661,9 @@ static int em_clts(struct x86_emulate_ctxt *ctxt) static int em_vmcall(struct x86_emulate_ctxt *ctxt) { - struct decode_cache *c = &ctxt->decode; int rc; - if (c->modrm_mod != 3 || c->modrm_rm != 1) + if (ctxt->modrm_mod != 3 || ctxt->modrm_rm != 1) return X86EMUL_UNHANDLEABLE; rc = ctxt->ops->fix_hypercall(ctxt); @@ -2754,94 +2671,84 @@ static int em_vmcall(struct x86_emulate_ctxt *ctxt) return rc; /* Let the processor re-execute the fixed hypercall */ - c->_eip = ctxt->eip; + ctxt->_eip = ctxt->eip; /* Disable writeback. */ - c->dst.type = OP_NONE; + ctxt->dst.type = OP_NONE; return X86EMUL_CONTINUE; } static int em_lgdt(struct x86_emulate_ctxt *ctxt) { - struct decode_cache *c = &ctxt->decode; struct desc_ptr desc_ptr; int rc; - rc = read_descriptor(ctxt, c->src.addr.mem, + rc = read_descriptor(ctxt, ctxt->src.addr.mem, &desc_ptr.size, &desc_ptr.address, - c->op_bytes); + ctxt->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; ctxt->ops->set_gdt(ctxt, &desc_ptr); /* Disable writeback. */ - c->dst.type = OP_NONE; + ctxt->dst.type = OP_NONE; return X86EMUL_CONTINUE; } static int em_vmmcall(struct x86_emulate_ctxt *ctxt) { - struct decode_cache *c = &ctxt->decode; int rc; rc = ctxt->ops->fix_hypercall(ctxt); /* Disable writeback. */ - c->dst.type = OP_NONE; + ctxt->dst.type = OP_NONE; return rc; } static int em_lidt(struct x86_emulate_ctxt *ctxt) { - struct decode_cache *c = &ctxt->decode; struct desc_ptr desc_ptr; int rc; - rc = read_descriptor(ctxt, c->src.addr.mem, + rc = read_descriptor(ctxt, ctxt->src.addr.mem, &desc_ptr.size, &desc_ptr.address, - c->op_bytes); + ctxt->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; ctxt->ops->set_idt(ctxt, &desc_ptr); /* Disable writeback. */ - c->dst.type = OP_NONE; + ctxt->dst.type = OP_NONE; return X86EMUL_CONTINUE; } static int em_smsw(struct x86_emulate_ctxt *ctxt) { - struct decode_cache *c = &ctxt->decode; - - c->dst.bytes = 2; - c->dst.val = ctxt->ops->get_cr(ctxt, 0); + ctxt->dst.bytes = 2; + ctxt->dst.val = ctxt->ops->get_cr(ctxt, 0); return X86EMUL_CONTINUE; } static int em_lmsw(struct x86_emulate_ctxt *ctxt) { - struct decode_cache *c = &ctxt->decode; ctxt->ops->set_cr(ctxt, 0, (ctxt->ops->get_cr(ctxt, 0) & ~0x0eul) - | (c->src.val & 0x0f)); - c->dst.type = OP_NONE; + | (ctxt->src.val & 0x0f)); + ctxt->dst.type = OP_NONE; return X86EMUL_CONTINUE; } static int em_loop(struct x86_emulate_ctxt *ctxt) { - struct decode_cache *c = &ctxt->decode; - - register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1); - if ((address_mask(c, c->regs[VCPU_REGS_RCX]) != 0) && - (c->b == 0xe2 || test_cc(c->b ^ 0x5, ctxt->eflags))) - jmp_rel(c, c->src.val); + register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RCX], -1); + if ((address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) != 0) && + (ctxt->b == 0xe2 || test_cc(ctxt->b ^ 0x5, ctxt->eflags))) + jmp_rel(ctxt, ctxt->src.val); return X86EMUL_CONTINUE; } static int em_jcxz(struct x86_emulate_ctxt *ctxt) { - struct decode_cache *c = &ctxt->decode; - - if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) - jmp_rel(c, c->src.val); + if (address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) == 0) + jmp_rel(ctxt, ctxt->src.val); return X86EMUL_CONTINUE; } @@ -2879,9 +2786,7 @@ static bool valid_cr(int nr) static int check_cr_read(struct x86_emulate_ctxt *ctxt) { - struct decode_cache *c = &ctxt->decode; - - if (!valid_cr(c->modrm_reg)) + if (!valid_cr(ctxt->modrm_reg)) return emulate_ud(ctxt); return X86EMUL_CONTINUE; @@ -2889,9 +2794,8 @@ static int check_cr_read(struct x86_emulate_ctxt *ctxt) static int check_cr_write(struct x86_emulate_ctxt *ctxt) { - struct decode_cache *c = &ctxt->decode; - u64 new_val = c->src.val64; - int cr = c->modrm_reg; + u64 new_val = ctxt->src.val64; + int cr = ctxt->modrm_reg; u64 efer = 0; static u64 cr_reserved_bits[] = { @@ -2968,8 +2872,7 @@ static int check_dr7_gd(struct x86_emulate_ctxt *ctxt) static int check_dr_read(struct x86_emulate_ctxt *ctxt) { - struct decode_cache *c = &ctxt->decode; - int dr = c->modrm_reg; + int dr = ctxt->modrm_reg; u64 cr4; if (dr > 7) @@ -2987,9 +2890,8 @@ static int check_dr_read(struct x86_emulate_ctxt *ctxt) static int check_dr_write(struct x86_emulate_ctxt *ctxt) { - struct decode_cache *c = &ctxt->decode; - u64 new_val = c->src.val64; - int dr = c->modrm_reg; + u64 new_val = ctxt->src.val64; + int dr = ctxt->modrm_reg; if ((dr == 6 || dr == 7) && (new_val & 0xffffffff00000000ULL)) return emulate_gp(ctxt, 0); @@ -3011,7 +2913,7 @@ static int check_svme(struct x86_emulate_ctxt *ctxt) static int check_svme_pa(struct x86_emulate_ctxt *ctxt) { - u64 rax = ctxt->decode.regs[VCPU_REGS_RAX]; + u64 rax = ctxt->regs[VCPU_REGS_RAX]; /* Valid physical address? */ if (rax & 0xffff000000000000ULL) @@ -3033,7 +2935,7 @@ static int check_rdtsc(struct x86_emulate_ctxt *ctxt) static int check_rdpmc(struct x86_emulate_ctxt *ctxt) { u64 cr4 = ctxt->ops->get_cr(ctxt, 4); - u64 rcx = ctxt->decode.regs[VCPU_REGS_RCX]; + u64 rcx = ctxt->regs[VCPU_REGS_RCX]; if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) || (rcx > 3)) @@ -3044,10 +2946,8 @@ static int check_rdpmc(struct x86_emulate_ctxt *ctxt) static int check_perm_in(struct x86_emulate_ctxt *ctxt) { - struct decode_cache *c = &ctxt->decode; - - c->dst.bytes = min(c->dst.bytes, 4u); - if (!emulator_io_permited(ctxt, c->src.val, c->dst.bytes)) + ctxt->dst.bytes = min(ctxt->dst.bytes, 4u); + if (!emulator_io_permited(ctxt, ctxt->src.val, ctxt->dst.bytes)) return emulate_gp(ctxt, 0); return X86EMUL_CONTINUE; @@ -3055,10 +2955,8 @@ static int check_perm_in(struct x86_emulate_ctxt *ctxt) static int check_perm_out(struct x86_emulate_ctxt *ctxt) { - struct decode_cache *c = &ctxt->decode; - - c->src.bytes = min(c->src.bytes, 4u); - if (!emulator_io_permited(ctxt, c->dst.val, c->src.bytes)) + ctxt->src.bytes = min(ctxt->src.bytes, 4u); + if (!emulator_io_permited(ctxt, ctxt->dst.val, ctxt->src.bytes)) return emulate_gp(ctxt, 0); return X86EMUL_CONTINUE; @@ -3390,11 +3288,11 @@ static struct opcode twobyte_table[256] = { #undef I2bv #undef I6ALU -static unsigned imm_size(struct decode_cache *c) +static unsigned imm_size(struct x86_emulate_ctxt *ctxt) { unsigned size; - size = (c->d & ByteOp) ? 1 : c->op_bytes; + size = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; if (size == 8) size = 4; return size; @@ -3403,22 +3301,21 @@ static unsigned imm_size(struct decode_cache *c) static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op, unsigned size, bool sign_extension) { - struct decode_cache *c = &ctxt->decode; int rc = X86EMUL_CONTINUE; op->type = OP_IMM; op->bytes = size; - op->addr.mem.ea = c->_eip; + op->addr.mem.ea = ctxt->_eip; /* NB. Immediates are sign-extended as necessary. */ switch (op->bytes) { case 1: - op->val = insn_fetch(s8, 1, c->_eip); + op->val = insn_fetch(s8, 1, ctxt->_eip); break; case 2: - op->val = insn_fetch(s16, 2, c->_eip); + op->val = insn_fetch(s16, 2, ctxt->_eip); break; case 4: - op->val = insn_fetch(s32, 4, c->_eip); + op->val = insn_fetch(s32, 4, ctxt->_eip); break; } if (!sign_extension) { @@ -3440,7 +3337,6 @@ done: int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) { - struct decode_cache *c = &ctxt->decode; int rc = X86EMUL_CONTINUE; int mode = ctxt->mode; int def_op_bytes, def_ad_bytes, goffset, simd_prefix; @@ -3448,11 +3344,11 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) struct opcode opcode; struct operand memop = { .type = OP_NONE }, *memopp = NULL; - c->_eip = ctxt->eip; - c->fetch.start = c->_eip; - c->fetch.end = c->fetch.start + insn_len; + ctxt->_eip = ctxt->eip; + ctxt->fetch.start = ctxt->_eip; + ctxt->fetch.end = ctxt->fetch.start + insn_len; if (insn_len > 0) - memcpy(c->fetch.data, insn, insn_len); + memcpy(ctxt->fetch.data, insn, insn_len); switch (mode) { case X86EMUL_MODE_REAL: @@ -3473,46 +3369,46 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) return -1; } - c->op_bytes = def_op_bytes; - c->ad_bytes = def_ad_bytes; + ctxt->op_bytes = def_op_bytes; + ctxt->ad_bytes = def_ad_bytes; /* Legacy prefixes. */ for (;;) { - switch (c->b = insn_fetch(u8, 1, c->_eip)) { + switch (ctxt->b = insn_fetch(u8, 1, ctxt->_eip)) { case 0x66: /* operand-size override */ op_prefix = true; /* switch between 2/4 bytes */ - c->op_bytes = def_op_bytes ^ 6; + ctxt->op_bytes = def_op_bytes ^ 6; break; case 0x67: /* address-size override */ if (mode == X86EMUL_MODE_PROT64) /* switch between 4/8 bytes */ - c->ad_bytes = def_ad_bytes ^ 12; + ctxt->ad_bytes = def_ad_bytes ^ 12; else /* switch between 2/4 bytes */ - c->ad_bytes = def_ad_bytes ^ 6; + ctxt->ad_bytes = def_ad_bytes ^ 6; break; case 0x26: /* ES override */ case 0x2e: /* CS override */ case 0x36: /* SS override */ case 0x3e: /* DS override */ - set_seg_override(c, (c->b >> 3) & 3); + set_seg_override(ctxt, (ctxt->b >> 3) & 3); break; case 0x64: /* FS override */ case 0x65: /* GS override */ - set_seg_override(c, c->b & 7); + set_seg_override(ctxt, ctxt->b & 7); break; case 0x40 ... 0x4f: /* REX */ if (mode != X86EMUL_MODE_PROT64) goto done_prefixes; - c->rex_prefix = c->b; + ctxt->rex_prefix = ctxt->b; continue; case 0xf0: /* LOCK */ - c->lock_prefix = 1; + ctxt->lock_prefix = 1; break; case 0xf2: /* REPNE/REPNZ */ case 0xf3: /* REP/REPE/REPZ */ - c->rep_prefix = c->b; + ctxt->rep_prefix = ctxt->b; break; default: goto done_prefixes; @@ -3520,50 +3416,50 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) /* Any legacy prefix after a REX prefix nullifies its effect. */ - c->rex_prefix = 0; + ctxt->rex_prefix = 0; } done_prefixes: /* REX prefix. */ - if (c->rex_prefix & 8) - c->op_bytes = 8; /* REX.W */ + if (ctxt->rex_prefix & 8) + ctxt->op_bytes = 8; /* REX.W */ /* Opcode byte(s). */ - opcode = opcode_table[c->b]; + opcode = opcode_table[ctxt->b]; /* Two-byte opcode? */ - if (c->b == 0x0f) { - c->twobyte = 1; - c->b = insn_fetch(u8, 1, c->_eip); - opcode = twobyte_table[c->b]; + if (ctxt->b == 0x0f) { + ctxt->twobyte = 1; + ctxt->b = insn_fetch(u8, 1, ctxt->_eip); + opcode = twobyte_table[ctxt->b]; } - c->d = opcode.flags; + ctxt->d = opcode.flags; - while (c->d & GroupMask) { - switch (c->d & GroupMask) { + while (ctxt->d & GroupMask) { + switch (ctxt->d & GroupMask) { case Group: - c->modrm = insn_fetch(u8, 1, c->_eip); - --c->_eip; - goffset = (c->modrm >> 3) & 7; + ctxt->modrm = insn_fetch(u8, 1, ctxt->_eip); + --ctxt->_eip; + goffset = (ctxt->modrm >> 3) & 7; opcode = opcode.u.group[goffset]; break; case GroupDual: - c->modrm = insn_fetch(u8, 1, c->_eip); - --c->_eip; - goffset = (c->modrm >> 3) & 7; - if ((c->modrm >> 6) == 3) + ctxt->modrm = insn_fetch(u8, 1, ctxt->_eip); + --ctxt->_eip; + goffset = (ctxt->modrm >> 3) & 7; + if ((ctxt->modrm >> 6) == 3) opcode = opcode.u.gdual->mod3[goffset]; else opcode = opcode.u.gdual->mod012[goffset]; break; case RMExt: - goffset = c->modrm & 7; + goffset = ctxt->modrm & 7; opcode = opcode.u.group[goffset]; break; case Prefix: - if (c->rep_prefix && op_prefix) + if (ctxt->rep_prefix && op_prefix) return X86EMUL_UNHANDLEABLE; - simd_prefix = op_prefix ? 0x66 : c->rep_prefix; + simd_prefix = op_prefix ? 0x66 : ctxt->rep_prefix; switch (simd_prefix) { case 0x00: opcode = opcode.u.gprefix->pfx_no; break; case 0x66: opcode = opcode.u.gprefix->pfx_66; break; @@ -3575,61 +3471,61 @@ done_prefixes: return X86EMUL_UNHANDLEABLE; } - c->d &= ~GroupMask; - c->d |= opcode.flags; + ctxt->d &= ~GroupMask; + ctxt->d |= opcode.flags; } - c->execute = opcode.u.execute; - c->check_perm = opcode.check_perm; - c->intercept = opcode.intercept; + ctxt->execute = opcode.u.execute; + ctxt->check_perm = opcode.check_perm; + ctxt->intercept = opcode.intercept; /* Unrecognised? */ - if (c->d == 0 || (c->d & Undefined)) + if (ctxt->d == 0 || (ctxt->d & Undefined)) return -1; - if (!(c->d & VendorSpecific) && ctxt->only_vendor_specific_insn) + if (!(ctxt->d & VendorSpecific) && ctxt->only_vendor_specific_insn) return -1; - if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack)) - c->op_bytes = 8; + if (mode == X86EMUL_MODE_PROT64 && (ctxt->d & Stack)) + ctxt->op_bytes = 8; - if (c->d & Op3264) { + if (ctxt->d & Op3264) { if (mode == X86EMUL_MODE_PROT64) - c->op_bytes = 8; + ctxt->op_bytes = 8; else - c->op_bytes = 4; + ctxt->op_bytes = 4; } - if (c->d & Sse) - c->op_bytes = 16; + if (ctxt->d & Sse) + ctxt->op_bytes = 16; /* ModRM and SIB bytes. */ - if (c->d & ModRM) { + if (ctxt->d & ModRM) { rc = decode_modrm(ctxt, &memop); - if (!c->has_seg_override) - set_seg_override(c, c->modrm_seg); - } else if (c->d & MemAbs) + if (!ctxt->has_seg_override) + set_seg_override(ctxt, ctxt->modrm_seg); + } else if (ctxt->d & MemAbs) rc = decode_abs(ctxt, &memop); if (rc != X86EMUL_CONTINUE) goto done; - if (!c->has_seg_override) - set_seg_override(c, VCPU_SREG_DS); + if (!ctxt->has_seg_override) + set_seg_override(ctxt, VCPU_SREG_DS); - memop.addr.mem.seg = seg_override(ctxt, c); + memop.addr.mem.seg = seg_override(ctxt); - if (memop.type == OP_MEM && c->ad_bytes != 8) + if (memop.type == OP_MEM && ctxt->ad_bytes != 8) memop.addr.mem.ea = (u32)memop.addr.mem.ea; /* * Decode and fetch the source operand: register, memory * or immediate. */ - switch (c->d & SrcMask) { + switch (ctxt->d & SrcMask) { case SrcNone: break; case SrcReg: - decode_register_operand(ctxt, &c->src, c, 0); + decode_register_operand(ctxt, &ctxt->src, 0); break; case SrcMem16: memop.bytes = 2; @@ -3638,60 +3534,60 @@ done_prefixes: memop.bytes = 4; goto srcmem_common; case SrcMem: - memop.bytes = (c->d & ByteOp) ? 1 : - c->op_bytes; + memop.bytes = (ctxt->d & ByteOp) ? 1 : + ctxt->op_bytes; srcmem_common: - c->src = memop; - memopp = &c->src; + ctxt->src = memop; + memopp = &ctxt->src; break; case SrcImmU16: - rc = decode_imm(ctxt, &c->src, 2, false); + rc = decode_imm(ctxt, &ctxt->src, 2, false); break; case SrcImm: - rc = decode_imm(ctxt, &c->src, imm_size(c), true); + rc = decode_imm(ctxt, &ctxt->src, imm_size(ctxt), true); break; case SrcImmU: - rc = decode_imm(ctxt, &c->src, imm_size(c), false); + rc = decode_imm(ctxt, &ctxt->src, imm_size(ctxt), false); break; case SrcImmByte: - rc = decode_imm(ctxt, &c->src, 1, true); + rc = decode_imm(ctxt, &ctxt->src, 1, true); break; case SrcImmUByte: - rc = decode_imm(ctxt, &c->src, 1, false); + rc = decode_imm(ctxt, &ctxt->src, 1, false); break; case SrcAcc: - c->src.type = OP_REG; - c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->src.addr.reg = &c->regs[VCPU_REGS_RAX]; - fetch_register_operand(&c->src); + ctxt->src.type = OP_REG; + ctxt->src.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; + ctxt->src.addr.reg = &ctxt->regs[VCPU_REGS_RAX]; + fetch_register_operand(&ctxt->src); break; case SrcOne: - c->src.bytes = 1; - c->src.val = 1; + ctxt->src.bytes = 1; + ctxt->src.val = 1; break; case SrcSI: - c->src.type = OP_MEM; - c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->src.addr.mem.ea = - register_address(c, c->regs[VCPU_REGS_RSI]); - c->src.addr.mem.seg = seg_override(ctxt, c); - c->src.val = 0; + ctxt->src.type = OP_MEM; + ctxt->src.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; + ctxt->src.addr.mem.ea = + register_address(ctxt, ctxt->regs[VCPU_REGS_RSI]); + ctxt->src.addr.mem.seg = seg_override(ctxt); + ctxt->src.val = 0; break; case SrcImmFAddr: - c->src.type = OP_IMM; - c->src.addr.mem.ea = c->_eip; - c->src.bytes = c->op_bytes + 2; - insn_fetch_arr(c->src.valptr, c->src.bytes, c->_eip); + ctxt->src.type = OP_IMM; + ctxt->src.addr.mem.ea = ctxt->_eip; + ctxt->src.bytes = ctxt->op_bytes + 2; + insn_fetch_arr(ctxt->src.valptr, ctxt->src.bytes, ctxt->_eip); break; case SrcMemFAddr: - memop.bytes = c->op_bytes + 2; + memop.bytes = ctxt->op_bytes + 2; goto srcmem_common; break; case SrcDX: - c->src.type = OP_REG; - c->src.bytes = 2; - c->src.addr.reg = &c->regs[VCPU_REGS_RDX]; - fetch_register_operand(&c->src); + ctxt->src.type = OP_REG; + ctxt->src.bytes = 2; + ctxt->src.addr.reg = &ctxt->regs[VCPU_REGS_RDX]; + fetch_register_operand(&ctxt->src); break; } @@ -3702,22 +3598,22 @@ done_prefixes: * Decode and fetch the second source operand: register, memory * or immediate. */ - switch (c->d & Src2Mask) { + switch (ctxt->d & Src2Mask) { case Src2None: break; case Src2CL: - c->src2.bytes = 1; - c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8; + ctxt->src2.bytes = 1; + ctxt->src2.val = ctxt->regs[VCPU_REGS_RCX] & 0x8; break; case Src2ImmByte: - rc = decode_imm(ctxt, &c->src2, 1, true); + rc = decode_imm(ctxt, &ctxt->src2, 1, true); break; case Src2One: - c->src2.bytes = 1; - c->src2.val = 1; + ctxt->src2.bytes = 1; + ctxt->src2.val = 1; break; case Src2Imm: - rc = decode_imm(ctxt, &c->src2, imm_size(c), true); + rc = decode_imm(ctxt, &ctxt->src2, imm_size(ctxt), true); break; } @@ -3725,68 +3621,66 @@ done_prefixes: goto done; /* Decode and fetch the destination operand: register or memory. */ - switch (c->d & DstMask) { + switch (ctxt->d & DstMask) { case DstReg: - decode_register_operand(ctxt, &c->dst, c, - c->twobyte && (c->b == 0xb6 || c->b == 0xb7)); + decode_register_operand(ctxt, &ctxt->dst, + ctxt->twobyte && (ctxt->b == 0xb6 || ctxt->b == 0xb7)); break; case DstImmUByte: - c->dst.type = OP_IMM; - c->dst.addr.mem.ea = c->_eip; - c->dst.bytes = 1; - c->dst.val = insn_fetch(u8, 1, c->_eip); + ctxt->dst.type = OP_IMM; + ctxt->dst.addr.mem.ea = ctxt->_eip; + ctxt->dst.bytes = 1; + ctxt->dst.val = insn_fetch(u8, 1, ctxt->_eip); break; case DstMem: case DstMem64: - c->dst = memop; - memopp = &c->dst; - if ((c->d & DstMask) == DstMem64) - c->dst.bytes = 8; + ctxt->dst = memop; + memopp = &ctxt->dst; + if ((ctxt->d & DstMask) == DstMem64) + ctxt->dst.bytes = 8; else - c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - if (c->d & BitOp) - fetch_bit_operand(c); - c->dst.orig_val = c->dst.val; + ctxt->dst.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; + if (ctxt->d & BitOp) + fetch_bit_operand(ctxt); + ctxt->dst.orig_val = ctxt->dst.val; break; case DstAcc: - c->dst.type = OP_REG; - c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->dst.addr.reg = &c->regs[VCPU_REGS_RAX]; - fetch_register_operand(&c->dst); - c->dst.orig_val = c->dst.val; + ctxt->dst.type = OP_REG; + ctxt->dst.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; + ctxt->dst.addr.reg = &ctxt->regs[VCPU_REGS_RAX]; + fetch_register_operand(&ctxt->dst); + ctxt->dst.orig_val = ctxt->dst.val; break; case DstDI: - c->dst.type = OP_MEM; - c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->dst.addr.mem.ea = - register_address(c, c->regs[VCPU_REGS_RDI]); - c->dst.addr.mem.seg = VCPU_SREG_ES; - c->dst.val = 0; + ctxt->dst.type = OP_MEM; + ctxt->dst.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; + ctxt->dst.addr.mem.ea = + register_address(ctxt, ctxt->regs[VCPU_REGS_RDI]); + ctxt->dst.addr.mem.seg = VCPU_SREG_ES; + ctxt->dst.val = 0; break; case DstDX: - c->dst.type = OP_REG; - c->dst.bytes = 2; - c->dst.addr.reg = &c->regs[VCPU_REGS_RDX]; - fetch_register_operand(&c->dst); + ctxt->dst.type = OP_REG; + ctxt->dst.bytes = 2; + ctxt->dst.addr.reg = &ctxt->regs[VCPU_REGS_RDX]; + fetch_register_operand(&ctxt->dst); break; case ImplicitOps: /* Special instructions do their own operand decoding. */ default: - c->dst.type = OP_NONE; /* Disable writeback. */ + ctxt->dst.type = OP_NONE; /* Disable writeback. */ break; } done: - if (memopp && memopp->type == OP_MEM && c->rip_relative) - memopp->addr.mem.ea += c->_eip; + if (memopp && memopp->type == OP_MEM && ctxt->rip_relative) + memopp->addr.mem.ea += ctxt->_eip; return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; } static bool string_insn_completed(struct x86_emulate_ctxt *ctxt) { - struct decode_cache *c = &ctxt->decode; - /* The second termination condition only applies for REPE * and REPNE. Test if the repeat string operation prefix is * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the @@ -3794,11 +3688,11 @@ static bool string_insn_completed(struct x86_emulate_ctxt *ctxt) * - if REPE/REPZ and ZF = 0 then done * - if REPNE/REPNZ and ZF = 1 then done */ - if (((c->b == 0xa6) || (c->b == 0xa7) || - (c->b == 0xae) || (c->b == 0xaf)) - && (((c->rep_prefix == REPE_PREFIX) && + if (((ctxt->b == 0xa6) || (ctxt->b == 0xa7) || + (ctxt->b == 0xae) || (ctxt->b == 0xaf)) + && (((ctxt->rep_prefix == REPE_PREFIX) && ((ctxt->eflags & EFLG_ZF) == 0)) - || ((c->rep_prefix == REPNE_PREFIX) && + || ((ctxt->rep_prefix == REPNE_PREFIX) && ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)))) return true; @@ -3809,129 +3703,128 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) { struct x86_emulate_ops *ops = ctxt->ops; u64 msr_data; - struct decode_cache *c = &ctxt->decode; int rc = X86EMUL_CONTINUE; - int saved_dst_type = c->dst.type; + int saved_dst_type = ctxt->dst.type; - c->mem_read.pos = 0; + ctxt->mem_read.pos = 0; - if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) { + if (ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) { rc = emulate_ud(ctxt); goto done; } /* LOCK prefix is allowed only with some instructions */ - if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) { + if (ctxt->lock_prefix && (!(ctxt->d & Lock) || ctxt->dst.type != OP_MEM)) { rc = emulate_ud(ctxt); goto done; } - if ((c->d & SrcMask) == SrcMemFAddr && c->src.type != OP_MEM) { + if ((ctxt->d & SrcMask) == SrcMemFAddr && ctxt->src.type != OP_MEM) { rc = emulate_ud(ctxt); goto done; } - if ((c->d & Sse) + if ((ctxt->d & Sse) && ((ops->get_cr(ctxt, 0) & X86_CR0_EM) || !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) { rc = emulate_ud(ctxt); goto done; } - if ((c->d & Sse) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) { + if ((ctxt->d & Sse) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) { rc = emulate_nm(ctxt); goto done; } - if (unlikely(ctxt->guest_mode) && c->intercept) { - rc = emulator_check_intercept(ctxt, c->intercept, + if (unlikely(ctxt->guest_mode) && ctxt->intercept) { + rc = emulator_check_intercept(ctxt, ctxt->intercept, X86_ICPT_PRE_EXCEPT); if (rc != X86EMUL_CONTINUE) goto done; } /* Privileged instruction can be executed only in CPL=0 */ - if ((c->d & Priv) && ops->cpl(ctxt)) { + if ((ctxt->d & Priv) && ops->cpl(ctxt)) { rc = emulate_gp(ctxt, 0); goto done; } /* Instruction can only be executed in protected mode */ - if ((c->d & Prot) && !(ctxt->mode & X86EMUL_MODE_PROT)) { + if ((ctxt->d & Prot) && !(ctxt->mode & X86EMUL_MODE_PROT)) { rc = emulate_ud(ctxt); goto done; } /* Do instruction specific permission checks */ - if (c->check_perm) { - rc = c->check_perm(ctxt); + if (ctxt->check_perm) { + rc = ctxt->check_perm(ctxt); if (rc != X86EMUL_CONTINUE) goto done; } - if (unlikely(ctxt->guest_mode) && c->intercept) { - rc = emulator_check_intercept(ctxt, c->intercept, + if (unlikely(ctxt->guest_mode) && ctxt->intercept) { + rc = emulator_check_intercept(ctxt, ctxt->intercept, X86_ICPT_POST_EXCEPT); if (rc != X86EMUL_CONTINUE) goto done; } - if (c->rep_prefix && (c->d & String)) { + if (ctxt->rep_prefix && (ctxt->d & String)) { /* All REP prefixes have the same first termination condition */ - if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) { - ctxt->eip = c->_eip; + if (address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) == 0) { + ctxt->eip = ctxt->_eip; goto done; } } - if ((c->src.type == OP_MEM) && !(c->d & NoAccess)) { - rc = segmented_read(ctxt, c->src.addr.mem, - c->src.valptr, c->src.bytes); + if ((ctxt->src.type == OP_MEM) && !(ctxt->d & NoAccess)) { + rc = segmented_read(ctxt, ctxt->src.addr.mem, + ctxt->src.valptr, ctxt->src.bytes); if (rc != X86EMUL_CONTINUE) goto done; - c->src.orig_val64 = c->src.val64; + ctxt->src.orig_val64 = ctxt->src.val64; } - if (c->src2.type == OP_MEM) { - rc = segmented_read(ctxt, c->src2.addr.mem, - &c->src2.val, c->src2.bytes); + if (ctxt->src2.type == OP_MEM) { + rc = segmented_read(ctxt, ctxt->src2.addr.mem, + &ctxt->src2.val, ctxt->src2.bytes); if (rc != X86EMUL_CONTINUE) goto done; } - if ((c->d & DstMask) == ImplicitOps) + if ((ctxt->d & DstMask) == ImplicitOps) goto special_insn; - if ((c->dst.type == OP_MEM) && !(c->d & Mov)) { + if ((ctxt->dst.type == OP_MEM) && !(ctxt->d & Mov)) { /* optimisation - avoid slow emulated read if Mov */ - rc = segmented_read(ctxt, c->dst.addr.mem, - &c->dst.val, c->dst.bytes); + rc = segmented_read(ctxt, ctxt->dst.addr.mem, + &ctxt->dst.val, ctxt->dst.bytes); if (rc != X86EMUL_CONTINUE) goto done; } - c->dst.orig_val = c->dst.val; + ctxt->dst.orig_val = ctxt->dst.val; special_insn: - if (unlikely(ctxt->guest_mode) && c->intercept) { - rc = emulator_check_intercept(ctxt, c->intercept, + if (unlikely(ctxt->guest_mode) && ctxt->intercept) { + rc = emulator_check_intercept(ctxt, ctxt->intercept, X86_ICPT_POST_MEMACCESS); if (rc != X86EMUL_CONTINUE) goto done; } - if (c->execute) { - rc = c->execute(ctxt); + if (ctxt->execute) { + rc = ctxt->execute(ctxt); if (rc != X86EMUL_CONTINUE) goto done; goto writeback; } - if (c->twobyte) + if (ctxt->twobyte) goto twobyte_insn; - switch (c->b) { + switch (ctxt->b) { case 0x06: /* push es */ rc = emulate_push_sreg(ctxt, VCPU_SREG_ES); break; @@ -3954,45 +3847,45 @@ special_insn: rc = emulate_pop_sreg(ctxt, VCPU_SREG_DS); break; case 0x40 ... 0x47: /* inc r16/r32 */ - emulate_1op("inc", c->dst, ctxt->eflags); + emulate_1op("inc", ctxt->dst, ctxt->eflags); break; case 0x48 ... 0x4f: /* dec r16/r32 */ - emulate_1op("dec", c->dst, ctxt->eflags); + emulate_1op("dec", ctxt->dst, ctxt->eflags); break; case 0x63: /* movsxd */ if (ctxt->mode != X86EMUL_MODE_PROT64) goto cannot_emulate; - c->dst.val = (s32) c->src.val; + ctxt->dst.val = (s32) ctxt->src.val; break; case 0x6c: /* insb */ case 0x6d: /* insw/insd */ - c->src.val = c->regs[VCPU_REGS_RDX]; + ctxt->src.val = ctxt->regs[VCPU_REGS_RDX]; goto do_io_in; case 0x6e: /* outsb */ case 0x6f: /* outsw/outsd */ - c->dst.val = c->regs[VCPU_REGS_RDX]; + ctxt->dst.val = ctxt->regs[VCPU_REGS_RDX]; goto do_io_out; break; case 0x70 ... 0x7f: /* jcc (short) */ - if (test_cc(c->b, ctxt->eflags)) - jmp_rel(c, c->src.val); + if (test_cc(ctxt->b, ctxt->eflags)) + jmp_rel(ctxt, ctxt->src.val); break; case 0x8d: /* lea r16/r32, m */ - c->dst.val = c->src.addr.mem.ea; + ctxt->dst.val = ctxt->src.addr.mem.ea; break; case 0x8f: /* pop (sole member of Grp1a) */ rc = em_grp1a(ctxt); break; case 0x90 ... 0x97: /* nop / xchg reg, rax */ - if (c->dst.addr.reg == &c->regs[VCPU_REGS_RAX]) + if (ctxt->dst.addr.reg == &ctxt->regs[VCPU_REGS_RAX]) break; rc = em_xchg(ctxt); break; case 0x98: /* cbw/cwde/cdqe */ - switch (c->op_bytes) { - case 2: c->dst.val = (s8)c->dst.val; break; - case 4: c->dst.val = (s16)c->dst.val; break; - case 8: c->dst.val = (s32)c->dst.val; break; + switch (ctxt->op_bytes) { + case 2: ctxt->dst.val = (s8)ctxt->dst.val; break; + case 4: ctxt->dst.val = (s16)ctxt->dst.val; break; + case 8: ctxt->dst.val = (s32)ctxt->dst.val; break; } break; case 0xc0 ... 0xc1: @@ -4008,7 +3901,7 @@ special_insn: rc = emulate_int(ctxt, 3); break; case 0xcd: /* int n */ - rc = emulate_int(ctxt, c->src.val); + rc = emulate_int(ctxt, ctxt->src.val); break; case 0xce: /* into */ if (ctxt->eflags & EFLG_OF) @@ -4018,7 +3911,7 @@ special_insn: rc = em_grp2(ctxt); break; case 0xd2 ... 0xd3: /* Grp2 */ - c->src.val = c->regs[VCPU_REGS_RCX]; + ctxt->src.val = ctxt->regs[VCPU_REGS_RCX]; rc = em_grp2(ctxt); break; case 0xe4: /* inb */ @@ -4028,30 +3921,30 @@ special_insn: case 0xe7: /* out */ goto do_io_out; case 0xe8: /* call (near) */ { - long int rel = c->src.val; - c->src.val = (unsigned long) c->_eip; - jmp_rel(c, rel); + long int rel = ctxt->src.val; + ctxt->src.val = (unsigned long) ctxt->_eip; + jmp_rel(ctxt, rel); rc = em_push(ctxt); break; } case 0xe9: /* jmp rel */ case 0xeb: /* jmp rel short */ - jmp_rel(c, c->src.val); - c->dst.type = OP_NONE; /* Disable writeback. */ + jmp_rel(ctxt, ctxt->src.val); + ctxt->dst.type = OP_NONE; /* Disable writeback. */ break; case 0xec: /* in al,dx */ case 0xed: /* in (e/r)ax,dx */ do_io_in: - if (!pio_in_emulated(ctxt, c->dst.bytes, c->src.val, - &c->dst.val)) + if (!pio_in_emulated(ctxt, ctxt->dst.bytes, ctxt->src.val, + &ctxt->dst.val)) goto done; /* IO is needed */ break; case 0xee: /* out dx,al */ case 0xef: /* out dx,(e/r)ax */ do_io_out: - ops->pio_out_emulated(ctxt, c->src.bytes, c->dst.val, - &c->src.val, 1); - c->dst.type = OP_NONE; /* Disable writeback. */ + ops->pio_out_emulated(ctxt, ctxt->src.bytes, ctxt->dst.val, + &ctxt->src.val, 1); + ctxt->dst.type = OP_NONE; /* Disable writeback. */ break; case 0xf4: /* hlt */ ctxt->ops->halt(ctxt); @@ -4097,40 +3990,40 @@ writeback: * restore dst type in case the decoding will be reused * (happens for string instruction ) */ - c->dst.type = saved_dst_type; + ctxt->dst.type = saved_dst_type; - if ((c->d & SrcMask) == SrcSI) - string_addr_inc(ctxt, seg_override(ctxt, c), - VCPU_REGS_RSI, &c->src); + if ((ctxt->d & SrcMask) == SrcSI) + string_addr_inc(ctxt, seg_override(ctxt), + VCPU_REGS_RSI, &ctxt->src); - if ((c->d & DstMask) == DstDI) + if ((ctxt->d & DstMask) == DstDI) string_addr_inc(ctxt, VCPU_SREG_ES, VCPU_REGS_RDI, - &c->dst); + &ctxt->dst); - if (c->rep_prefix && (c->d & String)) { - struct read_cache *r = &c->io_read; - register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1); + if (ctxt->rep_prefix && (ctxt->d & String)) { + struct read_cache *r = &ctxt->io_read; + register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RCX], -1); if (!string_insn_completed(ctxt)) { /* * Re-enter guest when pio read ahead buffer is empty * or, if it is not used, after each 1024 iteration. */ - if ((r->end != 0 || c->regs[VCPU_REGS_RCX] & 0x3ff) && + if ((r->end != 0 || ctxt->regs[VCPU_REGS_RCX] & 0x3ff) && (r->end == 0 || r->end != r->pos)) { /* * Reset read cache. Usually happens before * decode, but since instruction is restarted * we have to do it here. */ - c->mem_read.end = 0; + ctxt->mem_read.end = 0; return EMULATION_RESTART; } goto done; /* skip rip writeback */ } } - ctxt->eip = c->_eip; + ctxt->eip = ctxt->_eip; done: if (rc == X86EMUL_PROPAGATE_FAULT) @@ -4141,7 +4034,7 @@ done: return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; twobyte_insn: - switch (c->b) { + switch (ctxt->b) { case 0x09: /* wbinvd */ (ctxt->ops->wbinvd)(ctxt); break; @@ -4150,21 +4043,21 @@ twobyte_insn: case 0x18: /* Grp16 (prefetch/nop) */ break; case 0x20: /* mov cr, reg */ - c->dst.val = ops->get_cr(ctxt, c->modrm_reg); + ctxt->dst.val = ops->get_cr(ctxt, ctxt->modrm_reg); break; case 0x21: /* mov from dr to reg */ - ops->get_dr(ctxt, c->modrm_reg, &c->dst.val); + ops->get_dr(ctxt, ctxt->modrm_reg, &ctxt->dst.val); break; case 0x22: /* mov reg, cr */ - if (ops->set_cr(ctxt, c->modrm_reg, c->src.val)) { + if (ops->set_cr(ctxt, ctxt->modrm_reg, ctxt->src.val)) { emulate_gp(ctxt, 0); rc = X86EMUL_PROPAGATE_FAULT; goto done; } - c->dst.type = OP_NONE; + ctxt->dst.type = OP_NONE; break; case 0x23: /* mov from reg to dr */ - if (ops->set_dr(ctxt, c->modrm_reg, c->src.val & + if (ops->set_dr(ctxt, ctxt->modrm_reg, ctxt->src.val & ((ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U)) < 0) { /* #UD condition is already handled by the code above */ @@ -4173,13 +4066,13 @@ twobyte_insn: goto done; } - c->dst.type = OP_NONE; /* no writeback */ + ctxt->dst.type = OP_NONE; /* no writeback */ break; case 0x30: /* wrmsr */ - msr_data = (u32)c->regs[VCPU_REGS_RAX] - | ((u64)c->regs[VCPU_REGS_RDX] << 32); - if (ops->set_msr(ctxt, c->regs[VCPU_REGS_RCX], msr_data)) { + msr_data = (u32)ctxt->regs[VCPU_REGS_RAX] + | ((u64)ctxt->regs[VCPU_REGS_RDX] << 32); + if (ops->set_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], msr_data)) { emulate_gp(ctxt, 0); rc = X86EMUL_PROPAGATE_FAULT; goto done; @@ -4188,27 +4081,27 @@ twobyte_insn: break; case 0x32: /* rdmsr */ - if (ops->get_msr(ctxt, c->regs[VCPU_REGS_RCX], &msr_data)) { + if (ops->get_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], &msr_data)) { emulate_gp(ctxt, 0); rc = X86EMUL_PROPAGATE_FAULT; goto done; } else { - c->regs[VCPU_REGS_RAX] = (u32)msr_data; - c->regs[VCPU_REGS_RDX] = msr_data >> 32; + ctxt->regs[VCPU_REGS_RAX] = (u32)msr_data; + ctxt->regs[VCPU_REGS_RDX] = msr_data >> 32; } rc = X86EMUL_CONTINUE; break; case 0x40 ... 0x4f: /* cmov */ - c->dst.val = c->dst.orig_val = c->src.val; - if (!test_cc(c->b, ctxt->eflags)) - c->dst.type = OP_NONE; /* no writeback */ + ctxt->dst.val = ctxt->dst.orig_val = ctxt->src.val; + if (!test_cc(ctxt->b, ctxt->eflags)) + ctxt->dst.type = OP_NONE; /* no writeback */ break; case 0x80 ... 0x8f: /* jnz rel, etc*/ - if (test_cc(c->b, ctxt->eflags)) - jmp_rel(c, c->src.val); + if (test_cc(ctxt->b, ctxt->eflags)) + jmp_rel(ctxt, ctxt->src.val); break; case 0x90 ... 0x9f: /* setcc r/m8 */ - c->dst.val = test_cc(c->b, ctxt->eflags); + ctxt->dst.val = test_cc(ctxt->b, ctxt->eflags); break; case 0xa0: /* push fs */ rc = emulate_push_sreg(ctxt, VCPU_SREG_FS); @@ -4218,14 +4111,14 @@ twobyte_insn: break; case 0xa3: bt: /* bt */ - c->dst.type = OP_NONE; + ctxt->dst.type = OP_NONE; /* only subword offset */ - c->src.val &= (c->dst.bytes << 3) - 1; - emulate_2op_SrcV_nobyte("bt", c->src, c->dst, ctxt->eflags); + ctxt->src.val &= (ctxt->dst.bytes << 3) - 1; + emulate_2op_SrcV_nobyte("bt", ctxt->src, ctxt->dst, ctxt->eflags); break; case 0xa4: /* shld imm8, r, r/m */ case 0xa5: /* shld cl, r, r/m */ - emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags); + emulate_2op_cl("shld", ctxt->src2, ctxt->src, ctxt->dst, ctxt->eflags); break; case 0xa8: /* push gs */ rc = emulate_push_sreg(ctxt, VCPU_SREG_GS); @@ -4235,11 +4128,11 @@ twobyte_insn: break; case 0xab: bts: /* bts */ - emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags); + emulate_2op_SrcV_nobyte("bts", ctxt->src, ctxt->dst, ctxt->eflags); break; case 0xac: /* shrd imm8, r, r/m */ case 0xad: /* shrd cl, r, r/m */ - emulate_2op_cl("shrd", c->src2, c->src, c->dst, ctxt->eflags); + emulate_2op_cl("shrd", ctxt->src2, ctxt->src, ctxt->dst, ctxt->eflags); break; case 0xae: /* clflush */ break; @@ -4248,16 +4141,16 @@ twobyte_insn: * Save real source value, then compare EAX against * destination. */ - c->src.orig_val = c->src.val; - c->src.val = c->regs[VCPU_REGS_RAX]; - emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags); + ctxt->src.orig_val = ctxt->src.val; + ctxt->src.val = ctxt->regs[VCPU_REGS_RAX]; + emulate_2op_SrcV("cmp", ctxt->src, ctxt->dst, ctxt->eflags); if (ctxt->eflags & EFLG_ZF) { /* Success: write back to memory. */ - c->dst.val = c->src.orig_val; + ctxt->dst.val = ctxt->src.orig_val; } else { /* Failure: write the value we saw to EAX. */ - c->dst.type = OP_REG; - c->dst.addr.reg = (unsigned long *)&c->regs[VCPU_REGS_RAX]; + ctxt->dst.type = OP_REG; + ctxt->dst.addr.reg = (unsigned long *)&ctxt->regs[VCPU_REGS_RAX]; } break; case 0xb2: /* lss */ @@ -4265,7 +4158,7 @@ twobyte_insn: break; case 0xb3: btr: /* btr */ - emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags); + emulate_2op_SrcV_nobyte("btr", ctxt->src, ctxt->dst, ctxt->eflags); break; case 0xb4: /* lfs */ rc = emulate_load_segment(ctxt, VCPU_SREG_FS); @@ -4274,12 +4167,12 @@ twobyte_insn: rc = emulate_load_segment(ctxt, VCPU_SREG_GS); break; case 0xb6 ... 0xb7: /* movzx */ - c->dst.bytes = c->op_bytes; - c->dst.val = (c->d & ByteOp) ? (u8) c->src.val - : (u16) c->src.val; + ctxt->dst.bytes = ctxt->op_bytes; + ctxt->dst.val = (ctxt->d & ByteOp) ? (u8) ctxt->src.val + : (u16) ctxt->src.val; break; case 0xba: /* Grp8 */ - switch (c->modrm_reg & 3) { + switch (ctxt->modrm_reg & 3) { case 0: goto bt; case 1: @@ -4292,47 +4185,47 @@ twobyte_insn: break; case 0xbb: btc: /* btc */ - emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags); + emulate_2op_SrcV_nobyte("btc", ctxt->src, ctxt->dst, ctxt->eflags); break; case 0xbc: { /* bsf */ u8 zf; __asm__ ("bsf %2, %0; setz %1" - : "=r"(c->dst.val), "=q"(zf) - : "r"(c->src.val)); + : "=r"(ctxt->dst.val), "=q"(zf) + : "r"(ctxt->src.val)); ctxt->eflags &= ~X86_EFLAGS_ZF; if (zf) { ctxt->eflags |= X86_EFLAGS_ZF; - c->dst.type = OP_NONE; /* Disable writeback. */ + ctxt->dst.type = OP_NONE; /* Disable writeback. */ } break; } case 0xbd: { /* bsr */ u8 zf; __asm__ ("bsr %2, %0; setz %1" - : "=r"(c->dst.val), "=q"(zf) - : "r"(c->src.val)); + : "=r"(ctxt->dst.val), "=q"(zf) + : "r"(ctxt->src.val)); ctxt->eflags &= ~X86_EFLAGS_ZF; if (zf) { ctxt->eflags |= X86_EFLAGS_ZF; - c->dst.type = OP_NONE; /* Disable writeback. */ + ctxt->dst.type = OP_NONE; /* Disable writeback. */ } break; } case 0xbe ... 0xbf: /* movsx */ - c->dst.bytes = c->op_bytes; - c->dst.val = (c->d & ByteOp) ? (s8) c->src.val : - (s16) c->src.val; + ctxt->dst.bytes = ctxt->op_bytes; + ctxt->dst.val = (ctxt->d & ByteOp) ? (s8) ctxt->src.val : + (s16) ctxt->src.val; break; case 0xc0 ... 0xc1: /* xadd */ - emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags); + emulate_2op_SrcV("add", ctxt->src, ctxt->dst, ctxt->eflags); /* Write back the register source. */ - c->src.val = c->dst.orig_val; - write_register_operand(&c->src); + ctxt->src.val = ctxt->dst.orig_val; + write_register_operand(&ctxt->src); break; case 0xc3: /* movnti */ - c->dst.bytes = c->op_bytes; - c->dst.val = (c->op_bytes == 4) ? (u32) c->src.val : - (u64) c->src.val; + ctxt->dst.bytes = ctxt->op_bytes; + ctxt->dst.val = (ctxt->op_bytes == 4) ? (u32) ctxt->src.val : + (u64) ctxt->src.val; break; case 0xc7: /* Grp9 (cmpxchg8b) */ rc = em_grp9(ctxt); diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index d69e758..624f8cb 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -675,12 +675,12 @@ TRACE_EVENT(kvm_emulate_insn, ), TP_fast_assign( - __entry->rip = vcpu->arch.emulate_ctxt.decode.fetch.start; + __entry->rip = vcpu->arch.emulate_ctxt.fetch.start; __entry->csbase = kvm_x86_ops->get_segment_base(vcpu, VCPU_SREG_CS); - __entry->len = vcpu->arch.emulate_ctxt.decode._eip - - vcpu->arch.emulate_ctxt.decode.fetch.start; + __entry->len = vcpu->arch.emulate_ctxt._eip + - vcpu->arch.emulate_ctxt.fetch.start; memcpy(__entry->insn, - vcpu->arch.emulate_ctxt.decode.fetch.data, + vcpu->arch.emulate_ctxt.fetch.data, 15); __entry->flags = kei_decode_mode(vcpu->arch.emulate_ctxt.mode); __entry->failed = failed; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7e452fe..694538a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4507,24 +4507,24 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu) kvm_queue_exception(vcpu, ctxt->exception.vector); } -static void init_decode_cache(struct decode_cache *c, +static void init_decode_cache(struct x86_emulate_ctxt *ctxt, const unsigned long *regs) { - memset(c, 0, offsetof(struct decode_cache, regs)); - memcpy(c->regs, regs, sizeof(c->regs)); + memset(&ctxt->twobyte, 0, + (void *)&ctxt->regs - (void *)&ctxt->twobyte); + memcpy(ctxt->regs, regs, sizeof(ctxt->regs)); - c->fetch.start = 0; - c->fetch.end = 0; - c->io_read.pos = 0; - c->io_read.end = 0; - c->mem_read.pos = 0; - c->mem_read.end = 0; + ctxt->fetch.start = 0; + ctxt->fetch.end = 0; + ctxt->io_read.pos = 0; + ctxt->io_read.end = 0; + ctxt->mem_read.pos = 0; + ctxt->mem_read.end = 0; } static void init_emulate_ctxt(struct kvm_vcpu *vcpu) { struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; - struct decode_cache *c = &ctxt->decode; int cs_db, cs_l; /* @@ -4546,28 +4546,27 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) X86EMUL_MODE_PROT16; ctxt->guest_mode = is_guest_mode(vcpu); - init_decode_cache(c, vcpu->arch.regs); + init_decode_cache(ctxt, vcpu->arch.regs); vcpu->arch.emulate_regs_need_sync_from_vcpu = false; } int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) { struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; - struct decode_cache *c = &ctxt->decode; int ret; init_emulate_ctxt(vcpu); - c->op_bytes = 2; - c->ad_bytes = 2; - c->_eip = ctxt->eip + inc_eip; + ctxt->op_bytes = 2; + ctxt->ad_bytes = 2; + ctxt->_eip = ctxt->eip + inc_eip; ret = emulate_int_real(ctxt, irq); if (ret != X86EMUL_CONTINUE) return EMULATE_FAIL; - ctxt->eip = c->_eip; - memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); + ctxt->eip = ctxt->_eip; + memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs); kvm_rip_write(vcpu, ctxt->eip); kvm_set_rflags(vcpu, ctxt->eflags); @@ -4631,7 +4630,6 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, { int r; struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; - struct decode_cache *c = &ctxt->decode; bool writeback = true; kvm_clear_exception_queue(vcpu); @@ -4661,7 +4659,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, } if (emulation_type & EMULTYPE_SKIP) { - kvm_rip_write(vcpu, c->_eip); + kvm_rip_write(vcpu, ctxt->_eip); return EMULATE_DONE; } @@ -4669,7 +4667,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, changes registers values during IO operation */ if (vcpu->arch.emulate_regs_need_sync_from_vcpu) { vcpu->arch.emulate_regs_need_sync_from_vcpu = false; - memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); + memcpy(ctxt->regs, vcpu->arch.regs, sizeof ctxt->regs); } restart: @@ -4707,7 +4705,7 @@ restart: toggle_interruptibility(vcpu, ctxt->interruptibility); kvm_set_rflags(vcpu, ctxt->eflags); kvm_make_request(KVM_REQ_EVENT, vcpu); - memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); + memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs); vcpu->arch.emulate_regs_need_sync_to_vcpu = false; kvm_rip_write(vcpu, ctxt->eip); } else @@ -5718,8 +5716,8 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) * that usually, but some bad designed PV devices (vmware * backdoor interface) need this to work */ - struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; - memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); + struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs); vcpu->arch.emulate_regs_need_sync_to_vcpu = false; } regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); @@ -5849,7 +5847,6 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, bool has_error_code, u32 error_code) { struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; - struct decode_cache *c = &ctxt->decode; int ret; init_emulate_ctxt(vcpu); @@ -5860,7 +5857,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, if (ret) return EMULATE_FAIL; - memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); + memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs); kvm_rip_write(vcpu, ctxt->eip); kvm_set_rflags(vcpu, ctxt->eflags); kvm_make_request(KVM_REQ_EVENT, vcpu); -- cgit v1.1 From 509c75ea198fe524adaf90ca1021487b733447ce Mon Sep 17 00:00:00 2001 From: Nadav Har'El Date: Thu, 2 Jun 2011 11:54:52 +0300 Subject: KVM: nVMX: Fix bug preventing more than two levels of nesting The nested VMX feature is supposed to fully emulate VMX for the guest. This (theoretically) not only allows it to run its own guests, but also also to further emulate VMX for its own guests, and allow arbitrarily deep nesting. This patch fixes a bug (discovered by Kevin Tian) in handling a VMLAUNCH by L2, which prevented deeper nesting. Deeper nesting now works (I only actually tested L3), but is currently *absurdly* slow, to the point of being unusable. Signed-off-by: Nadav Har'El Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 58badf1..f5b49c7 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5694,8 +5694,8 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) if (vmx->nested.nested_run_pending) kvm_make_request(KVM_REQ_EVENT, vcpu); - if (exit_reason == EXIT_REASON_VMLAUNCH || - exit_reason == EXIT_REASON_VMRESUME) + if (!is_guest_mode(vcpu) && (exit_reason == EXIT_REASON_VMLAUNCH || + exit_reason == EXIT_REASON_VMRESUME)) vmx->nested.nested_run_pending = 1; else vmx->nested.nested_run_pending = 0; -- cgit v1.1 From 8d9c975fc5b825cb76953a1b45a84195ffc6f4ab Mon Sep 17 00:00:00 2001 From: "Yang, Wei Y" Date: Fri, 3 Jun 2011 11:13:35 +0800 Subject: KVM: Remove SMEP bit from CR4_RESERVED_BITS This patch removes SMEP bit from CR4_RESERVED_BITS. Signed-off-by: Yang, Wei Signed-off-by: Shan, Haitao Signed-off-by: Li, Xin Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index d167039..fc38eca 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -48,7 +48,7 @@ (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ - | X86_CR4_OSXSAVE \ + | X86_CR4_OSXSAVE | X86_CR4_SMEP \ | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) -- cgit v1.1 From c68b734fba402b9bfdd49e23b776c42dbeaf1f5b Mon Sep 17 00:00:00 2001 From: "Yang, Wei Y" Date: Fri, 3 Jun 2011 11:13:42 +0800 Subject: KVM: Add SMEP support when setting CR4 This patch adds SMEP handling when setting CR4. Signed-off-by: Yang, Wei Signed-off-by: Shan, Haitao Signed-off-by: Li, Xin Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 694538a..ba5cd27 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -580,6 +580,14 @@ static bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu) return best && (best->ecx & bit(X86_FEATURE_XSAVE)); } +static bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 7, 0); + return best && (best->ebx & bit(X86_FEATURE_SMEP)); +} + static void update_cpuid(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; @@ -599,14 +607,17 @@ static void update_cpuid(struct kvm_vcpu *vcpu) int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { unsigned long old_cr4 = kvm_read_cr4(vcpu); - unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE; - + unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | + X86_CR4_PAE | X86_CR4_SMEP; if (cr4 & CR4_RESERVED_BITS) return 1; if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE)) return 1; + if (!guest_cpuid_has_smep(vcpu) && (cr4 & X86_CR4_SMEP)) + return 1; + if (is_long_mode(vcpu)) { if (!(cr4 & X86_CR4_PAE)) return 1; -- cgit v1.1 From 611c120f7486a19e7df2225f875a52ef0b599ae8 Mon Sep 17 00:00:00 2001 From: "Yang, Wei Y" Date: Fri, 3 Jun 2011 11:14:03 +0800 Subject: KVM: Mask function7 ebx against host capability word9 This patch masks CPUID leaf 7 ebx against host capability word9. Signed-off-by: Yang, Wei Signed-off-by: Shan, Haitao Signed-off-by: Li, Xin Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ba5cd27..ff4623b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2359,6 +2359,10 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) | F(PMM) | F(PMM_EN); + /* cpuid 7.0.ebx */ + const u32 kvm_supported_word9_x86_features = + F(SMEP); + /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); do_cpuid_1_ent(entry, function, index); @@ -2393,7 +2397,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, } break; } - /* function 4 and 0xb have additional index. */ + /* function 4 has additional index. */ case 4: { int i, cache_type; @@ -2410,8 +2414,22 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, } break; } + case 7: { + entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + /* Mask ebx against host capbability word 9 */ + if (index == 0) { + entry->ebx &= kvm_supported_word9_x86_features; + cpuid_mask(&entry->ebx, 9); + } else + entry->ebx = 0; + entry->eax = 0; + entry->ecx = 0; + entry->edx = 0; + break; + } case 9: break; + /* function 0xb has additional index. */ case 0xb: { int i, level_type; -- cgit v1.1 From e57d4a356ad3ac46881399c424cc6cf6dd16359d Mon Sep 17 00:00:00 2001 From: "Yang, Wei Y" Date: Fri, 3 Jun 2011 11:14:16 +0800 Subject: KVM: Add instruction fetch checking when walking guest page table This patch adds instruction fetch checking when walking guest page table, to implement SMEP when emulating instead of executing natively. Signed-off-by: Yang, Wei Signed-off-by: Shan, Haitao Signed-off-by: Li, Xin Signed-off-by: Avi Kivity --- arch/x86/kvm/paging_tmpl.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 9d03ad4..1caeb4d 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -246,6 +246,12 @@ walk: gfn_t gfn; u32 ac; + /* check if the kernel is fetching from user page */ + if (unlikely(pte_access & PT_USER_MASK) && + kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)) + if (fetch_fault && !user_fault) + eperm = true; + gfn = gpte_to_gfn_lvl(pte, lvl); gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT; @@ -305,7 +311,8 @@ error: walker->fault.error_code |= write_fault | user_fault; - if (fetch_fault && mmu->nx) + if (fetch_fault && (mmu->nx || + kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))) walker->fault.error_code |= PFERR_FETCH_MASK; if (rsvd_fault) walker->fault.error_code |= PFERR_RSVD_MASK; -- cgit v1.1 From 02668b061db1b9f7f18872e594ac68e237db0bed Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Fri, 10 Jun 2011 11:35:30 +0200 Subject: KVM: fix XSAVE bit scanning (now properly) commit 123108f1c1aafd51d6a5c79cc04d7999dd88a930 tried to fix KVMs XSAVE valid feature scanning, but it was wrong. It was not considering the sparse nature of this bitfield, instead reading values from uninitialized members of the entries array. This patch now separates subleaf indicies from KVM's array indicies and fills the entry before querying it's value. This fixes AVX support in KVM guests. Signed-off-by: Andre Przywara Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ff4623b..84f4607 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2447,16 +2447,17 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, break; } case 0xd: { - int i; + int idx, i; entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; - for (i = 1; *nent < maxnent && i < 64; ++i) { - if (entry[i].eax == 0 || !supported_xcr0_bit(i)) + for (idx = 1, i = 1; *nent < maxnent && idx < 64; ++idx) { + do_cpuid_1_ent(&entry[i], function, idx); + if (entry[i].eax == 0 || !supported_xcr0_bit(idx)) continue; - do_cpuid_1_ent(&entry[i], function, i); entry[i].flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; ++*nent; + ++i; } break; } -- cgit v1.1 From 4a00efdf0c7c93dc6f6b3ce7e2d6bd4cd1ac1651 Mon Sep 17 00:00:00 2001 From: "Yang, Wei Y" Date: Mon, 13 Jun 2011 21:52:33 +0800 Subject: KVM: Enable DRNG feature support for KVM This patch exposes DRNG feature to KVM guests. The RDRAND instruction can provide software with sequences of random numbers generated from white noise. Signed-off-by: Yang, Wei Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 84f4607..6f1e54d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2345,7 +2345,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, 0 /* Reserved, DCA */ | F(XMM4_1) | F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) | - F(F16C); + F(F16C) | F(RDRAND); /* cpuid 0x80000001.ecx */ const u32 kvm_supported_word6_x86_features = F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ | -- cgit v1.1 From d9c3476d8a99455cd3af1bd773acd77aa947a934 Mon Sep 17 00:00:00 2001 From: "Yang, Wei" Date: Tue, 14 Jun 2011 20:10:17 +0800 Subject: KVM: Remove RDWRGSFS bit from CR4_RESERVED_BITS This patch removes RDWRGSFS bit from CR4_RESERVED_BITS. Signed-off-by: Yang, Wei Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/include/asm/processor-flags.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index fc38eca..554be45 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -48,7 +48,7 @@ (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ - | X86_CR4_OSXSAVE | X86_CR4_SMEP \ + | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_RDWRGSFS \ | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h index 59ab4df..2dddb31 100644 --- a/arch/x86/include/asm/processor-flags.h +++ b/arch/x86/include/asm/processor-flags.h @@ -59,6 +59,7 @@ #define X86_CR4_OSFXSR 0x00000200 /* enable fast FPU save and restore */ #define X86_CR4_OSXMMEXCPT 0x00000400 /* enable unmasked SSE exceptions */ #define X86_CR4_VMXE 0x00002000 /* enable VMX virtualization */ +#define X86_CR4_RDWRGSFS 0x00010000 /* enable RDWRGSFS support */ #define X86_CR4_OSXSAVE 0x00040000 /* enable xsave and xrestore */ #define X86_CR4_SMEP 0x00100000 /* enable SMEP support */ -- cgit v1.1 From 74dc2b4ffe3af1eac367be0178f88487cb9b240a Mon Sep 17 00:00:00 2001 From: "Yang, Wei" Date: Tue, 14 Jun 2011 20:10:18 +0800 Subject: KVM: Add RDWRGSFS support when setting CR4 This patch adds RDWRGSFS support when setting CR4. Signed-off-by: Yang, Wei Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6f1e54d..423e0cd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -588,6 +588,14 @@ static bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu) return best && (best->ebx & bit(X86_FEATURE_SMEP)); } +static bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 7, 0); + return best && (best->ebx & bit(X86_FEATURE_FSGSBASE)); +} + static void update_cpuid(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; @@ -618,6 +626,9 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) if (!guest_cpuid_has_smep(vcpu) && (cr4 & X86_CR4_SMEP)) return 1; + if (!guest_cpuid_has_fsgsbase(vcpu) && (cr4 & X86_CR4_RDWRGSFS)) + return 1; + if (is_long_mode(vcpu)) { if (!(cr4 & X86_CR4_PAE)) return 1; -- cgit v1.1 From 176f61da82435eae09cc96f70b530d1ba0746b8b Mon Sep 17 00:00:00 2001 From: "Yang, Wei" Date: Tue, 14 Jun 2011 20:10:19 +0800 Subject: KVM: Expose RDWRGSFS bit to KVM guests This patch exposes RDWRGSFS bit to KVM guests. Signed-off-by: Yang, Wei Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 423e0cd..76c16a5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2372,7 +2372,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 7.0.ebx */ const u32 kvm_supported_word9_x86_features = - F(SMEP); + F(SMEP) | F(FSGSBASE); /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); -- cgit v1.1 From a01c8f9b4e266df1d7166d23216f2060648f862d Mon Sep 17 00:00:00 2001 From: "Yang, Wei" Date: Tue, 14 Jun 2011 15:19:06 +0800 Subject: KVM: Enable ERMS feature support for KVM This patch exposes ERMS feature to KVM guests. The REP MOVSB/STOSB instruction can enhance fast strings attempts to move as much of the data with larger size load/stores as possible. Signed-off-by: Yang, Wei Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 76c16a5..0b803f0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2372,7 +2372,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 7.0.ebx */ const u32 kvm_supported_word9_x86_features = - F(SMEP) | F(FSGSBASE); + F(SMEP) | F(FSGSBASE) | F(ERMS); /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); -- cgit v1.1 From 411c588dfb863feee78b721d5e7c86ac38921c49 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 6 Jun 2011 16:11:54 +0300 Subject: KVM: MMU: Adjust shadow paging to work when SMEP=1 and CR0.WP=0 When CR0.WP=0, we sometimes map user pages as kernel pages (to allow the kernel to write to them). Unfortunately this also allows the kernel to fetch from these pages, even if CR4.SMEP is set. Adjust for this by also setting NX on the spte in these circumstances. Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/mmu.c | 14 +++++++++++++- 2 files changed, 14 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 554be45..da6bbee 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -205,6 +205,7 @@ union kvm_mmu_page_role { unsigned invalid:1; unsigned nxe:1; unsigned cr0_wp:1; + unsigned smep_andnot_wp:1; }; }; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 15afa1e..da0f3b0 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1985,8 +1985,17 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, spte |= PT_WRITABLE_MASK; if (!vcpu->arch.mmu.direct_map - && !(pte_access & ACC_WRITE_MASK)) + && !(pte_access & ACC_WRITE_MASK)) { spte &= ~PT_USER_MASK; + /* + * If we converted a user page to a kernel page, + * so that the kernel can write to it when cr0.wp=0, + * then we should prevent the kernel from executing it + * if SMEP is enabled. + */ + if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)) + spte |= PT64_NX_MASK; + } /* * Optimization: for pte sync, if spte was writable the hash @@ -2955,6 +2964,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context) { int r; + bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP); ASSERT(vcpu); ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); @@ -2969,6 +2979,8 @@ int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context) vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu); vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); + vcpu->arch.mmu.base_role.smep_andnot_wp + = smep && !is_write_protection(vcpu); return r; } -- cgit v1.1 From 45bd07b9d5202c910b31c92bd15572b560198c26 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 12 Jun 2011 18:14:08 +0300 Subject: KVM: MMU: make kvm_mmu_reset_context() flush the guest TLB kvm_set_cr0() and kvm_set_cr4(), and possible other functions, assume that kvm_mmu_reset_context() flushes the guest TLB. However, it does not. Fix by flushing the tlb (and syncing the new root as well). Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index da0f3b0..9c629b5 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3054,8 +3054,18 @@ static void destroy_kvm_mmu(struct kvm_vcpu *vcpu) int kvm_mmu_reset_context(struct kvm_vcpu *vcpu) { + int r; + destroy_kvm_mmu(vcpu); - return init_kvm_mmu(vcpu); + r = init_kvm_mmu(vcpu); + + if (r) + goto err; + + kvm_mmu_sync_roots(vcpu); + kvm_mmu_flush_tlb(vcpu); +err: + return r; } EXPORT_SYMBOL_GPL(kvm_mmu_reset_context); -- cgit v1.1 From f8f7e5ee1037e347eafff8f526913b92cec54873 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 21 Jun 2011 14:00:10 -0300 Subject: Revert "KVM: MMU: make kvm_mmu_reset_context() flush the guest TLB" This reverts commit bee931d31e588b8eb86b7edee32fac2d16930cd7. TLB flush should be done lazily during guest entry, in kvm_mmu_load(). Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 9c629b5..da0f3b0 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3054,18 +3054,8 @@ static void destroy_kvm_mmu(struct kvm_vcpu *vcpu) int kvm_mmu_reset_context(struct kvm_vcpu *vcpu) { - int r; - destroy_kvm_mmu(vcpu); - r = init_kvm_mmu(vcpu); - - if (r) - goto err; - - kvm_mmu_sync_roots(vcpu); - kvm_mmu_flush_tlb(vcpu); -err: - return r; + return init_kvm_mmu(vcpu); } EXPORT_SYMBOL_GPL(kvm_mmu_reset_context); -- cgit v1.1 From 134291bf3cb434a9039298ba6b15ef33e65ba542 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Fri, 1 Jul 2011 01:34:56 +0900 Subject: KVM: MMU: Clean up the error handling of walk_addr_generic() Avoid two step jump to the error handling part. This eliminates the use of the variables present and rsvd_fault. We also use the const type qualifier to show that write/user/fetch_fault do not change in the function. Both of these were suggested by Ingo Molnar. Cc: Ingo Molnar Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/paging_tmpl.h | 82 ++++++++++++++++++---------------------------- 1 file changed, 32 insertions(+), 50 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 1caeb4d..f0746d2 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -125,18 +125,17 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, gfn_t table_gfn; unsigned index, pt_access, uninitialized_var(pte_access); gpa_t pte_gpa; - bool eperm, present, rsvd_fault; - int offset, write_fault, user_fault, fetch_fault; - - write_fault = access & PFERR_WRITE_MASK; - user_fault = access & PFERR_USER_MASK; - fetch_fault = access & PFERR_FETCH_MASK; + bool eperm; + int offset; + const int write_fault = access & PFERR_WRITE_MASK; + const int user_fault = access & PFERR_USER_MASK; + const int fetch_fault = access & PFERR_FETCH_MASK; + u16 errcode = 0; trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault, fetch_fault); walk: - present = true; - eperm = rsvd_fault = false; + eperm = false; walker->level = mmu->root_level; pte = mmu->get_cr3(vcpu); @@ -144,10 +143,8 @@ walk: if (walker->level == PT32E_ROOT_LEVEL) { pte = kvm_pdptr_read_mmu(vcpu, mmu, (addr >> 30) & 3); trace_kvm_mmu_paging_element(pte, walker->level); - if (!is_present_gpte(pte)) { - present = false; + if (!is_present_gpte(pte)) goto error; - } --walker->level; } #endif @@ -170,35 +167,27 @@ walk: real_gfn = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn), PFERR_USER_MASK|PFERR_WRITE_MASK); - if (unlikely(real_gfn == UNMAPPED_GVA)) { - present = false; - break; - } + if (unlikely(real_gfn == UNMAPPED_GVA)) + goto error; real_gfn = gpa_to_gfn(real_gfn); host_addr = gfn_to_hva(vcpu->kvm, real_gfn); - if (unlikely(kvm_is_error_hva(host_addr))) { - present = false; - break; - } + if (unlikely(kvm_is_error_hva(host_addr))) + goto error; ptep_user = (pt_element_t __user *)((void *)host_addr + offset); - if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte)))) { - present = false; - break; - } + if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte)))) + goto error; trace_kvm_mmu_paging_element(pte, walker->level); - if (unlikely(!is_present_gpte(pte))) { - present = false; - break; - } + if (unlikely(!is_present_gpte(pte))) + goto error; if (unlikely(is_rsvd_bits_set(&vcpu->arch.mmu, pte, walker->level))) { - rsvd_fault = true; - break; + errcode |= PFERR_RSVD_MASK | PFERR_PRESENT_MASK; + goto error; } if (unlikely(write_fault && !is_writable_pte(pte) @@ -213,17 +202,15 @@ walk: eperm = true; #endif - if (!eperm && !rsvd_fault - && unlikely(!(pte & PT_ACCESSED_MASK))) { + if (!eperm && unlikely(!(pte & PT_ACCESSED_MASK))) { int ret; trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte)); ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, pte, pte|PT_ACCESSED_MASK); - if (unlikely(ret < 0)) { - present = false; - break; - } else if (ret) + if (unlikely(ret < 0)) + goto error; + else if (ret) goto walk; mark_page_dirty(vcpu->kvm, table_gfn); @@ -276,8 +263,10 @@ walk: --walker->level; } - if (unlikely(!present || eperm || rsvd_fault)) + if (unlikely(eperm)) { + errcode |= PFERR_PRESENT_MASK; goto error; + } if (write_fault && unlikely(!is_dirty_gpte(pte))) { int ret; @@ -285,10 +274,9 @@ walk: trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, pte, pte|PT_DIRTY_MASK); - if (unlikely(ret < 0)) { - present = false; + if (unlikely(ret < 0)) goto error; - } else if (ret) + else if (ret) goto walk; mark_page_dirty(vcpu->kvm, table_gfn); @@ -303,20 +291,14 @@ walk: return 1; error: - walker->fault.vector = PF_VECTOR; - walker->fault.error_code_valid = true; - walker->fault.error_code = 0; - if (present) - walker->fault.error_code |= PFERR_PRESENT_MASK; - - walker->fault.error_code |= write_fault | user_fault; - + errcode |= write_fault | user_fault; if (fetch_fault && (mmu->nx || kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))) - walker->fault.error_code |= PFERR_FETCH_MASK; - if (rsvd_fault) - walker->fault.error_code |= PFERR_RSVD_MASK; + errcode |= PFERR_FETCH_MASK; + walker->fault.vector = PF_VECTOR; + walker->fault.error_code_valid = true; + walker->fault.error_code = errcode; walker->fault.address = addr; walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu; -- cgit v1.1 From 92c1c1e85bdd72b41f9fd39b9d43d4b3c146d02d Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Fri, 1 Jul 2011 01:36:07 +0900 Subject: KVM: MMU: Rename the walk label in walk_addr_generic() The current name does not explain the meaning well. So give it a better name "retry_walk" to show that we are trying the walk again. This was suggested by Ingo Molnar. Cc: Ingo Molnar Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/paging_tmpl.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index f0746d2..9c0afba 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -134,7 +134,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault, fetch_fault); -walk: +retry_walk: eperm = false; walker->level = mmu->root_level; pte = mmu->get_cr3(vcpu); @@ -211,7 +211,7 @@ walk: if (unlikely(ret < 0)) goto error; else if (ret) - goto walk; + goto retry_walk; mark_page_dirty(vcpu->kvm, table_gfn); pte |= PT_ACCESSED_MASK; @@ -277,7 +277,7 @@ walk: if (unlikely(ret < 0)) goto error; else if (ret) - goto walk; + goto retry_walk; mark_page_dirty(vcpu->kvm, table_gfn); pte |= PT_DIRTY_MASK; -- cgit v1.1 From 3c8c652ae4c984a950d0672597085db866509914 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Fri, 1 Jul 2011 01:37:24 +0900 Subject: KVM: MMU: Introduce is_last_gpte() to clean up walk_addr_generic() Suggested by Ingo and Avi. Cc: Ingo Molnar Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/paging_tmpl.h | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 9c0afba..1e1c244 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -113,6 +113,24 @@ static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte) return access; } +static bool FNAME(is_last_gpte)(struct guest_walker *walker, + struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + pt_element_t gpte) +{ + if (walker->level == PT_PAGE_TABLE_LEVEL) + return true; + + if ((walker->level == PT_DIRECTORY_LEVEL) && is_large_pte(gpte) && + (PTTYPE == 64 || is_pse(vcpu))) + return true; + + if ((walker->level == PT_PDPE_LEVEL) && is_large_pte(gpte) && + (mmu->root_level == PT64_ROOT_LEVEL)) + return true; + + return false; +} + /* * Fetch a guest pte for a guest virtual address */ @@ -221,13 +239,7 @@ retry_walk: walker->ptes[walker->level - 1] = pte; - if ((walker->level == PT_PAGE_TABLE_LEVEL) || - ((walker->level == PT_DIRECTORY_LEVEL) && - is_large_pte(pte) && - (PTTYPE == 64 || is_pse(vcpu))) || - ((walker->level == PT_PDPE_LEVEL) && - is_large_pte(pte) && - mmu->root_level == PT64_ROOT_LEVEL)) { + if (FNAME(is_last_gpte)(walker, vcpu, mmu, pte)) { int lvl = walker->level; gpa_t real_gpa; gfn_t gfn; -- cgit v1.1 From 4b6b35f55ca81d3bfdec63b0adb61798702ceb2e Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Mon, 11 Jul 2011 15:28:12 -0400 Subject: KVM: Add constant to represent KVM MSRs enabled bit in guest/host interface This patch is simple, put in a different commit so it can be more easily shared between guest and hypervisor. It just defines a named constant to indicate the enable bit for KVM-specific MSRs. Signed-off-by: Glauber Costa Acked-by: Rik van Riel Tested-by: Eric B Munson CC: Jeremy Fitzhardinge CC: Peter Zijlstra CC: Anthony Liguori Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_para.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index a427bf7..d6cd79b 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -30,6 +30,7 @@ #define MSR_KVM_WALL_CLOCK 0x11 #define MSR_KVM_SYSTEM_TIME 0x12 +#define KVM_MSR_ENABLED 1 /* Custom MSRs falls in the range 0x4b564d00-0x4b564dff */ #define MSR_KVM_WALL_CLOCK_NEW 0x4b564d00 #define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01 -- cgit v1.1 From 9ddabbe72e41ca6794cb4947c70929c9410e6752 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Mon, 11 Jul 2011 15:28:13 -0400 Subject: KVM: KVM Steal time guest/host interface To implement steal time, we need the hypervisor to pass the guest information about how much time was spent running other processes outside the VM. This is per-vcpu, and using the kvmclock structure for that is an abuse we decided not to make. In this patchset, I am introducing a new msr, KVM_MSR_STEAL_TIME, that holds the memory area address containing information about steal time This patch contains the headers for it. I am keeping it separate to facilitate backports to people who wants to backport the kernel part but not the hypervisor, or the other way around. Signed-off-by: Glauber Costa Acked-by: Rik van Riel Tested-by: Eric B Munson CC: Jeremy Fitzhardinge CC: Peter Zijlstra CC: Anthony Liguori Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_para.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index d6cd79b..65f8bb9 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -21,6 +21,7 @@ */ #define KVM_FEATURE_CLOCKSOURCE2 3 #define KVM_FEATURE_ASYNC_PF 4 +#define KVM_FEATURE_STEAL_TIME 5 /* The last 8 bits are used to indicate how to interpret the flags field * in pvclock structure. If no bits are set, all flags are ignored. @@ -35,6 +36,14 @@ #define MSR_KVM_WALL_CLOCK_NEW 0x4b564d00 #define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01 #define MSR_KVM_ASYNC_PF_EN 0x4b564d02 +#define MSR_KVM_STEAL_TIME 0x4b564d03 + +struct kvm_steal_time { + __u64 steal; + __u32 version; + __u32 flags; + __u32 pad[12]; +}; #define KVM_MAX_MMU_OP_BATCH 32 -- cgit v1.1 From 688d3be815b1563b1484ce67702249a4a7a6314e Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 12 Jul 2011 13:35:57 +0200 Subject: percpu: Fixup __this_cpu_xchg* operations Somehow we got into a situation where the __this_cpu_xchg() operations were not defined in the same way as this_cpu_xchg() and friends. I had some build failures under 32 bit that were addressed by these fixes. Signed-off-by: Christoph Lameter Signed-off-by: Tejun Heo --- arch/x86/include/asm/percpu.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index a0a9779..3470c9d 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -388,12 +388,9 @@ do { \ #define __this_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val) #define __this_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val) #define __this_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val) -/* - * Generic fallback operations for __this_cpu_xchg_[1-4] are okay and much - * faster than an xchg with forced lock semantics. - */ -#define __this_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval) -#define __this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) +#define __this_cpu_xchg_1(pcp, val) percpu_xchg_op(pcp, val) +#define __this_cpu_xchg_2(pcp, val) percpu_xchg_op(pcp, val) +#define __this_cpu_xchg_4(pcp, val) percpu_xchg_op(pcp, val) #define this_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) #define this_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) @@ -485,6 +482,8 @@ do { \ #define __this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) #define __this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val) #define __this_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val) +#define __this_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval) +#define __this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) #define this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) #define this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val) -- cgit v1.1 From 25970852280c9d5fb2de899769880d3e97332baa Mon Sep 17 00:00:00 2001 From: Naga Chumbalkar Date: Tue, 12 Jul 2011 05:59:07 +0000 Subject: x86, x2apic: Preserve high 32-bits of IA32_APIC_BASE MSR If there's no special reason to zero-out the "high" 32-bits of the IA32_APIC_BASE MSR, let's preserve it. The x2APIC Specification doesn't explicitly state any such requirement. (Sec 2.2 in: http://www.intel.com/Assets/PDF/manual/318148.pdf). Signed-off-by: Naga Chumbalkar Link: http://lkml.kernel.org/r/20110712055831.2498.78521.sendpatchset@nchumbalkar.americas.cpqcorp.net Reviewed-by: Cyrill Gorcunov Reviewed-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apic/apic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index b9338b8..f7b0c7a 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1429,7 +1429,7 @@ void enable_x2apic(void) rdmsr(MSR_IA32_APICBASE, msr, msr2); if (!(msr & X2APIC_ENABLE)) { printk_once(KERN_INFO "Enabling x2apic\n"); - wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0); + wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, msr2); } } #endif /* CONFIG_X86_X2APIC */ -- cgit v1.1 From 3040db92ee1b6c5b6b6d73f8cdcad54c0da11563 Mon Sep 17 00:00:00 2001 From: Naga Chumbalkar Date: Tue, 12 Jul 2011 21:17:41 +0000 Subject: x86, ioapic: Print IRTE when IR is enabled When "apic=debug" is used as a boot parameter, Linux prints the IOAPIC routing entries in "dmesg". Below is output from IOAPIC whose apic_id is 8: # dmesg | grep "routing entry" IOAPIC[8]: Set routing entry (8-1 -> 0x31 -> IRQ 1 Mode:0 Active:0 Dest:0) IOAPIC[8]: Set routing entry (8-2 -> 0x30 -> IRQ 0 Mode:0 Active:0 Dest:0) IOAPIC[8]: Set routing entry (8-3 -> 0x33 -> IRQ 3 Mode:0 Active:0 Dest:0) ... Similarly, when IR (interrupt remapping) is enabled, and the IRTE (interrupt remapping table entry) is set up we should display it. After the fix: # dmesg | grep IRTE IOAPIC[8]: Set IRTE entry (P:1 FPD:0 Dst_Mode:0 Redir_hint:1 Trig_Mode:0 Dlvry_Mode:0 Avail:0 Vector:31 Dest:00000000 SID:00F1 SQ:0 SVT:1) IOAPIC[8]: Set IRTE entry (P:1 FPD:0 Dst_Mode:0 Redir_hint:1 Trig_Mode:0 Dlvry_Mode:0 Avail:0 Vector:30 Dest:00000000 SID:00F1 SQ:0 SVT:1) IOAPIC[8]: Set IRTE entry (P:1 FPD:0 Dst_Mode:0 Redir_hint:1 Trig_Mode:0 Dlvry_Mode:0 Avail:0 Vector:33 Dest:00000000 SID:00F1 SQ:0 SVT:1) ... The IRTE is defined in Sec 9.5 of the Intel VT-d Specification. Signed-off-by: Naga Chumbalkar Link: http://lkml.kernel.org/r/20110712211704.2939.71291.sendpatchset@nchumbalkar.americas.cpqcorp.net Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apic/io_apic.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 5cba200..1ed5cbb 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1295,6 +1295,16 @@ static int setup_ioapic_entry(int apic_id, int irq, * irq handler will do the explicit EOI to the io-apic. */ ir_entry->vector = pin; + + apic_printk(APIC_VERBOSE, KERN_DEBUG "IOAPIC[%d]: " + "Set IRTE entry (P:%d FPD:%d Dst_Mode:%d " + "Redir_hint:%d Trig_Mode:%d Dlvry_Mode:%X " + "Avail:%X Vector:%02X Dest:%08X " + "SID:%04X SQ:%X SVT:%X)\n", + apic_id, irte.present, irte.fpd, irte.dst_mode, + irte.redir_hint, irte.trigger_mode, irte.dlvry_mode, + irte.avail, irte.vector, irte.dest_id, + irte.sid, irte.sq, irte.svt); } else { entry->delivery_mode = apic->irq_delivery_mode; entry->dest_mode = apic->irq_dest_mode; -- cgit v1.1 From 42f0efc5aae2bd7e3bc420c0902c7024ef77391f Mon Sep 17 00:00:00 2001 From: Naga Chumbalkar Date: Tue, 12 Jul 2011 21:17:35 +0000 Subject: x86, ioapic: Print IR_IO_APIC_route_entry when IR is enabled When IR (interrupt remapping) is enabled print_IO_APIC() displays output according to legacy RTE (redirection table entry) definitons: NR Dst Mask Trig IRR Pol Stat Dmod Deli Vect: 00 00 1 0 0 0 0 0 0 00 01 00 0 0 0 0 0 0 0 01 02 00 0 0 0 0 0 0 0 02 03 00 1 0 0 0 0 0 0 03 04 00 1 0 0 0 0 0 0 04 05 00 1 0 0 0 0 0 0 05 06 00 1 0 0 0 0 0 0 06 ... The above output is as per Sec 3.2.4 of the IOAPIC datasheet: 82093AA I/O Advanced Programmable Interrupt Controller (IOAPIC): http://download.intel.com/design/chipsets/datashts/29056601.pdf Instead the output should display the fields as discussed in Sec 5.5.1 of the VT-d specification: (Intel Virtualization Technology for Directed I/O: http://download.intel.com/technology/computing/vptech/Intel(r)_VT_for_Direct_IO.pdf) After the fix: NR Indx Fmt Mask Trig IRR Pol Stat Indx2 Zero Vect: 00 0000 0 1 0 0 0 0 0 0 00 01 000F 1 0 0 0 0 0 0 0 01 02 0001 1 0 0 0 0 0 0 0 02 03 0002 1 1 0 0 0 0 0 0 03 04 0011 1 1 0 0 0 0 0 0 04 05 0004 1 1 0 0 0 0 0 0 05 06 0005 1 1 0 0 0 0 0 0 06 ... Signed-off-by: Naga Chumbalkar Link: http://lkml.kernel.org/r/20110712211658.2939.93123.sendpatchset@nchumbalkar.americas.cpqcorp.net Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apic/io_apic.c | 69 ++++++++++++++++++++++++++++++------------ 1 file changed, 49 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 1ed5cbb..8eb863e 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1562,31 +1562,60 @@ __apicdebuginit(void) print_IO_APIC(void) printk(KERN_DEBUG ".... IRQ redirection table:\n"); - printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol" - " Stat Dmod Deli Vect:\n"); + if (intr_remapping_enabled) { + printk(KERN_DEBUG " NR Indx Fmt Mask Trig IRR" + " Pol Stat Indx2 Zero Vect:\n"); + } else { + printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol" + " Stat Dmod Deli Vect:\n"); + } for (i = 0; i <= reg_01.bits.entries; i++) { - struct IO_APIC_route_entry entry; - - entry = ioapic_read_entry(apic, i); - - printk(KERN_DEBUG " %02x %02X ", - i, - entry.dest - ); + if (intr_remapping_enabled) { + struct IO_APIC_route_entry entry; + struct IR_IO_APIC_route_entry *ir_entry; + + entry = ioapic_read_entry(apic, i); + ir_entry = (struct IR_IO_APIC_route_entry *) &entry; + printk(KERN_DEBUG " %02x %04X ", + i, + ir_entry->index + ); + printk("%1d %1d %1d %1d %1d " + "%1d %1d %X %02X\n", + ir_entry->format, + ir_entry->mask, + ir_entry->trigger, + ir_entry->irr, + ir_entry->polarity, + ir_entry->delivery_status, + ir_entry->index2, + ir_entry->zero, + ir_entry->vector + ); + } else { + struct IO_APIC_route_entry entry; - printk("%1d %1d %1d %1d %1d %1d %1d %02X\n", - entry.mask, - entry.trigger, - entry.irr, - entry.polarity, - entry.delivery_status, - entry.dest_mode, - entry.delivery_mode, - entry.vector - ); + entry = ioapic_read_entry(apic, i); + printk(KERN_DEBUG " %02x %02X ", + i, + entry.dest + ); + printk("%1d %1d %1d %1d %1d " + "%1d %1d %02X\n", + entry.mask, + entry.trigger, + entry.irr, + entry.polarity, + entry.delivery_status, + entry.dest_mode, + entry.delivery_mode, + entry.vector + ); + } } } + printk(KERN_DEBUG "IRQ to pin mappings:\n"); for_each_active_irq(irq) { struct irq_pin_list *entry; -- cgit v1.1 From 3628c3f5c818cfc6e588d1ccb31f19aa12345c02 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Tue, 28 Jun 2011 15:57:31 +0200 Subject: x86. reboot: Make Dell Latitude E6320 use reboot=pci The Dell Latitude E6320 doesn't reboot unless reboot=pci is set. Force it thanks to DMI. Signed-off-by: Maxime Ripard Link: http://lkml.kernel.org/r/1309269451-4966-1-git-send-email-maxime.ripard@free-electrons.com Cc: Matthew Garrett Signed-off-by: H. Peter Anvin --- arch/x86/kernel/reboot.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 4f0d46f..14eed21 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -419,6 +419,14 @@ static struct dmi_system_id __initdata pci_reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "iMac9,1"), }, }, + { /* Handle problems with rebooting on the Latitude E6320. */ + .callback = set_pci_reboot, + .ident = "Dell Latitude E6320", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6320"), + }, + }, { } }; -- cgit v1.1 From c9712944b2a12373cb6ff8059afcfb7e826a6c54 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 13 Jul 2011 09:24:09 -0400 Subject: x86-64: Improve vsyscall emulation CS and RIP handling Three fixes here: - Send SIGSEGV if called from compat code or with a funny CS. - Don't BUG on impossible addresses. - Add a missing local_irq_disable. This patch also removes an unused variable. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/6fb2b13ab39b743d1e4f466eef13425854912f7f.1310563276.git.luto@mit.edu Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/vsyscall.h | 12 -------- arch/x86/kernel/vsyscall_64.c | 61 +++++++++++++++++++++++++++-------------- 2 files changed, 41 insertions(+), 32 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h index bb710cb..d555973 100644 --- a/arch/x86/include/asm/vsyscall.h +++ b/arch/x86/include/asm/vsyscall.h @@ -31,18 +31,6 @@ extern struct timezone sys_tz; extern void map_vsyscall(void); -/* Emulation */ - -static inline bool is_vsyscall_entry(unsigned long addr) -{ - return (addr & ~0xC00UL) == VSYSCALL_START; -} - -static inline int vsyscall_entry_nr(unsigned long addr) -{ - return (addr & 0xC00UL) >> 10; -} - #endif /* __KERNEL__ */ #endif /* _ASM_X86_VSYSCALL_H */ diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 10cd8ac..a262400 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -38,6 +38,7 @@ #include #include +#include #include #include #include @@ -97,33 +98,63 @@ static void warn_bad_vsyscall(const char *level, struct pt_regs *regs, tsk = current; - printk("%s%s[%d] %s ip:%lx sp:%lx ax:%lx si:%lx di:%lx\n", + printk("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n", level, tsk->comm, task_pid_nr(tsk), - message, regs->ip - 2, regs->sp, regs->ax, regs->si, regs->di); + message, regs->ip - 2, regs->cs, + regs->sp, regs->ax, regs->si, regs->di); +} + +static int addr_to_vsyscall_nr(unsigned long addr) +{ + int nr; + + if ((addr & ~0xC00UL) != VSYSCALL_START) + return -EINVAL; + + nr = (addr & 0xC00UL) >> 10; + if (nr >= 3) + return -EINVAL; + + return nr; } void dotraplinkage do_emulate_vsyscall(struct pt_regs *regs, long error_code) { - const char *vsyscall_name; struct task_struct *tsk; unsigned long caller; int vsyscall_nr; long ret; - /* Kernel code must never get here. */ - BUG_ON(!user_mode(regs)); - local_irq_enable(); /* + * Real 64-bit user mode code has cs == __USER_CS. Anything else + * is bogus. + */ + if (regs->cs != __USER_CS) { + /* + * If we trapped from kernel mode, we might as well OOPS now + * instead of returning to some random address and OOPSing + * then. + */ + BUG_ON(!user_mode(regs)); + + /* Compat mode and non-compat 32-bit CS should both segfault. */ + warn_bad_vsyscall(KERN_WARNING, regs, + "illegal int 0xcc from 32-bit mode"); + goto sigsegv; + } + + /* * x86-ism here: regs->ip points to the instruction after the int 0xcc, * and int 0xcc is two bytes long. */ - if (!is_vsyscall_entry(regs->ip - 2)) { - warn_bad_vsyscall(KERN_WARNING, regs, "illegal int 0xcc (exploit attempt?)"); + vsyscall_nr = addr_to_vsyscall_nr(regs->ip - 2); + if (vsyscall_nr < 0) { + warn_bad_vsyscall(KERN_WARNING, regs, + "illegal int 0xcc (exploit attempt?)"); goto sigsegv; } - vsyscall_nr = vsyscall_entry_nr(regs->ip - 2); if (get_user(caller, (unsigned long __user *)regs->sp) != 0) { warn_bad_vsyscall(KERN_WARNING, regs, "int 0xcc with bad stack (exploit attempt?)"); @@ -136,31 +167,20 @@ void dotraplinkage do_emulate_vsyscall(struct pt_regs *regs, long error_code) switch (vsyscall_nr) { case 0: - vsyscall_name = "gettimeofday"; ret = sys_gettimeofday( (struct timeval __user *)regs->di, (struct timezone __user *)regs->si); break; case 1: - vsyscall_name = "time"; ret = sys_time((time_t __user *)regs->di); break; case 2: - vsyscall_name = "getcpu"; ret = sys_getcpu((unsigned __user *)regs->di, (unsigned __user *)regs->si, 0); break; - - default: - /* - * If we get here, then vsyscall_nr indicates that int 0xcc - * happened at an address in the vsyscall page that doesn't - * contain int 0xcc. That can't happen. - */ - BUG(); } if (ret == -EFAULT) { @@ -188,6 +208,7 @@ void dotraplinkage do_emulate_vsyscall(struct pt_regs *regs, long error_code) sigsegv: regs->ip -= 2; /* The faulting instruction should be the int 0xcc. */ force_sig(SIGSEGV, current); + local_irq_disable(); } /* -- cgit v1.1 From 59e97e4d6fbcd5b74a94cb48bcbfc6f8478a5e93 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 13 Jul 2011 09:24:10 -0400 Subject: x86: Make alternative instruction pointers relative This save a few bytes on x86-64 and means that future patches can apply alternatives to unrelocated code. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/ff64a6b9a1a3860ca4a7b8b6dc7b4754f9491cd7.1310563276.git.luto@mit.edu Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/alternative-asm.h | 4 ++-- arch/x86/include/asm/alternative.h | 8 ++++---- arch/x86/include/asm/cpufeature.h | 8 ++++---- arch/x86/kernel/alternative.c | 21 +++++++++++++-------- arch/x86/lib/copy_page_64.S | 9 +++------ arch/x86/lib/memmove_64.S | 11 +++++------ 6 files changed, 31 insertions(+), 30 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h index 94d420b..4554cc6 100644 --- a/arch/x86/include/asm/alternative-asm.h +++ b/arch/x86/include/asm/alternative-asm.h @@ -17,8 +17,8 @@ .macro altinstruction_entry orig alt feature orig_len alt_len .align 8 - .quad \orig - .quad \alt + .long \orig - . + .long \alt - . .word \feature .byte \orig_len .byte \alt_len diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index bf535f9..23fb6d7 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -43,8 +43,8 @@ #endif struct alt_instr { - u8 *instr; /* original instruction */ - u8 *replacement; + s32 instr_offset; /* original instruction */ + s32 repl_offset; /* offset to replacement instruction */ u16 cpuid; /* cpuid bit set for replacement */ u8 instrlen; /* length of original instruction */ u8 replacementlen; /* length of new instruction, <= instrlen */ @@ -84,8 +84,8 @@ static inline int alternatives_text_reserved(void *start, void *end) "661:\n\t" oldinstr "\n662:\n" \ ".section .altinstructions,\"a\"\n" \ _ASM_ALIGN "\n" \ - _ASM_PTR "661b\n" /* label */ \ - _ASM_PTR "663f\n" /* new instruction */ \ + " .long 661b - .\n" /* label */ \ + " .long 663f - .\n" /* new instruction */ \ " .word " __stringify(feature) "\n" /* feature bit */ \ " .byte 662b-661b\n" /* sourcelen */ \ " .byte 664f-663f\n" /* replacementlen */ \ diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 71cc380..9929b35 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -331,8 +331,8 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) "2:\n" ".section .altinstructions,\"a\"\n" _ASM_ALIGN "\n" - _ASM_PTR "1b\n" - _ASM_PTR "0\n" /* no replacement */ + " .long 1b - .\n" + " .long 0\n" /* no replacement */ " .word %P0\n" /* feature bit */ " .byte 2b - 1b\n" /* source len */ " .byte 0\n" /* replacement len */ @@ -349,8 +349,8 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) "2:\n" ".section .altinstructions,\"a\"\n" _ASM_ALIGN "\n" - _ASM_PTR "1b\n" - _ASM_PTR "3f\n" + " .long 1b - .\n" + " .long 3f - .\n" " .word %P1\n" /* feature bit */ " .byte 2b - 1b\n" /* source len */ " .byte 4f - 3f\n" /* replacement len */ diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index a81f2d5..ddb207b 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -263,6 +263,7 @@ void __init_or_module apply_alternatives(struct alt_instr *start, struct alt_instr *end) { struct alt_instr *a; + u8 *instr, *replacement; u8 insnbuf[MAX_PATCH_LEN]; DPRINTK("%s: alt table %p -> %p\n", __func__, start, end); @@ -276,25 +277,29 @@ void __init_or_module apply_alternatives(struct alt_instr *start, * order. */ for (a = start; a < end; a++) { - u8 *instr = a->instr; + instr = (u8 *)&a->instr_offset + a->instr_offset; + replacement = (u8 *)&a->repl_offset + a->repl_offset; BUG_ON(a->replacementlen > a->instrlen); BUG_ON(a->instrlen > sizeof(insnbuf)); BUG_ON(a->cpuid >= NCAPINTS*32); if (!boot_cpu_has(a->cpuid)) continue; + + memcpy(insnbuf, replacement, a->replacementlen); + + /* 0xe8 is a relative jump; fix the offset. */ + if (*insnbuf == 0xe8 && a->replacementlen == 5) + *(s32 *)(insnbuf + 1) += replacement - instr; + + add_nops(insnbuf + a->replacementlen, + a->instrlen - a->replacementlen); + #ifdef CONFIG_X86_64 /* vsyscall code is not mapped yet. resolve it manually. */ if (instr >= (u8 *)VSYSCALL_START && instr < (u8*)VSYSCALL_END) { instr = __va(instr - (u8*)VSYSCALL_START + (u8*)__pa_symbol(&__vsyscall_0)); - DPRINTK("%s: vsyscall fixup: %p => %p\n", - __func__, a->instr, instr); } #endif - memcpy(insnbuf, a->replacement, a->replacementlen); - if (*insnbuf == 0xe8 && a->replacementlen == 5) - *(s32 *)(insnbuf + 1) += a->replacement - a->instr; - add_nops(insnbuf + a->replacementlen, - a->instrlen - a->replacementlen); text_poke_early(instr, insnbuf, a->instrlen); } } diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S index 6fec2d1..01c805b 100644 --- a/arch/x86/lib/copy_page_64.S +++ b/arch/x86/lib/copy_page_64.S @@ -2,6 +2,7 @@ #include #include +#include ALIGN copy_page_c: @@ -110,10 +111,6 @@ ENDPROC(copy_page) 2: .previous .section .altinstructions,"a" - .align 8 - .quad copy_page - .quad 1b - .word X86_FEATURE_REP_GOOD - .byte .Lcopy_page_end - copy_page - .byte 2b - 1b + altinstruction_entry copy_page, 1b, X86_FEATURE_REP_GOOD, \ + .Lcopy_page_end-copy_page, 2b-1b .previous diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S index d0ec9c2..ee16461 100644 --- a/arch/x86/lib/memmove_64.S +++ b/arch/x86/lib/memmove_64.S @@ -9,6 +9,7 @@ #include #include #include +#include #undef memmove @@ -214,11 +215,9 @@ ENTRY(memmove) .previous .section .altinstructions,"a" - .align 8 - .quad .Lmemmove_begin_forward - .quad .Lmemmove_begin_forward_efs - .word X86_FEATURE_ERMS - .byte .Lmemmove_end_forward-.Lmemmove_begin_forward - .byte .Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs + altinstruction_entry .Lmemmove_begin_forward, \ + .Lmemmove_begin_forward_efs,X86_FEATURE_ERMS, \ + .Lmemmove_end_forward-.Lmemmove_begin_forward, \ + .Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs .previous ENDPROC(memmove) -- cgit v1.1 From 1b3f2a72bbcfdf92e368a44448c45eb639b05b5e Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 13 Jul 2011 09:24:11 -0400 Subject: x86-64: Allow alternative patching in the vDSO This code is short enough and different enough from the module loader that it's not worth trying to share anything. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/e73112e4381fff29e31b882c2d0856822edaea53.1310563276.git.luto@mit.edu Signed-off-by: H. Peter Anvin --- arch/x86/vdso/vma.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index 7abd2be..c39938d 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -23,11 +23,44 @@ extern unsigned short vdso_sync_cpuid; static struct page **vdso_pages; static unsigned vdso_size; +static void __init patch_vdso(void *vdso, size_t len) +{ + Elf64_Ehdr *hdr = vdso; + Elf64_Shdr *sechdrs, *alt_sec = 0; + char *secstrings; + void *alt_data; + int i; + + BUG_ON(len < sizeof(Elf64_Ehdr)); + BUG_ON(memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0); + + sechdrs = (void *)hdr + hdr->e_shoff; + secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; + + for (i = 1; i < hdr->e_shnum; i++) { + Elf64_Shdr *shdr = &sechdrs[i]; + if (!strcmp(secstrings + shdr->sh_name, ".altinstructions")) { + alt_sec = shdr; + goto found; + } + } + + /* If we get here, it's probably a bug. */ + pr_warning("patch_vdso: .altinstructions not found\n"); + return; /* nothing to patch */ + +found: + alt_data = (void *)hdr + alt_sec->sh_offset; + apply_alternatives(alt_data, alt_data + alt_sec->sh_size); +} + static int __init init_vdso_vars(void) { int npages = (vdso_end - vdso_start + PAGE_SIZE - 1) / PAGE_SIZE; int i; + patch_vdso(vdso_start, vdso_end - vdso_start); + vdso_size = npages << PAGE_SHIFT; vdso_pages = kmalloc(sizeof(struct page *) * npages, GFP_KERNEL); if (!vdso_pages) -- cgit v1.1 From 7f79ad15f33cf4968cafb0e3d2beba427de01d3a Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 13 Jul 2011 09:24:12 -0400 Subject: x86-64: Add --no-undefined to vDSO build This gives much nicer diagnostics when something goes wrong. It's supported at least as far back as binutils 2.15. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/de0b50920469ff6359c529526e7639fdd36fa83c.1310563276.git.luto@mit.edu Signed-off-by: H. Peter Anvin --- arch/x86/vdso/Makefile | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile index bef0bc9..5d17950 100644 --- a/arch/x86/vdso/Makefile +++ b/arch/x86/vdso/Makefile @@ -26,6 +26,7 @@ targets += vdso.so vdso.so.dbg vdso.lds $(vobjs-y) export CPPFLAGS_vdso.lds += -P -C VDSO_LDFLAGS_vdso.lds = -m64 -Wl,-soname=linux-vdso.so.1 \ + -Wl,--no-undefined \ -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096 $(obj)/vdso.o: $(src)/vdso.S $(obj)/vdso.so -- cgit v1.1 From 433bd805e5fd2c731b3a9025b034f066272d336e Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 13 Jul 2011 09:24:13 -0400 Subject: clocksource: Replace vread with generic arch data The vread field was bloating struct clocksource everywhere except x86_64, and I want to change the way this works on x86_64, so let's split it out into per-arch data. Cc: x86@kernel.org Cc: Clemens Ladisch Cc: linux-ia64@vger.kernel.org Cc: Tony Luck Cc: Fenghua Yu Cc: John Stultz Cc: Thomas Gleixner Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/3ae5ec76a168eaaae63f08a2a1060b91aa0b7759.1310563276.git.luto@mit.edu Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/clocksource.h | 16 ++++++++++++++++ arch/x86/kernel/hpet.c | 2 +- arch/x86/kernel/tsc.c | 2 +- arch/x86/kernel/vsyscall_64.c | 2 +- 4 files changed, 19 insertions(+), 3 deletions(-) create mode 100644 arch/x86/include/asm/clocksource.h (limited to 'arch/x86') diff --git a/arch/x86/include/asm/clocksource.h b/arch/x86/include/asm/clocksource.h new file mode 100644 index 0000000..a5df33f --- /dev/null +++ b/arch/x86/include/asm/clocksource.h @@ -0,0 +1,16 @@ +/* x86-specific clocksource additions */ + +#ifndef _ASM_X86_CLOCKSOURCE_H +#define _ASM_X86_CLOCKSOURCE_H + +#ifdef CONFIG_X86_64 + +#define __ARCH_HAS_CLOCKSOURCE_DATA + +struct arch_clocksource_data { + cycle_t (*vread)(void); +}; + +#endif /* CONFIG_X86_64 */ + +#endif /* _ASM_X86_CLOCKSOURCE_H */ diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index e9f5605..0e07257bb 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -753,7 +753,7 @@ static struct clocksource clocksource_hpet = { .flags = CLOCK_SOURCE_IS_CONTINUOUS, .resume = hpet_resume_counter, #ifdef CONFIG_X86_64 - .vread = vread_hpet, + .archdata = { .vread = vread_hpet }, #endif }; diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 6cc6922..e7a74b8 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -777,7 +777,7 @@ static struct clocksource clocksource_tsc = { .flags = CLOCK_SOURCE_IS_CONTINUOUS | CLOCK_SOURCE_MUST_VERIFY, #ifdef CONFIG_X86_64 - .vread = vread_tsc, + .archdata = { .vread = vread_tsc }, #endif }; diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index a262400..12d488f 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -74,7 +74,7 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm, write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags); /* copy vsyscall data */ - vsyscall_gtod_data.clock.vread = clock->vread; + vsyscall_gtod_data.clock.vread = clock->archdata.vread; vsyscall_gtod_data.clock.cycle_last = clock->cycle_last; vsyscall_gtod_data.clock.mask = clock->mask; vsyscall_gtod_data.clock.mult = mult; -- cgit v1.1 From c9aaa8957f203bd6df83b002fb40b98390bed078 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Mon, 11 Jul 2011 15:28:14 -0400 Subject: KVM: Steal time implementation To implement steal time, we need the hypervisor to pass the guest information about how much time was spent running other processes outside the VM, while the vcpu had meaningful work to do - halt time does not count. This information is acquired through the run_delay field of delayacct/schedstats infrastructure, that counts time spent in a runqueue but not running. Steal time is a per-cpu information, so the traditional MSR-based infrastructure is used. A new msr, KVM_MSR_STEAL_TIME, holds the memory area address containing information about steal time This patch contains the hypervisor part of the steal time infrasructure, and can be backported independently of the guest portion. [avi, yongjie: export delayacct_on, to avoid build failures in some configs] Signed-off-by: Glauber Costa Tested-by: Eric B Munson CC: Rik van Riel CC: Jeremy Fitzhardinge CC: Peter Zijlstra CC: Anthony Liguori Signed-off-by: Yongjie Ren Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 9 +++++ arch/x86/include/asm/kvm_para.h | 4 +++ arch/x86/kvm/Kconfig | 1 + arch/x86/kvm/x86.c | 74 +++++++++++++++++++++++++++++++++++++++-- 4 files changed, 86 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index da6bbee..59086a7 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -389,6 +389,15 @@ struct kvm_vcpu_arch { unsigned int hw_tsc_khz; unsigned int time_offset; struct page *time_page; + + struct { + u64 msr_val; + u64 last_steal; + u64 accum_steal; + struct gfn_to_hva_cache stime; + struct kvm_steal_time steal; + } st; + u64 last_guest_tsc; u64 last_kernel_ns; u64 last_tsc_nsec; diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index 65f8bb9..c484ba8 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -45,6 +45,10 @@ struct kvm_steal_time { __u32 pad[12]; }; +#define KVM_STEAL_ALIGNMENT_BITS 5 +#define KVM_STEAL_VALID_BITS ((-1ULL << (KVM_STEAL_ALIGNMENT_BITS + 1))) +#define KVM_STEAL_RESERVED_MASK (((1 << KVM_STEAL_ALIGNMENT_BITS) - 1 ) << 1) + #define KVM_MAX_MMU_OP_BATCH 32 #define KVM_ASYNC_PF_ENABLED (1 << 0) diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 50f6364..99c3f05 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -31,6 +31,7 @@ config KVM select KVM_ASYNC_PF select USER_RETURN_NOTIFIER select KVM_MMIO + select TASK_DELAY_ACCT ---help--- Support hosting fully virtualized guest machines using hardware virtualization extensions. You will need a fairly recent diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0b803f0..c96cdc0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -808,12 +808,12 @@ EXPORT_SYMBOL_GPL(kvm_get_dr); * kvm-specific. Those are put in the beginning of the list. */ -#define KVM_SAVE_MSRS_BEGIN 8 +#define KVM_SAVE_MSRS_BEGIN 9 static u32 msrs_to_save[] = { MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, - HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, + HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, MSR_STAR, #ifdef CONFIG_X86_64 @@ -1488,6 +1488,35 @@ static void kvmclock_reset(struct kvm_vcpu *vcpu) } } +static void accumulate_steal_time(struct kvm_vcpu *vcpu) +{ + u64 delta; + + if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) + return; + + delta = current->sched_info.run_delay - vcpu->arch.st.last_steal; + vcpu->arch.st.last_steal = current->sched_info.run_delay; + vcpu->arch.st.accum_steal = delta; +} + +static void record_steal_time(struct kvm_vcpu *vcpu) +{ + if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) + return; + + if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, + &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)))) + return; + + vcpu->arch.st.steal.steal += vcpu->arch.st.accum_steal; + vcpu->arch.st.steal.version += 2; + vcpu->arch.st.accum_steal = 0; + + kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, + &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); +} + int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) { switch (msr) { @@ -1570,6 +1599,33 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) if (kvm_pv_enable_async_pf(vcpu, data)) return 1; break; + case MSR_KVM_STEAL_TIME: + + if (unlikely(!sched_info_on())) + return 1; + + if (data & KVM_STEAL_RESERVED_MASK) + return 1; + + if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime, + data & KVM_STEAL_VALID_BITS)) + return 1; + + vcpu->arch.st.msr_val = data; + + if (!(data & KVM_MSR_ENABLED)) + break; + + vcpu->arch.st.last_steal = current->sched_info.run_delay; + + preempt_disable(); + accumulate_steal_time(vcpu); + preempt_enable(); + + kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); + + break; + case MSR_IA32_MCG_CTL: case MSR_IA32_MCG_STATUS: case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: @@ -1855,6 +1911,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_KVM_ASYNC_PF_EN: data = vcpu->arch.apf.msr_val; break; + case MSR_KVM_STEAL_TIME: + data = vcpu->arch.st.msr_val; + break; case MSR_IA32_P5_MC_ADDR: case MSR_IA32_P5_MC_TYPE: case MSR_IA32_MCG_CAP: @@ -2166,6 +2225,9 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_migrate_timers(vcpu); vcpu->cpu = cpu; } + + accumulate_steal_time(vcpu); + kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) @@ -2487,6 +2549,10 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, (1 << KVM_FEATURE_CLOCKSOURCE2) | (1 << KVM_FEATURE_ASYNC_PF) | (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT); + + if (sched_info_on()) + entry->eax |= (1 << KVM_FEATURE_STEAL_TIME); + entry->ebx = 0; entry->ecx = 0; entry->edx = 0; @@ -5470,6 +5536,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) r = 1; goto out; } + if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu)) + record_steal_time(vcpu); + } r = kvm_mmu_reload(vcpu); @@ -6206,6 +6275,7 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) kvm_make_request(KVM_REQ_EVENT, vcpu); vcpu->arch.apf.msr_val = 0; + vcpu->arch.st.msr_val = 0; kvmclock_reset(vcpu); -- cgit v1.1 From 3c404b578fab699c4708279938078d9404b255a4 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Mon, 11 Jul 2011 15:28:15 -0400 Subject: KVM guest: Add a pv_ops stub for steal time This patch adds a function pointer in one of the many paravirt_ops structs, to allow guests to register a steal time function. Besides a steal time function, we also declare two jump_labels. They will be used to allow the steal time code to be easily bypassed when not in use. Signed-off-by: Glauber Costa Acked-by: Rik van Riel Tested-by: Eric B Munson CC: Jeremy Fitzhardinge CC: Peter Zijlstra CC: Anthony Liguori Signed-off-by: Avi Kivity --- arch/x86/include/asm/paravirt.h | 9 +++++++++ arch/x86/include/asm/paravirt_types.h | 1 + arch/x86/kernel/paravirt.c | 9 +++++++++ 3 files changed, 19 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index ebbc4d8..a7d2db9 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -230,6 +230,15 @@ static inline unsigned long long paravirt_sched_clock(void) return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock); } +struct jump_label_key; +extern struct jump_label_key paravirt_steal_enabled; +extern struct jump_label_key paravirt_steal_rq_enabled; + +static inline u64 paravirt_steal_clock(int cpu) +{ + return PVOP_CALL1(u64, pv_time_ops.steal_clock, cpu); +} + static inline unsigned long long paravirt_read_pmc(int counter) { return PVOP_CALL1(u64, pv_cpu_ops.read_pmc, counter); diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 8288509..2c76521 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -89,6 +89,7 @@ struct pv_lazy_ops { struct pv_time_ops { unsigned long long (*sched_clock)(void); + unsigned long long (*steal_clock)(int cpu); unsigned long (*get_tsc_khz)(void); }; diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 869e1ae..613a793 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -202,6 +202,14 @@ static void native_flush_tlb_single(unsigned long addr) __native_flush_tlb_single(addr); } +struct jump_label_key paravirt_steal_enabled; +struct jump_label_key paravirt_steal_rq_enabled; + +static u64 native_steal_clock(int cpu) +{ + return 0; +} + /* These are in entry.S */ extern void native_iret(void); extern void native_irq_enable_sysexit(void); @@ -307,6 +315,7 @@ struct pv_init_ops pv_init_ops = { struct pv_time_ops pv_time_ops = { .sched_clock = native_sched_clock, + .steal_clock = native_steal_clock, }; struct pv_irq_ops pv_irq_ops = { -- cgit v1.1 From 095c0aa83e52d6c3dd7168610746703921f570af Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Mon, 11 Jul 2011 15:28:18 -0400 Subject: sched: adjust scheduler cpu power for stolen time This patch makes update_rq_clock() aware of steal time. The mechanism of operation is not different from irq_time, and follows the same principles. This lives in a CONFIG option itself, and can be compiled out independently of the rest of steal time reporting. The effect of disabling it is that the scheduler will still report steal time (that cannot be disabled), but won't use this information for cpu power adjustments. Everytime update_rq_clock_task() is invoked, we query information about how much time was stolen since last call, and feed it into sched_rt_avg_update(). Although steal time reporting in account_process_tick() keeps track of the last time we read the steal clock, in prev_steal_time, this patch do it independently using another field, prev_steal_time_rq. This is because otherwise, information about time accounted in update_process_tick() would never reach us in update_rq_clock(). Signed-off-by: Glauber Costa Acked-by: Rik van Riel Acked-by: Peter Zijlstra Tested-by: Eric B Munson CC: Jeremy Fitzhardinge CC: Anthony Liguori Signed-off-by: Avi Kivity --- arch/x86/Kconfig | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index da34972..1f03e22 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -512,6 +512,18 @@ menuconfig PARAVIRT_GUEST if PARAVIRT_GUEST +config PARAVIRT_TIME_ACCOUNTING + bool "Paravirtual steal time accounting" + select PARAVIRT + default n + ---help--- + Select this option to enable fine granularity task steal time + accounting. Time spent executing other tasks in parallel with + the current vCPU is discounted from the vCPU power. To account for + that, there can be a small performance impact. + + If in doubt, say N here. + source "arch/x86/xen/Kconfig" config KVM_CLOCK -- cgit v1.1 From abe48b108247e9b90b4c6739662a2e5c765ed114 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Thu, 14 Jul 2011 00:53:24 -0400 Subject: x86, intel, power: Initialize MSR_IA32_ENERGY_PERF_BIAS Since 2.6.36 (23016bf0d25), Linux prints the existence of "epb" in /proc/cpuinfo, Since 2.6.38 (d5532ee7b40), the x86_energy_perf_policy(8) utility has been available in-tree to update MSR_IA32_ENERGY_PERF_BIAS. However, the typical BIOS fails to initialize the MSR, presumably because this is handled by high-volume shrink-wrap operating systems... Linux distros, on the other hand, do not yet invoke x86_energy_perf_policy(8). As a result, WSM-EP, SNB, and later hardware from Intel will run in its default hardware power-on state (performance), which assumes that users care for performance at all costs and not for energy efficiency. While that is fine for performance benchmarks, the hardware's intended default operating point is "normal" mode... Initialize the MSR to the "normal" by default during kernel boot. x86_energy_perf_policy(8) is available to change the default after boot, should the user have a different preference. Signed-off-by: Len Brown Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1107140051020.18606@x980 Acked-by: Rafael J. Wysocki Signed-off-by: H. Peter Anvin Cc: --- arch/x86/include/asm/msr-index.h | 3 +++ arch/x86/kernel/cpu/intel.c | 18 ++++++++++++++++++ 2 files changed, 21 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 485b4f1..23a9d89 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -259,6 +259,9 @@ #define MSR_IA32_TEMPERATURE_TARGET 0x000001a2 #define MSR_IA32_ENERGY_PERF_BIAS 0x000001b0 +#define ENERGY_PERF_BIAS_PERFORMANCE 0 +#define ENERGY_PERF_BIAS_NORMAL 6 +#define ENERGY_PERF_BIAS_POWERSWAVE 15 #define MSR_IA32_PACKAGE_THERM_STATUS 0x000001b1 diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 1edf5ba..da0d779 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -456,6 +456,24 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_VMX)) detect_vmx_virtcap(c); + + /* + * Initialize MSR_IA32_ENERGY_PERF_BIAS if BIOS did not. + * x86_energy_perf_policy(8) is available to change it at run-time + */ + if (cpu_has(c, X86_FEATURE_EPB)) { + u64 epb; + + rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); + if ((epb & 0xF) == 0) { + printk_once(KERN_WARNING, "x86: updated energy_perf_bias" + " to 'normal' from 'performance'\n" + "You can view and update epb via utility," + " such as x86_energy_perf_policy(8)\n"); + epb = (epb & ~0xF) | ENERGY_PERF_BIAS_NORMAL; + wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); + } + } } #ifdef CONFIG_X86_32 -- cgit v1.1 From f91298709790b9a483752ca3c967845537df2af3 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sat, 9 Jul 2011 00:17:12 +0400 Subject: perf, x86: P4 PMU - Introduce event alias feature Instead of hw_nmi_watchdog_set_attr() weak function and appropriate x86_pmu::hw_watchdog_set_attr() call we introduce even alias mechanism which allow us to drop this routines completely and isolate quirks of Netburst architecture inside P4 PMU code only. The main idea remains the same though -- to allow nmi-watchdog and perf top run simultaneously. Note the aliasing mechanism applies to generic PERF_COUNT_HW_CPU_CYCLES event only because arbitrary event (say passed as RAW initially) might have some additional bits set inside ESCR register changing the behaviour of event and we can't guarantee anymore that alias event will give the same result. P.S. Thanks a huge to Don and Steven for for testing and early review. Acked-by: Don Zickus Tested-by: Steven Rostedt Signed-off-by: Cyrill Gorcunov CC: Ingo Molnar CC: Peter Zijlstra CC: Stephane Eranian CC: Lin Ming CC: Arnaldo Carvalho de Melo CC: Frederic Weisbecker Link: http://lkml.kernel.org/r/20110708201712.GS23657@sun Signed-off-by: Steven Rostedt --- arch/x86/include/asm/perf_event_p4.h | 33 +++++++++ arch/x86/kernel/cpu/perf_event.c | 7 -- arch/x86/kernel/cpu/perf_event_p4.c | 135 +++++++++++++++++++++++++++-------- 3 files changed, 139 insertions(+), 36 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h index 56fd9e3..4d86c86 100644 --- a/arch/x86/include/asm/perf_event_p4.h +++ b/arch/x86/include/asm/perf_event_p4.h @@ -102,6 +102,14 @@ #define P4_CONFIG_HT (1ULL << P4_CONFIG_HT_SHIFT) /* + * If an event has alias it should be marked + * with a special bit. (Don't forget to check + * P4_PEBS_CONFIG_MASK and related bits on + * modification.) + */ +#define P4_CONFIG_ALIASABLE (1 << 9) + +/* * The bits we allow to pass for RAW events */ #define P4_CONFIG_MASK_ESCR \ @@ -123,6 +131,31 @@ (p4_config_pack_escr(P4_CONFIG_MASK_ESCR)) | \ (p4_config_pack_cccr(P4_CONFIG_MASK_CCCR)) +/* + * In case of event aliasing we need to preserve some + * caller bits otherwise the mapping won't be complete. + */ +#define P4_CONFIG_EVENT_ALIAS_MASK \ + (p4_config_pack_escr(P4_CONFIG_MASK_ESCR) | \ + p4_config_pack_cccr(P4_CCCR_EDGE | \ + P4_CCCR_THRESHOLD_MASK | \ + P4_CCCR_COMPLEMENT | \ + P4_CCCR_COMPARE)) + +#define P4_CONFIG_EVENT_ALIAS_IMMUTABLE_BITS \ + ((P4_CONFIG_HT) | \ + p4_config_pack_escr(P4_ESCR_T0_OS | \ + P4_ESCR_T0_USR | \ + P4_ESCR_T1_OS | \ + P4_ESCR_T1_USR) | \ + p4_config_pack_cccr(P4_CCCR_OVF | \ + P4_CCCR_CASCADE | \ + P4_CCCR_FORCE_OVF | \ + P4_CCCR_THREAD_ANY | \ + P4_CCCR_OVF_PMI_T0 | \ + P4_CCCR_OVF_PMI_T1 | \ + P4_CONFIG_ALIASABLE)) + static inline bool p4_is_event_cascaded(u64 config) { u32 cccr = p4_config_unpack_cccr(config); diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index c53d433..b7a010f 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -274,7 +274,6 @@ struct x86_pmu { void (*enable_all)(int added); void (*enable)(struct perf_event *); void (*disable)(struct perf_event *); - void (*hw_watchdog_set_attr)(struct perf_event_attr *attr); int (*hw_config)(struct perf_event *event); int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign); unsigned eventsel; @@ -360,12 +359,6 @@ static u64 __read_mostly hw_cache_extra_regs [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX]; -void hw_nmi_watchdog_set_attr(struct perf_event_attr *wd_attr) -{ - if (x86_pmu.hw_watchdog_set_attr) - x86_pmu.hw_watchdog_set_attr(wd_attr); -} - /* * Propagate event elapsed time into the generic event. * Can only be executed on the CPU where the event is active. diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index fb901c5..8b7a0c3 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -570,11 +570,92 @@ static __initconst const u64 p4_hw_cache_event_ids }, }; +/* + * Because of Netburst being quite restricted in now + * many same events can run simultaneously, we use + * event aliases, ie different events which have the + * same functionallity but use non-intersected resources + * (ESCR/CCCR/couter registers). This allow us to run + * two or more semi-same events together. It is done + * transparently to a user space. + * + * Never set any cusom internal bits such as P4_CONFIG_HT, + * P4_CONFIG_ALIASABLE or bits for P4_PEBS_METRIC, they are + * either up-to-dated automatically either not appliable + * at all. + * + * And be really carefull choosing aliases! + */ +struct p4_event_alias { + u64 orig; + u64 alter; +} p4_event_aliases[] = { + { + /* + * Non-halted cycles can be substituted with + * non-sleeping cycles (see Intel SDM Vol3b for + * details). + */ + .orig = + p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS) | + P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)), + .alter = + p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_EXECUTION_EVENT) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0)| + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1)| + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2)| + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3)| + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3))| + p4_config_pack_cccr(P4_CCCR_THRESHOLD(15) | P4_CCCR_COMPLEMENT | + P4_CCCR_COMPARE), + }, +}; + +static u64 p4_get_alias_event(u64 config) +{ + u64 config_match; + int i; + + /* + * Probably we're lucky and don't have to do + * matching over all config bits. + */ + if (!(config & P4_CONFIG_ALIASABLE)) + return 0; + + config_match = config & P4_CONFIG_EVENT_ALIAS_MASK; + + /* + * If an event was previously swapped to the alter config + * we should swap it back otherwise contnention on registers + * will return back. + */ + for (i = 0; i < ARRAY_SIZE(p4_event_aliases); i++) { + if (config_match == p4_event_aliases[i].orig) { + config_match = p4_event_aliases[i].alter; + break; + } else if (config_match == p4_event_aliases[i].alter) { + config_match = p4_event_aliases[i].orig; + break; + } + } + + if (i >= ARRAY_SIZE(p4_event_aliases)) + return 0; + + return config_match | + (config & P4_CONFIG_EVENT_ALIAS_IMMUTABLE_BITS); +} + static u64 p4_general_events[PERF_COUNT_HW_MAX] = { /* non-halted CPU clocks */ [PERF_COUNT_HW_CPU_CYCLES] = p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS) | - P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)), + P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)) | + P4_CONFIG_ALIASABLE, /* * retired instructions @@ -719,31 +800,6 @@ static int p4_validate_raw_event(struct perf_event *event) return 0; } -static void p4_hw_watchdog_set_attr(struct perf_event_attr *wd_attr) -{ - /* - * Watchdog ticks are special on Netburst, we use - * that named "non-sleeping" ticks as recommended - * by Intel SDM Vol3b. - */ - WARN_ON_ONCE(wd_attr->type != PERF_TYPE_HARDWARE || - wd_attr->config != PERF_COUNT_HW_CPU_CYCLES); - - wd_attr->type = PERF_TYPE_RAW; - wd_attr->config = - p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_EXECUTION_EVENT) | - P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0) | - P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1) | - P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2) | - P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3) | - P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0) | - P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1) | - P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2) | - P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3)) | - p4_config_pack_cccr(P4_CCCR_THRESHOLD(15) | P4_CCCR_COMPLEMENT | - P4_CCCR_COMPARE); -} - static int p4_hw_config(struct perf_event *event) { int cpu = get_cpu(); @@ -1159,6 +1215,8 @@ static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign struct p4_event_bind *bind; unsigned int i, thread, num; int cntr_idx, escr_idx; + u64 config_alias; + int pass; bitmap_zero(used_mask, X86_PMC_IDX_MAX); bitmap_zero(escr_mask, P4_ESCR_MSR_TABLE_SIZE); @@ -1167,6 +1225,17 @@ static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign hwc = &cpuc->event_list[i]->hw; thread = p4_ht_thread(cpu); + pass = 0; + +again: + /* + * Aliases are swappable so we may hit circular + * lock if both original config and alias need + * resources (MSR registers) which already busy. + */ + if (pass > 2) + goto done; + bind = p4_config_get_bind(hwc->config); escr_idx = p4_get_escr_idx(bind->escr_msr[thread]); if (unlikely(escr_idx == -1)) @@ -1180,8 +1249,17 @@ static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign } cntr_idx = p4_next_cntr(thread, used_mask, bind); - if (cntr_idx == -1 || test_bit(escr_idx, escr_mask)) - goto done; + if (cntr_idx == -1 || test_bit(escr_idx, escr_mask)) { + /* + * Probably an event alias is still available. + */ + config_alias = p4_get_alias_event(hwc->config); + if (!config_alias) + goto done; + hwc->config = config_alias; + pass++; + goto again; + } p4_pmu_swap_config_ts(hwc, cpu); if (assign) @@ -1218,7 +1296,6 @@ static __initconst const struct x86_pmu p4_pmu = { .cntval_bits = ARCH_P4_CNTRVAL_BITS, .cntval_mask = ARCH_P4_CNTRVAL_MASK, .max_period = (1ULL << (ARCH_P4_CNTRVAL_BITS - 1)) - 1, - .hw_watchdog_set_attr = p4_hw_watchdog_set_attr, .hw_config = p4_hw_config, .schedule_events = p4_pmu_schedule_events, /* -- cgit v1.1 From 4bb82178f5cb074783aaeaa06f9f840c67af7707 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 14 Jul 2011 14:58:44 -0700 Subject: x86, msr: Fix typo in ENERGY_PERF_BIAS_POWERSAVE Fix a trivial typo in the name of the constant ENERGY_PERF_BIAS_POWERSAVE. This didn't cause trouble because this constant is not currently used for anything. Signed-off-by: H. Peter Anvin Cc: Len Brown Link: http://lkml.kernel.org/r/tip-abe48b108247e9b90b4c6739662a2e5c765ed114@git.kernel.org --- arch/x86/include/asm/msr-index.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 23a9d89..d96bdb2 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -261,7 +261,7 @@ #define MSR_IA32_ENERGY_PERF_BIAS 0x000001b0 #define ENERGY_PERF_BIAS_PERFORMANCE 0 #define ENERGY_PERF_BIAS_NORMAL 6 -#define ENERGY_PERF_BIAS_POWERSWAVE 15 +#define ENERGY_PERF_BIAS_POWERSAVE 15 #define MSR_IA32_PACKAGE_THERM_STATUS 0x000001b1 -- cgit v1.1 From 98d0ac38ca7b1b7a552c9a2359174ff84decb600 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 14 Jul 2011 06:47:22 -0400 Subject: x86-64: Move vread_tsc and vread_hpet into the vDSO The vsyscall page now consists entirely of trap instructions. Cc: John Stultz Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/637648f303f2ef93af93bae25186e9a1bea093f5.1310639973.git.luto@mit.edu Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/clocksource.h | 6 ++++- arch/x86/include/asm/tsc.h | 4 --- arch/x86/include/asm/vgtod.h | 2 +- arch/x86/include/asm/vsyscall.h | 4 --- arch/x86/kernel/Makefile | 7 +---- arch/x86/kernel/alternative.c | 8 ------ arch/x86/kernel/hpet.c | 9 +------ arch/x86/kernel/tsc.c | 2 +- arch/x86/kernel/vmlinux.lds.S | 3 --- arch/x86/kernel/vread_tsc_64.c | 36 -------------------------- arch/x86/kernel/vsyscall_64.c | 2 +- arch/x86/vdso/vclock_gettime.c | 53 +++++++++++++++++++++++++++++++++----- 12 files changed, 57 insertions(+), 79 deletions(-) delete mode 100644 arch/x86/kernel/vread_tsc_64.c (limited to 'arch/x86') diff --git a/arch/x86/include/asm/clocksource.h b/arch/x86/include/asm/clocksource.h index a5df33f..3882c65 100644 --- a/arch/x86/include/asm/clocksource.h +++ b/arch/x86/include/asm/clocksource.h @@ -7,8 +7,12 @@ #define __ARCH_HAS_CLOCKSOURCE_DATA +#define VCLOCK_NONE 0 /* No vDSO clock available. */ +#define VCLOCK_TSC 1 /* vDSO should use vread_tsc. */ +#define VCLOCK_HPET 2 /* vDSO should use vread_hpet. */ + struct arch_clocksource_data { - cycle_t (*vread)(void); + int vclock_mode; }; #endif /* CONFIG_X86_64 */ diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index 9db5583..83e2efd 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -51,10 +51,6 @@ extern int unsynchronized_tsc(void); extern int check_tsc_unstable(void); extern unsigned long native_calibrate_tsc(void); -#ifdef CONFIG_X86_64 -extern cycles_t vread_tsc(void); -#endif - /* * Boot-time check whether the TSCs are synchronized across * all CPUs/cores: diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h index aa5add8..815285b 100644 --- a/arch/x86/include/asm/vgtod.h +++ b/arch/x86/include/asm/vgtod.h @@ -13,7 +13,7 @@ struct vsyscall_gtod_data { struct timezone sys_tz; struct { /* extract of a clocksource struct */ - cycle_t (*vread)(void); + int vclock_mode; cycle_t cycle_last; cycle_t mask; u32 mult; diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h index d555973..6010707 100644 --- a/arch/x86/include/asm/vsyscall.h +++ b/arch/x86/include/asm/vsyscall.h @@ -16,10 +16,6 @@ enum vsyscall_num { #ifdef __KERNEL__ #include -/* Definitions for CONFIG_GENERIC_TIME definitions */ -#define __vsyscall_fn \ - __attribute__ ((unused, __section__(".vsyscall_fn"))) notrace - #define VGETCPU_RDTSCP 1 #define VGETCPU_LSL 2 diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index cc0469a..2deef3d 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -24,17 +24,12 @@ endif nostackp := $(call cc-option, -fno-stack-protector) CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp) CFLAGS_hpet.o := $(nostackp) -CFLAGS_vread_tsc_64.o := $(nostackp) CFLAGS_paravirt.o := $(nostackp) GCOV_PROFILE_vsyscall_64.o := n GCOV_PROFILE_hpet.o := n GCOV_PROFILE_tsc.o := n -GCOV_PROFILE_vread_tsc_64.o := n GCOV_PROFILE_paravirt.o := n -# vread_tsc_64 is hot and should be fully optimized: -CFLAGS_REMOVE_vread_tsc_64.o = -pg -fno-optimize-sibling-calls - obj-y := process_$(BITS).o signal.o entry_$(BITS).o obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time.o ioport.o ldt.o dumpstack.o @@ -43,7 +38,7 @@ obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-y += probe_roms.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o -obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o vread_tsc_64.o +obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o obj-$(CONFIG_X86_64) += vsyscall_emu_64.o obj-y += bootflag.o e820.o obj-y += pci-dma.o quirks.o topology.o kdebugfs.o diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index ddb207b..c638228 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include @@ -250,7 +249,6 @@ static void __init_or_module add_nops(void *insns, unsigned int len) extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; extern s32 __smp_locks[], __smp_locks_end[]; -extern char __vsyscall_0; void *text_poke_early(void *addr, const void *opcode, size_t len); /* Replace instructions with better alternatives for this CPU type. @@ -294,12 +292,6 @@ void __init_or_module apply_alternatives(struct alt_instr *start, add_nops(insnbuf + a->replacementlen, a->instrlen - a->replacementlen); -#ifdef CONFIG_X86_64 - /* vsyscall code is not mapped yet. resolve it manually. */ - if (instr >= (u8 *)VSYSCALL_START && instr < (u8*)VSYSCALL_END) { - instr = __va(instr - (u8*)VSYSCALL_START + (u8*)__pa_symbol(&__vsyscall_0)); - } -#endif text_poke_early(instr, insnbuf, a->instrlen); } } diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 0e07257bb..d10cc00 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -738,13 +738,6 @@ static cycle_t read_hpet(struct clocksource *cs) return (cycle_t)hpet_readl(HPET_COUNTER); } -#ifdef CONFIG_X86_64 -static cycle_t __vsyscall_fn vread_hpet(void) -{ - return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0); -} -#endif - static struct clocksource clocksource_hpet = { .name = "hpet", .rating = 250, @@ -753,7 +746,7 @@ static struct clocksource clocksource_hpet = { .flags = CLOCK_SOURCE_IS_CONTINUOUS, .resume = hpet_resume_counter, #ifdef CONFIG_X86_64 - .archdata = { .vread = vread_hpet }, + .archdata = { .vclock_mode = VCLOCK_HPET }, #endif }; diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index e7a74b8..56c633a 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -777,7 +777,7 @@ static struct clocksource clocksource_tsc = { .flags = CLOCK_SOURCE_IS_CONTINUOUS | CLOCK_SOURCE_MUST_VERIFY, #ifdef CONFIG_X86_64 - .archdata = { .vread = vread_tsc }, + .archdata = { .vclock_mode = VCLOCK_TSC }, #endif }; diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 8017471..4aa9c54 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -169,9 +169,6 @@ SECTIONS .vsyscall : AT(VLOAD(.vsyscall)) { *(.vsyscall_0) - . = ALIGN(L1_CACHE_BYTES); - *(.vsyscall_fn) - . = 1024; *(.vsyscall_1) diff --git a/arch/x86/kernel/vread_tsc_64.c b/arch/x86/kernel/vread_tsc_64.c deleted file mode 100644 index a81aa9e..0000000 --- a/arch/x86/kernel/vread_tsc_64.c +++ /dev/null @@ -1,36 +0,0 @@ -/* This code runs in userspace. */ - -#define DISABLE_BRANCH_PROFILING -#include - -notrace cycle_t __vsyscall_fn vread_tsc(void) -{ - cycle_t ret; - u64 last; - - /* - * Empirically, a fence (of type that depends on the CPU) - * before rdtsc is enough to ensure that rdtsc is ordered - * with respect to loads. The various CPU manuals are unclear - * as to whether rdtsc can be reordered with later loads, - * but no one has ever seen it happen. - */ - rdtsc_barrier(); - ret = (cycle_t)vget_cycles(); - - last = VVAR(vsyscall_gtod_data).clock.cycle_last; - - if (likely(ret >= last)) - return ret; - - /* - * GCC likes to generate cmov here, but this branch is extremely - * predictable (it's just a funciton of time and the likely is - * very likely) and there's a data dependence, so force GCC - * to generate a branch instead. I don't barrier() because - * we don't actually need a barrier, and if this function - * ever gets inlined it will generate worse code. - */ - asm volatile (""); - return last; -} diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 12d488f..dda7dff 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -74,7 +74,7 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm, write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags); /* copy vsyscall data */ - vsyscall_gtod_data.clock.vread = clock->archdata.vread; + vsyscall_gtod_data.clock.vclock_mode = clock->archdata.vclock_mode; vsyscall_gtod_data.clock.cycle_last = clock->cycle_last; vsyscall_gtod_data.clock.mask = clock->mask; vsyscall_gtod_data.clock.mult = mult; diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index cf54813..8792d6e 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -25,6 +26,43 @@ #define gtod (&VVAR(vsyscall_gtod_data)) +notrace static cycle_t vread_tsc(void) +{ + cycle_t ret; + u64 last; + + /* + * Empirically, a fence (of type that depends on the CPU) + * before rdtsc is enough to ensure that rdtsc is ordered + * with respect to loads. The various CPU manuals are unclear + * as to whether rdtsc can be reordered with later loads, + * but no one has ever seen it happen. + */ + rdtsc_barrier(); + ret = (cycle_t)vget_cycles(); + + last = VVAR(vsyscall_gtod_data).clock.cycle_last; + + if (likely(ret >= last)) + return ret; + + /* + * GCC likes to generate cmov here, but this branch is extremely + * predictable (it's just a funciton of time and the likely is + * very likely) and there's a data dependence, so force GCC + * to generate a branch instead. I don't barrier() because + * we don't actually need a barrier, and if this function + * ever gets inlined it will generate worse code. + */ + asm volatile (""); + return last; +} + +static notrace cycle_t vread_hpet(void) +{ + return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0); +} + notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) { long ret; @@ -36,9 +74,12 @@ notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) notrace static inline long vgetns(void) { long v; - cycles_t (*vread)(void); - vread = gtod->clock.vread; - v = (vread() - gtod->clock.cycle_last) & gtod->clock.mask; + cycles_t cycles; + if (gtod->clock.vclock_mode == VCLOCK_TSC) + cycles = vread_tsc(); + else + cycles = vread_hpet(); + v = (cycles - gtod->clock.cycle_last) & gtod->clock.mask; return (v * gtod->clock.mult) >> gtod->clock.shift; } @@ -118,11 +159,11 @@ notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) { switch (clock) { case CLOCK_REALTIME: - if (likely(gtod->clock.vread)) + if (likely(gtod->clock.vclock_mode != VCLOCK_NONE)) return do_realtime(ts); break; case CLOCK_MONOTONIC: - if (likely(gtod->clock.vread)) + if (likely(gtod->clock.vclock_mode != VCLOCK_NONE)) return do_monotonic(ts); break; case CLOCK_REALTIME_COARSE: @@ -139,7 +180,7 @@ int clock_gettime(clockid_t, struct timespec *) notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) { long ret; - if (likely(gtod->clock.vread)) { + if (likely(gtod->clock.vclock_mode != VCLOCK_NONE)) { if (likely(tv != NULL)) { BUILD_BUG_ON(offsetof(struct timeval, tv_usec) != offsetof(struct timespec, tv_nsec) || -- cgit v1.1 From 905f29e2aa004560907199cc96248fa11bafea8a Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 10 Jul 2011 21:27:24 +0200 Subject: x86, signals: Convert the IA32_EMULATION code to use set_current_blocked() sys32_sigsuspend() and sys32_*sigreturn() change ->blocked directly. This is not correct, see the changelog in e6fa16ab "signal: sigprocmask() should do retarget_shared_pending()" Change them to use set_current_blocked(). Signed-off-by: Oleg Nesterov Link: http://lkml.kernel.org/r/20110710192724.GA31755@redhat.com Signed-off-by: H. Peter Anvin --- arch/x86/ia32/ia32_signal.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 588a7aa..6557769 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -127,15 +127,17 @@ int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from) asmlinkage long sys32_sigsuspend(int history0, int history1, old_sigset_t mask) { - mask &= _BLOCKABLE; - spin_lock_irq(¤t->sighand->siglock); + sigset_t blocked; + current->saved_sigmask = current->blocked; - siginitset(¤t->blocked, mask); - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); + + mask &= _BLOCKABLE; + siginitset(&blocked, mask); + set_current_blocked(&blocked); current->state = TASK_INTERRUPTIBLE; schedule(); + set_restore_sigmask(); return -ERESTARTNOHAND; } @@ -279,10 +281,7 @@ asmlinkage long sys32_sigreturn(struct pt_regs *regs) goto badframe; sigdelsetmask(&set, ~_BLOCKABLE); - spin_lock_irq(¤t->sighand->siglock); - current->blocked = set; - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); + set_current_blocked(&set); if (ia32_restore_sigcontext(regs, &frame->sc, &ax)) goto badframe; @@ -308,10 +307,7 @@ asmlinkage long sys32_rt_sigreturn(struct pt_regs *regs) goto badframe; sigdelsetmask(&set, ~_BLOCKABLE); - spin_lock_irq(¤t->sighand->siglock); - current->blocked = set; - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); + set_current_blocked(&set); if (ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) goto badframe; -- cgit v1.1 From 3982294b0342474ff91472b34c6afb701785f524 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 10 Jul 2011 21:27:27 +0200 Subject: x86, signals: Convert the X86_32 code to use set_current_blocked() sys_sigsuspend() and sys_sigreturn() change ->blocked directly. This is not correct, see the changelog in e6fa16ab "signal: sigprocmask() should do retarget_shared_pending()" Change them to use set_current_blocked(). Signed-off-by: Oleg Nesterov Link: http://lkml.kernel.org/r/20110710192727.GA31759@redhat.com Signed-off-by: H. Peter Anvin --- arch/x86/kernel/signal.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 40a2493..bf9345d 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -485,17 +485,18 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, asmlinkage int sys_sigsuspend(int history0, int history1, old_sigset_t mask) { - mask &= _BLOCKABLE; - spin_lock_irq(¤t->sighand->siglock); + sigset_t blocked; + current->saved_sigmask = current->blocked; - siginitset(¤t->blocked, mask); - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); + + mask &= _BLOCKABLE; + siginitset(&blocked, mask); + set_current_blocked(&blocked); current->state = TASK_INTERRUPTIBLE; schedule(); - set_restore_sigmask(); + set_restore_sigmask(); return -ERESTARTNOHAND; } @@ -572,10 +573,7 @@ unsigned long sys_sigreturn(struct pt_regs *regs) goto badframe; sigdelsetmask(&set, ~_BLOCKABLE); - spin_lock_irq(¤t->sighand->siglock); - current->blocked = set; - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); + set_current_blocked(&set); if (restore_sigcontext(regs, &frame->sc, &ax)) goto badframe; -- cgit v1.1 From 9b429620740945363b746414e8b9a84b8119914c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 10 Jul 2011 20:22:03 +0200 Subject: x86, do_signal: Simplify the TS_RESTORE_SIGMASK logic 1. do_signal() looks at TS_RESTORE_SIGMASK and calculates the mask which should be stored in the signal frame, then it passes "oldset" to the callees, down to setup_rt_frame(). This is ugly, setup_rt_frame() can do this itself and nobody else needs this sigset_t. Move this code into setup_rt_frame. 2. do_signal() also clears TS_RESTORE_SIGMASK if handle_signal() succeeds. We can move this to setup_rt_frame() as well, this avoids the unnecessary checks and makes the logic more clear. 3. use set_current_blocked() instead of sigprocmask(SIG_SETMASK), sigprocmask() should be avoided. Signed-off-by: Oleg Nesterov Link: http://lkml.kernel.org/r/20110710182203.GA27979@redhat.com Signed-off-by: H. Peter Anvin --- arch/x86/kernel/signal.c | 29 ++++++++++------------------- 1 file changed, 10 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index bf9345d..8c55f97 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -651,11 +651,15 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka, static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, - sigset_t *set, struct pt_regs *regs) + struct pt_regs *regs) { int usig = signr_convert(sig); + sigset_t *set = ¤t->blocked; int ret; + if (current_thread_info()->status & TS_RESTORE_SIGMASK) + set = ¤t->saved_sigmask; + /* Set up the stack frame */ if (is_ia32) { if (ka->sa.sa_flags & SA_SIGINFO) @@ -670,12 +674,13 @@ setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, return -EFAULT; } + current_thread_info()->status &= ~TS_RESTORE_SIGMASK; return ret; } static int handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, - sigset_t *oldset, struct pt_regs *regs) + struct pt_regs *regs) { sigset_t blocked; int ret; @@ -710,7 +715,7 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, likely(test_and_clear_thread_flag(TIF_FORCED_TF))) regs->flags &= ~X86_EFLAGS_TF; - ret = setup_rt_frame(sig, ka, info, oldset, regs); + ret = setup_rt_frame(sig, ka, info, regs); if (ret) return ret; @@ -765,7 +770,6 @@ static void do_signal(struct pt_regs *regs) struct k_sigaction ka; siginfo_t info; int signr; - sigset_t *oldset; /* * We want the common case to go fast, which is why we may in certain @@ -777,23 +781,10 @@ static void do_signal(struct pt_regs *regs) if (!user_mode(regs)) return; - if (current_thread_info()->status & TS_RESTORE_SIGMASK) - oldset = ¤t->saved_sigmask; - else - oldset = ¤t->blocked; - signr = get_signal_to_deliver(&info, &ka, regs, NULL); if (signr > 0) { /* Whee! Actually deliver the signal. */ - if (handle_signal(signr, &info, &ka, oldset, regs) == 0) { - /* - * A signal was successfully delivered; the saved - * sigmask will have been stored in the signal frame, - * and will be restored by sigreturn, so we can simply - * clear the TS_RESTORE_SIGMASK flag. - */ - current_thread_info()->status &= ~TS_RESTORE_SIGMASK; - } + handle_signal(signr, &info, &ka, regs); return; } @@ -821,7 +812,7 @@ static void do_signal(struct pt_regs *regs) */ if (current_thread_info()->status & TS_RESTORE_SIGMASK) { current_thread_info()->status &= ~TS_RESTORE_SIGMASK; - sigprocmask(SIG_SETMASK, ¤t->saved_sigmask, NULL); + set_current_blocked(¤t->saved_sigmask); } } -- cgit v1.1 From 73d382deccac186d103496bf10388bc2432a8384 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 10 Jul 2011 18:44:24 +0200 Subject: x86: Kill handle_signal()->set_fs() handle_signal()->set_fs() has a nice comment which explains what set_fs() is, but it doesn't explain why it is needed and why it depends on CONFIG_X86_64. Afaics, the history of this confusion is: 1. I guess today nobody can explain why it was needed in arch/i386/kernel/signal.c, perhaps it was always wrong. This predates 2.4.0 kernel. 2. then it was copy-and-past'ed to the new x86_64 arch. 3. then it was removed from i386 (but not from x86_64) by b93b6ca3 "i386: remove unnecessary code". 4. then it was reintroduced under CONFIG_X86_64 when x86 unified i386 and x86_64, because the patch above didn't touch x86_64. Remove it. ->addr_limit should be correct. Even if it was possible that it is wrong, it is too late to fix it after setup_rt_frame(). Linus commented in: http://lkml.kernel.org/r/alpine.LFD.0.999.0707170902570.19166@woody.linux-foundation.org ... about the equivalent bit from i386: Heh. I think it's entirely historical. Please realize that the whole reason that function is called "set_fs()" is that it literally used to set the %fs segment register, not "->addr_limit". So I think the "set_fs(USER_DS)" is there _only_ to match the other regs->xds = __USER_DS; regs->xes = __USER_DS; regs->xss = __USER_DS; regs->xcs = __USER_CS; things, and never mattered. And now it matters even less, and has been copied to all other architectures where it is just totally insane. Signed-off-by: Oleg Nesterov Link: http://lkml.kernel.org/r/20110710164424.GA20261@redhat.com Signed-off-by: H. Peter Anvin --- arch/x86/kernel/signal.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 8c55f97..54ddaeb2 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -720,15 +720,6 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, if (ret) return ret; -#ifdef CONFIG_X86_64 - /* - * This has nothing to do with segment registers, - * despite the name. This magic affects uaccess.h - * macros' behavior. Reset it to the normal setting. - */ - set_fs(USER_DS); -#endif - /* * Clear the direction flag as per the ABI for function entry. */ -- cgit v1.1 From 17edf2d79f1ea6dfdb4c444801d928953b9f98d6 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Fri, 15 Jul 2011 17:37:15 -0400 Subject: x86, intel, power: Correct the MSR_IA32_ENERGY_PERF_BIAS message Fix the printk_once() so that it actually prints (didn't print before due to a stray comma.) [ hpa: changed to an incremental patch and adjusted the description accordingly. ] Signed-off-by: Len Brown Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1107151732480.18606@x980 Cc: Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/intel.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index da0d779..ed6086e 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -465,11 +465,11 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) u64 epb; rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); - if ((epb & 0xF) == 0) { - printk_once(KERN_WARNING, "x86: updated energy_perf_bias" - " to 'normal' from 'performance'\n" - "You can view and update epb via utility," - " such as x86_energy_perf_policy(8)\n"); + if ((epb & 0xF) == ENERGY_PERF_BIAS_PERFORMANCE) { + printk_once(KERN_WARNING "ENERGY_PERF_BIAS:" + " Set to 'normal', was 'performance'\n" + "ENERGY_PERF_BIAS: View and update with" + " x86_energy_perf_policy(8)\n"); epb = (epb & ~0xF) | ENERGY_PERF_BIAS_NORMAL; wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); } -- cgit v1.1 From 8c400f6ce068366bc3517f1036bb99169cfec9cd Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 18 Jul 2011 21:10:54 +0200 Subject: x86, vdso: Drop now wrong comment Now that 1b3f2a72bbcfdf92e368a44448c45eb639b05b5e is in, it is very important that the below lying comment be removed! :-) Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/20110718191054.GA18359@liondog.tnic Acked-by: Andy Lutomirski Signed-off-by: H. Peter Anvin --- arch/x86/vdso/vclock_gettime.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index 8792d6e..6bc0e72 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -6,7 +6,6 @@ * * The code should have no internal unresolved relocations. * Check with readelf after changing. - * Also alternative() doesn't work. */ /* Disable profiling for userspace code: */ -- cgit v1.1 From 84cdee76b15f3669f012d5916287d124c805ef2f Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 16 Dec 2010 13:42:55 -0800 Subject: xen/multicalls: remove debugfs stats Remove debugfs stats to make way for tracing. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/multicalls.c | 109 +--------------------------------------------- arch/x86/xen/multicalls.h | 3 ++ 2 files changed, 4 insertions(+), 108 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c index 1b2b73f..b9bf198 100644 --- a/arch/x86/xen/multicalls.c +++ b/arch/x86/xen/multicalls.c @@ -52,76 +52,6 @@ struct mc_buffer { static DEFINE_PER_CPU(struct mc_buffer, mc_buffer); DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags); -/* flush reasons 0- slots, 1- args, 2- callbacks */ -enum flush_reasons -{ - FL_SLOTS, - FL_ARGS, - FL_CALLBACKS, - - FL_N_REASONS -}; - -#ifdef CONFIG_XEN_DEBUG_FS -#define NHYPERCALLS 40 /* not really */ - -static struct { - unsigned histo[MC_BATCH+1]; - - unsigned issued; - unsigned arg_total; - unsigned hypercalls; - unsigned histo_hypercalls[NHYPERCALLS]; - - unsigned flush[FL_N_REASONS]; -} mc_stats; - -static u8 zero_stats; - -static inline void check_zero(void) -{ - if (unlikely(zero_stats)) { - memset(&mc_stats, 0, sizeof(mc_stats)); - zero_stats = 0; - } -} - -static void mc_add_stats(const struct mc_buffer *mc) -{ - int i; - - check_zero(); - - mc_stats.issued++; - mc_stats.hypercalls += mc->mcidx; - mc_stats.arg_total += mc->argidx; - - mc_stats.histo[mc->mcidx]++; - for(i = 0; i < mc->mcidx; i++) { - unsigned op = mc->entries[i].op; - if (op < NHYPERCALLS) - mc_stats.histo_hypercalls[op]++; - } -} - -static void mc_stats_flush(enum flush_reasons idx) -{ - check_zero(); - - mc_stats.flush[idx]++; -} - -#else /* !CONFIG_XEN_DEBUG_FS */ - -static inline void mc_add_stats(const struct mc_buffer *mc) -{ -} - -static inline void mc_stats_flush(enum flush_reasons idx) -{ -} -#endif /* CONFIG_XEN_DEBUG_FS */ - void xen_mc_flush(void) { struct mc_buffer *b = &__get_cpu_var(mc_buffer); @@ -135,8 +65,6 @@ void xen_mc_flush(void) something in the middle */ local_irq_save(flags); - mc_add_stats(b); - if (b->mcidx) { #if MC_DEBUG memcpy(b->debug, b->entries, @@ -193,7 +121,6 @@ struct multicall_space __xen_mc_entry(size_t args) if (b->mcidx == MC_BATCH || (argidx + args) >= MC_ARGS) { - mc_stats_flush(b->mcidx == MC_BATCH ? FL_SLOTS : FL_ARGS); xen_mc_flush(); argidx = roundup(b->argidx, sizeof(u64)); } @@ -240,44 +167,10 @@ void xen_mc_callback(void (*fn)(void *), void *data) struct mc_buffer *b = &__get_cpu_var(mc_buffer); struct callback *cb; - if (b->cbidx == MC_BATCH) { - mc_stats_flush(FL_CALLBACKS); + if (b->cbidx == MC_BATCH) xen_mc_flush(); - } cb = &b->callbacks[b->cbidx++]; cb->fn = fn; cb->data = data; } - -#ifdef CONFIG_XEN_DEBUG_FS - -static struct dentry *d_mc_debug; - -static int __init xen_mc_debugfs(void) -{ - struct dentry *d_xen = xen_init_debugfs(); - - if (d_xen == NULL) - return -ENOMEM; - - d_mc_debug = debugfs_create_dir("multicalls", d_xen); - - debugfs_create_u8("zero_stats", 0644, d_mc_debug, &zero_stats); - - debugfs_create_u32("batches", 0444, d_mc_debug, &mc_stats.issued); - debugfs_create_u32("hypercalls", 0444, d_mc_debug, &mc_stats.hypercalls); - debugfs_create_u32("arg_total", 0444, d_mc_debug, &mc_stats.arg_total); - - xen_debugfs_create_u32_array("batch_histo", 0444, d_mc_debug, - mc_stats.histo, MC_BATCH); - xen_debugfs_create_u32_array("hypercall_histo", 0444, d_mc_debug, - mc_stats.histo_hypercalls, NHYPERCALLS); - xen_debugfs_create_u32_array("flush_reasons", 0444, d_mc_debug, - mc_stats.flush, FL_N_REASONS); - - return 0; -} -fs_initcall(xen_mc_debugfs); - -#endif /* CONFIG_XEN_DEBUG_FS */ diff --git a/arch/x86/xen/multicalls.h b/arch/x86/xen/multicalls.h index 4ec8035..fa7b8af 100644 --- a/arch/x86/xen/multicalls.h +++ b/arch/x86/xen/multicalls.h @@ -1,6 +1,8 @@ #ifndef _XEN_MULTICALLS_H #define _XEN_MULTICALLS_H +#include + #include "xen-ops.h" /* Multicalls */ @@ -20,6 +22,7 @@ DECLARE_PER_CPU(unsigned long, xen_mc_irq_flags); static inline void xen_mc_batch(void) { unsigned long flags; + /* need to disable interrupts until this entry is complete */ local_irq_save(flags); __this_cpu_write(xen_mc_irq_flags, flags); -- cgit v1.1 From f04e2ee41d3dbeb6eeb3685d1b4c208b898e278f Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 16 Dec 2010 14:15:23 -0800 Subject: xen/trace: set up tracepoint skeleton Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/Makefile | 2 +- arch/x86/xen/trace.c | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) create mode 100644 arch/x86/xen/trace.c (limited to 'arch/x86') diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 17c565d..6fdf366 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -13,7 +13,7 @@ CFLAGS_mmu.o := $(nostackp) obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ time.o xen-asm.o xen-asm_$(BITS).o \ grant-table.o suspend.o platform-pci-unplug.o \ - p2m.o + p2m.o trace.o obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o diff --git a/arch/x86/xen/trace.c b/arch/x86/xen/trace.c new file mode 100644 index 0000000..8cca6164 --- /dev/null +++ b/arch/x86/xen/trace.c @@ -0,0 +1,4 @@ +#include + +#define CREATE_TRACE_POINTS +#include -- cgit v1.1 From c796f213a6934712ede728d9b53ef0e5066db23a Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 16 Dec 2010 14:33:27 -0800 Subject: xen/trace: add multicall tracing Signed-off-by: Jeremy Fitzhardinge --- arch/x86/include/asm/xen/hypercall.h | 22 +++++++++++++ arch/x86/include/asm/xen/trace_types.h | 18 +++++++++++ arch/x86/xen/multicalls.c | 31 +++++++++++++----- arch/x86/xen/multicalls.h | 3 ++ arch/x86/xen/trace.c | 57 ++++++++++++++++++++++++++++++++++ 5 files changed, 123 insertions(+), 8 deletions(-) create mode 100644 arch/x86/include/asm/xen/trace_types.h (limited to 'arch/x86') diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index d240ea9..417777d 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -39,6 +39,8 @@ #include #include +#include + #include #include @@ -459,6 +461,8 @@ MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set) { mcl->op = __HYPERVISOR_fpu_taskswitch; mcl->args[0] = set; + + trace_xen_mc_entry(mcl, 1); } static inline void @@ -475,6 +479,8 @@ MULTI_update_va_mapping(struct multicall_entry *mcl, unsigned long va, mcl->args[2] = new_val.pte >> 32; mcl->args[3] = flags; } + + trace_xen_mc_entry(mcl, sizeof(new_val) == sizeof(long) ? 3 : 4); } static inline void @@ -485,6 +491,8 @@ MULTI_grant_table_op(struct multicall_entry *mcl, unsigned int cmd, mcl->args[0] = cmd; mcl->args[1] = (unsigned long)uop; mcl->args[2] = count; + + trace_xen_mc_entry(mcl, 3); } static inline void @@ -504,6 +512,8 @@ MULTI_update_va_mapping_otherdomain(struct multicall_entry *mcl, unsigned long v mcl->args[3] = flags; mcl->args[4] = domid; } + + trace_xen_mc_entry(mcl, sizeof(new_val) == sizeof(long) ? 4 : 5); } static inline void @@ -520,6 +530,8 @@ MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr, mcl->args[2] = desc.a; mcl->args[3] = desc.b; } + + trace_xen_mc_entry(mcl, sizeof(maddr) == sizeof(long) ? 2 : 4); } static inline void @@ -528,6 +540,8 @@ MULTI_memory_op(struct multicall_entry *mcl, unsigned int cmd, void *arg) mcl->op = __HYPERVISOR_memory_op; mcl->args[0] = cmd; mcl->args[1] = (unsigned long)arg; + + trace_xen_mc_entry(mcl, 2); } static inline void @@ -539,6 +553,8 @@ MULTI_mmu_update(struct multicall_entry *mcl, struct mmu_update *req, mcl->args[1] = count; mcl->args[2] = (unsigned long)success_count; mcl->args[3] = domid; + + trace_xen_mc_entry(mcl, 4); } static inline void @@ -550,6 +566,8 @@ MULTI_mmuext_op(struct multicall_entry *mcl, struct mmuext_op *op, int count, mcl->args[1] = count; mcl->args[2] = (unsigned long)success_count; mcl->args[3] = domid; + + trace_xen_mc_entry(mcl, 4); } static inline void @@ -558,6 +576,8 @@ MULTI_set_gdt(struct multicall_entry *mcl, unsigned long *frames, int entries) mcl->op = __HYPERVISOR_set_gdt; mcl->args[0] = (unsigned long)frames; mcl->args[1] = entries; + + trace_xen_mc_entry(mcl, 2); } static inline void @@ -567,6 +587,8 @@ MULTI_stack_switch(struct multicall_entry *mcl, mcl->op = __HYPERVISOR_stack_switch; mcl->args[0] = ss; mcl->args[1] = esp; + + trace_xen_mc_entry(mcl, 2); } #endif /* _ASM_X86_XEN_HYPERCALL_H */ diff --git a/arch/x86/include/asm/xen/trace_types.h b/arch/x86/include/asm/xen/trace_types.h new file mode 100644 index 0000000..21e1874 --- /dev/null +++ b/arch/x86/include/asm/xen/trace_types.h @@ -0,0 +1,18 @@ +#ifndef _ASM_XEN_TRACE_TYPES_H +#define _ASM_XEN_TRACE_TYPES_H + +enum xen_mc_flush_reason { + XEN_MC_FL_NONE, /* explicit flush */ + XEN_MC_FL_BATCH, /* out of hypercall space */ + XEN_MC_FL_ARGS, /* out of argument space */ + XEN_MC_FL_CALLBACK, /* out of callback space */ +}; + +enum xen_mc_extend_args { + XEN_MC_XE_OK, + XEN_MC_XE_BAD_OP, + XEN_MC_XE_NO_SPACE +}; +typedef void (*xen_mc_callback_fn_t)(void *); + +#endif /* _ASM_XEN_TRACE_TYPES_H */ diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c index b9bf198..7074c7e 100644 --- a/arch/x86/xen/multicalls.c +++ b/arch/x86/xen/multicalls.c @@ -65,6 +65,8 @@ void xen_mc_flush(void) something in the middle */ local_irq_save(flags); + trace_xen_mc_flush(b->mcidx, b->argidx, b->cbidx); + if (b->mcidx) { #if MC_DEBUG memcpy(b->debug, b->entries, @@ -116,11 +118,15 @@ struct multicall_space __xen_mc_entry(size_t args) struct multicall_space ret; unsigned argidx = roundup(b->argidx, sizeof(u64)); + trace_xen_mc_entry_alloc(args); + BUG_ON(preemptible()); BUG_ON(b->argidx >= MC_ARGS); if (b->mcidx == MC_BATCH || (argidx + args) >= MC_ARGS) { + trace_xen_mc_flush_reason((b->mcidx == MC_BATCH) ? + XEN_MC_FL_BATCH : XEN_MC_FL_ARGS); xen_mc_flush(); argidx = roundup(b->argidx, sizeof(u64)); } @@ -145,20 +151,25 @@ struct multicall_space xen_mc_extend_args(unsigned long op, size_t size) BUG_ON(preemptible()); BUG_ON(b->argidx >= MC_ARGS); - if (b->mcidx == 0) - return ret; - - if (b->entries[b->mcidx - 1].op != op) - return ret; + if (unlikely(b->mcidx == 0 || + b->entries[b->mcidx - 1].op != op)) { + trace_xen_mc_extend_args(op, size, XEN_MC_XE_BAD_OP); + goto out; + } - if ((b->argidx + size) >= MC_ARGS) - return ret; + if (unlikely((b->argidx + size) >= MC_ARGS)) { + trace_xen_mc_extend_args(op, size, XEN_MC_XE_NO_SPACE); + goto out; + } ret.mc = &b->entries[b->mcidx - 1]; ret.args = &b->args[b->argidx]; b->argidx += size; BUG_ON(b->argidx >= MC_ARGS); + + trace_xen_mc_extend_args(op, size, XEN_MC_XE_OK); +out: return ret; } @@ -167,8 +178,12 @@ void xen_mc_callback(void (*fn)(void *), void *data) struct mc_buffer *b = &__get_cpu_var(mc_buffer); struct callback *cb; - if (b->cbidx == MC_BATCH) + if (b->cbidx == MC_BATCH) { + trace_xen_mc_flush_reason(XEN_MC_FL_CALLBACK); xen_mc_flush(); + } + + trace_xen_mc_callback(fn, data); cb = &b->callbacks[b->cbidx++]; cb->fn = fn; diff --git a/arch/x86/xen/multicalls.h b/arch/x86/xen/multicalls.h index fa7b8af..dee79b7 100644 --- a/arch/x86/xen/multicalls.h +++ b/arch/x86/xen/multicalls.h @@ -25,6 +25,7 @@ static inline void xen_mc_batch(void) /* need to disable interrupts until this entry is complete */ local_irq_save(flags); + trace_xen_mc_batch(paravirt_get_lazy_mode()); __this_cpu_write(xen_mc_irq_flags, flags); } @@ -40,6 +41,8 @@ void xen_mc_flush(void); /* Issue a multicall if we're not in a lazy mode */ static inline void xen_mc_issue(unsigned mode) { + trace_xen_mc_issue(mode); + if ((paravirt_get_lazy_mode() & mode) == 0) xen_mc_flush(); diff --git a/arch/x86/xen/trace.c b/arch/x86/xen/trace.c index 8cca6164..734beba 100644 --- a/arch/x86/xen/trace.c +++ b/arch/x86/xen/trace.c @@ -1,4 +1,61 @@ #include +#define N(x) [__HYPERVISOR_##x] = "("#x")" +static const char *xen_hypercall_names[] = { + N(set_trap_table), + N(mmu_update), + N(set_gdt), + N(stack_switch), + N(set_callbacks), + N(fpu_taskswitch), + N(sched_op_compat), + N(dom0_op), + N(set_debugreg), + N(get_debugreg), + N(update_descriptor), + N(memory_op), + N(multicall), + N(update_va_mapping), + N(set_timer_op), + N(event_channel_op_compat), + N(xen_version), + N(console_io), + N(physdev_op_compat), + N(grant_table_op), + N(vm_assist), + N(update_va_mapping_otherdomain), + N(iret), + N(vcpu_op), + N(set_segment_base), + N(mmuext_op), + N(acm_op), + N(nmi_op), + N(sched_op), + N(callback_op), + N(xenoprof_op), + N(event_channel_op), + N(physdev_op), + N(hvm_op), + +/* Architecture-specific hypercall definitions. */ + N(arch_0), + N(arch_1), + N(arch_2), + N(arch_3), + N(arch_4), + N(arch_5), + N(arch_6), + N(arch_7), +}; +#undef N + +static const char *xen_hypercall_name(unsigned op) +{ + if (op < ARRAY_SIZE(xen_hypercall_names) && xen_hypercall_names[op] != NULL) + return xen_hypercall_names[op]; + + return ""; +} + #define CREATE_TRACE_POINTS #include -- cgit v1.1 From 847088079162a5cf8ab0d1ad1ecf7fa60c057246 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 16 Dec 2010 17:02:35 -0800 Subject: xen/trace: add mmu tracepoints Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/mmu.c | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 0ccccb6..43fa777 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -48,6 +48,8 @@ #include #include +#include + #include #include #include @@ -194,6 +196,8 @@ void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid) struct multicall_space mcs; struct mmu_update *u; + trace_xen_mmu_set_domain_pte(ptep, pteval, domid); + mcs = xen_mc_entry(sizeof(*u)); u = mcs.args; @@ -245,6 +249,8 @@ static void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val) static void xen_set_pmd(pmd_t *ptr, pmd_t val) { + trace_xen_mmu_set_pmd(ptr, val); + /* If page is not pinned, we can just update the entry directly */ if (!xen_page_pinned(ptr)) { @@ -282,22 +288,30 @@ static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval) return true; } -static void xen_set_pte(pte_t *ptep, pte_t pteval) +static inline void __xen_set_pte(pte_t *ptep, pte_t pteval) { if (!xen_batched_set_pte(ptep, pteval)) native_set_pte(ptep, pteval); } +static void xen_set_pte(pte_t *ptep, pte_t pteval) +{ + trace_xen_mmu_set_pte(ptep, pteval); + __xen_set_pte(ptep, pteval); +} + static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval) { - xen_set_pte(ptep, pteval); + trace_xen_mmu_set_pte_at(mm, addr, ptep, pteval); + __xen_set_pte(ptep, pteval); } pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { /* Just return the pte as-is. We preserve the bits on commit */ + trace_xen_mmu_ptep_modify_prot_start(mm, addr, ptep, *ptep); return *ptep; } @@ -306,6 +320,7 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, { struct mmu_update u; + trace_xen_mmu_ptep_modify_prot_commit(mm, addr, ptep, pte); xen_mc_batch(); u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD; @@ -530,6 +545,8 @@ static void xen_set_pud_hyper(pud_t *ptr, pud_t val) static void xen_set_pud(pud_t *ptr, pud_t val) { + trace_xen_mmu_set_pud(ptr, val); + /* If page is not pinned, we can just update the entry directly */ if (!xen_page_pinned(ptr)) { @@ -543,17 +560,20 @@ static void xen_set_pud(pud_t *ptr, pud_t val) #ifdef CONFIG_X86_PAE static void xen_set_pte_atomic(pte_t *ptep, pte_t pte) { + trace_xen_mmu_set_pte_atomic(ptep, pte); set_64bit((u64 *)ptep, native_pte_val(pte)); } static void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { + trace_xen_mmu_pte_clear(mm, addr, ptep); if (!xen_batched_set_pte(ptep, native_make_pte(0))) native_pte_clear(mm, addr, ptep); } static void xen_pmd_clear(pmd_t *pmdp) { + trace_xen_mmu_pmd_clear(pmdp); set_pmd(pmdp, __pmd(0)); } #endif /* CONFIG_X86_PAE */ @@ -629,6 +649,8 @@ static void xen_set_pgd(pgd_t *ptr, pgd_t val) { pgd_t *user_ptr = xen_get_user_pgd(ptr); + trace_xen_mmu_set_pgd(ptr, user_ptr, val); + /* If page is not pinned, we can just update the entry directly */ if (!xen_page_pinned(ptr)) { -- cgit v1.1 From c2ba050d2e5638774571ea0ad0375a1c17c7b04e Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 17 Dec 2010 14:21:17 -0800 Subject: xen/trace: add ptpage alloc/release tracepoints Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/mmu.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 43fa777..f216099 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1478,8 +1478,11 @@ static void __init xen_release_pmd_init(unsigned long pfn) static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level) { struct page *page = pfn_to_page(pfn); + int pinned = PagePinned(virt_to_page(mm->pgd)); + + trace_xen_mmu_alloc_ptpage(mm, pfn, level, pinned); - if (PagePinned(virt_to_page(mm->pgd))) { + if (pinned) { SetPagePinned(page); if (!PageHighMem(page)) { @@ -1508,8 +1511,11 @@ static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn) static void xen_release_ptpage(unsigned long pfn, unsigned level) { struct page *page = pfn_to_page(pfn); + bool pinned = PagePinned(page); - if (PagePinned(page)) { + trace_xen_mmu_release_ptpage(pfn, level, pinned); + + if (pinned) { if (!PageHighMem(page)) { if (level == PT_PTE && USE_SPLIT_PTLOCKS) pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); -- cgit v1.1 From 5f94fb5b8edf29bba06e2cd05f9d3a80a5dfb9bc Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 17 Dec 2010 15:31:23 -0800 Subject: xen/trace: add xen_pgd_(un)pin tracepoints Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/mmu.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index f216099..5dfa90a 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -885,6 +885,8 @@ static int xen_pin_page(struct mm_struct *mm, struct page *page, read-only, and can be pinned. */ static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd) { + trace_xen_mmu_pgd_pin(mm, pgd); + xen_mc_batch(); if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) { @@ -1010,6 +1012,8 @@ static int xen_unpin_page(struct mm_struct *mm, struct page *page, /* Release a pagetables pages back as normal RW */ static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd) { + trace_xen_mmu_pgd_unpin(mm, pgd); + xen_mc_batch(); xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); -- cgit v1.1 From ab78f7ad2c78fb1b724b278479adec998933be36 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 17 Dec 2010 15:54:28 -0800 Subject: xen/trace: add segment desc tracing Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/enlighten.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 5525163..520325c 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -341,6 +341,8 @@ static void xen_set_ldt(const void *addr, unsigned entries) struct mmuext_op *op; struct multicall_space mcs = xen_mc_entry(sizeof(*op)); + trace_xen_cpu_set_ldt(addr, entries); + op = mcs.args; op->cmd = MMUEXT_SET_LDT; op->arg1.linear_addr = (unsigned long)addr; @@ -496,6 +498,8 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, xmaddr_t mach_lp = arbitrary_virt_to_machine(&dt[entrynum]); u64 entry = *(u64 *)ptr; + trace_xen_cpu_write_ldt_entry(dt, entrynum, entry); + preempt_disable(); xen_mc_flush(); @@ -565,6 +569,8 @@ static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g) unsigned long p = (unsigned long)&dt[entrynum]; unsigned long start, end; + trace_xen_cpu_write_idt_entry(dt, entrynum, g); + preempt_disable(); start = __this_cpu_read(idt_desc.address); @@ -619,6 +625,8 @@ static void xen_load_idt(const struct desc_ptr *desc) static DEFINE_SPINLOCK(lock); static struct trap_info traps[257]; + trace_xen_cpu_load_idt(desc); + spin_lock(&lock); __get_cpu_var(idt_desc) = *desc; @@ -637,6 +645,8 @@ static void xen_load_idt(const struct desc_ptr *desc) static void xen_write_gdt_entry(struct desc_struct *dt, int entry, const void *desc, int type) { + trace_xen_cpu_write_gdt_entry(dt, entry, desc, type); + preempt_disable(); switch (type) { @@ -665,6 +675,8 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry, static void __init xen_write_gdt_entry_boot(struct desc_struct *dt, int entry, const void *desc, int type) { + trace_xen_cpu_write_gdt_entry(dt, entry, desc, type); + switch (type) { case DESC_LDT: case DESC_TSS: @@ -684,7 +696,9 @@ static void __init xen_write_gdt_entry_boot(struct desc_struct *dt, int entry, static void xen_load_sp0(struct tss_struct *tss, struct thread_struct *thread) { - struct multicall_space mcs = xen_mc_entry(0); + struct multicall_space mcs; + + mcs = xen_mc_entry(0); MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0); xen_mc_issue(PARAVIRT_LAZY_CPU); } -- cgit v1.1 From c8eed1719afb337472c5ef31ec590d549770b173 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 20 Dec 2010 13:15:04 -0800 Subject: xen/trace: add tlb flush tracepoints Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/mmu.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 5dfa90a..c3abba1 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1222,6 +1222,8 @@ static void xen_flush_tlb(void) struct mmuext_op *op; struct multicall_space mcs; + trace_xen_mmu_flush_tlb(0); + preempt_disable(); mcs = xen_mc_entry(sizeof(*op)); @@ -1240,6 +1242,8 @@ static void xen_flush_tlb_single(unsigned long addr) struct mmuext_op *op; struct multicall_space mcs; + trace_xen_mmu_flush_tlb_single(addr); + preempt_disable(); mcs = xen_mc_entry(sizeof(*op)); @@ -1266,6 +1270,8 @@ static void xen_flush_tlb_others(const struct cpumask *cpus, } *args; struct multicall_space mcs; + trace_xen_mmu_flush_tlb_others(cpus, mm, va); + if (cpumask_empty(cpus)) return; /* nothing to do */ @@ -1305,6 +1311,8 @@ static void __xen_write_cr3(bool kernel, unsigned long cr3) struct multicall_space mcs; unsigned long mfn; + trace_xen_mmu_write_cr3(kernel, cr3); + if (cr3) mfn = pfn_to_mfn(PFN_DOWN(cr3)); else -- cgit v1.1 From dcf7435cfe617c9d9f3dd501d3f5ed4d0b30c218 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 17 Dec 2010 09:17:32 -0800 Subject: xen/mmu: use extend_args for more mmuext updates Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/mmu.c | 42 +++++++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index c3abba1..eb91126 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -229,6 +229,24 @@ static void xen_extend_mmu_update(const struct mmu_update *update) *u = *update; } +static void xen_extend_mmuext_op(const struct mmuext_op *op) +{ + struct multicall_space mcs; + struct mmuext_op *u; + + mcs = xen_mc_extend_args(__HYPERVISOR_mmuext_op, sizeof(*u)); + + if (mcs.mc != NULL) { + mcs.mc->args[1]++; + } else { + mcs = __xen_mc_entry(sizeof(*u)); + MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); + } + + u = mcs.args; + *u = *op; +} + static void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val) { struct mmu_update u; @@ -810,14 +828,12 @@ static void xen_pte_unlock(void *v) static void xen_do_pin(unsigned level, unsigned long pfn) { - struct mmuext_op *op; - struct multicall_space mcs; + struct mmuext_op op; - mcs = __xen_mc_entry(sizeof(*op)); - op = mcs.args; - op->cmd = level; - op->arg1.mfn = pfn_to_mfn(pfn); - MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + op.cmd = level; + op.arg1.mfn = pfn_to_mfn(pfn); + + xen_extend_mmuext_op(&op); } static int xen_pin_page(struct mm_struct *mm, struct page *page, @@ -1307,8 +1323,7 @@ static void set_current_cr3(void *v) static void __xen_write_cr3(bool kernel, unsigned long cr3) { - struct mmuext_op *op; - struct multicall_space mcs; + struct mmuext_op op; unsigned long mfn; trace_xen_mmu_write_cr3(kernel, cr3); @@ -1320,13 +1335,10 @@ static void __xen_write_cr3(bool kernel, unsigned long cr3) WARN_ON(mfn == 0 && kernel); - mcs = __xen_mc_entry(sizeof(*op)); + op.cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR; + op.arg1.mfn = mfn; - op = mcs.args; - op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR; - op->arg1.mfn = mfn; - - MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + xen_extend_mmuext_op(&op); if (kernel) { percpu_write(xen_cr3, cr3); -- cgit v1.1 From bc7fe1d977734efe6059d6d537b439135ad4a34c Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 17 Dec 2010 14:58:43 -0800 Subject: xen/mmu: tune pgtable alloc/release Make sure the fastpath code is inlined. Batch the page permission change and the pin/unpin, and make sure that it can be batched with any adjacent set_pte/pmd/etc operations. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/mmu.c | 53 ++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 44 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index eb91126..f987bde 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1497,22 +1497,52 @@ static void __init xen_release_pmd_init(unsigned long pfn) make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); } +static inline void __pin_pagetable_pfn(unsigned cmd, unsigned long pfn) +{ + struct multicall_space mcs; + struct mmuext_op *op; + + mcs = __xen_mc_entry(sizeof(*op)); + op = mcs.args; + op->cmd = cmd; + op->arg1.mfn = pfn_to_mfn(pfn); + + MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); +} + +static inline void __set_pfn_prot(unsigned long pfn, pgprot_t prot) +{ + struct multicall_space mcs; + unsigned long addr = (unsigned long)__va(pfn << PAGE_SHIFT); + + mcs = __xen_mc_entry(0); + MULTI_update_va_mapping(mcs.mc, (unsigned long)addr, + pfn_pte(pfn, prot), 0); +} + /* This needs to make sure the new pte page is pinned iff its being attached to a pinned pagetable. */ -static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level) +static inline void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, + unsigned level) { - struct page *page = pfn_to_page(pfn); - int pinned = PagePinned(virt_to_page(mm->pgd)); - + bool pinned = PagePinned(virt_to_page(mm->pgd)); + trace_xen_mmu_alloc_ptpage(mm, pfn, level, pinned); if (pinned) { + struct page *page = pfn_to_page(pfn); + SetPagePinned(page); if (!PageHighMem(page)) { - make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn))); + xen_mc_batch(); + + __set_pfn_prot(pfn, PAGE_KERNEL_RO); + if (level == PT_PTE && USE_SPLIT_PTLOCKS) - pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); + __pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); + + xen_mc_issue(PARAVIRT_LAZY_MMU); } else { /* make sure there are no stray mappings of this page */ @@ -1532,7 +1562,7 @@ static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn) } /* This should never happen until we're OK to use struct page */ -static void xen_release_ptpage(unsigned long pfn, unsigned level) +static inline void xen_release_ptpage(unsigned long pfn, unsigned level) { struct page *page = pfn_to_page(pfn); bool pinned = PagePinned(page); @@ -1541,9 +1571,14 @@ static void xen_release_ptpage(unsigned long pfn, unsigned level) if (pinned) { if (!PageHighMem(page)) { + xen_mc_batch(); + if (level == PT_PTE && USE_SPLIT_PTLOCKS) - pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); - make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); + __pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); + + __set_pfn_prot(pfn, PAGE_KERNEL); + + xen_mc_issue(PARAVIRT_LAZY_MMU); } ClearPagePinned(page); } -- cgit v1.1 From ffc78767f2ac5b8007aeb366e748c8dd8b8bc3a3 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 17 Dec 2010 17:19:42 -0800 Subject: xen/multicalls: disable MC_DEBUG It's useful - and probably should be a config - but its very heavyweight, especially with the tracing stuff to help sort out problems. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/multicalls.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c index 7074c7e..3001162 100644 --- a/arch/x86/xen/multicalls.c +++ b/arch/x86/xen/multicalls.c @@ -30,7 +30,7 @@ #define MC_BATCH 32 -#define MC_DEBUG 1 +#define MC_DEBUG 0 #define MC_ARGS (MC_BATCH * 16) @@ -132,7 +132,7 @@ struct multicall_space __xen_mc_entry(size_t args) } ret.mc = &b->entries[b->mcidx]; -#ifdef MC_DEBUG +#if MC_DEBUG b->caller[b->mcidx] = __builtin_return_address(0); #endif b->mcidx++; -- cgit v1.1 From 4a7b005dbfa554e7cc7fbc08e0299a9b7a91ef3b Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 17 Dec 2010 17:20:25 -0800 Subject: xen/multicalls: add unlikely around slowpath in __xen_mc_entry() Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/multicalls.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c index 3001162..cc71f7c 100644 --- a/arch/x86/xen/multicalls.c +++ b/arch/x86/xen/multicalls.c @@ -123,8 +123,8 @@ struct multicall_space __xen_mc_entry(size_t args) BUG_ON(preemptible()); BUG_ON(b->argidx >= MC_ARGS); - if (b->mcidx == MC_BATCH || - (argidx + args) >= MC_ARGS) { + if (unlikely(b->mcidx == MC_BATCH || + (argidx + args) >= MC_ARGS)) { trace_xen_mc_flush_reason((b->mcidx == MC_BATCH) ? XEN_MC_FL_BATCH : XEN_MC_FL_ARGS); xen_mc_flush(); -- cgit v1.1 From eac303bf2ef63e7f7b6971badea0ff7bf08a2b22 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 17 Dec 2010 17:32:28 -0800 Subject: xen/multicall: special-case singleton hypercalls Singleton calls seem to end up being pretty common, so just directly call the hypercall rather than going via multicall. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/multicalls.c | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c index cc71f7c..2474553 100644 --- a/arch/x86/xen/multicalls.c +++ b/arch/x86/xen/multicalls.c @@ -55,6 +55,7 @@ DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags); void xen_mc_flush(void) { struct mc_buffer *b = &__get_cpu_var(mc_buffer); + struct multicall_entry *mc; int ret = 0; unsigned long flags; int i; @@ -67,7 +68,24 @@ void xen_mc_flush(void) trace_xen_mc_flush(b->mcidx, b->argidx, b->cbidx); - if (b->mcidx) { + switch (b->mcidx) { + case 0: + /* no-op */ + BUG_ON(b->argidx != 0); + break; + + case 1: + /* Singleton multicall - bypass multicall machinery + and just do the call directly. */ + mc = &b->entries[0]; + + mc->result = privcmd_call(mc->op, + mc->args[0], mc->args[1], mc->args[2], + mc->args[3], mc->args[4]); + ret = mc->result < 0; + break; + + default: #if MC_DEBUG memcpy(b->debug, b->entries, b->mcidx * sizeof(struct multicall_entry)); @@ -94,11 +112,10 @@ void xen_mc_flush(void) } } #endif + } - b->mcidx = 0; - b->argidx = 0; - } else - BUG_ON(b->argidx != 0); + b->mcidx = 0; + b->argidx = 0; for (i = 0; i < b->cbidx; i++) { struct callback *cb = &b->callbacks[i]; -- cgit v1.1 From 2a6f6d095509c7dc6e9ff8d9fd9fba0b730ecce3 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 17 Dec 2010 17:33:11 -0800 Subject: xen/multicall: move *idx fields to start of mc_buffer The CPU would prefer small offsets. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/multicalls.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c index 2474553..0d82003 100644 --- a/arch/x86/xen/multicalls.c +++ b/arch/x86/xen/multicalls.c @@ -36,6 +36,7 @@ struct mc_buffer { + unsigned mcidx, argidx, cbidx; struct multicall_entry entries[MC_BATCH]; #if MC_DEBUG struct multicall_entry debug[MC_BATCH]; @@ -46,7 +47,6 @@ struct mc_buffer { void (*fn)(void *); void *data; } callbacks[MC_BATCH]; - unsigned mcidx, argidx, cbidx; }; static DEFINE_PER_CPU(struct mc_buffer, mc_buffer); -- cgit v1.1 From 4625cd637919edfb562e0d62abf94f52e9321335 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 19 Jul 2011 12:59:51 +0100 Subject: x86: Unify rwlock assembly implementation Rather than having two functionally identical implementations for 32- and 64-bit configurations, extend the existing assembly abstractions enough to fold the two rwlock implementations into a shared one. Signed-off-by: Jan Beulich Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/4E258DD7020000780004E3EA@nat28.tlf.novell.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/asm.h | 3 ++- arch/x86/include/asm/frame.h | 11 ++++++----- arch/x86/lib/Makefile | 6 ++++-- arch/x86/lib/rwlock.S | 44 ++++++++++++++++++++++++++++++++++++++++++++ arch/x86/lib/rwlock_64.S | 38 -------------------------------------- arch/x86/lib/semaphore_32.S | 44 -------------------------------------------- 6 files changed, 56 insertions(+), 90 deletions(-) create mode 100644 arch/x86/lib/rwlock.S delete mode 100644 arch/x86/lib/rwlock_64.S (limited to 'arch/x86') diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index b3ed1e1..5890beb 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -15,7 +15,8 @@ # define __ASM_SEL(a,b) __ASM_FORM(b) #endif -#define __ASM_SIZE(inst) __ASM_SEL(inst##l, inst##q) +#define __ASM_SIZE(inst, ...) __ASM_SEL(inst##l##__VA_ARGS__, \ + inst##q##__VA_ARGS__) #define __ASM_REG(reg) __ASM_SEL(e##reg, r##reg) #define _ASM_PTR __ASM_SEL(.long, .quad) diff --git a/arch/x86/include/asm/frame.h b/arch/x86/include/asm/frame.h index 2c6fc9e..3b629f4 100644 --- a/arch/x86/include/asm/frame.h +++ b/arch/x86/include/asm/frame.h @@ -1,5 +1,6 @@ #ifdef __ASSEMBLY__ +#include #include /* The annotation hides the frame from the unwinder and makes it look @@ -7,13 +8,13 @@ frame pointer later */ #ifdef CONFIG_FRAME_POINTER .macro FRAME - pushl_cfi %ebp - CFI_REL_OFFSET ebp,0 - movl %esp,%ebp + __ASM_SIZE(push,_cfi) %__ASM_REG(bp) + CFI_REL_OFFSET __ASM_REG(bp), 0 + __ASM_SIZE(mov) %__ASM_REG(sp), %__ASM_REG(bp) .endm .macro ENDFRAME - popl_cfi %ebp - CFI_RESTORE ebp + __ASM_SIZE(pop,_cfi) %__ASM_REG(bp) + CFI_RESTORE __ASM_REG(bp) .endm #else .macro FRAME diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index f2479f1..d3ed120 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -20,6 +20,7 @@ lib-y := delay.o lib-y += thunk_$(BITS).o lib-y += usercopy_$(BITS).o getuser.o putuser.o lib-y += memcpy_$(BITS).o +lib-$(CONFIG_SMP) += rwlock.o lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o obj-y += msr.o msr-reg.o msr-reg-export.o @@ -29,7 +30,8 @@ ifeq ($(CONFIG_X86_32),y) lib-y += atomic64_cx8_32.o lib-y += checksum_32.o lib-y += strstr_32.o - lib-y += semaphore_32.o string_32.o + lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += semaphore_32.o + lib-y += string_32.o lib-y += cmpxchg.o ifneq ($(CONFIG_X86_CMPXCHG64),y) lib-y += cmpxchg8b_emu.o atomic64_386_32.o @@ -40,7 +42,7 @@ else lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o lib-y += thunk_64.o clear_page_64.o copy_page_64.o lib-y += memmove_64.o memset_64.o - lib-y += copy_user_64.o rwlock_64.o copy_user_nocache_64.o + lib-y += copy_user_64.o copy_user_nocache_64.o lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem_64.o lib-y += cmpxchg16b_emu.o endif diff --git a/arch/x86/lib/rwlock.S b/arch/x86/lib/rwlock.S new file mode 100644 index 0000000..fca1782 --- /dev/null +++ b/arch/x86/lib/rwlock.S @@ -0,0 +1,44 @@ +/* Slow paths of read/write spinlocks. */ + +#include +#include +#include +#include + +#ifdef CONFIG_X86_32 +# define __lock_ptr eax +#else +# define __lock_ptr rdi +#endif + +ENTRY(__write_lock_failed) + CFI_STARTPROC + FRAME +0: LOCK_PREFIX + addl $RW_LOCK_BIAS, (%__lock_ptr) +1: rep; nop + cmpl $RW_LOCK_BIAS, (%__lock_ptr) + jne 1b + LOCK_PREFIX + subl $RW_LOCK_BIAS, (%__lock_ptr) + jnz 0b + ENDFRAME + ret + CFI_ENDPROC +END(__write_lock_failed) + +ENTRY(__read_lock_failed) + CFI_STARTPROC + FRAME +0: LOCK_PREFIX + incl (%__lock_ptr) +1: rep; nop + cmpl $1, (%__lock_ptr) + js 1b + LOCK_PREFIX + decl (%__lock_ptr) + js 0b + ENDFRAME + ret + CFI_ENDPROC +END(__read_lock_failed) diff --git a/arch/x86/lib/rwlock_64.S b/arch/x86/lib/rwlock_64.S deleted file mode 100644 index 05ea55f..0000000 --- a/arch/x86/lib/rwlock_64.S +++ /dev/null @@ -1,38 +0,0 @@ -/* Slow paths of read/write spinlocks. */ - -#include -#include -#include -#include - -/* rdi: pointer to rwlock_t */ -ENTRY(__write_lock_failed) - CFI_STARTPROC - LOCK_PREFIX - addl $RW_LOCK_BIAS,(%rdi) -1: rep - nop - cmpl $RW_LOCK_BIAS,(%rdi) - jne 1b - LOCK_PREFIX - subl $RW_LOCK_BIAS,(%rdi) - jnz __write_lock_failed - ret - CFI_ENDPROC -END(__write_lock_failed) - -/* rdi: pointer to rwlock_t */ -ENTRY(__read_lock_failed) - CFI_STARTPROC - LOCK_PREFIX - incl (%rdi) -1: rep - nop - cmpl $1,(%rdi) - js 1b - LOCK_PREFIX - decl (%rdi) - js __read_lock_failed - ret - CFI_ENDPROC -END(__read_lock_failed) diff --git a/arch/x86/lib/semaphore_32.S b/arch/x86/lib/semaphore_32.S index 06691da..65b591d 100644 --- a/arch/x86/lib/semaphore_32.S +++ b/arch/x86/lib/semaphore_32.S @@ -14,8 +14,6 @@ */ #include -#include -#include #include #include @@ -31,46 +29,6 @@ */ .section .sched.text, "ax" -/* - * rw spinlock fallbacks - */ -#ifdef CONFIG_SMP -ENTRY(__write_lock_failed) - CFI_STARTPROC - FRAME -2: LOCK_PREFIX - addl $ RW_LOCK_BIAS,(%eax) -1: rep; nop - cmpl $ RW_LOCK_BIAS,(%eax) - jne 1b - LOCK_PREFIX - subl $ RW_LOCK_BIAS,(%eax) - jnz 2b - ENDFRAME - ret - CFI_ENDPROC - ENDPROC(__write_lock_failed) - -ENTRY(__read_lock_failed) - CFI_STARTPROC - FRAME -2: LOCK_PREFIX - incl (%eax) -1: rep; nop - cmpl $1,(%eax) - js 1b - LOCK_PREFIX - decl (%eax) - js 2b - ENDFRAME - ret - CFI_ENDPROC - ENDPROC(__read_lock_failed) - -#endif - -#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM - /* Fix up special calling conventions */ ENTRY(call_rwsem_down_read_failed) CFI_STARTPROC @@ -120,5 +78,3 @@ ENTRY(call_rwsem_downgrade_wake) ret CFI_ENDPROC ENDPROC(call_rwsem_downgrade_wake) - -#endif -- cgit v1.1 From a738669464a1e0d8e7b20f631120192f9cf7cfbd Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 19 Jul 2011 13:00:19 +0100 Subject: x86: Unify rwsem assembly implementation Rather than having two functionally identical implementations for 32- and 64-bit configurations, use the previously extended assembly abstractions to fold the rwsem two implementations into a shared one. Signed-off-by: Jan Beulich Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/4E258DF3020000780004E3ED@nat28.tlf.novell.com Signed-off-by: Ingo Molnar --- arch/x86/lib/Makefile | 3 +- arch/x86/lib/rwsem.S | 136 ++++++++++++++++++++++++++++++++++++++++++++ arch/x86/lib/rwsem_64.S | 93 ------------------------------ arch/x86/lib/semaphore_32.S | 80 -------------------------- 4 files changed, 137 insertions(+), 175 deletions(-) create mode 100644 arch/x86/lib/rwsem.S delete mode 100644 arch/x86/lib/rwsem_64.S delete mode 100644 arch/x86/lib/semaphore_32.S (limited to 'arch/x86') diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index d3ed120..5d46ddc 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -21,6 +21,7 @@ lib-y += thunk_$(BITS).o lib-y += usercopy_$(BITS).o getuser.o putuser.o lib-y += memcpy_$(BITS).o lib-$(CONFIG_SMP) += rwlock.o +lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o obj-y += msr.o msr-reg.o msr-reg-export.o @@ -30,7 +31,6 @@ ifeq ($(CONFIG_X86_32),y) lib-y += atomic64_cx8_32.o lib-y += checksum_32.o lib-y += strstr_32.o - lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += semaphore_32.o lib-y += string_32.o lib-y += cmpxchg.o ifneq ($(CONFIG_X86_CMPXCHG64),y) @@ -43,6 +43,5 @@ else lib-y += thunk_64.o clear_page_64.o copy_page_64.o lib-y += memmove_64.o memset_64.o lib-y += copy_user_64.o copy_user_nocache_64.o - lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem_64.o lib-y += cmpxchg16b_emu.o endif diff --git a/arch/x86/lib/rwsem.S b/arch/x86/lib/rwsem.S new file mode 100644 index 0000000..5dff5f0 --- /dev/null +++ b/arch/x86/lib/rwsem.S @@ -0,0 +1,136 @@ +/* + * x86 semaphore implementation. + * + * (C) Copyright 1999 Linus Torvalds + * + * Portions Copyright 1999 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * rw semaphores implemented November 1999 by Benjamin LaHaise + */ + +#include +#include +#include + +#define __ASM_HALF_REG(reg) __ASM_SEL(reg, e##reg) +#define __ASM_HALF_SIZE(inst) __ASM_SEL(inst##w, inst##l) + +#ifdef CONFIG_X86_32 + +/* + * The semaphore operations have a special calling sequence that + * allow us to do a simpler in-line version of them. These routines + * need to convert that sequence back into the C sequence when + * there is contention on the semaphore. + * + * %eax contains the semaphore pointer on entry. Save the C-clobbered + * registers (%eax, %edx and %ecx) except %eax whish is either a return + * value or just clobbered.. + */ + +#define save_common_regs \ + pushl_cfi %ecx; CFI_REL_OFFSET ecx, 0 + +#define restore_common_regs \ + popl_cfi %ecx; CFI_RESTORE ecx + + /* Avoid uglifying the argument copying x86-64 needs to do. */ + .macro movq src, dst + .endm + +#else + +/* + * x86-64 rwsem wrappers + * + * This interfaces the inline asm code to the slow-path + * C routines. We need to save the call-clobbered regs + * that the asm does not mark as clobbered, and move the + * argument from %rax to %rdi. + * + * NOTE! We don't need to save %rax, because the functions + * will always return the semaphore pointer in %rax (which + * is also the input argument to these helpers) + * + * The following can clobber %rdx because the asm clobbers it: + * call_rwsem_down_write_failed + * call_rwsem_wake + * but %rdi, %rsi, %rcx, %r8-r11 always need saving. + */ + +#define save_common_regs \ + pushq_cfi %rdi; CFI_REL_OFFSET rdi, 0; \ + pushq_cfi %rsi; CFI_REL_OFFSET rsi, 0; \ + pushq_cfi %rcx; CFI_REL_OFFSET rcx, 0; \ + pushq_cfi %r8; CFI_REL_OFFSET r8, 0; \ + pushq_cfi %r9; CFI_REL_OFFSET r9, 0; \ + pushq_cfi %r10; CFI_REL_OFFSET r10, 0; \ + pushq_cfi %r11; CFI_REL_OFFSET r11, 0 + +#define restore_common_regs \ + popq_cfi %r11; CFI_RESTORE r11; \ + popq_cfi %r10; CFI_RESTORE r10; \ + popq_cfi %r9; CFI_RESTORE r9; \ + popq_cfi %r8; CFI_RESTORE r8; \ + popq_cfi %rcx; CFI_RESTORE rcx; \ + popq_cfi %rsi; CFI_RESTORE rsi; \ + popq_cfi %rdi; CFI_RESTORE rdi + +#endif + +/* Fix up special calling conventions */ +ENTRY(call_rwsem_down_read_failed) + CFI_STARTPROC + save_common_regs + __ASM_SIZE(push,_cfi) %__ASM_REG(dx) + CFI_REL_OFFSET __ASM_REG(dx), 0 + movq %rax,%rdi + call rwsem_down_read_failed + __ASM_SIZE(pop,_cfi) %__ASM_REG(dx) + CFI_RESTORE __ASM_REG(dx) + restore_common_regs + ret + CFI_ENDPROC +ENDPROC(call_rwsem_down_read_failed) + +ENTRY(call_rwsem_down_write_failed) + CFI_STARTPROC + save_common_regs + movq %rax,%rdi + call rwsem_down_write_failed + restore_common_regs + ret + CFI_ENDPROC +ENDPROC(call_rwsem_down_write_failed) + +ENTRY(call_rwsem_wake) + CFI_STARTPROC + /* do nothing if still outstanding active readers */ + __ASM_HALF_SIZE(dec) %__ASM_HALF_REG(dx) + jnz 1f + save_common_regs + movq %rax,%rdi + call rwsem_wake + restore_common_regs +1: ret + CFI_ENDPROC +ENDPROC(call_rwsem_wake) + +ENTRY(call_rwsem_downgrade_wake) + CFI_STARTPROC + save_common_regs + __ASM_SIZE(push,_cfi) %__ASM_REG(dx) + CFI_REL_OFFSET __ASM_REG(dx), 0 + movq %rax,%rdi + call rwsem_downgrade_wake + __ASM_SIZE(pop,_cfi) %__ASM_REG(dx) + CFI_RESTORE __ASM_REG(dx) + restore_common_regs + ret + CFI_ENDPROC +ENDPROC(call_rwsem_downgrade_wake) diff --git a/arch/x86/lib/rwsem_64.S b/arch/x86/lib/rwsem_64.S deleted file mode 100644 index 6774397..0000000 --- a/arch/x86/lib/rwsem_64.S +++ /dev/null @@ -1,93 +0,0 @@ -/* - * x86-64 rwsem wrappers - * - * This interfaces the inline asm code to the slow-path - * C routines. We need to save the call-clobbered regs - * that the asm does not mark as clobbered, and move the - * argument from %rax to %rdi. - * - * NOTE! We don't need to save %rax, because the functions - * will always return the semaphore pointer in %rax (which - * is also the input argument to these helpers) - * - * The following can clobber %rdx because the asm clobbers it: - * call_rwsem_down_write_failed - * call_rwsem_wake - * but %rdi, %rsi, %rcx, %r8-r11 always need saving. - */ - -#include -#include -#include -#include -#include - -#define save_common_regs \ - pushq_cfi %rdi; CFI_REL_OFFSET rdi, 0; \ - pushq_cfi %rsi; CFI_REL_OFFSET rsi, 0; \ - pushq_cfi %rcx; CFI_REL_OFFSET rcx, 0; \ - pushq_cfi %r8; CFI_REL_OFFSET r8, 0; \ - pushq_cfi %r9; CFI_REL_OFFSET r9, 0; \ - pushq_cfi %r10; CFI_REL_OFFSET r10, 0; \ - pushq_cfi %r11; CFI_REL_OFFSET r11, 0 - -#define restore_common_regs \ - popq_cfi %r11; CFI_RESTORE r11; \ - popq_cfi %r10; CFI_RESTORE r10; \ - popq_cfi %r9; CFI_RESTORE r9; \ - popq_cfi %r8; CFI_RESTORE r8; \ - popq_cfi %rcx; CFI_RESTORE rcx; \ - popq_cfi %rsi; CFI_RESTORE rsi; \ - popq_cfi %rdi; CFI_RESTORE rdi - -/* Fix up special calling conventions */ -ENTRY(call_rwsem_down_read_failed) - CFI_STARTPROC - save_common_regs - pushq_cfi %rdx - CFI_REL_OFFSET rdx, 0 - movq %rax,%rdi - call rwsem_down_read_failed - popq_cfi %rdx - CFI_RESTORE rdx - restore_common_regs - ret - CFI_ENDPROC -ENDPROC(call_rwsem_down_read_failed) - -ENTRY(call_rwsem_down_write_failed) - CFI_STARTPROC - save_common_regs - movq %rax,%rdi - call rwsem_down_write_failed - restore_common_regs - ret - CFI_ENDPROC -ENDPROC(call_rwsem_down_write_failed) - -ENTRY(call_rwsem_wake) - CFI_STARTPROC - decl %edx /* do nothing if still outstanding active readers */ - jnz 1f - save_common_regs - movq %rax,%rdi - call rwsem_wake - restore_common_regs -1: ret - CFI_ENDPROC -ENDPROC(call_rwsem_wake) - -/* Fix up special calling conventions */ -ENTRY(call_rwsem_downgrade_wake) - CFI_STARTPROC - save_common_regs - pushq_cfi %rdx - CFI_REL_OFFSET rdx, 0 - movq %rax,%rdi - call rwsem_downgrade_wake - popq_cfi %rdx - CFI_RESTORE rdx - restore_common_regs - ret - CFI_ENDPROC -ENDPROC(call_rwsem_downgrade_wake) diff --git a/arch/x86/lib/semaphore_32.S b/arch/x86/lib/semaphore_32.S deleted file mode 100644 index 65b591d..0000000 --- a/arch/x86/lib/semaphore_32.S +++ /dev/null @@ -1,80 +0,0 @@ -/* - * i386 semaphore implementation. - * - * (C) Copyright 1999 Linus Torvalds - * - * Portions Copyright 1999 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * rw semaphores implemented November 1999 by Benjamin LaHaise - */ - -#include -#include -#include - -/* - * The semaphore operations have a special calling sequence that - * allow us to do a simpler in-line version of them. These routines - * need to convert that sequence back into the C sequence when - * there is contention on the semaphore. - * - * %eax contains the semaphore pointer on entry. Save the C-clobbered - * registers (%eax, %edx and %ecx) except %eax whish is either a return - * value or just clobbered.. - */ - .section .sched.text, "ax" - -/* Fix up special calling conventions */ -ENTRY(call_rwsem_down_read_failed) - CFI_STARTPROC - pushl_cfi %ecx - CFI_REL_OFFSET ecx,0 - pushl_cfi %edx - CFI_REL_OFFSET edx,0 - call rwsem_down_read_failed - popl_cfi %edx - popl_cfi %ecx - ret - CFI_ENDPROC - ENDPROC(call_rwsem_down_read_failed) - -ENTRY(call_rwsem_down_write_failed) - CFI_STARTPROC - pushl_cfi %ecx - CFI_REL_OFFSET ecx,0 - calll rwsem_down_write_failed - popl_cfi %ecx - ret - CFI_ENDPROC - ENDPROC(call_rwsem_down_write_failed) - -ENTRY(call_rwsem_wake) - CFI_STARTPROC - decw %dx /* do nothing if still outstanding active readers */ - jnz 1f - pushl_cfi %ecx - CFI_REL_OFFSET ecx,0 - call rwsem_wake - popl_cfi %ecx -1: ret - CFI_ENDPROC - ENDPROC(call_rwsem_wake) - -/* Fix up special calling conventions */ -ENTRY(call_rwsem_downgrade_wake) - CFI_STARTPROC - pushl_cfi %ecx - CFI_REL_OFFSET ecx,0 - pushl_cfi %edx - CFI_REL_OFFSET edx,0 - call rwsem_downgrade_wake - popl_cfi %edx - popl_cfi %ecx - ret - CFI_ENDPROC - ENDPROC(call_rwsem_downgrade_wake) -- cgit v1.1 From a750036f35cda160ef77408ec92c3dc41f8feebb Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 19 Jul 2011 13:00:45 +0100 Subject: x86: Fix write lock scalability 64-bit issue With the write lock path simply subtracting RW_LOCK_BIAS there is, on large systems, the theoretical possibility of overflowing the 32-bit value that was used so far (namely if 128 or more CPUs manage to do the subtraction, but don't get to do the inverse addition in the failure path quickly enough). A first measure is to modify RW_LOCK_BIAS itself - with the new value chosen, it is good for up to 2048 CPUs each allowed to nest over 2048 times on the read path without causing an issue. Quite possibly it would even be sufficient to adjust the bias a little further, assuming that allowing for significantly less nesting would suffice. However, as the original value chosen allowed for even more nesting levels, to support more than 2048 CPUs (possible currently only for 64-bit kernels) the lock itself gets widened to 64 bits. Signed-off-by: Jan Beulich Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/4E258E0D020000780004E3F0@nat28.tlf.novell.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/asm.h | 2 ++ arch/x86/include/asm/rwlock.h | 43 ++++++++++++++++++++++++++++++++++- arch/x86/include/asm/spinlock.h | 37 ++++++++++++++++++------------ arch/x86/include/asm/spinlock_types.h | 6 +---- arch/x86/lib/rwlock.S | 12 +++++----- arch/x86/lib/thunk_64.S | 1 - 6 files changed, 73 insertions(+), 28 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 5890beb..9412d65 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -3,9 +3,11 @@ #ifdef __ASSEMBLY__ # define __ASM_FORM(x) x +# define __ASM_FORM_COMMA(x) x, # define __ASM_EX_SEC .section __ex_table, "a" #else # define __ASM_FORM(x) " " #x " " +# define __ASM_FORM_COMMA(x) " " #x "," # define __ASM_EX_SEC " .section __ex_table,\"a\"\n" #endif diff --git a/arch/x86/include/asm/rwlock.h b/arch/x86/include/asm/rwlock.h index 6a8c0d6..a5370a0 100644 --- a/arch/x86/include/asm/rwlock.h +++ b/arch/x86/include/asm/rwlock.h @@ -1,7 +1,48 @@ #ifndef _ASM_X86_RWLOCK_H #define _ASM_X86_RWLOCK_H -#define RW_LOCK_BIAS 0x01000000 +#include + +#if CONFIG_NR_CPUS <= 2048 + +#ifndef __ASSEMBLY__ +typedef union { + s32 lock; + s32 write; +} arch_rwlock_t; +#endif + +#define RW_LOCK_BIAS 0x00100000 +#define READ_LOCK_SIZE(insn) __ASM_FORM(insn##l) +#define READ_LOCK_ATOMIC(n) atomic_##n +#define WRITE_LOCK_ADD(n) __ASM_FORM_COMMA(addl n) +#define WRITE_LOCK_SUB(n) __ASM_FORM_COMMA(subl n) +#define WRITE_LOCK_CMP RW_LOCK_BIAS + +#else /* CONFIG_NR_CPUS > 2048 */ + +#include + +#ifndef __ASSEMBLY__ +typedef union { + s64 lock; + struct { + u32 read; + s32 write; + }; +} arch_rwlock_t; +#endif + +#define RW_LOCK_BIAS (_AC(1,L) << 32) +#define READ_LOCK_SIZE(insn) __ASM_FORM(insn##q) +#define READ_LOCK_ATOMIC(n) atomic64_##n +#define WRITE_LOCK_ADD(n) __ASM_FORM(incl) +#define WRITE_LOCK_SUB(n) __ASM_FORM(decl) +#define WRITE_LOCK_CMP 1 + +#endif /* CONFIG_NR_CPUS */ + +#define __ARCH_RW_LOCK_UNLOCKED { RW_LOCK_BIAS } /* Actual code is in asm/spinlock.h or in arch/x86/lib/rwlock.S */ diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index 3089f70..e9e51f7 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -2,7 +2,6 @@ #define _ASM_X86_SPINLOCK_H #include -#include #include #include #include @@ -234,7 +233,7 @@ static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) */ static inline int arch_read_can_lock(arch_rwlock_t *lock) { - return (int)(lock)->lock > 0; + return lock->lock > 0; } /** @@ -243,12 +242,12 @@ static inline int arch_read_can_lock(arch_rwlock_t *lock) */ static inline int arch_write_can_lock(arch_rwlock_t *lock) { - return (lock)->lock == RW_LOCK_BIAS; + return lock->write == WRITE_LOCK_CMP; } static inline void arch_read_lock(arch_rwlock_t *rw) { - asm volatile(LOCK_PREFIX " subl $1,(%0)\n\t" + asm volatile(LOCK_PREFIX READ_LOCK_SIZE(dec) " (%0)\n\t" "jns 1f\n" "call __read_lock_failed\n\t" "1:\n" @@ -257,47 +256,55 @@ static inline void arch_read_lock(arch_rwlock_t *rw) static inline void arch_write_lock(arch_rwlock_t *rw) { - asm volatile(LOCK_PREFIX " subl %1,(%0)\n\t" + asm volatile(LOCK_PREFIX WRITE_LOCK_SUB(%1) "(%0)\n\t" "jz 1f\n" "call __write_lock_failed\n\t" "1:\n" - ::LOCK_PTR_REG (rw), "i" (RW_LOCK_BIAS) : "memory"); + ::LOCK_PTR_REG (&rw->write), "i" (RW_LOCK_BIAS) + : "memory"); } static inline int arch_read_trylock(arch_rwlock_t *lock) { - atomic_t *count = (atomic_t *)lock; + READ_LOCK_ATOMIC(t) *count = (READ_LOCK_ATOMIC(t) *)lock; - if (atomic_dec_return(count) >= 0) + if (READ_LOCK_ATOMIC(dec_return)(count) >= 0) return 1; - atomic_inc(count); + READ_LOCK_ATOMIC(inc)(count); return 0; } static inline int arch_write_trylock(arch_rwlock_t *lock) { - atomic_t *count = (atomic_t *)lock; + atomic_t *count = (atomic_t *)&lock->write; - if (atomic_sub_and_test(RW_LOCK_BIAS, count)) + if (atomic_sub_and_test(WRITE_LOCK_CMP, count)) return 1; - atomic_add(RW_LOCK_BIAS, count); + atomic_add(WRITE_LOCK_CMP, count); return 0; } static inline void arch_read_unlock(arch_rwlock_t *rw) { - asm volatile(LOCK_PREFIX "incl %0" :"+m" (rw->lock) : : "memory"); + asm volatile(LOCK_PREFIX READ_LOCK_SIZE(inc) " %0" + :"+m" (rw->lock) : : "memory"); } static inline void arch_write_unlock(arch_rwlock_t *rw) { - asm volatile(LOCK_PREFIX "addl %1, %0" - : "+m" (rw->lock) : "i" (RW_LOCK_BIAS) : "memory"); + asm volatile(LOCK_PREFIX WRITE_LOCK_ADD(%1) "%0" + : "+m" (rw->write) : "i" (RW_LOCK_BIAS) : "memory"); } #define arch_read_lock_flags(lock, flags) arch_read_lock(lock) #define arch_write_lock_flags(lock, flags) arch_write_lock(lock) +#undef READ_LOCK_SIZE +#undef READ_LOCK_ATOMIC +#undef WRITE_LOCK_ADD +#undef WRITE_LOCK_SUB +#undef WRITE_LOCK_CMP + #define arch_spin_relax(lock) cpu_relax() #define arch_read_relax(lock) cpu_relax() #define arch_write_relax(lock) cpu_relax() diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h index dcb48b2..7c7a486 100644 --- a/arch/x86/include/asm/spinlock_types.h +++ b/arch/x86/include/asm/spinlock_types.h @@ -11,10 +11,6 @@ typedef struct arch_spinlock { #define __ARCH_SPIN_LOCK_UNLOCKED { 0 } -typedef struct { - unsigned int lock; -} arch_rwlock_t; - -#define __ARCH_RW_LOCK_UNLOCKED { RW_LOCK_BIAS } +#include #endif /* _ASM_X86_SPINLOCK_TYPES_H */ diff --git a/arch/x86/lib/rwlock.S b/arch/x86/lib/rwlock.S index fca1782..1cad221 100644 --- a/arch/x86/lib/rwlock.S +++ b/arch/x86/lib/rwlock.S @@ -15,12 +15,12 @@ ENTRY(__write_lock_failed) CFI_STARTPROC FRAME 0: LOCK_PREFIX - addl $RW_LOCK_BIAS, (%__lock_ptr) + WRITE_LOCK_ADD($RW_LOCK_BIAS) (%__lock_ptr) 1: rep; nop - cmpl $RW_LOCK_BIAS, (%__lock_ptr) + cmpl $WRITE_LOCK_CMP, (%__lock_ptr) jne 1b LOCK_PREFIX - subl $RW_LOCK_BIAS, (%__lock_ptr) + WRITE_LOCK_SUB($RW_LOCK_BIAS) (%__lock_ptr) jnz 0b ENDFRAME ret @@ -31,12 +31,12 @@ ENTRY(__read_lock_failed) CFI_STARTPROC FRAME 0: LOCK_PREFIX - incl (%__lock_ptr) + READ_LOCK_SIZE(inc) (%__lock_ptr) 1: rep; nop - cmpl $1, (%__lock_ptr) + READ_LOCK_SIZE(cmp) $1, (%__lock_ptr) js 1b LOCK_PREFIX - decl (%__lock_ptr) + READ_LOCK_SIZE(dec) (%__lock_ptr) js 0b ENDFRAME ret diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S index d5b088b..a63efd6 100644 --- a/arch/x86/lib/thunk_64.S +++ b/arch/x86/lib/thunk_64.S @@ -8,7 +8,6 @@ #include #include #include -#include /* rdi: arg1 ... normal C conventions. rax is saved/restored. */ .macro THUNK name, func, put_ret_addr_in_rdi=0 -- cgit v1.1 From ac619f4eba45da10053fc991f8a5d47b3be79fa3 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 19 Jul 2011 11:39:03 +0100 Subject: x86: Serialize SMP bootup CMOS accesses on rtc_lock With CPU hotplug, there is a theoretical race between other CMOS (namely RTC) accesses and those done in the SMP secondary processor bringup path. I am unware of the problem having been noticed by anyone in practice, but it would very likely be rather spurious and very hard to reproduce. So to be on the safe side, acquire rtc_lock around those accesses. Signed-off-by: Jan Beulich Cc: John Stultz Link: http://lkml.kernel.org/r/4E257AE7020000780004E2FF@nat28.tlf.novell.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/smpboot_hooks.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/smpboot_hooks.h b/arch/x86/include/asm/smpboot_hooks.h index 725b778..49adfd7 100644 --- a/arch/x86/include/asm/smpboot_hooks.h +++ b/arch/x86/include/asm/smpboot_hooks.h @@ -10,7 +10,11 @@ static inline void smpboot_clear_io_apic_irqs(void) static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip) { + unsigned long flags; + + spin_lock_irqsave(&rtc_lock, flags); CMOS_WRITE(0xa, 0xf); + spin_unlock_irqrestore(&rtc_lock, flags); local_flush_tlb(); pr_debug("1.\n"); *((volatile unsigned short *)phys_to_virt(apic->trampoline_phys_high)) = @@ -23,6 +27,8 @@ static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip) static inline void smpboot_restore_warm_reset_vector(void) { + unsigned long flags; + /* * Install writable page 0 entry to set BIOS data area. */ @@ -32,7 +38,9 @@ static inline void smpboot_restore_warm_reset_vector(void) * Paranoid: Set warm reset code and vector here back * to default values. */ + spin_lock_irqsave(&rtc_lock, flags); CMOS_WRITE(0, 0xf); + spin_unlock_irqrestore(&rtc_lock, flags); *((volatile u32 *)phys_to_virt(apic->trampoline_phys_low)) = 0; } -- cgit v1.1 From ef68c8f87ed13f65df867dddf36c0e185b27b942 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 19 Jul 2011 11:53:07 +0100 Subject: x86: Serialize EFI time accesses on rtc_lock The EFI specification requires that callers of the time related runtime functions serialize with other CMOS accesses in the kernel, as the EFI time functions may choose to also use the legacy CMOS RTC. Besides fixing a latent bug, this is a prerequisite to safely enable the rtc-efi driver for x86, which ought to be preferred over rtc-cmos on all EFI platforms. Signed-off-by: Jan Beulich Acked-by: Matthew Garrett Cc: Link: http://lkml.kernel.org/r/4E257E33020000780004E319@nat28.tlf.novell.com Signed-off-by: Ingo Molnar Cc: Matthew Garrett --- arch/x86/platform/efi/efi.c | 39 +++++++++++++++++++++++++++++++++------ 1 file changed, 33 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 474356b..b8823d5 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -79,26 +79,50 @@ early_param("add_efi_memmap", setup_add_efi_memmap); static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc) { - return efi_call_virt2(get_time, tm, tc); + unsigned long flags; + efi_status_t status; + + spin_lock_irqsave(&rtc_lock, flags); + status = efi_call_virt2(get_time, tm, tc); + spin_unlock_irqrestore(&rtc_lock, flags); + return status; } static efi_status_t virt_efi_set_time(efi_time_t *tm) { - return efi_call_virt1(set_time, tm); + unsigned long flags; + efi_status_t status; + + spin_lock_irqsave(&rtc_lock, flags); + status = efi_call_virt1(set_time, tm); + spin_unlock_irqrestore(&rtc_lock, flags); + return status; } static efi_status_t virt_efi_get_wakeup_time(efi_bool_t *enabled, efi_bool_t *pending, efi_time_t *tm) { - return efi_call_virt3(get_wakeup_time, - enabled, pending, tm); + unsigned long flags; + efi_status_t status; + + spin_lock_irqsave(&rtc_lock, flags); + status = efi_call_virt3(get_wakeup_time, + enabled, pending, tm); + spin_unlock_irqrestore(&rtc_lock, flags); + return status; } static efi_status_t virt_efi_set_wakeup_time(efi_bool_t enabled, efi_time_t *tm) { - return efi_call_virt2(set_wakeup_time, - enabled, tm); + unsigned long flags; + efi_status_t status; + + spin_lock_irqsave(&rtc_lock, flags); + status = efi_call_virt2(set_wakeup_time, + enabled, tm); + spin_unlock_irqrestore(&rtc_lock, flags); + return status; } static efi_status_t virt_efi_get_variable(efi_char16_t *name, @@ -164,11 +188,14 @@ static efi_status_t __init phys_efi_set_virtual_address_map( static efi_status_t __init phys_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc) { + unsigned long flags; efi_status_t status; + spin_lock_irqsave(&rtc_lock, flags); efi_call_phys_prelog(); status = efi_call_phys2(efi_phys.get_time, tm, tc); efi_call_phys_epilog(); + spin_unlock_irqrestore(&rtc_lock, flags); return status; } -- cgit v1.1 From a6c23905ff0d6bbddf590ef0838489ee0f6c74ac Mon Sep 17 00:00:00 2001 From: Greg Dietsche Date: Thu, 30 Jun 2011 20:10:53 -0500 Subject: x86, smpboot: Mark the names[] array in __inquire_remote_apic() as const This array is read-only. Make it explicit by marking as const. Signed-off-by: Greg Dietsche Link: http://lkml.kernel.org/r/1309482653-23648-1-git-send-email-Gregory.Dietsche@cuw.edu Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index a3c430b..e02653a 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -425,7 +425,7 @@ static void impress_friends(void) void __inquire_remote_apic(int apicid) { unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 }; - char *names[] = { "ID", "VERSION", "SPIV" }; + const char * const names[] = { "ID", "VERSION", "SPIV" }; int timeout; u32 status; -- cgit v1.1 From 38175051f8e79c5e9f65daab7200fd8d1fa4a912 Mon Sep 17 00:00:00 2001 From: Sergei Shtylyov Date: Mon, 11 Jul 2011 19:01:38 +0400 Subject: x86, quirks: Use pci_dev->revision This code uses PCI_CLASS_REVISION instead of PCI_REVISION_ID, so it wasn't converted by commit 44c10138fd4bbc ("PCI: Change all drivers to use pci_device->revision") before being moved to arch/x86/... Signed-off-by: Sergei Shtylyov Cc: Jesse Barnes Cc: Dave Jones Link: http://lkml.kernel.org/r/201107111901.39281.sshtylyov@ru.mvista.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/quirks.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 8bbe8c5..b78643d 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -10,7 +10,7 @@ static void __devinit quirk_intel_irqbalance(struct pci_dev *dev) { - u8 config, rev; + u8 config; u16 word; /* BIOS may enable hardware IRQ balancing for @@ -18,8 +18,7 @@ static void __devinit quirk_intel_irqbalance(struct pci_dev *dev) * based platforms. * Disable SW irqbalance/affinity on those platforms. */ - pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev); - if (rev > 0x9) + if (dev->revision > 0x9) return; /* enable access to config space*/ -- cgit v1.1 From 43605ef188cd39708ddc5e3adc47b337b6ebe40e Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Tue, 12 Jul 2011 17:49:29 +0100 Subject: x86, config: Introduce an INTEL_MID configuration We need to carve up the configuration between: - MID general - Moorestown specific - Medfield specific - Future devices As a base point create an INTEL_MID configuration property. We make the existing MRST configuration a sub-option. This means that the rest of the kernel config can still use X86_MRST checks without anything going backwards. After this is merged future patches will tidy up which devices are MID and which are X86_MRST, as well as add options for Medfield. Signed-off-by: Alan Cox Link: http://lkml.kernel.org/r/20110712164859.7642.84136.stgit@bob.linux.org.uk Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 37357a5..80ef419 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -384,12 +384,21 @@ config X86_INTEL_CE This option compiles in support for the CE4100 SOC for settop boxes and media devices. +config X86_INTEL_MID + bool "Intel MID platform support" + depends on X86_32 + depends on X86_EXTENDED_PLATFORM + ---help--- + Select to build a kernel capable of supporting Intel MID platform + systems which do not have the PCI legacy interfaces (Moorestown, + Medfield). If you are building for a PC class system say N here. + +if X86_INTEL_MID + config X86_MRST bool "Moorestown MID platform" depends on PCI depends on PCI_GOANY - depends on X86_32 - depends on X86_EXTENDED_PLATFORM depends on X86_IO_APIC select APB_TIMER select I2C @@ -404,6 +413,8 @@ config X86_MRST nor standard legacy replacement devices/features. e.g. Moorestown does not contain i8259, i8254, HPET, legacy BIOS, most of the io ports. +endif + config X86_RDC321X bool "RDC R-321x SoC" depends on X86_32 -- cgit v1.1 From 050438ed5a05b25cdf287f5691e56a58c2606997 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Thu, 14 Jul 2011 09:34:37 +0800 Subject: kexec, x86: Fix incorrect jump back address if not preserving context In kexec jump support, jump back address passed to the kexeced kernel via function calling ABI, that is, the function call return address is the jump back entry. Furthermore, jump back entry == 0 should be used to signal that the jump back or preserve context is not enabled in the original kernel. But in the current implementation the stack position used for function call return address is not cleared context preservation is disabled. The patch fixes this bug. Reported-and-tested-by: Yin Kangkai Signed-off-by: Huang Ying Cc: Eric W. Biederman Cc: Vivek Goyal Cc: Link: http://lkml.kernel.org/r/1310607277-25029-1-git-send-email-ying.huang@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/relocate_kernel_32.S | 2 ++ arch/x86/kernel/relocate_kernel_64.S | 2 ++ 2 files changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S index 4123553..36818f8 100644 --- a/arch/x86/kernel/relocate_kernel_32.S +++ b/arch/x86/kernel/relocate_kernel_32.S @@ -97,6 +97,8 @@ relocate_kernel: ret identity_mapped: + /* set return address to 0 if not preserving context */ + pushl $0 /* store the start address on the stack */ pushl %edx diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index 4de8f5b..7a6f3b3 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -100,6 +100,8 @@ relocate_kernel: ret identity_mapped: + /* set return address to 0 if not preserving context */ + pushq $0 /* store the start address on the stack */ pushq %rdx -- cgit v1.1 From 497888cf69bf607ac1fe061a6437e0a670b0022f Mon Sep 17 00:00:00 2001 From: Phil Carmody Date: Thu, 14 Jul 2011 15:07:13 +0300 Subject: treewide: fix potentially dangerous trailing ';' in #defined values/expressions All these are instances of #define NAME value; or #define NAME(params_opt) value; These of course fail to build when used in contexts like if(foo $OP NAME) while(bar $OP NAME) and may silently generate the wrong code in contexts such as foo = NAME + 1; /* foo = value; + 1; */ bar = NAME - 1; /* bar = value; - 1; */ baz = NAME & quux; /* baz = value; & quux; */ Reported on comp.lang.c, Message-ID: Initial analysis of the dangers provided by Keith Thompson in that thread. There are many more instances of more complicated macros having unnecessary trailing semicolons, but this pile seems to be all of the cases of simple values suffering from the problem. (Thus things that are likely to be found in one of the contexts above, more complicated ones aren't.) Signed-off-by: Phil Carmody Signed-off-by: Jiri Kosina --- arch/x86/kernel/i387.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 12aff25..739d859 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -321,7 +321,7 @@ static inline unsigned short twd_i387_to_fxsr(unsigned short twd) return tmp; } -#define FPREG_ADDR(f, n) ((void *)&(f)->st_space + (n) * 16); +#define FPREG_ADDR(f, n) ((void *)&(f)->st_space + (n) * 16) #define FP_EXP_TAG_VALID 0 #define FP_EXP_TAG_ZERO 1 #define FP_EXP_TAG_SPECIAL 2 -- cgit v1.1 From f53173e47dee5f7514d264796bec58d43ed0f67f Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 21 Jul 2011 20:06:25 +0400 Subject: x86, perf: P4 PMU - Fix typos in comments and style cleanup This patch: - fixes typos in comments and clarifies the text - renames obscure p4_event_alias::original and ::alter members to ::original and ::alternative as appropriate - drops parenthesis from the return of p4_get_alias_event() No functional changes. Reported-by: Ingo Molnar Signed-off-by: Cyrill Gorcunov Link: http://lkml.kernel.org/r/20110721160625.GX7492@sun Signed-off-by: Ingo Molnar --- arch/x86/include/asm/perf_event_p4.h | 2 +- arch/x86/kernel/cpu/perf_event_p4.c | 66 ++++++++++++++++-------------------- 2 files changed, 31 insertions(+), 37 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h index 4d86c86..4f7e67e 100644 --- a/arch/x86/include/asm/perf_event_p4.h +++ b/arch/x86/include/asm/perf_event_p4.h @@ -133,7 +133,7 @@ /* * In case of event aliasing we need to preserve some - * caller bits otherwise the mapping won't be complete. + * caller bits, otherwise the mapping won't be complete. */ #define P4_CONFIG_EVENT_ALIAS_MASK \ (p4_config_pack_escr(P4_CONFIG_MASK_ESCR) | \ diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index 8b7a0c3..7809d2b 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -571,35 +571,34 @@ static __initconst const u64 p4_hw_cache_event_ids }; /* - * Because of Netburst being quite restricted in now - * many same events can run simultaneously, we use - * event aliases, ie different events which have the - * same functionallity but use non-intersected resources - * (ESCR/CCCR/couter registers). This allow us to run - * two or more semi-same events together. It is done - * transparently to a user space. + * Because of Netburst being quite restricted in how many + * identical events may run simultaneously, we introduce event aliases, + * ie the different events which have the same functionality but + * utilize non-intersected resources (ESCR/CCCR/counter registers). * - * Never set any cusom internal bits such as P4_CONFIG_HT, - * P4_CONFIG_ALIASABLE or bits for P4_PEBS_METRIC, they are - * either up-to-dated automatically either not appliable - * at all. + * This allow us to relax restrictions a bit and run two or more + * identical events together. * - * And be really carefull choosing aliases! + * Never set any custom internal bits such as P4_CONFIG_HT, + * P4_CONFIG_ALIASABLE or bits for P4_PEBS_METRIC, they are + * either up to date automatically or not applicable at all. */ struct p4_event_alias { - u64 orig; - u64 alter; + u64 original; + u64 alternative; } p4_event_aliases[] = { { /* - * Non-halted cycles can be substituted with - * non-sleeping cycles (see Intel SDM Vol3b for - * details). + * Non-halted cycles can be substituted with non-sleeping cycles (see + * Intel SDM Vol3b for details). We need this alias to be able + * to run nmi-watchdog and 'perf top' (or any other user space tool + * which is interested in running PERF_COUNT_HW_CPU_CYCLES) + * simultaneously. */ - .orig = + .original = p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS) | P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)), - .alter = + .alternative = p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_EXECUTION_EVENT) | P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0)| P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1)| @@ -620,25 +619,21 @@ static u64 p4_get_alias_event(u64 config) int i; /* - * Probably we're lucky and don't have to do - * matching over all config bits. + * Only event with special mark is allowed, + * we're to be sure it didn't come as malformed + * RAW event. */ if (!(config & P4_CONFIG_ALIASABLE)) return 0; config_match = config & P4_CONFIG_EVENT_ALIAS_MASK; - /* - * If an event was previously swapped to the alter config - * we should swap it back otherwise contnention on registers - * will return back. - */ for (i = 0; i < ARRAY_SIZE(p4_event_aliases); i++) { - if (config_match == p4_event_aliases[i].orig) { - config_match = p4_event_aliases[i].alter; + if (config_match == p4_event_aliases[i].original) { + config_match = p4_event_aliases[i].alternative; break; - } else if (config_match == p4_event_aliases[i].alter) { - config_match = p4_event_aliases[i].orig; + } else if (config_match == p4_event_aliases[i].alternative) { + config_match = p4_event_aliases[i].original; break; } } @@ -646,8 +641,7 @@ static u64 p4_get_alias_event(u64 config) if (i >= ARRAY_SIZE(p4_event_aliases)) return 0; - return config_match | - (config & P4_CONFIG_EVENT_ALIAS_IMMUTABLE_BITS); + return config_match | (config & P4_CONFIG_EVENT_ALIAS_IMMUTABLE_BITS); } static u64 p4_general_events[PERF_COUNT_HW_MAX] = { @@ -1229,9 +1223,9 @@ static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign again: /* - * Aliases are swappable so we may hit circular - * lock if both original config and alias need - * resources (MSR registers) which already busy. + * It's possible to hit a circular lock + * between original and alternative events + * if both are scheduled already. */ if (pass > 2) goto done; @@ -1251,7 +1245,7 @@ again: cntr_idx = p4_next_cntr(thread, used_mask, bind); if (cntr_idx == -1 || test_bit(escr_idx, escr_mask)) { /* - * Probably an event alias is still available. + * Check whether an event alias is still available. */ config_alias = p4_get_alias_event(hwc->config); if (!config_alias) -- cgit v1.1 From 1ac2e6ca44e13a087eb7438d284f887097ee7e84 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 7 Jun 2011 11:49:55 +0200 Subject: x86, perf: Make copy_from_user_nmi() a library function copy_from_user_nmi() is used in oprofile and perf. Moving it to other library functions like copy_from_user(). As this is x86 code for 32 and 64 bits, create a new file usercopy.c for unified code. Signed-off-by: Robert Richter Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110607172413.GJ20052@erda.amd.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uaccess.h | 3 +++ arch/x86/kernel/cpu/perf_event.c | 35 -------------------------------- arch/x86/lib/Makefile | 2 +- arch/x86/lib/usercopy.c | 43 ++++++++++++++++++++++++++++++++++++++++ arch/x86/oprofile/backtrace.c | 39 +----------------------------------- 5 files changed, 48 insertions(+), 74 deletions(-) create mode 100644 arch/x86/lib/usercopy.c (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 99ddd14..36361bf 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -555,6 +555,9 @@ struct __large_struct { unsigned long buf[100]; }; #endif /* CONFIG_X86_WP_WORKS_OK */ +extern unsigned long +copy_from_user_nmi(void *to, const void __user *from, unsigned long n); + /* * movsl can be slow when source and dest are not both 8-byte aligned */ diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index b7a010f..4ee3abf 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include @@ -67,40 +66,6 @@ enum extra_reg_type { EXTRA_REG_MAX /* number of entries needed */ }; -/* - * best effort, GUP based copy_from_user() that assumes IRQ or NMI context - */ -static unsigned long -copy_from_user_nmi(void *to, const void __user *from, unsigned long n) -{ - unsigned long offset, addr = (unsigned long)from; - unsigned long size, len = 0; - struct page *page; - void *map; - int ret; - - do { - ret = __get_user_pages_fast(addr, 1, 0, &page); - if (!ret) - break; - - offset = addr & (PAGE_SIZE - 1); - size = min(PAGE_SIZE - offset, n - len); - - map = kmap_atomic(page); - memcpy(to, map+offset, size); - kunmap_atomic(map); - put_page(page); - - len += size; - to += size; - addr += size; - - } while (len < n); - - return len; -} - struct event_constraint { union { unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index f2479f1..6ba4773 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -18,7 +18,7 @@ obj-$(CONFIG_SMP) += msr-smp.o cache-smp.o lib-y := delay.o lib-y += thunk_$(BITS).o -lib-y += usercopy_$(BITS).o getuser.o putuser.o +lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o lib-y += memcpy_$(BITS).o lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c new file mode 100644 index 0000000..97be9cb --- /dev/null +++ b/arch/x86/lib/usercopy.c @@ -0,0 +1,43 @@ +/* + * User address space access functions. + * + * For licencing details see kernel-base/COPYING + */ + +#include +#include + +/* + * best effort, GUP based copy_from_user() that is NMI-safe + */ +unsigned long +copy_from_user_nmi(void *to, const void __user *from, unsigned long n) +{ + unsigned long offset, addr = (unsigned long)from; + unsigned long size, len = 0; + struct page *page; + void *map; + int ret; + + do { + ret = __get_user_pages_fast(addr, 1, 0, &page); + if (!ret) + break; + + offset = addr & (PAGE_SIZE - 1); + size = min(PAGE_SIZE - offset, n - len); + + map = kmap_atomic(page); + memcpy(to, map+offset, size); + kunmap_atomic(map); + put_page(page); + + len += size; + to += size; + addr += size; + + } while (len < n); + + return len; +} +EXPORT_SYMBOL_GPL(copy_from_user_nmi); diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c index 32f78eb..bff89df 100644 --- a/arch/x86/oprofile/backtrace.c +++ b/arch/x86/oprofile/backtrace.c @@ -12,10 +12,9 @@ #include #include #include -#include +#include #include -#include #include static int backtrace_stack(void *data, char *name) @@ -38,42 +37,6 @@ static struct stacktrace_ops backtrace_ops = { .walk_stack = print_context_stack, }; -/* from arch/x86/kernel/cpu/perf_event.c: */ - -/* - * best effort, GUP based copy_from_user() that assumes IRQ or NMI context - */ -static unsigned long -copy_from_user_nmi(void *to, const void __user *from, unsigned long n) -{ - unsigned long offset, addr = (unsigned long)from; - unsigned long size, len = 0; - struct page *page; - void *map; - int ret; - - do { - ret = __get_user_pages_fast(addr, 1, 0, &page); - if (!ret) - break; - - offset = addr & (PAGE_SIZE - 1); - size = min(PAGE_SIZE - offset, n - len); - - map = kmap_atomic(page); - memcpy(to, map+offset, size); - kunmap_atomic(map); - put_page(page); - - len += size; - to += size; - addr += size; - - } while (len < n); - - return len; -} - #ifdef CONFIG_COMPAT static struct stack_frame_ia32 * dump_user_backtrace_32(struct stack_frame_ia32 *head) -- cgit v1.1 From b7798d28ec15d20fd34b70fa57eb13f0cf6d1ecd Mon Sep 17 00:00:00 2001 From: Daniel J Blueman Date: Fri, 13 May 2011 09:04:59 +0800 Subject: x86: Make Dell Latitude E5420 use reboot=pci Rebooting on the Dell E5420 often hangs with the keyboard or ACPI methods, but is reliable via the PCI method. [ hpa: this was deferred because we believed for a long time that the recent reshuffling of the boot priorities in commit 660e34cebf0a11d54f2d5dd8838607452355f321 fixed this platform. Unfortunately that turned out to be incorrect. ] Signed-off-by: Daniel J Blueman Link: http://lkml.kernel.org/r/1305248699-2347-1-git-send-email-daniel.blueman@gmail.com Signed-off-by: H. Peter Anvin Cc: --- arch/x86/kernel/reboot.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 14eed21..3dfd38f 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -427,6 +427,14 @@ static struct dmi_system_id __initdata pci_reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6320"), }, }, + { /* Handle problems with rebooting on the Latitude E5420. */ + .callback = set_pci_reboot, + .ident = "Dell Latitude E5420", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E5420"), + }, + }, { } }; -- cgit v1.1 From a536877e77f73ea22d12d94a019fedd9671b6acd Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 21 Jul 2011 11:22:21 -0700 Subject: x86: Make Dell Latitude E6420 use reboot=pci Yet another variant of the Dell Latitude series which requires reboot=pci. From the E5420 bug report by Daniel J Blueman: > The E6420 is affected also (same platform, different casing and > features), which provides an external confirmation of the issue; I can > submit a patch for that later or include it if you prefer: > http://linux.koolsolutions.com/2009/08/04/howto-fix-linux-hangfreeze-during-reboots-and-restarts/ Reported-by: Daniel J Blueman Signed-off-by: H. Peter Anvin Cc: --- arch/x86/kernel/reboot.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 3dfd38f..9242436 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -435,6 +435,14 @@ static struct dmi_system_id __initdata pci_reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E5420"), }, }, + { /* Handle problems with rebooting on the Latitude E6420. */ + .callback = set_pci_reboot, + .ident = "Dell Latitude E6420", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6420"), + }, + }, { } }; -- cgit v1.1 From ae7bd11b471931752e5609094ca0a49386590524 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 21 Jul 2011 13:34:05 -0700 Subject: clocksource: Change __ARCH_HAS_CLOCKSOURCE_DATA to a CONFIG option The machinery for __ARCH_HAS_CLOCKSOURCE_DATA assumed a file in asm-generic would be the default for architectures without their own file in asm/, but that is not how it works. Replace it with a Kconfig option instead. Link: http://lkml.kernel.org/r/4E288AA6.7090804@zytor.com Signed-off-by: H. Peter Anvin Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Tony Luck --- arch/x86/Kconfig | 4 ++++ arch/x86/include/asm/clocksource.h | 2 -- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index da34972..c1e41bc 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -93,6 +93,10 @@ config CLOCKSOURCE_WATCHDOG config GENERIC_CLOCKEVENTS def_bool y +config ARCH_CLOCKSOURCE_DATA + def_bool y + depends on X86_64 + config GENERIC_CLOCKEVENTS_BROADCAST def_bool y depends on X86_64 || (X86_32 && X86_LOCAL_APIC) diff --git a/arch/x86/include/asm/clocksource.h b/arch/x86/include/asm/clocksource.h index 3882c65..0bdbbb3 100644 --- a/arch/x86/include/asm/clocksource.h +++ b/arch/x86/include/asm/clocksource.h @@ -5,8 +5,6 @@ #ifdef CONFIG_X86_64 -#define __ARCH_HAS_CLOCKSOURCE_DATA - #define VCLOCK_NONE 0 /* No vDSO clock available. */ #define VCLOCK_TSC 1 /* vDSO should use vread_tsc. */ #define VCLOCK_HPET 2 /* vDSO should use vread_hpet. */ -- cgit v1.1 From aafade242ff24fac3aabf61c7861dfa44a3c2445 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 21 Jul 2011 15:47:10 -0400 Subject: x86-64, vdso: Do not allocate memory for the vDSO We can map the vDSO straight from kernel data, saving a few page allocations. As an added bonus, the deleted code contained a memory leak. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/2c4ed5c2c2e93603790229e0c3403ae506ccc0cb.1311277573.git.luto@mit.edu Signed-off-by: H. Peter Anvin --- arch/x86/vdso/vdso.S | 15 +++++++++++++-- arch/x86/vdso/vma.c | 25 ++++++------------------- 2 files changed, 19 insertions(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/vdso/vdso.S b/arch/x86/vdso/vdso.S index 1d3aa6b..1b979c1 100644 --- a/arch/x86/vdso/vdso.S +++ b/arch/x86/vdso/vdso.S @@ -1,10 +1,21 @@ +#include +#include #include -__INITDATA +__PAGE_ALIGNED_DATA .globl vdso_start, vdso_end + .align PAGE_SIZE vdso_start: .incbin "arch/x86/vdso/vdso.so" vdso_end: -__FINIT +.previous + + .globl vdso_pages + .bss + .align 8 + .type vdso_pages, @object +vdso_pages: + .zero (vdso_end - vdso_start + PAGE_SIZE - 1) / PAGE_SIZE * 8 + .size vdso_pages, .-vdso_pages diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index c39938d..316fbca 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -14,13 +14,14 @@ #include #include #include +#include unsigned int __read_mostly vdso_enabled = 1; extern char vdso_start[], vdso_end[]; extern unsigned short vdso_sync_cpuid; -static struct page **vdso_pages; +extern struct page *vdso_pages[]; static unsigned vdso_size; static void __init patch_vdso(void *vdso, size_t len) @@ -54,7 +55,7 @@ found: apply_alternatives(alt_data, alt_data + alt_sec->sh_size); } -static int __init init_vdso_vars(void) +static int __init init_vdso(void) { int npages = (vdso_end - vdso_start + PAGE_SIZE - 1) / PAGE_SIZE; int i; @@ -62,26 +63,12 @@ static int __init init_vdso_vars(void) patch_vdso(vdso_start, vdso_end - vdso_start); vdso_size = npages << PAGE_SHIFT; - vdso_pages = kmalloc(sizeof(struct page *) * npages, GFP_KERNEL); - if (!vdso_pages) - goto oom; - for (i = 0; i < npages; i++) { - struct page *p; - p = alloc_page(GFP_KERNEL); - if (!p) - goto oom; - vdso_pages[i] = p; - copy_page(page_address(p), vdso_start + i*PAGE_SIZE); - } + for (i = 0; i < npages; i++) + vdso_pages[i] = virt_to_page(vdso_start + i*PAGE_SIZE); return 0; - - oom: - printk("Cannot allocate vdso\n"); - vdso_enabled = 0; - return -ENOMEM; } -subsys_initcall(init_vdso_vars); +subsys_initcall(init_vdso); struct linux_binprm; -- cgit v1.1 From 5dea1c88ed11a1221581c4b202f053c4fc138704 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 22 Jul 2011 14:39:48 +0930 Subject: lguest: use a special 1:1 linear pagetable mode until first switch. The Host used to create some page tables for the Guest to use at the top of Guest memory; it would then tell the Guest where this was. In particular, it created linear mappings for 0 and 0xC0000000 addresses because lguest used to switch to its real page tables quite late in boot. However, since d50d8fe19 Linux initialized boot page tables in head_32.S even before the "are we lguest?" boot jump. So, now we can simplify things: the Host pagetable code assumes 1:1 linear mapping until it first calls the LHCALL_NEW_PGTABLE hypercall, which we now do before we reach C code. This also means that the Host doesn't need to know anything about the Guest's PAGE_OFFSET. (Non-Linux guests might not even have such a thing). Signed-off-by: Rusty Russell --- arch/x86/kernel/asm-offsets_32.c | 1 - arch/x86/lguest/boot.c | 11 +++++------ arch/x86/lguest/i386_head.S | 9 +++++++-- 3 files changed, 12 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index c29d631..395a10e 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -63,7 +63,6 @@ void foo(void) BLANK(); OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending); - OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir); BLANK(); OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc); diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index db832fd..719a32c 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -520,17 +520,16 @@ static unsigned long lguest_read_cr2(void) /* See lguest_set_pte() below. */ static bool cr3_changed = false; +static unsigned long current_cr3; /* * cr3 is the current toplevel pagetable page: the principle is the same as - * cr0. Keep a local copy, and tell the Host when it changes. The only - * difference is that our local copy is in lguest_data because the Host needs - * to set it upon our initial hypercall. + * cr0. Keep a local copy, and tell the Host when it changes. */ static void lguest_write_cr3(unsigned long cr3) { - lguest_data.pgdir = cr3; lazy_hcall1(LHCALL_NEW_PGTABLE, cr3); + current_cr3 = cr3; /* These two page tables are simple, linear, and used during boot */ if (cr3 != __pa(swapper_pg_dir) && cr3 != __pa(initial_page_table)) @@ -539,7 +538,7 @@ static void lguest_write_cr3(unsigned long cr3) static unsigned long lguest_read_cr3(void) { - return lguest_data.pgdir; + return current_cr3; } /* cr4 is used to enable and disable PGE, but we don't care. */ @@ -758,7 +757,7 @@ static void lguest_pmd_clear(pmd_t *pmdp) static void lguest_flush_tlb_single(unsigned long addr) { /* Simply set it to zero: if it was not, it will fault back in. */ - lazy_hcall3(LHCALL_SET_PTE, lguest_data.pgdir, addr, 0); + lazy_hcall3(LHCALL_SET_PTE, current_cr3, addr, 0); } /* diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S index 4f420c2f..863b03a 100644 --- a/arch/x86/lguest/i386_head.S +++ b/arch/x86/lguest/i386_head.S @@ -27,13 +27,18 @@ .section .init.text, "ax", @progbits ENTRY(lguest_entry) /* - * We make the "initialization" hypercall now to tell the Host about - * us, and also find out where it put our page tables. + * We make the "initialization" hypercall now to tell the Host where + * our lguest_data struct is. */ movl $LHCALL_LGUEST_INIT, %eax movl $lguest_data - __PAGE_OFFSET, %ebx int $LGUEST_TRAP_ENTRY + /* Now turn our pagetables on; setup by arch/x86/kernel/head_32.S. */ + movl $LHCALL_NEW_PGTABLE, %eax + movl $(initial_page_table - __PAGE_OFFSET), %ebx + int $LGUEST_TRAP_ENTRY + /* Set up the initial stack so we can run C code. */ movl $(init_thread_union+THREAD_SIZE),%esp -- cgit v1.1 From 7e1941444f808d8001aa3b63588150c516321a3c Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 22 Jul 2011 14:39:49 +0930 Subject: lguest: remove remaining vmcall We switch back from using vmcall in 091ebf07a2408f9a56634caa0f86d9360e9af23b because it was unreliable under kvm, but I missed one (rarely-used) place. Signed-off-by: Rusty Russell --- arch/x86/lguest/i386_head.S | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S index 863b03a..c8c95e5 100644 --- a/arch/x86/lguest/i386_head.S +++ b/arch/x86/lguest/i386_head.S @@ -101,12 +101,8 @@ send_interrupts: */ pushl %eax movl $LHCALL_SEND_INTERRUPTS, %eax - /* - * This is a vmcall instruction (same thing that KVM uses). Older - * assembler versions might not know the "vmcall" instruction, so we - * create one manually here. - */ - .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ + /* This is the actual hypercall trap. */ + int $LGUEST_TRAP_ENTRY /* Put eax back the way we found it. */ popl %eax ret -- cgit v1.1 From 9f54288def3f92b7805eb6d4b1ddcd73ecf6e889 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 22 Jul 2011 14:39:50 +0930 Subject: lguest: update comments Also removes a long-unused #define and an extraneous semicolon. Signed-off-by: Rusty Russell --- arch/x86/include/asm/lguest_hcall.h | 1 + arch/x86/lguest/boot.c | 21 +++++++++++++++------ arch/x86/lguest/i386_head.S | 18 +++++++++++------- 3 files changed, 27 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h index b60f292..879fd7d 100644 --- a/arch/x86/include/asm/lguest_hcall.h +++ b/arch/x86/include/asm/lguest_hcall.h @@ -61,6 +61,7 @@ hcall(unsigned long call, : "memory"); return call; } +/*:*/ /* Can't use our min() macro here: needs to be a constant */ #define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32) diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 719a32c..7427990 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -71,7 +71,8 @@ #include #include /* for struct machine_ops */ -/*G:010 Welcome to the Guest! +/*G:010 + * Welcome to the Guest! * * The Guest in our tale is a simple creature: identical to the Host but * behaving in simplified but equivalent ways. In particular, the Guest is the @@ -190,15 +191,23 @@ static void lazy_hcall4(unsigned long call, #endif /*G:036 - * When lazy mode is turned off reset the per-cpu lazy mode variable and then - * issue the do-nothing hypercall to flush any stored calls. -:*/ + * When lazy mode is turned off, we issue the do-nothing hypercall to + * flush any stored calls, and call the generic helper to reset the + * per-cpu lazy mode variable. + */ static void lguest_leave_lazy_mmu_mode(void) { hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0); paravirt_leave_lazy_mmu(); } +/* + * We also catch the end of context switch; we enter lazy mode for much of + * that too, so again we need to flush here. + * + * (Technically, this is lazy CPU mode, and normally we're in lazy MMU + * mode, but unlike Xen, lguest doesn't care about the difference). + */ static void lguest_end_context_switch(struct task_struct *next) { hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0); @@ -640,7 +649,7 @@ static void lguest_write_cr4(unsigned long val) /* * The Guest calls this after it has set a second-level entry (pte), ie. to map - * a page into a process' address space. Wetell the Host the toplevel and + * a page into a process' address space. We tell the Host the toplevel and * address this corresponds to. The Guest uses one pagetable per process, so * we need to tell the Host which one we're changing (mm->pgd). */ @@ -1139,7 +1148,7 @@ static struct notifier_block paniced = { static __init char *lguest_memory_setup(void) { /* - *The Linux bootloader header contains an "e820" memory map: the + * The Linux bootloader header contains an "e820" memory map: the * Launcher populated the first entry with our memory limit. */ e820_add_region(boot_params.e820_map[0].addr, diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S index c8c95e5..cfa23e3 100644 --- a/arch/x86/lguest/i386_head.S +++ b/arch/x86/lguest/i386_head.S @@ -6,18 +6,22 @@ #include /*G:020 - * Our story starts with the kernel booting into startup_32 in - * arch/x86/kernel/head_32.S. It expects a boot header, which is created by - * the bootloader (the Launcher in our case). + + * Our story starts with the bzImage: booting starts at startup_32 in + * arch/x86/boot/compressed/head_32.S. This merely uncompresses the real + * kernel in place and then jumps into it: startup_32 in + * arch/x86/kernel/head_32.S. Both routines expects a boot header in the %esi + * register, which is created by the bootloader (the Launcher in our case). * * The startup_32 function does very little: it clears the uninitialized global * C variables which we expect to be zero (ie. BSS) and then copies the boot - * header and kernel command line somewhere safe. Finally it checks the - * 'hardware_subarch' field. This was introduced in 2.6.24 for lguest and Xen: - * if it's set to '1' (lguest's assigned number), then it calls us here. + * header and kernel command line somewhere safe, and populates some initial + * page tables. Finally it checks the 'hardware_subarch' field. This was + * introduced in 2.6.24 for lguest and Xen: if it's set to '1' (lguest's + * assigned number), then it calls us here. * * WARNING: be very careful here! We're running at addresses equal to physical - * addesses (around 0), not above PAGE_OFFSET as most code expectes + * addesses (around 0), not above PAGE_OFFSET as most code expects * (eg. 0xC0000000). Jumps are relative, so they're OK, but we can't touch any * data without remembering to subtract __PAGE_OFFSET! * -- cgit v1.1 From 64be11583bac5ca65d017a5c4288f10b2bcf1928 Mon Sep 17 00:00:00 2001 From: Adrian Knoth Date: Mon, 11 Jul 2011 18:07:14 +0200 Subject: lguest: Fix three simple typos in comments This patch fixes three typos I've accidentally spotted. Signed-off-by: Adrian Knoth Signed-off-by: Rusty Russell (one was already fixed) --- arch/x86/lguest/boot.c | 2 +- arch/x86/lguest/i386_head.S | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 7427990..5c4f240 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -467,7 +467,7 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx, /* * PAE systems can mark pages as non-executable. Linux calls this the * NX bit. Intel calls it XD (eXecute Disable), AMD EVP (Enhanced - * Virus Protection). We just switch turn if off here, since we don't + * Virus Protection). We just switch it off here, since we don't * support it. */ case 0x80000001: diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S index cfa23e3..6ddfe4f 100644 --- a/arch/x86/lguest/i386_head.S +++ b/arch/x86/lguest/i386_head.S @@ -21,7 +21,7 @@ * assigned number), then it calls us here. * * WARNING: be very careful here! We're running at addresses equal to physical - * addesses (around 0), not above PAGE_OFFSET as most code expects + * addresses (around 0), not above PAGE_OFFSET as most code expects * (eg. 0xC0000000). Jumps are relative, so they're OK, but we can't touch any * data without remembering to subtract __PAGE_OFFSET! * -- cgit v1.1 From 8d431f41603acff8a20cf5df99bc8958c91879c1 Mon Sep 17 00:00:00 2001 From: Adrian Knoth Date: Mon, 11 Jul 2011 18:08:47 +0200 Subject: lguest: Fix translation count about wikipedia's cpuid page The comment is outdated, wikipedia now has six translations of the cpuid page. Signed-off-by: Adrian Knoth Signed-off-by: Rusty Russell --- arch/x86/lguest/boot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 5c4f240..13ee258 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -400,7 +400,7 @@ static void lguest_load_tr_desc(void) * giant ball of hair. Its entry in the current Intel manual runs to 28 pages. * * This instruction even it has its own Wikipedia entry. The Wikipedia entry - * has been translated into 5 languages. I am not making this up! + * has been translated into 6 languages. I am not making this up! * * We could get funky here and identify ourselves as "GenuineLguest", but * instead we just use the real "cpuid" instruction. Then I pretty much turned -- cgit v1.1 From 0aba496fc820d7c36775f2fd0ef81994e1af67a8 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 27 May 2011 14:59:39 +0800 Subject: x86/PCI: select direct access mode for mmconfig option Direct access is needed in mmconf mode too. There are two reasons: 1. we need it to access first 256 bytes. We have bug before that using mmconf to access pci config space hangs system (when resizing BARs), so we use type1 config for legacy config space. 2. when doing mmconfg bar checking, we need access ACPI _CRS, which might access PCI config space. Signed-off-by: Shaohua Li Signed-off-by: Jesse Barnes --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index da34972..5b3e6ec 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1905,7 +1905,7 @@ config PCI_BIOS # x86-64 doesn't support PCI BIOS access from long mode so always go direct. config PCI_DIRECT def_bool y - depends on PCI && (X86_64 || (PCI_GODIRECT || PCI_GOANY || PCI_GOOLPC)) + depends on PCI && (X86_64 || (PCI_GODIRECT || PCI_GOANY || PCI_GOOLPC || PCI_GOMMCONFIG)) config PCI_MMCONFIG def_bool y -- cgit v1.1 From 43d786ed4df4c54cb8802a523748a7d78130a2cb Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Sat, 2 Jul 2011 10:47:12 -0600 Subject: x86/PCI: reduce severity of host bridge window conflict warnings Host bridge windows are top-level resources, so if we find a host bridge window conflict, it's probably with a hard-coded legacy reservation. Moving host bridge windows is theoretically possible, but we don't support it; we just ignore windows with conflicts, and it's not worth making this a user-visible error. Reported-and-tested-by: Jools Wills References: https://bugzilla.kernel.org/show_bug.cgi?id=38522 Reported-by: Das References: https://bugzilla.kernel.org/show_bug.cgi?id=16497 Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- arch/x86/pci/acpi.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 68c3c13..ae3cb23 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -246,10 +246,9 @@ static void add_resources(struct pci_root_info *info) conflict = insert_resource_conflict(root, res); if (conflict) - dev_err(&info->bridge->dev, - "address space collision: host bridge window %pR " - "conflicts with %s %pR\n", - res, conflict->name, conflict); + dev_info(&info->bridge->dev, + "ignoring host bridge window %pR (conflicts with %s %pR)\n", + res, conflict->name, conflict); else pci_bus_add_resource(info->bus, res, 0); } -- cgit v1.1 From db34a363b992e0c8063f432607561520d79fbfb8 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 22 Jul 2011 08:13:05 +0100 Subject: x86/PCI: config space accessor functions should not ignore the segment argument Without this change, the majority of the raw PCI config space access functions silently ignore a non-zero segment argument, which is certainly wrong. Apart from pci_direct_conf1, all other non-MMCFG access methods get used only for non-extended accesses (i.e. assigned to raw_pci_ops only). Consequently, with the way raw_pci_{read,write}() work, it would be a coding error to call these functions with a non-zero segment (with the current call flow this cannot happen afaict). The access method 1 accessor, as it can be used for extended accesses (on AMD systems) instead gets checks added for the passed in segment to be zero. This would be the case when on such a system having multiple PCI segments (don't know whether any exist in practice) MMCFG for some reason is not usable, and method 1 gets selected for doing extended accesses. Rather than accessing the wrong device's config space, the function will now error out. v2: Convert BUG_ON() to WARN_ON(), and extend description as per Ingo's request. Signed-off-by: Jan Beulich Reviewed-by: Ingo Molnar Signed-off-by: Jesse Barnes --- arch/x86/pci/ce4100.c | 2 ++ arch/x86/pci/direct.c | 6 ++++-- arch/x86/pci/numaq_32.c | 2 ++ arch/x86/pci/olpc.c | 4 ++++ arch/x86/pci/pcbios.c | 2 ++ 5 files changed, 14 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/ce4100.c b/arch/x86/pci/ce4100.c index 67858be..9917609 100644 --- a/arch/x86/pci/ce4100.c +++ b/arch/x86/pci/ce4100.c @@ -257,6 +257,7 @@ static int ce4100_conf_read(unsigned int seg, unsigned int bus, { int i; + WARN_ON(seg); if (bus == 1) { for (i = 0; i < ARRAY_SIZE(bus1_fixups); i++) { if (bus1_fixups[i].dev_func == devfn && @@ -282,6 +283,7 @@ static int ce4100_conf_write(unsigned int seg, unsigned int bus, { int i; + WARN_ON(seg); if (bus == 1) { for (i = 0; i < ARRAY_SIZE(bus1_fixups); i++) { if (bus1_fixups[i].dev_func == devfn && diff --git a/arch/x86/pci/direct.c b/arch/x86/pci/direct.c index e6fd847..4f2c704 100644 --- a/arch/x86/pci/direct.c +++ b/arch/x86/pci/direct.c @@ -22,7 +22,7 @@ static int pci_conf1_read(unsigned int seg, unsigned int bus, { unsigned long flags; - if ((bus > 255) || (devfn > 255) || (reg > 4095)) { + if (seg || (bus > 255) || (devfn > 255) || (reg > 4095)) { *value = -1; return -EINVAL; } @@ -53,7 +53,7 @@ static int pci_conf1_write(unsigned int seg, unsigned int bus, { unsigned long flags; - if ((bus > 255) || (devfn > 255) || (reg > 4095)) + if (seg || (bus > 255) || (devfn > 255) || (reg > 4095)) return -EINVAL; raw_spin_lock_irqsave(&pci_config_lock, flags); @@ -97,6 +97,7 @@ static int pci_conf2_read(unsigned int seg, unsigned int bus, unsigned long flags; int dev, fn; + WARN_ON(seg); if ((bus > 255) || (devfn > 255) || (reg > 255)) { *value = -1; return -EINVAL; @@ -138,6 +139,7 @@ static int pci_conf2_write(unsigned int seg, unsigned int bus, unsigned long flags; int dev, fn; + WARN_ON(seg); if ((bus > 255) || (devfn > 255) || (reg > 255)) return -EINVAL; diff --git a/arch/x86/pci/numaq_32.c b/arch/x86/pci/numaq_32.c index 5c9e245..512a88c 100644 --- a/arch/x86/pci/numaq_32.c +++ b/arch/x86/pci/numaq_32.c @@ -34,6 +34,7 @@ static int pci_conf1_mq_read(unsigned int seg, unsigned int bus, unsigned long flags; void *adr __iomem = XQUAD_PORT_ADDR(0xcfc, BUS2QUAD(bus)); + WARN_ON(seg); if (!value || (bus >= MAX_MP_BUSSES) || (devfn > 255) || (reg > 255)) return -EINVAL; @@ -73,6 +74,7 @@ static int pci_conf1_mq_write(unsigned int seg, unsigned int bus, unsigned long flags; void *adr __iomem = XQUAD_PORT_ADDR(0xcfc, BUS2QUAD(bus)); + WARN_ON(seg); if ((bus >= MAX_MP_BUSSES) || (devfn > 255) || (reg > 255)) return -EINVAL; diff --git a/arch/x86/pci/olpc.c b/arch/x86/pci/olpc.c index 13700ec..5262603 100644 --- a/arch/x86/pci/olpc.c +++ b/arch/x86/pci/olpc.c @@ -206,6 +206,8 @@ static int pci_olpc_read(unsigned int seg, unsigned int bus, { uint32_t *addr; + WARN_ON(seg); + /* Use the hardware mechanism for non-simulated devices */ if (!is_simulated(bus, devfn)) return pci_direct_conf1.read(seg, bus, devfn, reg, len, value); @@ -264,6 +266,8 @@ static int pci_olpc_read(unsigned int seg, unsigned int bus, static int pci_olpc_write(unsigned int seg, unsigned int bus, unsigned int devfn, int reg, int len, uint32_t value) { + WARN_ON(seg); + /* Use the hardware mechanism for non-simulated devices */ if (!is_simulated(bus, devfn)) return pci_direct_conf1.write(seg, bus, devfn, reg, len, value); diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c index a5f7d0d..f685535 100644 --- a/arch/x86/pci/pcbios.c +++ b/arch/x86/pci/pcbios.c @@ -181,6 +181,7 @@ static int pci_bios_read(unsigned int seg, unsigned int bus, unsigned long flags; unsigned long bx = (bus << 8) | devfn; + WARN_ON(seg); if (!value || (bus > 255) || (devfn > 255) || (reg > 255)) return -EINVAL; @@ -247,6 +248,7 @@ static int pci_bios_write(unsigned int seg, unsigned int bus, unsigned long flags; unsigned long bx = (bus << 8) | devfn; + WARN_ON(seg); if ((bus > 255) || (devfn > 255) || (reg > 255)) return -EINVAL; -- cgit v1.1 From d5341942d784134f2997b3ff82cd63cf71d1f932 Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Fri, 10 Jun 2011 15:30:21 +0100 Subject: PCI: Make the struct pci_dev * argument of pci_fixup_irqs const. Aside of the usual motivation for constification, this function has a history of being abused a hook for interrupt and other fixups so I turned this function const ages ago in the MIPS code but it should be done treewide. Due to function pointer passing in varous places a few other functions had to be constified as well. Signed-off-by: Ralf Baechle To: Anton Vorontsov To: Chris Metcalf To: Colin Cross Acked-by: "David S. Miller" To: Eric Miao To: Erik Gilling Acked-by: Guan Xuetao To: "H. Peter Anvin" To: Imre Kaloz To: Ingo Molnar To: Ivan Kokshaysky To: Jesse Barnes To: Krzysztof Halasa To: Lennert Buytenhek To: Matt Turner To: Nicolas Pitre To: Olof Johansson Acked-by: Paul Mundt To: Richard Henderson To: Russell King To: Thomas Gleixner Cc: Andrew Morton Cc: linux-alpha@vger.kernel.org Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Cc: linux-mips@linux-mips.org Cc: linux-pci@vger.kernel.org Cc: linux-sh@vger.kernel.org Cc: linux-tegra@vger.kernel.org Cc: sparclinux@vger.kernel.org Cc: x86@kernel.org Signed-off-by: Jesse Barnes --- arch/x86/pci/visws.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/visws.c b/arch/x86/pci/visws.c index 03008f7..6f2f8ee 100644 --- a/arch/x86/pci/visws.c +++ b/arch/x86/pci/visws.c @@ -24,7 +24,7 @@ static void pci_visws_disable_irq(struct pci_dev *dev) { } unsigned int pci_bus0, pci_bus1; -static int __init visws_map_irq(struct pci_dev *dev, u8 slot, u8 pin) +static int __init visws_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { int irq, bus = dev->bus->number; -- cgit v1.1 From a8c7ef3187a266d201af887fd48afbef3598825a Mon Sep 17 00:00:00 2001 From: tip-bot for Sergei Shtylyov Date: Thu, 21 Jul 2011 10:11:42 +0000 Subject: x86/PCI: quirks: Use pci_dev->revision This code uses PCI_CLASS_REVISION instead of PCI_REVISION_ID, so it wasn't converted by commit 44c10138fd4bbc ("PCI: Change all drivers to use pci_device->revision") before being moved to arch/x86/... Signed-off-by: Sergei Shtylyov Cc: Dave Jones Link: http://lkml.kernel.org/r/201107111901.39281.sshtylyov@ru.mvista.com Signed-off-by: Ingo Molnar Signed-off-by: Jesse Barnes --- arch/x86/kernel/quirks.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 8bbe8c5..b78643d 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -10,7 +10,7 @@ static void __devinit quirk_intel_irqbalance(struct pci_dev *dev) { - u8 config, rev; + u8 config; u16 word; /* BIOS may enable hardware IRQ balancing for @@ -18,8 +18,7 @@ static void __devinit quirk_intel_irqbalance(struct pci_dev *dev) * based platforms. * Disable SW irqbalance/affinity on those platforms. */ - pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev); - if (rev > 0x9) + if (dev->revision > 0x9) return; /* enable access to config space*/ -- cgit v1.1 From 9b373ed18f745bddf4288f1ec4a51fe822b8610a Mon Sep 17 00:00:00 2001 From: "Narendra_K@Dell.com" Date: Fri, 18 Mar 2011 10:22:14 -0700 Subject: x86/PCI: Preserve existing pci=bfsort whitelist for Dell systems Commit 6e8af08dfa40b747002207d3ce8e8b43a050d99f enables pci=bfsort on future Dell systems. But the identification string 'Dell System' matches on already existing whitelist, which do not have SMBIOS type 0xB1, causing pci=bfsort not being set on existing whitelist. This patch fixes the regression by moving the type 0xB1 check beyond the existing whitelist so that existing whitelist is walked before. Signed-off-by: Shyam Iyer Signed-off-by: Narendra K Signed-off-by: Jesse Barnes --- arch/x86/pci/common.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 5fe7502..92df322 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -247,13 +247,6 @@ static const struct dmi_system_id __devinitconst pciprobe_dmi_table[] = { }, #endif /* __i386__ */ { - .callback = find_sort_method, - .ident = "Dell System", - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc"), - }, - }, - { .callback = set_bf_sort, .ident = "Dell PowerEdge 1950", .matches = { @@ -294,6 +287,13 @@ static const struct dmi_system_id __devinitconst pciprobe_dmi_table[] = { }, }, { + .callback = find_sort_method, + .ident = "Dell System", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc"), + }, + }, + { .callback = set_bf_sort, .ident = "HP ProLiant BL20p G3", .matches = { -- cgit v1.1 From a4e05276a10198a1540dd1a0001f759c10ce1cf1 Mon Sep 17 00:00:00 2001 From: Jonas Bonn Date: Sat, 2 Jul 2011 11:40:18 +0200 Subject: asm-generic: move archictures to common delay.h This patch moves the in-tree architectures that were using the 'generic' delay.h over to using the header file in asm-generic. This is not done using the generic-y mechanism as none of these arch's have started using that mechanism yet. This is a trivial change to make later when the arch begins using generic-y. Note the subtle change to the avr32 and SH architectures where the argument to __const_udelay was previously using the rounded down constant value instead of the rounded up value. Signed-off-by: Jonas Bonn Acked-by: Arnd Bergmann Acked-by: Hans-Christian Egtvedt --- arch/x86/include/asm/delay.h | 25 +------------------------ 1 file changed, 1 insertion(+), 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/delay.h b/arch/x86/include/asm/delay.h index 409a649..9b3b4f2 100644 --- a/arch/x86/include/asm/delay.h +++ b/arch/x86/include/asm/delay.h @@ -1,30 +1,7 @@ #ifndef _ASM_X86_DELAY_H #define _ASM_X86_DELAY_H -/* - * Copyright (C) 1993 Linus Torvalds - * - * Delay routines calling functions in arch/x86/lib/delay.c - */ - -/* Undefined functions to get compile-time errors */ -extern void __bad_udelay(void); -extern void __bad_ndelay(void); - -extern void __udelay(unsigned long usecs); -extern void __ndelay(unsigned long nsecs); -extern void __const_udelay(unsigned long xloops); -extern void __delay(unsigned long loops); - -/* 0x10c7 is 2**32 / 1000000 (rounded up) */ -#define udelay(n) (__builtin_constant_p(n) ? \ - ((n) > 20000 ? __bad_udelay() : __const_udelay((n) * 0x10c7ul)) : \ - __udelay(n)) - -/* 0x5 is 2**32 / 1000000000 (rounded up) */ -#define ndelay(n) (__builtin_constant_p(n) ? \ - ((n) > 20000 ? __bad_ndelay() : __const_udelay((n) * 5ul)) : \ - __ndelay(n)) +#include void use_tsc_delay(void); -- cgit v1.1 From e72542191cbba4cf7fda21cb22e26b42d7415daf Mon Sep 17 00:00:00 2001 From: Ohad Ben-Cohen Date: Tue, 5 Jul 2011 17:06:14 +0300 Subject: virtio: expose for non-virtualization users too virtio has been so far used only in the context of virtualization, and the virtio Kconfig was sourced directly by the relevant arch Kconfigs when VIRTUALIZATION was selected. Now that we start using virtio for inter-processor communications, we need to source the virtio Kconfig outside of the virtualization scope too. Moreover, some architectures might use virtio for both virtualization and inter-processor communications, so directly sourcing virtio might yield unexpected results due to conflicting selections. The simple solution offered by this patch is to always source virtio's Kconfig in drivers/Kconfig, and remove it from the appropriate arch Kconfigs. Additionally, a virtio menu entry has been added so virtio drivers don't show up in the general drivers menu. This way anyone can use virtio, though it's arguably less accessible (and neat!) for virtualization users now. Note: some architectures (mips and sh) seem to have a VIRTUALIZATION menu merely for sourcing virtio's Kconfig, so that menu is removed too. Signed-off-by: Ohad Ben-Cohen Signed-off-by: Rusty Russell --- arch/x86/kvm/Kconfig | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 50f6364..65cf823 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -76,6 +76,5 @@ config KVM_MMU_AUDIT # the virtualization menu. source drivers/vhost/Kconfig source drivers/lguest/Kconfig -source drivers/virtio/Kconfig endif # VIRTUALIZATION -- cgit v1.1 From d910f5c1064d7ff09c31b0191564f9f99e210f91 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Mon, 11 Jul 2011 15:28:19 -0400 Subject: KVM guest: KVM Steal time registration This patch implements the kvm bits of the steal time infrastructure. The most important part of it, is the steal time clock. It is an continuous clock that shows the accumulated amount of steal time since vcpu creation. It is supposed to survive cpu offlining/onlining. [marcelo: fix build with CONFIG_KVM_GUEST=n] Signed-off-by: Glauber Costa Acked-by: Rik van Riel Tested-by: Eric B Munson CC: Jeremy Fitzhardinge CC: Peter Zijlstra CC: Avi Kivity CC: Anthony Liguori Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_para.h | 6 ++++ arch/x86/kernel/kvm.c | 72 +++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/kvmclock.c | 2 ++ 3 files changed, 80 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index c484ba8..734c376 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -192,6 +192,7 @@ void __init kvm_guest_init(void); void kvm_async_pf_task_wait(u32 token); void kvm_async_pf_task_wake(u32 token); u32 kvm_read_and_reset_pf_reason(void); +extern void kvm_disable_steal_time(void); #else #define kvm_guest_init() do { } while (0) #define kvm_async_pf_task_wait(T) do {} while(0) @@ -200,6 +201,11 @@ static inline u32 kvm_read_and_reset_pf_reason(void) { return 0; } + +static inline void kvm_disable_steal_time(void) +{ + return; +} #endif #endif /* __KERNEL__ */ diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 33c07b0..a9c2116 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -51,6 +51,15 @@ static int parse_no_kvmapf(char *arg) early_param("no-kvmapf", parse_no_kvmapf); +static int steal_acc = 1; +static int parse_no_stealacc(char *arg) +{ + steal_acc = 0; + return 0; +} + +early_param("no-steal-acc", parse_no_stealacc); + struct kvm_para_state { u8 mmu_queue[MMU_QUEUE_SIZE]; int mmu_queue_len; @@ -58,6 +67,8 @@ struct kvm_para_state { static DEFINE_PER_CPU(struct kvm_para_state, para_state); static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); +static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64); +static int has_steal_clock = 0; static struct kvm_para_state *kvm_para_state(void) { @@ -441,6 +452,21 @@ static void __init paravirt_ops_setup(void) #endif } +static void kvm_register_steal_time(void) +{ + int cpu = smp_processor_id(); + struct kvm_steal_time *st = &per_cpu(steal_time, cpu); + + if (!has_steal_clock) + return; + + memset(st, 0, sizeof(*st)); + + wrmsrl(MSR_KVM_STEAL_TIME, (__pa(st) | KVM_MSR_ENABLED)); + printk(KERN_INFO "kvm-stealtime: cpu %d, msr %lx\n", + cpu, __pa(st)); +} + void __cpuinit kvm_guest_cpu_init(void) { if (!kvm_para_available()) @@ -457,6 +483,9 @@ void __cpuinit kvm_guest_cpu_init(void) printk(KERN_INFO"KVM setup async PF for cpu %d\n", smp_processor_id()); } + + if (has_steal_clock) + kvm_register_steal_time(); } static void kvm_pv_disable_apf(void *unused) @@ -483,6 +512,31 @@ static struct notifier_block kvm_pv_reboot_nb = { .notifier_call = kvm_pv_reboot_notify, }; +static u64 kvm_steal_clock(int cpu) +{ + u64 steal; + struct kvm_steal_time *src; + int version; + + src = &per_cpu(steal_time, cpu); + do { + version = src->version; + rmb(); + steal = src->steal; + rmb(); + } while ((version & 1) || (version != src->version)); + + return steal; +} + +void kvm_disable_steal_time(void) +{ + if (!has_steal_clock) + return; + + wrmsr(MSR_KVM_STEAL_TIME, 0, 0); +} + #ifdef CONFIG_SMP static void __init kvm_smp_prepare_boot_cpu(void) { @@ -500,6 +554,7 @@ static void __cpuinit kvm_guest_cpu_online(void *dummy) static void kvm_guest_cpu_offline(void *dummy) { + kvm_disable_steal_time(); kvm_pv_disable_apf(NULL); apf_task_wake_all(); } @@ -548,6 +603,11 @@ void __init kvm_guest_init(void) if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF)) x86_init.irqs.trap_init = kvm_apf_trap_init; + if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) { + has_steal_clock = 1; + pv_time_ops.steal_clock = kvm_steal_clock; + } + #ifdef CONFIG_SMP smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; register_cpu_notifier(&kvm_cpu_notifier); @@ -555,3 +615,15 @@ void __init kvm_guest_init(void) kvm_guest_cpu_init(); #endif } + +static __init int activate_jump_labels(void) +{ + if (has_steal_clock) { + jump_label_inc(¶virt_steal_enabled); + if (steal_acc) + jump_label_inc(¶virt_steal_rq_enabled); + } + + return 0; +} +arch_initcall(activate_jump_labels); diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 6389a6b..c1a0188 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -160,6 +160,7 @@ static void __cpuinit kvm_setup_secondary_clock(void) static void kvm_crash_shutdown(struct pt_regs *regs) { native_write_msr(msr_kvm_system_time, 0, 0); + kvm_disable_steal_time(); native_machine_crash_shutdown(regs); } #endif @@ -167,6 +168,7 @@ static void kvm_crash_shutdown(struct pt_regs *regs) static void kvm_shutdown(void) { native_write_msr(msr_kvm_system_time, 0, 0); + kvm_disable_steal_time(); native_machine_shutdown(); } -- cgit v1.1 From 052331bea38dfc176322ec85642eb98d6803a762 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 12 Jul 2011 03:21:17 +0800 Subject: KVM: MMU: fix walking shadow page table Properly check the last mapping, and do not walk to the next level if last spte is met Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index da0f3b0..03323dc 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1517,10 +1517,6 @@ static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator) if (iterator->level < PT_PAGE_TABLE_LEVEL) return false; - if (iterator->level == PT_PAGE_TABLE_LEVEL) - if (is_large_pte(*iterator->sptep)) - return false; - iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level); iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index; return true; @@ -1528,6 +1524,11 @@ static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator) static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) { + if (is_last_spte(*iterator->sptep, iterator->level)) { + iterator->level = 0; + return; + } + iterator->shadow_addr = *iterator->sptep & PT64_BASE_ADDR_MASK; --iterator->level; } -- cgit v1.1 From ffb61bb3bca33ff8e68d11d7cb6b27ac0f74a2c0 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 12 Jul 2011 03:22:01 +0800 Subject: KVM: MMU: do not update slot bitmap if spte is nonpresent Set slot bitmap only if the spte is present Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 03323dc..02c839f 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -743,9 +743,6 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) struct kvm_mmu_page *sp; unsigned long *rmapp; - if (!is_rmap_spte(*spte)) - return 0; - sp = page_header(__pa(spte)); kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); @@ -2087,11 +2084,13 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (!was_rmapped && is_large_pte(*sptep)) ++vcpu->kvm->stat.lpages; - page_header_update_slot(vcpu->kvm, sptep, gfn); - if (!was_rmapped) { - rmap_count = rmap_add(vcpu, sptep, gfn); - if (rmap_count > RMAP_RECYCLE_THRESHOLD) - rmap_recycle(vcpu, sptep, gfn); + if (is_shadow_present_pte(*sptep)) { + page_header_update_slot(vcpu->kvm, sptep, gfn); + if (!was_rmapped) { + rmap_count = rmap_add(vcpu, sptep, gfn); + if (rmap_count > RMAP_RECYCLE_THRESHOLD) + rmap_recycle(vcpu, sptep, gfn); + } } kvm_release_pfn_clean(pfn); if (speculative) { -- cgit v1.1 From af7cc7d1ee422a612f6785e347a893d44cc892ea Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 12 Jul 2011 03:22:46 +0800 Subject: KVM: x86: introduce vcpu_mmio_gva_to_gpa to cleanup the code Introduce vcpu_mmio_gva_to_gpa to translate the gva to gpa, we can use it to cleanup the code between read emulation and write emulation Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 42 +++++++++++++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c96cdc0..a1dbd04 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4010,6 +4010,27 @@ out: } EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system); +static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, + gpa_t *gpa, struct x86_exception *exception, + bool write) +{ + u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + + if (write) + access |= PFERR_WRITE_MASK; + + *gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); + + if (*gpa == UNMAPPED_GVA) + return -1; + + /* For APIC access vmexit */ + if ((*gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) + return 1; + + return 0; +} + static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, unsigned long addr, void *val, @@ -4017,8 +4038,8 @@ static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, struct x86_exception *exception) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - gpa_t gpa; - int handled; + gpa_t gpa; + int handled, ret; if (vcpu->mmio_read_completed) { memcpy(val, vcpu->mmio_data, bytes); @@ -4028,13 +4049,12 @@ static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, return X86EMUL_CONTINUE; } - gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, exception); + ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, false); - if (gpa == UNMAPPED_GVA) + if (ret < 0) return X86EMUL_PROPAGATE_FAULT; - /* For APIC access vmexit */ - if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) + if (ret) goto mmio; if (kvm_read_guest_virt(ctxt, addr, val, bytes, exception) @@ -4085,16 +4105,16 @@ static int emulator_write_emulated_onepage(unsigned long addr, struct x86_exception *exception, struct kvm_vcpu *vcpu) { - gpa_t gpa; - int handled; + gpa_t gpa; + int handled, ret; - gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception); + ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, true); - if (gpa == UNMAPPED_GVA) + if (ret < 0) return X86EMUL_PROPAGATE_FAULT; /* For APIC access vmexit */ - if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) + if (ret) goto mmio; if (emulator_write_phys(vcpu, gpa, val, bytes)) -- cgit v1.1 From bebb106a5afa32efdf5332ed4a40bf4d6d06b56e Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 12 Jul 2011 03:23:20 +0800 Subject: KVM: MMU: cache mmio info on page fault path If the page fault is caused by mmio, we can cache the mmio info, later, we do not need to walk guest page table and quickly know it is a mmio fault while we emulate the mmio instruction Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 5 +++++ arch/x86/kvm/mmu.c | 21 +++++++-------------- arch/x86/kvm/mmu.h | 23 +++++++++++++++++++++++ arch/x86/kvm/paging_tmpl.h | 21 ++++++++++++++------- arch/x86/kvm/x86.c | 11 +++++++++++ arch/x86/kvm/x86.h | 36 ++++++++++++++++++++++++++++++++++++ 6 files changed, 96 insertions(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 59086a7..8da1400 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -424,6 +424,11 @@ struct kvm_vcpu_arch { u64 mcg_ctl; u64 *mce_banks; + /* Cache MMIO info */ + u64 mmio_gva; + unsigned access; + gfn_t mmio_gfn; + /* used for guest single stepping over the given code position */ unsigned long singlestep_rip; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 02c839f..d1986b7 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -217,11 +217,6 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, } EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); -static bool is_write_protection(struct kvm_vcpu *vcpu) -{ - return kvm_read_cr0_bits(vcpu, X86_CR0_WP); -} - static int is_cpuid_PSE36(void) { return 1; @@ -243,11 +238,6 @@ static int is_large_pte(u64 pte) return pte & PT_PAGE_SIZE_MASK; } -static int is_writable_pte(unsigned long pte) -{ - return pte & PT_WRITABLE_MASK; -} - static int is_dirty_gpte(unsigned long pte) { return pte & PT_DIRTY_MASK; @@ -2247,15 +2237,17 @@ static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct * send_sig_info(SIGBUS, &info, tsk); } -static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn) +static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gva_t gva, + unsigned access, gfn_t gfn, pfn_t pfn) { kvm_release_pfn_clean(pfn); if (is_hwpoison_pfn(pfn)) { - kvm_send_hwpoison_signal(gfn_to_hva(kvm, gfn), current); + kvm_send_hwpoison_signal(gfn_to_hva(vcpu->kvm, gfn), current); return 0; } else if (is_fault_pfn(pfn)) return -EFAULT; + vcpu_cache_mmio_info(vcpu, gva, gfn, access); return 1; } @@ -2337,7 +2329,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn, /* mmio */ if (is_error_pfn(pfn)) - return kvm_handle_bad_page(vcpu->kvm, gfn, pfn); + return kvm_handle_bad_page(vcpu, v, ACC_ALL, gfn, pfn); spin_lock(&vcpu->kvm->mmu_lock); if (mmu_notifier_retry(vcpu, mmu_seq)) @@ -2564,6 +2556,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu) if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) return; + vcpu_clear_mmio_info(vcpu, ~0ul); trace_kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { hpa_t root = vcpu->arch.mmu.root_hpa; @@ -2710,7 +2703,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, /* mmio */ if (is_error_pfn(pfn)) - return kvm_handle_bad_page(vcpu->kvm, gfn, pfn); + return kvm_handle_bad_page(vcpu, 0, 0, gfn, pfn); spin_lock(&vcpu->kvm->mmu_lock); if (mmu_notifier_retry(vcpu, mmu_seq)) goto out_unlock; diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 7086ca8..05310b1 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -76,4 +76,27 @@ static inline int is_present_gpte(unsigned long pte) return pte & PT_PRESENT_MASK; } +static inline int is_writable_pte(unsigned long pte) +{ + return pte & PT_WRITABLE_MASK; +} + +static inline bool is_write_protection(struct kvm_vcpu *vcpu) +{ + return kvm_read_cr0_bits(vcpu, X86_CR0_WP); +} + +static inline bool check_write_user_access(struct kvm_vcpu *vcpu, + bool write_fault, bool user_fault, + unsigned long pte) +{ + if (unlikely(write_fault && !is_writable_pte(pte) + && (user_fault || is_write_protection(vcpu)))) + return false; + + if (unlikely(user_fault && !(pte & PT_USER_MASK))) + return false; + + return true; +} #endif diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 1e1c244..f0fb1a4 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -208,11 +208,8 @@ retry_walk: goto error; } - if (unlikely(write_fault && !is_writable_pte(pte) - && (user_fault || is_write_protection(vcpu)))) - eperm = true; - - if (unlikely(user_fault && !(pte & PT_USER_MASK))) + if (!check_write_user_access(vcpu, write_fault, user_fault, + pte)) eperm = true; #if PTTYPE == 64 @@ -625,8 +622,16 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, return 0; /* mmio */ - if (is_error_pfn(pfn)) - return kvm_handle_bad_page(vcpu->kvm, walker.gfn, pfn); + if (is_error_pfn(pfn)) { + unsigned access = walker.pte_access; + bool dirty = is_dirty_gpte(walker.ptes[walker.level - 1]); + + if (!dirty) + access &= ~ACC_WRITE_MASK; + + return kvm_handle_bad_page(vcpu, mmu_is_nested(vcpu) ? 0 : + addr, access, walker.gfn, pfn); + } spin_lock(&vcpu->kvm->mmu_lock); if (mmu_notifier_retry(vcpu, mmu_seq)) @@ -666,6 +671,8 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) u64 *sptep; int need_flush = 0; + vcpu_clear_mmio_info(vcpu, gva); + spin_lock(&vcpu->kvm->mmu_lock); for_each_shadow_entry(vcpu, gva, iterator) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a1dbd04..028a0f2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4016,6 +4016,14 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, { u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + if (vcpu_match_mmio_gva(vcpu, gva) && + check_write_user_access(vcpu, write, access, + vcpu->arch.access)) { + *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT | + (gva & (PAGE_SIZE - 1)); + return 1; + } + if (write) access |= PFERR_WRITE_MASK; @@ -4028,6 +4036,9 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, if ((*gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) return 1; + if (vcpu_match_mmio_gpa(vcpu, *gpa)) + return 1; + return 0; } diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 256da82..d36fe23 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -75,6 +75,42 @@ static inline u32 bit(int bitno) return 1 << (bitno & 31); } +static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu, + gva_t gva, gfn_t gfn, unsigned access) +{ + vcpu->arch.mmio_gva = gva & PAGE_MASK; + vcpu->arch.access = access; + vcpu->arch.mmio_gfn = gfn; +} + +/* + * Clear the mmio cache info for the given gva, + * specially, if gva is ~0ul, we clear all mmio cache info. + */ +static inline void vcpu_clear_mmio_info(struct kvm_vcpu *vcpu, gva_t gva) +{ + if (gva != (~0ul) && vcpu->arch.mmio_gva != (gva & PAGE_MASK)) + return; + + vcpu->arch.mmio_gva = 0; +} + +static inline bool vcpu_match_mmio_gva(struct kvm_vcpu *vcpu, unsigned long gva) +{ + if (vcpu->arch.mmio_gva && vcpu->arch.mmio_gva == (gva & PAGE_MASK)) + return true; + + return false; +} + +static inline bool vcpu_match_mmio_gpa(struct kvm_vcpu *vcpu, gpa_t gpa) +{ + if (vcpu->arch.mmio_gfn && vcpu->arch.mmio_gfn == gpa >> PAGE_SHIFT) + return true; + + return false; +} + void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); -- cgit v1.1 From 640d9b0dbe9f744ac8fd517a8f6afe238f8f525b Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 12 Jul 2011 03:24:39 +0800 Subject: KVM: MMU: optimize to handle dirty bit If dirty bit is not set, we can make the pte access read-only to avoid handing dirty bit everywhere Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 13 ++++++------- arch/x86/kvm/paging_tmpl.h | 46 +++++++++++++++++++--------------------------- 2 files changed, 25 insertions(+), 34 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index d1986b7..98812c2 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1923,7 +1923,7 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, int user_fault, - int write_fault, int dirty, int level, + int write_fault, int level, gfn_t gfn, pfn_t pfn, bool speculative, bool can_unsync, bool host_writable) { @@ -1938,8 +1938,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, spte = PT_PRESENT_MASK; if (!speculative) spte |= shadow_accessed_mask; - if (!dirty) - pte_access &= ~ACC_WRITE_MASK; + if (pte_access & ACC_EXEC_MASK) spte |= shadow_x_mask; else @@ -2023,7 +2022,7 @@ done: static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pt_access, unsigned pte_access, - int user_fault, int write_fault, int dirty, + int user_fault, int write_fault, int *ptwrite, int level, gfn_t gfn, pfn_t pfn, bool speculative, bool host_writable) @@ -2059,7 +2058,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, } if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault, - dirty, level, gfn, pfn, speculative, true, + level, gfn, pfn, speculative, true, host_writable)) { if (write_fault) *ptwrite = 1; @@ -2129,7 +2128,7 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, for (i = 0; i < ret; i++, gfn++, start++) mmu_set_spte(vcpu, start, ACC_ALL, - access, 0, 0, 1, NULL, + access, 0, 0, NULL, sp->role.level, gfn, page_to_pfn(pages[i]), true, true); @@ -2193,7 +2192,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, unsigned pte_access = ACC_ALL; mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access, - 0, write, 1, &pt_write, + 0, write, &pt_write, level, gfn, pfn, prefault, map_writable); direct_pte_prefetch(vcpu, iterator.sptep); ++vcpu->stat.pf_fixed; diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index f0fb1a4..c9fe97b 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -101,11 +101,15 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, return (ret != orig_pte); } -static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte) +static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte, + bool last) { unsigned access; access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; + if (last && !is_dirty_gpte(gpte)) + access &= ~ACC_WRITE_MASK; + #if PTTYPE == 64 if (vcpu->arch.mmu.nx) access &= ~(gpte >> PT64_NX_SHIFT); @@ -232,8 +236,6 @@ retry_walk: pte |= PT_ACCESSED_MASK; } - pte_access = pt_access & FNAME(gpte_access)(vcpu, pte); - walker->ptes[walker->level - 1] = pte; if (FNAME(is_last_gpte)(walker, vcpu, mmu, pte)) { @@ -268,7 +270,7 @@ retry_walk: break; } - pt_access = pte_access; + pt_access &= FNAME(gpte_access)(vcpu, pte, false); --walker->level; } @@ -293,6 +295,7 @@ retry_walk: walker->ptes[walker->level - 1] = pte; } + pte_access = pt_access & FNAME(gpte_access)(vcpu, pte, true); walker->pt_access = pt_access; walker->pte_access = pte_access; pgprintk("%s: pte %llx pte_access %x pt_access %x\n", @@ -367,7 +370,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, return; pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); - pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); + pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte, true); pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte)); if (is_error_pfn(pfn)) { kvm_release_pfn_clean(pfn); @@ -379,7 +382,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1). */ mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, - is_dirty_gpte(gpte), NULL, PT_PAGE_TABLE_LEVEL, + NULL, PT_PAGE_TABLE_LEVEL, gpte_to_gfn(gpte), pfn, true, true); } @@ -430,7 +433,6 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, unsigned pte_access; gfn_t gfn; pfn_t pfn; - bool dirty; if (spte == sptep) continue; @@ -443,18 +445,18 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) continue; - pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); + pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte, + true); gfn = gpte_to_gfn(gpte); - dirty = is_dirty_gpte(gpte); pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, - (pte_access & ACC_WRITE_MASK) && dirty); + pte_access & ACC_WRITE_MASK); if (is_error_pfn(pfn)) { kvm_release_pfn_clean(pfn); break; } mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, - dirty, NULL, PT_PAGE_TABLE_LEVEL, gfn, + NULL, PT_PAGE_TABLE_LEVEL, gfn, pfn, true, true); } } @@ -470,7 +472,6 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, { unsigned access = gw->pt_access; struct kvm_mmu_page *sp = NULL; - bool dirty = is_dirty_gpte(gw->ptes[gw->level - 1]); int top_level; unsigned direct_access; struct kvm_shadow_walk_iterator it; @@ -479,8 +480,6 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, return NULL; direct_access = gw->pt_access & gw->pte_access; - if (!dirty) - direct_access &= ~ACC_WRITE_MASK; top_level = vcpu->arch.mmu.root_level; if (top_level == PT32E_ROOT_LEVEL) @@ -539,7 +538,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, } mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access, - user_fault, write_fault, dirty, ptwrite, it.level, + user_fault, write_fault, ptwrite, it.level, gw->gfn, pfn, prefault, map_writable); FNAME(pte_prefetch)(vcpu, gw, it.sptep); @@ -622,17 +621,9 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, return 0; /* mmio */ - if (is_error_pfn(pfn)) { - unsigned access = walker.pte_access; - bool dirty = is_dirty_gpte(walker.ptes[walker.level - 1]); - - if (!dirty) - access &= ~ACC_WRITE_MASK; - + if (is_error_pfn(pfn)) return kvm_handle_bad_page(vcpu, mmu_is_nested(vcpu) ? 0 : - addr, access, walker.gfn, pfn); - } - + addr, walker.pte_access, walker.gfn, pfn); spin_lock(&vcpu->kvm->mmu_lock); if (mmu_notifier_retry(vcpu, mmu_seq)) goto out_unlock; @@ -849,11 +840,12 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) } nr_present++; - pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); + pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte, + true); host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE; set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, - is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn, + PT_PAGE_TABLE_LEVEL, gfn, spte_to_pfn(sp->spt[i]), true, false, host_writable); } -- cgit v1.1 From b36c7a7c10bf845b623ce187501b561d1d843a18 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 12 Jul 2011 03:25:19 +0800 Subject: KVM: MMU: cleanup for FNAME(fetch) gw->pte_access is the final access permission, since it is unified with gw->pt_access when we walked guest page table: FNAME(walk_addr_generic): pte_access = pt_access & FNAME(gpte_access)(vcpu, pte, true); Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/paging_tmpl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index c9fe97b..5c2aa40 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -479,7 +479,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, if (!is_present_gpte(gw->ptes[gw->level - 1])) return NULL; - direct_access = gw->pt_access & gw->pte_access; + direct_access = gw->pte_access; top_level = vcpu->arch.mmu.root_level; if (top_level == PT32E_ROOT_LEVEL) @@ -537,7 +537,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, link_shadow_page(it.sptep, sp); } - mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access, + mmu_set_spte(vcpu, it.sptep, access, gw->pte_access, user_fault, write_fault, ptwrite, it.level, gw->gfn, pfn, prefault, map_writable); FNAME(pte_prefetch)(vcpu, gw, it.sptep); -- cgit v1.1 From b90a0e6c81d7b1fef0b7dea007015e1a56ab14c7 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 12 Jul 2011 03:25:56 +0800 Subject: KVM: MMU: rename 'pt_write' to 'emulate' If 'pt_write' is true, we need to emulate the fault. And in later patch, we need to emulate the fault even though it is not a pt_write event, so rename it to better fit the meaning Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 10 +++++----- arch/x86/kvm/paging_tmpl.h | 16 ++++++++-------- 2 files changed, 13 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 98812c2..a62ba46 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2023,7 +2023,7 @@ done: static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pt_access, unsigned pte_access, int user_fault, int write_fault, - int *ptwrite, int level, gfn_t gfn, + int *emulate, int level, gfn_t gfn, pfn_t pfn, bool speculative, bool host_writable) { @@ -2061,7 +2061,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, level, gfn, pfn, speculative, true, host_writable)) { if (write_fault) - *ptwrite = 1; + *emulate = 1; kvm_mmu_flush_tlb(vcpu); } @@ -2184,7 +2184,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, { struct kvm_shadow_walk_iterator iterator; struct kvm_mmu_page *sp; - int pt_write = 0; + int emulate = 0; gfn_t pseudo_gfn; for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { @@ -2192,7 +2192,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, unsigned pte_access = ACC_ALL; mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access, - 0, write, &pt_write, + 0, write, &emulate, level, gfn, pfn, prefault, map_writable); direct_pte_prefetch(vcpu, iterator.sptep); ++vcpu->stat.pf_fixed; @@ -2220,7 +2220,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, | shadow_accessed_mask); } } - return pt_write; + return emulate; } static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk) diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 5c2aa40..fa3b54b 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -467,7 +467,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, struct guest_walker *gw, int user_fault, int write_fault, int hlevel, - int *ptwrite, pfn_t pfn, bool map_writable, + int *emulate, pfn_t pfn, bool map_writable, bool prefault) { unsigned access = gw->pt_access; @@ -538,7 +538,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, } mmu_set_spte(vcpu, it.sptep, access, gw->pte_access, - user_fault, write_fault, ptwrite, it.level, + user_fault, write_fault, emulate, it.level, gw->gfn, pfn, prefault, map_writable); FNAME(pte_prefetch)(vcpu, gw, it.sptep); @@ -572,7 +572,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, int user_fault = error_code & PFERR_USER_MASK; struct guest_walker walker; u64 *sptep; - int write_pt = 0; + int emulate = 0; int r; pfn_t pfn; int level = PT_PAGE_TABLE_LEVEL; @@ -633,19 +633,19 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, if (!force_pt_level) transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level); sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, - level, &write_pt, pfn, map_writable, prefault); + level, &emulate, pfn, map_writable, prefault); (void)sptep; - pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__, - sptep, *sptep, write_pt); + pgprintk("%s: shadow pte %p %llx emulate %d\n", __func__, + sptep, *sptep, emulate); - if (!write_pt) + if (!emulate) vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ ++vcpu->stat.pf_fixed; trace_kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); spin_unlock(&vcpu->kvm->mmu_lock); - return write_pt; + return emulate; out_unlock: spin_unlock(&vcpu->kvm->mmu_lock); -- cgit v1.1 From aa6bd187af013319c3f18be7b0970d9a3d1be696 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 12 Jul 2011 03:26:40 +0800 Subject: KVM: MMU: count used shadow pages on prepareing path Move counting used shadow pages from commiting path to preparing path to reduce tlb flush on some paths Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index a62ba46..91d3069 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1039,7 +1039,7 @@ static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr) percpu_counter_add(&kvm_total_used_mmu_pages, nr); } -static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) +static void kvm_mmu_free_page(struct kvm_mmu_page *sp) { ASSERT(is_empty_shadow_page(sp->spt)); hlist_del(&sp->hash_link); @@ -1048,7 +1048,6 @@ static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) if (!sp->role.direct) free_page((unsigned long)sp->gfns); kmem_cache_free(mmu_page_header_cache, sp); - kvm_mod_used_mmu_pages(kvm, -1); } static unsigned kvm_page_table_hashfn(gfn_t gfn) @@ -1655,6 +1654,7 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, /* Count self */ ret++; list_move(&sp->link, invalid_list); + kvm_mod_used_mmu_pages(kvm, -1); } else { list_move(&sp->link, &kvm->arch.active_mmu_pages); kvm_reload_remote_mmus(kvm); @@ -1678,7 +1678,7 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, do { sp = list_first_entry(invalid_list, struct kvm_mmu_page, link); WARN_ON(!sp->role.invalid || sp->root_count); - kvm_mmu_free_page(kvm, sp); + kvm_mmu_free_page(sp); } while (!list_empty(invalid_list)); } @@ -1704,8 +1704,8 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages) page = container_of(kvm->arch.active_mmu_pages.prev, struct kvm_mmu_page, link); kvm_mmu_prepare_zap_page(kvm, page, &invalid_list); - kvm_mmu_commit_zap_page(kvm, &invalid_list); } + kvm_mmu_commit_zap_page(kvm, &invalid_list); goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages; } @@ -3302,9 +3302,9 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, struct kvm_mmu_page, link); kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); - kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); ++vcpu->kvm->stat.mmu_recycled; } + kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); } int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, -- cgit v1.1 From bd4c86eaa6ff10abc4e00d0f45d2a28b10b09df4 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 12 Jul 2011 03:27:14 +0800 Subject: KVM: MMU: split kvm_mmu_free_page Split kvm_mmu_free_page to kvm_mmu_isolate_page and kvm_mmu_free_page One is used to remove the page from cache under mmu lock and the other is used to free page table out of mmu lock Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 91d3069..2f8543c 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1039,14 +1039,28 @@ static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr) percpu_counter_add(&kvm_total_used_mmu_pages, nr); } -static void kvm_mmu_free_page(struct kvm_mmu_page *sp) +/* + * Remove the sp from shadow page cache, after call it, + * we can not find this sp from the cache, and the shadow + * page table is still valid. + * It should be under the protection of mmu lock. + */ +static void kvm_mmu_isolate_page(struct kvm_mmu_page *sp) { ASSERT(is_empty_shadow_page(sp->spt)); hlist_del(&sp->hash_link); - list_del(&sp->link); - free_page((unsigned long)sp->spt); if (!sp->role.direct) free_page((unsigned long)sp->gfns); +} + +/* + * Free the shadow page table and the sp, we can do it + * out of the protection of mmu lock. + */ +static void kvm_mmu_free_page(struct kvm_mmu_page *sp) +{ + list_del(&sp->link); + free_page((unsigned long)sp->spt); kmem_cache_free(mmu_page_header_cache, sp); } @@ -1678,6 +1692,7 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, do { sp = list_first_entry(invalid_list, struct kvm_mmu_page, link); WARN_ON(!sp->role.invalid || sp->root_count); + kvm_mmu_isolate_page(sp); kvm_mmu_free_page(sp); } while (!list_empty(invalid_list)); -- cgit v1.1 From c37079586f317d7e7f1a70d36f0e5177691c89c2 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 12 Jul 2011 03:28:04 +0800 Subject: KVM: MMU: remove bypass_guest_pf The idea is from Avi: | Maybe it's time to kill off bypass_guest_pf=1. It's not as effective as | it used to be, since unsync pages always use shadow_trap_nonpresent_pte, | and since we convert between the two nonpresent_ptes during sync and unsync. Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 3 -- arch/x86/kvm/mmu.c | 83 ++++++++++++----------------------------- arch/x86/kvm/mmu_audit.c | 12 ------ arch/x86/kvm/paging_tmpl.h | 51 ++++--------------------- arch/x86/kvm/vmx.c | 11 +----- arch/x86/kvm/x86.c | 1 - 6 files changed, 33 insertions(+), 128 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 8da1400..a198a5b 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -266,8 +266,6 @@ struct kvm_mmu { gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access, struct x86_exception *exception); gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access); - void (*prefetch_page)(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *page); int (*sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp); void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva); @@ -647,7 +645,6 @@ void kvm_mmu_module_exit(void); void kvm_mmu_destroy(struct kvm_vcpu *vcpu); int kvm_mmu_create(struct kvm_vcpu *vcpu); int kvm_mmu_setup(struct kvm_vcpu *vcpu); -void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte); void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, u64 dirty_mask, u64 nx_mask, u64 x_mask); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 2f8543c..5334b4e 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -186,8 +186,6 @@ static struct kmem_cache *pte_list_desc_cache; static struct kmem_cache *mmu_page_header_cache; static struct percpu_counter kvm_total_used_mmu_pages; -static u64 __read_mostly shadow_trap_nonpresent_pte; -static u64 __read_mostly shadow_notrap_nonpresent_pte; static u64 __read_mostly shadow_nx_mask; static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ static u64 __read_mostly shadow_user_mask; @@ -199,13 +197,6 @@ static inline u64 rsvd_bits(int s, int e) return ((1ULL << (e - s + 1)) - 1) << s; } -void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte) -{ - shadow_trap_nonpresent_pte = trap_pte; - shadow_notrap_nonpresent_pte = notrap_pte; -} -EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes); - void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, u64 dirty_mask, u64 nx_mask, u64 x_mask) { @@ -229,8 +220,7 @@ static int is_nx(struct kvm_vcpu *vcpu) static int is_shadow_present_pte(u64 pte) { - return pte != shadow_trap_nonpresent_pte - && pte != shadow_notrap_nonpresent_pte; + return pte & PT_PRESENT_MASK; } static int is_large_pte(u64 pte) @@ -777,9 +767,9 @@ static int set_spte_track_bits(u64 *sptep, u64 new_spte) return 1; } -static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte) +static void drop_spte(struct kvm *kvm, u64 *sptep) { - if (set_spte_track_bits(sptep, new_spte)) + if (set_spte_track_bits(sptep, 0ull)) rmap_remove(kvm, sptep); } @@ -814,8 +804,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)); pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); if (is_writable_pte(*spte)) { - drop_spte(kvm, spte, - shadow_trap_nonpresent_pte); + drop_spte(kvm, spte); --kvm->stat.lpages; spte = NULL; write_protected = 1; @@ -836,7 +825,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, while ((spte = rmap_next(kvm, rmapp, NULL))) { BUG_ON(!(*spte & PT_PRESENT_MASK)); rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte); - drop_spte(kvm, spte, shadow_trap_nonpresent_pte); + drop_spte(kvm, spte); need_tlb_flush = 1; } return need_tlb_flush; @@ -858,7 +847,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte); need_flush = 1; if (pte_write(*ptep)) { - drop_spte(kvm, spte, shadow_trap_nonpresent_pte); + drop_spte(kvm, spte); spte = rmap_next(kvm, rmapp, NULL); } else { new_spte = *spte &~ (PT64_BASE_ADDR_MASK); @@ -1088,7 +1077,7 @@ static void drop_parent_pte(struct kvm_mmu_page *sp, u64 *parent_pte) { mmu_page_remove_parent_pte(sp, parent_pte); - __set_spte(parent_pte, shadow_trap_nonpresent_pte); + __set_spte(parent_pte, 0ull); } static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, @@ -1130,15 +1119,6 @@ static void mark_unsync(u64 *spte) kvm_mmu_mark_parents_unsync(sp); } -static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp) -{ - int i; - - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) - sp->spt[i] = shadow_trap_nonpresent_pte; -} - static int nonpaging_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { @@ -1420,6 +1400,14 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu, } } +static void init_shadow_page_table(struct kvm_mmu_page *sp) +{ + int i; + + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) + sp->spt[i] = 0ull; +} + static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gaddr, @@ -1482,10 +1470,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, account_shadowed(vcpu->kvm, gfn); } - if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte) - vcpu->arch.mmu.prefetch_page(vcpu, sp); - else - nonpaging_prefetch_page(vcpu, sp); + init_shadow_page_table(sp); trace_kvm_mmu_get_page(sp, true); return sp; } @@ -1546,7 +1531,7 @@ static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp) static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) { if (is_large_pte(*sptep)) { - drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); + drop_spte(vcpu->kvm, sptep); kvm_flush_remote_tlbs(vcpu->kvm); } } @@ -1582,13 +1567,13 @@ static void mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, pte = *spte; if (is_shadow_present_pte(pte)) { if (is_last_spte(pte, sp->role.level)) - drop_spte(kvm, spte, shadow_trap_nonpresent_pte); + drop_spte(kvm, spte); else { child = page_header(pte & PT64_BASE_ADDR_MASK); drop_parent_pte(child, spte); } } - __set_spte(spte, shadow_trap_nonpresent_pte); + if (is_large_pte(pte)) --kvm->stat.lpages; } @@ -1769,20 +1754,6 @@ static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) __set_bit(slot, sp->slot_bitmap); } -static void mmu_convert_notrap(struct kvm_mmu_page *sp) -{ - int i; - u64 *pt = sp->spt; - - if (shadow_trap_nonpresent_pte == shadow_notrap_nonpresent_pte) - return; - - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { - if (pt[i] == shadow_notrap_nonpresent_pte) - __set_spte(&pt[i], shadow_trap_nonpresent_pte); - } -} - /* * The function is based on mtrr_type_lookup() in * arch/x86/kernel/cpu/mtrr/generic.c @@ -1895,7 +1866,6 @@ static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) sp->unsync = 1; kvm_mmu_mark_parents_unsync(sp); - mmu_convert_notrap(sp); } static void kvm_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) @@ -1980,7 +1950,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (level > PT_PAGE_TABLE_LEVEL && has_wrprotected_page(vcpu->kvm, gfn, level)) { ret = 1; - drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); + drop_spte(vcpu->kvm, sptep); goto done; } @@ -2066,7 +2036,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, } else if (pfn != spte_to_pfn(*sptep)) { pgprintk("hfn old %llx new %llx\n", spte_to_pfn(*sptep), pfn); - drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); + drop_spte(vcpu->kvm, sptep); kvm_flush_remote_tlbs(vcpu->kvm); } else was_rmapped = 1; @@ -2162,7 +2132,7 @@ static void __direct_pte_prefetch(struct kvm_vcpu *vcpu, spte = sp->spt + i; for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { - if (*spte != shadow_trap_nonpresent_pte || spte == sptep) { + if (is_shadow_present_pte(*spte) || spte == sptep) { if (!start) continue; if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0) @@ -2214,7 +2184,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, break; } - if (*iterator.sptep == shadow_trap_nonpresent_pte) { + if (!is_shadow_present_pte(*iterator.sptep)) { u64 base_addr = iterator.addr; base_addr &= PT64_LVL_ADDR_MASK(iterator.level); @@ -2748,7 +2718,6 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu, context->page_fault = nonpaging_page_fault; context->gva_to_gpa = nonpaging_gva_to_gpa; context->free = nonpaging_free; - context->prefetch_page = nonpaging_prefetch_page; context->sync_page = nonpaging_sync_page; context->invlpg = nonpaging_invlpg; context->update_pte = nonpaging_update_pte; @@ -2878,7 +2847,6 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, context->new_cr3 = paging_new_cr3; context->page_fault = paging64_page_fault; context->gva_to_gpa = paging64_gva_to_gpa; - context->prefetch_page = paging64_prefetch_page; context->sync_page = paging64_sync_page; context->invlpg = paging64_invlpg; context->update_pte = paging64_update_pte; @@ -2907,7 +2875,6 @@ static int paging32_init_context(struct kvm_vcpu *vcpu, context->page_fault = paging32_page_fault; context->gva_to_gpa = paging32_gva_to_gpa; context->free = paging_free; - context->prefetch_page = paging32_prefetch_page; context->sync_page = paging32_sync_page; context->invlpg = paging32_invlpg; context->update_pte = paging32_update_pte; @@ -2932,7 +2899,6 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->new_cr3 = nonpaging_new_cr3; context->page_fault = tdp_page_fault; context->free = nonpaging_free; - context->prefetch_page = nonpaging_prefetch_page; context->sync_page = nonpaging_sync_page; context->invlpg = nonpaging_invlpg; context->update_pte = nonpaging_update_pte; @@ -3443,8 +3409,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) continue; if (is_large_pte(pt[i])) { - drop_spte(kvm, &pt[i], - shadow_trap_nonpresent_pte); + drop_spte(kvm, &pt[i]); --kvm->stat.lpages; continue; } diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c index 5f6223b..2460a26 100644 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu_audit.c @@ -99,18 +99,6 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level) "level = %d\n", sp, level); return; } - - if (*sptep == shadow_notrap_nonpresent_pte) { - audit_printk(vcpu->kvm, "notrap spte in unsync " - "sp: %p\n", sp); - return; - } - } - - if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) { - audit_printk(vcpu->kvm, "notrap spte in direct sp: %p\n", - sp); - return; } if (!is_shadow_present_pte(*sptep) || !is_last_spte(*sptep, level)) diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index fa3b54b..a4565df 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -337,16 +337,11 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, pt_element_t gpte) { - u64 nonpresent = shadow_trap_nonpresent_pte; - if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) goto no_present; - if (!is_present_gpte(gpte)) { - if (!sp->unsync) - nonpresent = shadow_notrap_nonpresent_pte; + if (!is_present_gpte(gpte)) goto no_present; - } if (!(gpte & PT_ACCESSED_MASK)) goto no_present; @@ -354,7 +349,7 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, return false; no_present: - drop_spte(vcpu->kvm, spte, nonpresent); + drop_spte(vcpu->kvm, spte); return true; } @@ -437,7 +432,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, if (spte == sptep) continue; - if (*spte != shadow_trap_nonpresent_pte) + if (is_shadow_present_pte(*spte)) continue; gpte = gptep[i]; @@ -687,11 +682,10 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) if (is_shadow_present_pte(*sptep)) { if (is_large_pte(*sptep)) --vcpu->kvm->stat.lpages; - drop_spte(vcpu->kvm, sptep, - shadow_trap_nonpresent_pte); + drop_spte(vcpu->kvm, sptep); need_flush = 1; - } else - __set_spte(sptep, shadow_trap_nonpresent_pte); + } + break; } @@ -751,36 +745,6 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, return gpa; } -static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp) -{ - int i, j, offset, r; - pt_element_t pt[256 / sizeof(pt_element_t)]; - gpa_t pte_gpa; - - if (sp->role.direct - || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) { - nonpaging_prefetch_page(vcpu, sp); - return; - } - - pte_gpa = gfn_to_gpa(sp->gfn); - if (PTTYPE == 32) { - offset = sp->role.quadrant << PT64_LEVEL_BITS; - pte_gpa += offset * sizeof(pt_element_t); - } - - for (i = 0; i < PT64_ENT_PER_PAGE; i += ARRAY_SIZE(pt)) { - r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, pt, sizeof pt); - pte_gpa += ARRAY_SIZE(pt) * sizeof(pt_element_t); - for (j = 0; j < ARRAY_SIZE(pt); ++j) - if (r || is_present_gpte(pt[j])) - sp->spt[i+j] = shadow_trap_nonpresent_pte; - else - sp->spt[i+j] = shadow_notrap_nonpresent_pte; - } -} - /* * Using the cached information from sp->gfns is safe because: * - The spte has a reference to the struct page, so the pfn for a given gfn @@ -833,8 +797,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) } if (gfn != sp->gfns[i]) { - drop_spte(vcpu->kvm, &sp->spt[i], - shadow_trap_nonpresent_pte); + drop_spte(vcpu->kvm, &sp->spt[i]); vcpu->kvm->tlbs_dirty++; continue; } diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f5b49c7..a644acb 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -49,9 +49,6 @@ MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); -static int __read_mostly bypass_guest_pf = 1; -module_param(bypass_guest_pf, bool, S_IRUGO); - static int __read_mostly enable_vpid = 1; module_param_named(vpid, enable_vpid, bool, 0444); @@ -3632,8 +3629,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write32(PLE_WINDOW, ple_window); } - vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf); - vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf); + vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); + vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */ @@ -7103,16 +7100,12 @@ static int __init vmx_init(void) vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); if (enable_ept) { - bypass_guest_pf = 0; kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull, VMX_EPT_EXECUTABLE_MASK); kvm_enable_tdp(); } else kvm_disable_tdp(); - if (bypass_guest_pf) - kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull); - return 0; out3: diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 028a0f2..64c42d9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5091,7 +5091,6 @@ int kvm_arch_init(void *opaque) kvm_init_msr_list(); kvm_x86_ops = ops; - kvm_mmu_set_nonpresent_ptes(0ull, 0ull); kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, PT_DIRTY_MASK, PT64_NX_MASK, 0); -- cgit v1.1 From fce92dce79dbf5fff39c7ac2fb149729d79b7a39 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 12 Jul 2011 03:28:54 +0800 Subject: KVM: MMU: filter out the mmio pfn from the fault pfn If the page fault is caused by mmio, the gfn can not be found in memslots, and 'bad_pfn' is returned on gfn_to_hva path, so we can use 'bad_pfn' to identify the mmio page fault. And, to clarify the meaning of mmio pfn, we return fault page instead of bad page when the gfn is not allowd to prefetch Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 5334b4e..96a7ed4 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2085,8 +2085,8 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log); if (!slot) { - get_page(bad_page); - return page_to_pfn(bad_page); + get_page(fault_page); + return page_to_pfn(fault_page); } hva = gfn_to_hva_memslot(slot, gfn); -- cgit v1.1 From d7c55201e66e9f702db575c9dfc2d34a7af6cf1f Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 12 Jul 2011 03:29:38 +0800 Subject: KVM: MMU: abstract some functions to handle fault pfn Introduce handle_abnormal_pfn to handle fault pfn on page fault path, introduce mmu_invalid_pfn to handle fault pfn on prefetch path It is the preparing work for mmio page fault support Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 47 ++++++++++++++++++++++++++++++++++------------ arch/x86/kvm/paging_tmpl.h | 12 ++++++------ 2 files changed, 41 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 96a7ed4..1d4a2d9 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2221,18 +2221,15 @@ static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct * send_sig_info(SIGBUS, &info, tsk); } -static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gva_t gva, - unsigned access, gfn_t gfn, pfn_t pfn) +static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn) { kvm_release_pfn_clean(pfn); if (is_hwpoison_pfn(pfn)) { kvm_send_hwpoison_signal(gfn_to_hva(vcpu->kvm, gfn), current); return 0; - } else if (is_fault_pfn(pfn)) - return -EFAULT; + } - vcpu_cache_mmio_info(vcpu, gva, gfn, access); - return 1; + return -EFAULT; } static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, @@ -2277,6 +2274,33 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, } } +static bool mmu_invalid_pfn(pfn_t pfn) +{ + return unlikely(is_invalid_pfn(pfn) || is_noslot_pfn(pfn)); +} + +static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, + pfn_t pfn, unsigned access, int *ret_val) +{ + bool ret = true; + + /* The pfn is invalid, report the error! */ + if (unlikely(is_invalid_pfn(pfn))) { + *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn); + goto exit; + } + + if (unlikely(is_noslot_pfn(pfn))) { + vcpu_cache_mmio_info(vcpu, gva, gfn, access); + *ret_val = 1; + goto exit; + } + + ret = false; +exit: + return ret; +} + static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, gva_t gva, pfn_t *pfn, bool write, bool *writable); @@ -2311,9 +2335,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn, if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable)) return 0; - /* mmio */ - if (is_error_pfn(pfn)) - return kvm_handle_bad_page(vcpu, v, ACC_ALL, gfn, pfn); + if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r)) + return r; spin_lock(&vcpu->kvm->mmu_lock); if (mmu_notifier_retry(vcpu, mmu_seq)) @@ -2685,9 +2708,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable)) return 0; - /* mmio */ - if (is_error_pfn(pfn)) - return kvm_handle_bad_page(vcpu, 0, 0, gfn, pfn); + if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r)) + return r; + spin_lock(&vcpu->kvm->mmu_lock); if (mmu_notifier_retry(vcpu, mmu_seq)) goto out_unlock; diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index a4565df..67998d3 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -367,7 +367,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte, true); pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte)); - if (is_error_pfn(pfn)) { + if (mmu_invalid_pfn(pfn)) { kvm_release_pfn_clean(pfn); return; } @@ -445,7 +445,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, gfn = gpte_to_gfn(gpte); pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, pte_access & ACC_WRITE_MASK); - if (is_error_pfn(pfn)) { + if (mmu_invalid_pfn(pfn)) { kvm_release_pfn_clean(pfn); break; } @@ -615,10 +615,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, &map_writable)) return 0; - /* mmio */ - if (is_error_pfn(pfn)) - return kvm_handle_bad_page(vcpu, mmu_is_nested(vcpu) ? 0 : - addr, walker.pte_access, walker.gfn, pfn); + if (handle_abnormal_pfn(vcpu, mmu_is_nested(vcpu) ? 0 : addr, + walker.gfn, pfn, walker.pte_access, &r)) + return r; + spin_lock(&vcpu->kvm->mmu_lock); if (mmu_notifier_retry(vcpu, mmu_seq)) goto out_unlock; -- cgit v1.1 From 1df9f2dc39948c3cb900725b7f0754fb385c8354 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 12 Jul 2011 03:30:35 +0800 Subject: KVM: MMU: introduce the rules to modify shadow page table Introduce some interfaces to modify spte as linux kernel does: - mmu_spte_clear_track_bits, it set the spte from present to nonpresent, and track the stat bits(accessed/dirty) of spte - mmu_spte_clear_no_track, the same as mmu_spte_clear_track_bits except tracking the stat bits - mmu_spte_set, set spte from nonpresent to present - mmu_spte_update, only update the stat bits Now, it does not allowed to set spte from present to present, later, we can drop the atomicly opration for X86_32 host, and it is the preparing work to get spte on X86_32 host out of the mmu lock Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 103 +++++++++++++++++++++++++++++++++++------------------ 1 file changed, 69 insertions(+), 34 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 1d4a2d9..982718f 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -299,12 +299,30 @@ static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask) return (old_spte & bit_mask) && !(new_spte & bit_mask); } -static void update_spte(u64 *sptep, u64 new_spte) +/* Rules for using mmu_spte_set: + * Set the sptep from nonpresent to present. + * Note: the sptep being assigned *must* be either not present + * or in a state where the hardware will not attempt to update + * the spte. + */ +static void mmu_spte_set(u64 *sptep, u64 new_spte) +{ + WARN_ON(is_shadow_present_pte(*sptep)); + __set_spte(sptep, new_spte); +} + +/* Rules for using mmu_spte_update: + * Update the state bits, it means the mapped pfn is not changged. + */ +static void mmu_spte_update(u64 *sptep, u64 new_spte) { u64 mask, old_spte = *sptep; WARN_ON(!is_rmap_spte(new_spte)); + if (!is_shadow_present_pte(old_spte)) + return mmu_spte_set(sptep, new_spte); + new_spte |= old_spte & shadow_dirty_mask; mask = shadow_accessed_mask; @@ -325,6 +343,42 @@ static void update_spte(u64 *sptep, u64 new_spte) kvm_set_pfn_dirty(spte_to_pfn(old_spte)); } +/* + * Rules for using mmu_spte_clear_track_bits: + * It sets the sptep from present to nonpresent, and track the + * state bits, it is used to clear the last level sptep. + */ +static int mmu_spte_clear_track_bits(u64 *sptep) +{ + pfn_t pfn; + u64 old_spte = *sptep; + + if (!spte_has_volatile_bits(old_spte)) + __set_spte(sptep, 0ull); + else + old_spte = __xchg_spte(sptep, 0ull); + + if (!is_rmap_spte(old_spte)) + return 0; + + pfn = spte_to_pfn(old_spte); + if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) + kvm_set_pfn_accessed(pfn); + if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask)) + kvm_set_pfn_dirty(pfn); + return 1; +} + +/* + * Rules for using mmu_spte_clear_no_track: + * Directly clear spte without caring the state bits of sptep, + * it is used to set the upper level spte. + */ +static void mmu_spte_clear_no_track(u64 *sptep) +{ + __set_spte(sptep, 0ull); +} + static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, struct kmem_cache *base_cache, int min) { @@ -746,30 +800,9 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) pte_list_remove(spte, rmapp); } -static int set_spte_track_bits(u64 *sptep, u64 new_spte) -{ - pfn_t pfn; - u64 old_spte = *sptep; - - if (!spte_has_volatile_bits(old_spte)) - __set_spte(sptep, new_spte); - else - old_spte = __xchg_spte(sptep, new_spte); - - if (!is_rmap_spte(old_spte)) - return 0; - - pfn = spte_to_pfn(old_spte); - if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) - kvm_set_pfn_accessed(pfn); - if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask)) - kvm_set_pfn_dirty(pfn); - return 1; -} - static void drop_spte(struct kvm *kvm, u64 *sptep) { - if (set_spte_track_bits(sptep, 0ull)) + if (mmu_spte_clear_track_bits(sptep)) rmap_remove(kvm, sptep); } @@ -787,7 +820,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) BUG_ON(!(*spte & PT_PRESENT_MASK)); rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); if (is_writable_pte(*spte)) { - update_spte(spte, *spte & ~PT_WRITABLE_MASK); + mmu_spte_update(spte, *spte & ~PT_WRITABLE_MASK); write_protected = 1; } spte = rmap_next(kvm, rmapp, spte); @@ -856,7 +889,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, new_spte &= ~PT_WRITABLE_MASK; new_spte &= ~SPTE_HOST_WRITEABLE; new_spte &= ~shadow_accessed_mask; - set_spte_track_bits(spte, new_spte); + mmu_spte_clear_track_bits(spte); + mmu_spte_set(spte, new_spte); spte = rmap_next(kvm, rmapp, spte); } } @@ -1077,7 +1111,7 @@ static void drop_parent_pte(struct kvm_mmu_page *sp, u64 *parent_pte) { mmu_page_remove_parent_pte(sp, parent_pte); - __set_spte(parent_pte, 0ull); + mmu_spte_clear_no_track(parent_pte); } static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, @@ -1525,7 +1559,7 @@ static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp) spte = __pa(sp->spt) | PT_PRESENT_MASK | PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK; - __set_spte(sptep, spte); + mmu_spte_set(sptep, spte); } static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) @@ -1992,7 +2026,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, mark_page_dirty(vcpu->kvm, gfn); set_pte: - update_spte(sptep, spte); + mmu_spte_update(sptep, spte); /* * If we overwrite a writable spte with a read-only one we * should flush remote TLBs. Otherwise rmap_write_protect @@ -2198,11 +2232,11 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, return -ENOMEM; } - __set_spte(iterator.sptep, - __pa(sp->spt) - | PT_PRESENT_MASK | PT_WRITABLE_MASK - | shadow_user_mask | shadow_x_mask - | shadow_accessed_mask); + mmu_spte_set(iterator.sptep, + __pa(sp->spt) + | PT_PRESENT_MASK | PT_WRITABLE_MASK + | shadow_user_mask | shadow_x_mask + | shadow_accessed_mask); } } return emulate; @@ -3439,7 +3473,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) /* avoid RMW */ if (is_writable_pte(pt[i])) - update_spte(&pt[i], pt[i] & ~PT_WRITABLE_MASK); + mmu_spte_update(&pt[i], + pt[i] & ~PT_WRITABLE_MASK); } } kvm_flush_remote_tlbs(kvm); -- cgit v1.1 From 603e0651cfc8562b103454d7ded71f3ad1eb3a37 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 12 Jul 2011 03:31:28 +0800 Subject: KVM: MMU: do not need atomicly to set/clear spte Now, the spte is just from nonprsent to present or present to nonprsent, so we can use some trick to set/clear spte non-atomicly as linux kernel does Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 86 ++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 71 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 982718f..a22b5fe 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -259,26 +259,82 @@ static gfn_t pse36_gfn_delta(u32 gpte) return (gpte & PT32_DIR_PSE36_MASK) << shift; } +#ifdef CONFIG_X86_64 static void __set_spte(u64 *sptep, u64 spte) { - set_64bit(sptep, spte); + *sptep = spte; } -static u64 __xchg_spte(u64 *sptep, u64 new_spte) +static void __update_clear_spte_fast(u64 *sptep, u64 spte) { -#ifdef CONFIG_X86_64 - return xchg(sptep, new_spte); + *sptep = spte; +} + +static u64 __update_clear_spte_slow(u64 *sptep, u64 spte) +{ + return xchg(sptep, spte); +} #else - u64 old_spte; +union split_spte { + struct { + u32 spte_low; + u32 spte_high; + }; + u64 spte; +}; - do { - old_spte = *sptep; - } while (cmpxchg64(sptep, old_spte, new_spte) != old_spte); +static void __set_spte(u64 *sptep, u64 spte) +{ + union split_spte *ssptep, sspte; - return old_spte; -#endif + ssptep = (union split_spte *)sptep; + sspte = (union split_spte)spte; + + ssptep->spte_high = sspte.spte_high; + + /* + * If we map the spte from nonpresent to present, We should store + * the high bits firstly, then set present bit, so cpu can not + * fetch this spte while we are setting the spte. + */ + smp_wmb(); + + ssptep->spte_low = sspte.spte_low; } +static void __update_clear_spte_fast(u64 *sptep, u64 spte) +{ + union split_spte *ssptep, sspte; + + ssptep = (union split_spte *)sptep; + sspte = (union split_spte)spte; + + ssptep->spte_low = sspte.spte_low; + + /* + * If we map the spte from present to nonpresent, we should clear + * present bit firstly to avoid vcpu fetch the old high bits. + */ + smp_wmb(); + + ssptep->spte_high = sspte.spte_high; +} + +static u64 __update_clear_spte_slow(u64 *sptep, u64 spte) +{ + union split_spte *ssptep, sspte, orig; + + ssptep = (union split_spte *)sptep; + sspte = (union split_spte)spte; + + /* xchg acts as a barrier before the setting of the high bits */ + orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low); + orig.spte_high = ssptep->spte_high = sspte.spte_high; + + return orig.spte; +} +#endif + static bool spte_has_volatile_bits(u64 spte) { if (!shadow_accessed_mask) @@ -330,9 +386,9 @@ static void mmu_spte_update(u64 *sptep, u64 new_spte) mask |= shadow_dirty_mask; if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask) - __set_spte(sptep, new_spte); + __update_clear_spte_fast(sptep, new_spte); else - old_spte = __xchg_spte(sptep, new_spte); + old_spte = __update_clear_spte_slow(sptep, new_spte); if (!shadow_accessed_mask) return; @@ -354,9 +410,9 @@ static int mmu_spte_clear_track_bits(u64 *sptep) u64 old_spte = *sptep; if (!spte_has_volatile_bits(old_spte)) - __set_spte(sptep, 0ull); + __update_clear_spte_fast(sptep, 0ull); else - old_spte = __xchg_spte(sptep, 0ull); + old_spte = __update_clear_spte_slow(sptep, 0ull); if (!is_rmap_spte(old_spte)) return 0; @@ -376,7 +432,7 @@ static int mmu_spte_clear_track_bits(u64 *sptep) */ static void mmu_spte_clear_no_track(u64 *sptep) { - __set_spte(sptep, 0ull); + __update_clear_spte_fast(sptep, 0ull); } static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, -- cgit v1.1 From c2a2ac2b563ccc3a69540965b5a994c19e3817d7 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 12 Jul 2011 03:32:13 +0800 Subject: KVM: MMU: lockless walking shadow page table Use rcu to protect shadow pages table to be freed, so we can safely walk it, it should run fastly and is needed by mmio page fault Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 8 +++ arch/x86/kvm/mmu.c | 132 +++++++++++++++++++++++++++++++++++++--- 2 files changed, 132 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index a198a5b..dd51c83 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -233,6 +233,12 @@ struct kvm_mmu_page { unsigned int unsync_children; unsigned long parent_ptes; /* Reverse mapping for parent_pte */ DECLARE_BITMAP(unsync_child_bitmap, 512); + +#ifdef CONFIG_X86_32 + int clear_spte_count; +#endif + + struct rcu_head rcu; }; struct kvm_pv_mmu_op_buffer { @@ -486,6 +492,8 @@ struct kvm_arch { u64 hv_guest_os_id; u64 hv_hypercall; + atomic_t reader_counter; + #ifdef CONFIG_KVM_MMU_AUDIT int audit_point; #endif diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index a22b5fe..374530a 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -182,6 +182,12 @@ struct kvm_shadow_walk_iterator { shadow_walk_okay(&(_walker)); \ shadow_walk_next(&(_walker))) +#define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte) \ + for (shadow_walk_init(&(_walker), _vcpu, _addr); \ + shadow_walk_okay(&(_walker)) && \ + ({ spte = mmu_spte_get_lockless(_walker.sptep); 1; }); \ + __shadow_walk_next(&(_walker), spte)) + static struct kmem_cache *pte_list_desc_cache; static struct kmem_cache *mmu_page_header_cache; static struct percpu_counter kvm_total_used_mmu_pages; @@ -274,6 +280,11 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte) { return xchg(sptep, spte); } + +static u64 __get_spte_lockless(u64 *sptep) +{ + return ACCESS_ONCE(*sptep); +} #else union split_spte { struct { @@ -283,6 +294,18 @@ union split_spte { u64 spte; }; +static void count_spte_clear(u64 *sptep, u64 spte) +{ + struct kvm_mmu_page *sp = page_header(__pa(sptep)); + + if (is_shadow_present_pte(spte)) + return; + + /* Ensure the spte is completely set before we increase the count */ + smp_wmb(); + sp->clear_spte_count++; +} + static void __set_spte(u64 *sptep, u64 spte) { union split_spte *ssptep, sspte; @@ -318,6 +341,7 @@ static void __update_clear_spte_fast(u64 *sptep, u64 spte) smp_wmb(); ssptep->spte_high = sspte.spte_high; + count_spte_clear(sptep, spte); } static u64 __update_clear_spte_slow(u64 *sptep, u64 spte) @@ -330,9 +354,40 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte) /* xchg acts as a barrier before the setting of the high bits */ orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low); orig.spte_high = ssptep->spte_high = sspte.spte_high; + count_spte_clear(sptep, spte); return orig.spte; } + +/* + * The idea using the light way get the spte on x86_32 guest is from + * gup_get_pte(arch/x86/mm/gup.c). + * The difference is we can not catch the spte tlb flush if we leave + * guest mode, so we emulate it by increase clear_spte_count when spte + * is cleared. + */ +static u64 __get_spte_lockless(u64 *sptep) +{ + struct kvm_mmu_page *sp = page_header(__pa(sptep)); + union split_spte spte, *orig = (union split_spte *)sptep; + int count; + +retry: + count = sp->clear_spte_count; + smp_rmb(); + + spte.spte_low = orig->spte_low; + smp_rmb(); + + spte.spte_high = orig->spte_high; + smp_rmb(); + + if (unlikely(spte.spte_low != orig->spte_low || + count != sp->clear_spte_count)) + goto retry; + + return spte.spte; +} #endif static bool spte_has_volatile_bits(u64 spte) @@ -435,6 +490,28 @@ static void mmu_spte_clear_no_track(u64 *sptep) __update_clear_spte_fast(sptep, 0ull); } +static u64 mmu_spte_get_lockless(u64 *sptep) +{ + return __get_spte_lockless(sptep); +} + +static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu) +{ + rcu_read_lock(); + atomic_inc(&vcpu->kvm->arch.reader_counter); + + /* Increase the counter before walking shadow page table */ + smp_mb__after_atomic_inc(); +} + +static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu) +{ + /* Decrease the counter after walking shadow page table finished */ + smp_mb__before_atomic_dec(); + atomic_dec(&vcpu->kvm->arch.reader_counter); + rcu_read_unlock(); +} + static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, struct kmem_cache *base_cache, int min) { @@ -1597,17 +1674,23 @@ static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator) return true; } -static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) +static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator, + u64 spte) { - if (is_last_spte(*iterator->sptep, iterator->level)) { + if (is_last_spte(spte, iterator->level)) { iterator->level = 0; return; } - iterator->shadow_addr = *iterator->sptep & PT64_BASE_ADDR_MASK; + iterator->shadow_addr = spte & PT64_BASE_ADDR_MASK; --iterator->level; } +static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) +{ + return __shadow_walk_next(iterator, *iterator->sptep); +} + static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp) { u64 spte; @@ -1754,6 +1837,30 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, return ret; } +static void kvm_mmu_isolate_pages(struct list_head *invalid_list) +{ + struct kvm_mmu_page *sp; + + list_for_each_entry(sp, invalid_list, link) + kvm_mmu_isolate_page(sp); +} + +static void free_pages_rcu(struct rcu_head *head) +{ + struct kvm_mmu_page *next, *sp; + + sp = container_of(head, struct kvm_mmu_page, rcu); + while (sp) { + if (!list_empty(&sp->link)) + next = list_first_entry(&sp->link, + struct kvm_mmu_page, link); + else + next = NULL; + kvm_mmu_free_page(sp); + sp = next; + } +} + static void kvm_mmu_commit_zap_page(struct kvm *kvm, struct list_head *invalid_list) { @@ -1764,6 +1871,14 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, kvm_flush_remote_tlbs(kvm); + if (atomic_read(&kvm->arch.reader_counter)) { + kvm_mmu_isolate_pages(invalid_list); + sp = list_first_entry(invalid_list, struct kvm_mmu_page, link); + list_del_init(invalid_list); + call_rcu(&sp->rcu, free_pages_rcu); + return; + } + do { sp = list_first_entry(invalid_list, struct kvm_mmu_page, link); WARN_ON(!sp->role.invalid || sp->root_count); @@ -3784,16 +3899,17 @@ out: int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]) { struct kvm_shadow_walk_iterator iterator; + u64 spte; int nr_sptes = 0; - spin_lock(&vcpu->kvm->mmu_lock); - for_each_shadow_entry(vcpu, addr, iterator) { - sptes[iterator.level-1] = *iterator.sptep; + walk_shadow_page_lockless_begin(vcpu); + for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) { + sptes[iterator.level-1] = spte; nr_sptes++; - if (!is_shadow_present_pte(*iterator.sptep)) + if (!is_shadow_present_pte(spte)) break; } - spin_unlock(&vcpu->kvm->mmu_lock); + walk_shadow_page_lockless_end(vcpu); return nr_sptes; } -- cgit v1.1 From dd3bfd59dbc69fd970394ab354cfca5f959d5755 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 12 Jul 2011 03:32:54 +0800 Subject: KVM: MMU: reorganize struct kvm_shadow_walk_iterator Reorganize it for good using the cache Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 374530a..4b1aa67 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -172,8 +172,8 @@ struct pte_list_desc { struct kvm_shadow_walk_iterator { u64 addr; hpa_t shadow_addr; - int level; u64 *sptep; + int level; unsigned index; }; -- cgit v1.1 From ce88decffd17bf9f373cc233c961ad2054965667 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 12 Jul 2011 03:33:44 +0800 Subject: KVM: MMU: mmio page fault support The idea is from Avi: | We could cache the result of a miss in an spte by using a reserved bit, and | checking the page fault error code (or seeing if we get an ept violation or | ept misconfiguration), so if we get repeated mmio on a page, we don't need to | search the slot list/tree. | (https://lkml.org/lkml/2011/2/22/221) When the page fault is caused by mmio, we cache the info in the shadow page table, and also set the reserved bits in the shadow page table, so if the mmio is caused again, we can quickly identify it and emulate it directly Searching mmio gfn in memslots is heavy since we need to walk all memeslots, it can be reduced by this feature, and also avoid walking guest page table for soft mmu. [jan: fix operator precedence issue] Signed-off-by: Xiao Guangrong Signed-off-by: Jan Kiszka Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 192 +++++++++++++++++++++++++++++++++++++++++++-- arch/x86/kvm/mmu.h | 2 + arch/x86/kvm/paging_tmpl.h | 21 +++-- arch/x86/kvm/vmx.c | 22 +++++- arch/x86/kvm/x86.c | 25 ++++++ 5 files changed, 248 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 4b1aa67..4e22df6 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -197,6 +197,47 @@ static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ static u64 __read_mostly shadow_user_mask; static u64 __read_mostly shadow_accessed_mask; static u64 __read_mostly shadow_dirty_mask; +static u64 __read_mostly shadow_mmio_mask; + +static void mmu_spte_set(u64 *sptep, u64 spte); + +void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask) +{ + shadow_mmio_mask = mmio_mask; +} +EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); + +static void mark_mmio_spte(u64 *sptep, u64 gfn, unsigned access) +{ + access &= ACC_WRITE_MASK | ACC_USER_MASK; + + mmu_spte_set(sptep, shadow_mmio_mask | access | gfn << PAGE_SHIFT); +} + +static bool is_mmio_spte(u64 spte) +{ + return (spte & shadow_mmio_mask) == shadow_mmio_mask; +} + +static gfn_t get_mmio_spte_gfn(u64 spte) +{ + return (spte & ~shadow_mmio_mask) >> PAGE_SHIFT; +} + +static unsigned get_mmio_spte_access(u64 spte) +{ + return (spte & ~shadow_mmio_mask) & ~PAGE_MASK; +} + +static bool set_mmio_spte(u64 *sptep, gfn_t gfn, pfn_t pfn, unsigned access) +{ + if (unlikely(is_noslot_pfn(pfn))) { + mark_mmio_spte(sptep, gfn, access); + return true; + } + + return false; +} static inline u64 rsvd_bits(int s, int e) { @@ -226,7 +267,7 @@ static int is_nx(struct kvm_vcpu *vcpu) static int is_shadow_present_pte(u64 pte) { - return pte & PT_PRESENT_MASK; + return pte & PT_PRESENT_MASK && !is_mmio_spte(pte); } static int is_large_pte(u64 pte) @@ -285,6 +326,12 @@ static u64 __get_spte_lockless(u64 *sptep) { return ACCESS_ONCE(*sptep); } + +static bool __check_direct_spte_mmio_pf(u64 spte) +{ + /* It is valid if the spte is zapped. */ + return spte == 0ull; +} #else union split_spte { struct { @@ -388,6 +435,23 @@ retry: return spte.spte; } + +static bool __check_direct_spte_mmio_pf(u64 spte) +{ + union split_spte sspte = (union split_spte)spte; + u32 high_mmio_mask = shadow_mmio_mask >> 32; + + /* It is valid if the spte is zapped. */ + if (spte == 0ull) + return true; + + /* It is valid if the spte is being zapped. */ + if (sspte.spte_low == 0ull && + (sspte.spte_high & high_mmio_mask) == high_mmio_mask) + return true; + + return false; +} #endif static bool spte_has_volatile_bits(u64 spte) @@ -1745,7 +1809,8 @@ static void mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, child = page_header(pte & PT64_BASE_ADDR_MASK); drop_parent_pte(child, spte); } - } + } else if (is_mmio_spte(pte)) + mmu_spte_clear_no_track(spte); if (is_large_pte(pte)) --kvm->stat.lpages; @@ -2120,6 +2185,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 spte, entry = *sptep; int ret = 0; + if (set_mmio_spte(sptep, gfn, pfn, pte_access)) + return 0; + /* * We don't set the accessed bit, since we sometimes want to see * whether the guest actually used the pte (in order to detect @@ -2255,6 +2323,9 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, kvm_mmu_flush_tlb(vcpu); } + if (unlikely(is_mmio_spte(*sptep) && emulate)) + *emulate = 1; + pgprintk("%s: setting spte %llx\n", __func__, *sptep); pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n", is_large_pte(*sptep)? "2MB" : "4kB", @@ -2481,7 +2552,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, static bool mmu_invalid_pfn(pfn_t pfn) { - return unlikely(is_invalid_pfn(pfn) || is_noslot_pfn(pfn)); + return unlikely(is_invalid_pfn(pfn)); } static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, @@ -2495,11 +2566,8 @@ static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, goto exit; } - if (unlikely(is_noslot_pfn(pfn))) { + if (unlikely(is_noslot_pfn(pfn))) vcpu_cache_mmio_info(vcpu, gva, gfn, access); - *ret_val = 1; - goto exit; - } ret = false; exit: @@ -2813,6 +2881,92 @@ static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr, return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access); } +static bool quickly_check_mmio_pf(struct kvm_vcpu *vcpu, u64 addr, bool direct) +{ + if (direct) + return vcpu_match_mmio_gpa(vcpu, addr); + + return vcpu_match_mmio_gva(vcpu, addr); +} + + +/* + * On direct hosts, the last spte is only allows two states + * for mmio page fault: + * - It is the mmio spte + * - It is zapped or it is being zapped. + * + * This function completely checks the spte when the last spte + * is not the mmio spte. + */ +static bool check_direct_spte_mmio_pf(u64 spte) +{ + return __check_direct_spte_mmio_pf(spte); +} + +static u64 walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr) +{ + struct kvm_shadow_walk_iterator iterator; + u64 spte = 0ull; + + walk_shadow_page_lockless_begin(vcpu); + for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) + if (!is_shadow_present_pte(spte)) + break; + walk_shadow_page_lockless_end(vcpu); + + return spte; +} + +/* + * If it is a real mmio page fault, return 1 and emulat the instruction + * directly, return 0 to let CPU fault again on the address, -1 is + * returned if bug is detected. + */ +int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct) +{ + u64 spte; + + if (quickly_check_mmio_pf(vcpu, addr, direct)) + return 1; + + spte = walk_shadow_page_get_mmio_spte(vcpu, addr); + + if (is_mmio_spte(spte)) { + gfn_t gfn = get_mmio_spte_gfn(spte); + unsigned access = get_mmio_spte_access(spte); + + if (direct) + addr = 0; + vcpu_cache_mmio_info(vcpu, addr, gfn, access); + return 1; + } + + /* + * It's ok if the gva is remapped by other cpus on shadow guest, + * it's a BUG if the gfn is not a mmio page. + */ + if (direct && !check_direct_spte_mmio_pf(spte)) + return -1; + + /* + * If the page table is zapped by other cpus, let CPU fault again on + * the address. + */ + return 0; +} +EXPORT_SYMBOL_GPL(handle_mmio_page_fault_common); + +static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, + u32 error_code, bool direct) +{ + int ret; + + ret = handle_mmio_page_fault_common(vcpu, addr, direct); + WARN_ON(ret < 0); + return ret; +} + static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code, bool prefault) { @@ -2820,6 +2974,10 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int r; pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code); + + if (unlikely(error_code & PFERR_RSVD_MASK)) + return handle_mmio_page_fault(vcpu, gva, error_code, true); + r = mmu_topup_memory_caches(vcpu); if (r) return r; @@ -2896,6 +3054,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, ASSERT(vcpu); ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); + if (unlikely(error_code & PFERR_RSVD_MASK)) + return handle_mmio_page_fault(vcpu, gpa, error_code, true); + r = mmu_topup_memory_caches(vcpu); if (r) return r; @@ -2993,6 +3154,23 @@ static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level) return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0; } +static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access, + int *nr_present) +{ + if (unlikely(is_mmio_spte(*sptep))) { + if (gfn != get_mmio_spte_gfn(*sptep)) { + mmu_spte_clear_no_track(sptep); + return true; + } + + (*nr_present)++; + mark_mmio_spte(sptep, gfn, access); + return true; + } + + return false; +} + #define PTTYPE 64 #include "paging_tmpl.h" #undef PTTYPE diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 05310b1..e374db9 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -49,6 +49,8 @@ #define PFERR_FETCH_MASK (1U << 4) int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); +void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask); +int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct); int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context); static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 67998d3..507e2b8 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -577,6 +577,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); + if (unlikely(error_code & PFERR_RSVD_MASK)) + return handle_mmio_page_fault(vcpu, addr, error_code, + mmu_is_nested(vcpu)); + r = mmu_topup_memory_caches(vcpu); if (r) return r; @@ -684,7 +688,8 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) --vcpu->kvm->stat.lpages; drop_spte(vcpu->kvm, sptep); need_flush = 1; - } + } else if (is_mmio_spte(*sptep)) + mmu_spte_clear_no_track(sptep); break; } @@ -780,7 +785,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) gpa_t pte_gpa; gfn_t gfn; - if (!is_shadow_present_pte(sp->spt[i])) + if (!sp->spt[i]) continue; pte_gpa = first_pte_gpa + i * sizeof(pt_element_t); @@ -789,13 +794,18 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) sizeof(pt_element_t))) return -EINVAL; - gfn = gpte_to_gfn(gpte); - if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { vcpu->kvm->tlbs_dirty++; continue; } + gfn = gpte_to_gfn(gpte); + pte_access = sp->role.access; + pte_access &= FNAME(gpte_access)(vcpu, gpte, true); + + if (sync_mmio_spte(&sp->spt[i], gfn, pte_access, &nr_present)) + continue; + if (gfn != sp->gfns[i]) { drop_spte(vcpu->kvm, &sp->spt[i]); vcpu->kvm->tlbs_dirty++; @@ -803,8 +813,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) } nr_present++; - pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte, - true); + host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE; set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a644acb..e65a158 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3594,6 +3594,17 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) return exec_control; } +static void ept_set_mmio_spte_mask(void) +{ + /* + * EPT Misconfigurations can be generated if the value of bits 2:0 + * of an EPT paging-structure entry is 110b (write/execute). + * Also, magic bits (0xffull << 49) is set to quickly identify mmio + * spte. + */ + kvm_mmu_set_mmio_spte_mask(0xffull << 49 | 0x6ull); +} + /* * Sets up the vmcs for emulated real mode. */ @@ -4671,11 +4682,19 @@ static void ept_misconfig_inspect_spte(struct kvm_vcpu *vcpu, u64 spte, static int handle_ept_misconfig(struct kvm_vcpu *vcpu) { u64 sptes[4]; - int nr_sptes, i; + int nr_sptes, i, ret; gpa_t gpa; gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); + ret = handle_mmio_page_fault_common(vcpu, gpa, true); + if (likely(ret == 1)) + return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) == + EMULATE_DONE; + if (unlikely(!ret)) + return 1; + + /* It is the real ept misconfig */ printk(KERN_ERR "EPT: Misconfiguration.\n"); printk(KERN_ERR "EPT: GPA: 0x%llx\n", gpa); @@ -7102,6 +7121,7 @@ static int __init vmx_init(void) if (enable_ept) { kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull, VMX_EPT_EXECUTABLE_MASK); + ept_set_mmio_spte_mask(); kvm_enable_tdp(); } else kvm_disable_tdp(); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 64c42d9..2c9661f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5062,6 +5062,30 @@ void kvm_after_handle_nmi(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_after_handle_nmi); +static void kvm_set_mmio_spte_mask(void) +{ + u64 mask; + int maxphyaddr = boot_cpu_data.x86_phys_bits; + + /* + * Set the reserved bits and the present bit of an paging-structure + * entry to generate page fault with PFER.RSV = 1. + */ + mask = ((1ull << (62 - maxphyaddr + 1)) - 1) << maxphyaddr; + mask |= 1ull; + +#ifdef CONFIG_X86_64 + /* + * If reserved bit is not supported, clear the present bit to disable + * mmio page fault. + */ + if (maxphyaddr == 52) + mask &= ~1ull; +#endif + + kvm_mmu_set_mmio_spte_mask(mask); +} + int kvm_arch_init(void *opaque) { int r; @@ -5088,6 +5112,7 @@ int kvm_arch_init(void *opaque) if (r) goto out; + kvm_set_mmio_spte_mask(); kvm_init_msr_list(); kvm_x86_ops = ops; -- cgit v1.1 From 4f0226482d20f104e943ee9e6f1218b573953f63 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 12 Jul 2011 03:34:24 +0800 Subject: KVM: MMU: trace mmio page fault Add tracepoints to trace mmio page fault Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 5 +++++ arch/x86/kvm/mmutrace.h | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/trace.h | 23 +++++++++++++++++++++++ arch/x86/kvm/x86.c | 5 ++++- 4 files changed, 80 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 4e22df6..9335e1b 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -211,6 +211,7 @@ static void mark_mmio_spte(u64 *sptep, u64 gfn, unsigned access) { access &= ACC_WRITE_MASK | ACC_USER_MASK; + trace_mark_mmio_spte(sptep, gfn, access); mmu_spte_set(sptep, shadow_mmio_mask | access | gfn << PAGE_SHIFT); } @@ -1940,6 +1941,8 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, kvm_mmu_isolate_pages(invalid_list); sp = list_first_entry(invalid_list, struct kvm_mmu_page, link); list_del_init(invalid_list); + + trace_kvm_mmu_delay_free_pages(sp); call_rcu(&sp->rcu, free_pages_rcu); return; } @@ -2938,6 +2941,8 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct) if (direct) addr = 0; + + trace_handle_mmio_page_fault(addr, gfn, access); vcpu_cache_mmio_info(vcpu, addr, gfn, access); return 1; } diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index b60b4fd..eed67f3 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h @@ -196,6 +196,54 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page, TP_ARGS(sp) ); +DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_delay_free_pages, + TP_PROTO(struct kvm_mmu_page *sp), + + TP_ARGS(sp) +); + +TRACE_EVENT( + mark_mmio_spte, + TP_PROTO(u64 *sptep, gfn_t gfn, unsigned access), + TP_ARGS(sptep, gfn, access), + + TP_STRUCT__entry( + __field(void *, sptep) + __field(gfn_t, gfn) + __field(unsigned, access) + ), + + TP_fast_assign( + __entry->sptep = sptep; + __entry->gfn = gfn; + __entry->access = access; + ), + + TP_printk("sptep:%p gfn %llx access %x", __entry->sptep, __entry->gfn, + __entry->access) +); + +TRACE_EVENT( + handle_mmio_page_fault, + TP_PROTO(u64 addr, gfn_t gfn, unsigned access), + TP_ARGS(addr, gfn, access), + + TP_STRUCT__entry( + __field(u64, addr) + __field(gfn_t, gfn) + __field(unsigned, access) + ), + + TP_fast_assign( + __entry->addr = addr; + __entry->gfn = gfn; + __entry->access = access; + ), + + TP_printk("addr:%llx gfn %llx access %x", __entry->addr, __entry->gfn, + __entry->access) +); + TRACE_EVENT( kvm_mmu_audit, TP_PROTO(struct kvm_vcpu *vcpu, int audit_point), diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 624f8cb..3ff898c 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -698,6 +698,29 @@ TRACE_EVENT(kvm_emulate_insn, #define trace_kvm_emulate_insn_start(vcpu) trace_kvm_emulate_insn(vcpu, 0) #define trace_kvm_emulate_insn_failed(vcpu) trace_kvm_emulate_insn(vcpu, 1) +TRACE_EVENT( + vcpu_match_mmio, + TP_PROTO(gva_t gva, gpa_t gpa, bool write, bool gpa_match), + TP_ARGS(gva, gpa, write, gpa_match), + + TP_STRUCT__entry( + __field(gva_t, gva) + __field(gpa_t, gpa) + __field(bool, write) + __field(bool, gpa_match) + ), + + TP_fast_assign( + __entry->gva = gva; + __entry->gpa = gpa; + __entry->write = write; + __entry->gpa_match = gpa_match + ), + + TP_printk("gva %#lx gpa %#llx %s %s", __entry->gva, __entry->gpa, + __entry->write ? "Write" : "Read", + __entry->gpa_match ? "GPA" : "GVA") +); #endif /* _TRACE_KVM_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2c9661f..84a28ea 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4021,6 +4021,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, vcpu->arch.access)) { *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT | (gva & (PAGE_SIZE - 1)); + trace_vcpu_match_mmio(gva, *gpa, write, false); return 1; } @@ -4036,8 +4037,10 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, if ((*gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) return 1; - if (vcpu_match_mmio_gpa(vcpu, *gpa)) + if (vcpu_match_mmio_gpa(vcpu, *gpa)) { + trace_vcpu_match_mmio(gva, *gpa, write, true); return 1; + } return 0; } -- cgit v1.1 From 66574cc05438dd0907029075d7e6ec5ac0036fbc Mon Sep 17 00:00:00 2001 From: Jonas Bonn Date: Thu, 30 Jun 2011 21:22:12 +0200 Subject: modules: make arch's use default loader hooks This patch removes all the module loader hook implementations in the architecture specific code where the functionality is the same as that now provided by the recently added default hooks. Signed-off-by: Jonas Bonn Acked-by: Mike Frysinger Acked-by: Geert Uytterhoeven Tested-by: Michal Simek Signed-off-by: Rusty Russell --- arch/x86/kernel/module.c | 37 ------------------------------------- 1 file changed, 37 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 52f256f..925179f 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -45,21 +45,6 @@ void *module_alloc(unsigned long size) -1, __builtin_return_address(0)); } -/* Free memory returned from module_alloc */ -void module_free(struct module *mod, void *module_region) -{ - vfree(module_region); -} - -/* We don't need anything special. */ -int module_frob_arch_sections(Elf_Ehdr *hdr, - Elf_Shdr *sechdrs, - char *secstrings, - struct module *mod) -{ - return 0; -} - #ifdef CONFIG_X86_32 int apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, @@ -100,17 +85,6 @@ int apply_relocate(Elf32_Shdr *sechdrs, } return 0; } - -int apply_relocate_add(Elf32_Shdr *sechdrs, - const char *strtab, - unsigned int symindex, - unsigned int relsec, - struct module *me) -{ - printk(KERN_ERR "module %s: ADD RELOCATION unsupported\n", - me->name); - return -ENOEXEC; -} #else /*X86_64*/ int apply_relocate_add(Elf64_Shdr *sechdrs, const char *strtab, @@ -181,17 +155,6 @@ overflow: me->name); return -ENOEXEC; } - -int apply_relocate(Elf_Shdr *sechdrs, - const char *strtab, - unsigned int symindex, - unsigned int relsec, - struct module *me) -{ - printk(KERN_ERR "non add relocation not supported\n"); - return -ENOSYS; -} - #endif int module_finalize(const Elf_Ehdr *hdr, -- cgit v1.1 From d8d01a6378db6e3eeb8777a8769c69e455635dc3 Mon Sep 17 00:00:00 2001 From: Daniel Drake Date: Sun, 24 Jul 2011 18:33:21 +0100 Subject: x86, olpc: Fix dependency on POWER_SUPPLY As reported by Randy Dunlap, CONFIG_POWER_SUPPLY=m caused a compile error: arch/x86/built-in.o: In function `battery_status_changed': olpc-xo15-sci.c:(.text+0x3acdd): undefined reference to `power_supply_get_by_name' olpc-xo15-sci.c:(.text+0x3ad04): undefined reference to `power_supply_changed' The SCI drivers, as bool, require POWER_SUPPLY to be builtin. Use select to make that a hard requirement and avoid this build failure. Reported-by: Randy Dunlap Acked-by: Randy Dunlap Signed-off-by: Daniel Drake Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 8af5ba8..b198c01 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2089,7 +2089,8 @@ config OLPC_XO1_RTC config OLPC_XO1_SCI bool "OLPC XO-1 SCI extras" - depends on OLPC && OLPC_XO1_PM && POWER_SUPPLY + depends on OLPC && OLPC_XO1_PM + select POWER_SUPPLY select GPIO_CS5535 select MFD_CORE ---help--- @@ -2103,7 +2104,8 @@ config OLPC_XO1_SCI config OLPC_XO15_SCI bool "OLPC XO-1.5 SCI extras" - depends on OLPC && ACPI && POWER_SUPPLY + depends on OLPC && ACPI + select POWER_SUPPLY ---help--- Add support for SCI-based features of the OLPC XO-1.5 laptop: - EC-driven system wakeups -- cgit v1.1 From 07d5b38e14b7ff98eb52e4a6db4e20abcc608da3 Mon Sep 17 00:00:00 2001 From: Daniel Drake Date: Sun, 24 Jul 2011 18:34:30 +0100 Subject: x86, olpc-xo15-sci: Enable EC wakeup capability Some recent changes to the way that ACPI handles wakeup flags means that the XO15EC ACPI device is not wakeup-capable by default so device_set_wakeup_enable() does nothing. Use device_init_wakeup() to mark the device as wakeup capable, and to enable wakeups. Signed-off-by: Daniel Drake Link: http://lkml.kernel.org/r/20110724173430.BE03C9D401C@zog.reactivated.net Signed-off-by: Ingo Molnar --- arch/x86/platform/olpc/olpc-xo15-sci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/olpc/olpc-xo15-sci.c b/arch/x86/platform/olpc/olpc-xo15-sci.c index a5990eb..2b235b7 100644 --- a/arch/x86/platform/olpc/olpc-xo15-sci.c +++ b/arch/x86/platform/olpc/olpc-xo15-sci.c @@ -120,7 +120,7 @@ static int xo15_sci_add(struct acpi_device *device) /* Enable wake-on-EC */ if (device->wakeup.flags.valid) - device_set_wakeup_enable(&device->dev, true); + device_init_wakeup(&device->dev, true); return 0; } -- cgit v1.1 From b3c4b9825075b680817b9460184e7b576dc97597 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 25 Jul 2011 15:51:02 -0700 Subject: xen/tracing: fix compile errors when tracing is disabled. When CONFIG_FUNCTION_TRACER is disabled, compilation fails as follows: CC arch/x86/xen/setup.o In file included from arch/x86/include/asm/xen/hypercall.h:42, from arch/x86/xen/setup.c:19: include/trace/events/xen.h:31: warning: 'struct multicall_entry' declared inside parameter list include/trace/events/xen.h:31: warning: its scope is only this definition or declaration, which is probably not what you want include/trace/events/xen.h:31: warning: 'struct multicall_entry' declared inside parameter list include/trace/events/xen.h:31: warning: 'struct multicall_entry' declared inside parameter list include/trace/events/xen.h:31: warning: 'struct multicall_entry' declared inside parameter list [...] arch/x86/xen/trace.c:5: error: '__HYPERVISOR_set_trap_table' undeclared here (not in a function) arch/x86/xen/trace.c:5: error: array index in initializer not of integer type arch/x86/xen/trace.c:5: error: (near initialization for 'xen_hypercall_names') arch/x86/xen/trace.c:6: error: '__HYPERVISOR_mmu_update' undeclared here (not in a function) arch/x86/xen/trace.c:6: error: array index in initializer not of integer type arch/x86/xen/trace.c:6: error: (near initialization for 'xen_hypercall_names') Fix this by making sure struct multicall_entry has a declaration in scope at all times, and don't bother compiling xen/trace.c when tracing is disabled. Reported-by: Randy Dunlap Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/Makefile | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 6fdf366..ce69201 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -13,7 +13,9 @@ CFLAGS_mmu.o := $(nostackp) obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ time.o xen-asm.o xen-asm_$(BITS).o \ grant-table.o suspend.o platform-pci-unplug.o \ - p2m.o trace.o + p2m.o + +obj-$(CONFIG_FUNCTION_TRACER) += trace.o obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o -- cgit v1.1 From 0e9a6cb5e66f4b23e2a8f6b3f00949b7b3125dda Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Tue, 26 Jul 2011 16:08:31 -0700 Subject: ptrace: unify show_regs() prototype [ poleg@redhat.com: no need to declare show_regs() in ptrace.h, sched.h does this ] Signed-off-by: Mike Frysinger Cc: Tejun Heo Signed-off-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/kdebug.h | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h index fe2cc6e..d73f157 100644 --- a/arch/x86/include/asm/kdebug.h +++ b/arch/x86/include/asm/kdebug.h @@ -28,7 +28,6 @@ extern void show_registers(struct pt_regs *regs); extern void show_trace(struct task_struct *t, struct pt_regs *regs, unsigned long *sp, unsigned long bp); extern void __show_regs(struct pt_regs *regs, int all); -extern void show_regs(struct pt_regs *regs); extern unsigned long oops_begin(void); extern void oops_end(unsigned long, struct pt_regs *, int signr); #ifdef CONFIG_KEXEC -- cgit v1.1 From 148817ba092f9f6edd35bad3c6c6b8e8f90fe2ed Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Tue, 26 Jul 2011 16:09:04 -0700 Subject: asm-generic: add another generic ext2 atomic bitops The majority of architectures implement ext2 atomic bitops as test_and_{set,clear}_bit() without spinlock. This adds this type of generic implementation in ext2-atomic-setbit.h and use it wherever possible. Signed-off-by: Akinobu Mita Suggested-by: Andreas Dilger Suggested-by: Arnd Bergmann Acked-by: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/bitops.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index 69d5813..1775d6e 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -458,10 +458,7 @@ static inline int fls(int x) #include -#define ext2_set_bit_atomic(lock, nr, addr) \ - test_and_set_bit((nr), (unsigned long *)(addr)) -#define ext2_clear_bit_atomic(lock, nr, addr) \ - test_and_clear_bit((nr), (unsigned long *)(addr)) +#include #endif /* __KERNEL__ */ #endif /* _ASM_X86_BITOPS_H */ -- cgit v1.1 From 60063497a95e716c9a689af3be2687d261f115b4 Mon Sep 17 00:00:00 2001 From: Arun Sharma Date: Tue, 26 Jul 2011 16:09:06 -0700 Subject: atomic: use This allows us to move duplicated code in (atomic_inc_not_zero() for now) to Signed-off-by: Arun Sharma Reviewed-by: Eric Dumazet Cc: Ingo Molnar Cc: David Miller Cc: Eric Dumazet Acked-by: Mike Frysinger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/ia32/sys_ia32.c | 2 +- arch/x86/include/asm/apic.h | 2 +- arch/x86/include/asm/atomic.h | 1 - arch/x86/include/asm/hw_irq.h | 2 +- arch/x86/include/asm/local.h | 2 +- arch/x86/include/asm/mce.h | 2 +- arch/x86/include/asm/mmu_context.h | 2 +- arch/x86/include/asm/prom.h | 2 +- arch/x86/include/asm/spinlock.h | 2 +- arch/x86/include/asm/thread_info.h | 2 +- arch/x86/kernel/amd_gart_64.c | 2 +- arch/x86/kernel/apic/apic.c | 2 +- arch/x86/kernel/apic/es7000_32.c | 2 +- arch/x86/kernel/cpu/common.c | 2 +- arch/x86/kernel/i8259.c | 2 +- arch/x86/kernel/irqinit.c | 2 +- arch/x86/kernel/traps.c | 2 +- arch/x86/kvm/lapic.c | 2 +- arch/x86/kvm/timer.c | 2 +- arch/x86/lib/atomic64_32.c | 2 +- arch/x86/mm/mmio-mod.c | 2 +- 21 files changed, 20 insertions(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index 5852519..f6f5c53 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c @@ -43,7 +43,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 4a0b7c7..7b3ca83 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -8,7 +8,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index 952a826..897969b 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -244,7 +244,6 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u) return c != (u); } -#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) /* * atomic_dec_if_positive - decrement by 1 if old value positive diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 13f5504..0919905 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -21,7 +21,7 @@ #include #include -#include +#include #include #include diff --git a/arch/x86/include/asm/local.h b/arch/x86/include/asm/local.h index 2e99724..9cdae5d 100644 --- a/arch/x86/include/asm/local.h +++ b/arch/x86/include/asm/local.h @@ -4,7 +4,7 @@ #include #include -#include +#include #include typedef struct { diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 716b48a..c9321f3 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -124,7 +124,7 @@ extern struct atomic_notifier_head x86_mce_decoder_chain; #include #include -#include +#include extern int mce_disabled; extern int mce_p5_enabled; diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 8b5393e..6902152 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -2,7 +2,7 @@ #define _ASM_X86_MMU_CONTEXT_H #include -#include +#include #include #include #include diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h index df12870..644dd885 100644 --- a/arch/x86/include/asm/prom.h +++ b/arch/x86/include/asm/prom.h @@ -19,7 +19,7 @@ #include #include -#include +#include #include #include diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index e9e51f7..ee67edf 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -1,7 +1,7 @@ #ifndef _ASM_X86_SPINLOCK_H #define _ASM_X86_SPINLOCK_H -#include +#include #include #include #include diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 1f2e61e..a1fe5c1 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -21,7 +21,7 @@ struct task_struct; struct exec_domain; #include #include -#include +#include struct thread_info { struct task_struct *task; /* main task structure */ diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index b117efd..8a439d3 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -30,7 +30,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index b24be38..52fa563 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -38,7 +38,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 9536b3f..5d513bc 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -48,7 +48,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 22a073d..6218439 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 65b8f5c..6104852 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -14,7 +14,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index f09d4bb..b3300e6 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -15,7 +15,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index fbc097a..9682ec5 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -49,7 +49,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 2b2255b..57dcbd4 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -33,7 +33,7 @@ #include #include #include -#include +#include #include "kvm_cache_regs.h" #include "irq.h" #include "trace.h" diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c index abd86e86..ae432ea 100644 --- a/arch/x86/kvm/timer.c +++ b/arch/x86/kvm/timer.c @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include "kvm_timer.h" static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer) diff --git a/arch/x86/lib/atomic64_32.c b/arch/x86/lib/atomic64_32.c index 540179e..042f682 100644 --- a/arch/x86/lib/atomic64_32.c +++ b/arch/x86/lib/atomic64_32.c @@ -4,7 +4,7 @@ #include #include -#include +#include long long atomic64_read_cx8(long long, const atomic64_t *v); EXPORT_SYMBOL(atomic64_read_cx8); diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c index 3adff7d..67421f3 100644 --- a/arch/x86/mm/mmio-mod.c +++ b/arch/x86/mm/mmio-mod.c @@ -34,7 +34,7 @@ #include #include #include /* for ISA_START_ADDRESS */ -#include +#include #include #include -- cgit v1.1 From f24219b4e90cf70ec4a211b17fbabc725a0ddf3c Mon Sep 17 00:00:00 2001 From: Arun Sharma Date: Tue, 26 Jul 2011 16:09:07 -0700 Subject: atomic: move atomic_add_unless to generic code This is in preparation for more generic atomic primitives based on __atomic_add_unless. Signed-off-by: Arun Sharma Signed-off-by: Hans-Christian Egtvedt Reviewed-by: Eric Dumazet Cc: Ingo Molnar Cc: David Miller Acked-by: Mike Frysinger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/atomic.h | 8 ++++---- arch/x86/include/asm/atomic64_32.h | 2 +- arch/x86/include/asm/atomic64_64.h | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index 897969b..5fe9cb3 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -221,15 +221,15 @@ static inline int atomic_xchg(atomic_t *v, int new) } /** - * atomic_add_unless - add unless the number is already a given value + * __atomic_add_unless - add unless the number is already a given value * @v: pointer of type atomic_t * @a: the amount to add to v... * @u: ...unless v is equal to u. * * Atomically adds @a to @v, so long as @v was not already @u. - * Returns non-zero if @v was not @u, and zero otherwise. + * Returns the old value of @v. */ -static inline int atomic_add_unless(atomic_t *v, int a, int u) +static inline int __atomic_add_unless(atomic_t *v, int a, int u) { int c, old; c = atomic_read(v); @@ -241,7 +241,7 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u) break; c = old; } - return c != (u); + return c; } diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h index 2a934aa..24098aa 100644 --- a/arch/x86/include/asm/atomic64_32.h +++ b/arch/x86/include/asm/atomic64_32.h @@ -263,7 +263,7 @@ static inline int atomic64_add_negative(long long i, atomic64_t *v) * @u: ...unless v is equal to u. * * Atomically adds @a to @v, so long as it was not @u. - * Returns non-zero if @v was not @u, and zero otherwise. + * Returns the old value of @v. */ static inline int atomic64_add_unless(atomic64_t *v, long long a, long long u) { diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h index 49fd1ea..017594d 100644 --- a/arch/x86/include/asm/atomic64_64.h +++ b/arch/x86/include/asm/atomic64_64.h @@ -202,7 +202,7 @@ static inline long atomic64_xchg(atomic64_t *v, long new) * @u: ...unless v is equal to u. * * Atomically adds @a to @v, so long as it was not @u. - * Returns non-zero if @v was not @u, and zero otherwise. + * Returns the old value of @v. */ static inline int atomic64_add_unless(atomic64_t *v, long a, long u) { -- cgit v1.1 From 7847777a45f9f8bfc8617dbf107bde1ecb59caee Mon Sep 17 00:00:00 2001 From: Arun Sharma Date: Tue, 26 Jul 2011 16:09:08 -0700 Subject: atomic: cleanup asm-generic atomic*.h inclusion After changing all consumers of atomics to include , we ran into some compile time errors due to this dependency chain: linux/atomic.h -> asm/atomic.h -> asm-generic/atomic-long.h where atomic-long.h could use funcs defined later in linux/atomic.h without a prototype. This patches moves the code that includes asm-generic/atomic*.h to linux/atomic.h. Archs that need need to select CONFIG_GENERIC_ATOMIC64 from now on (some of them used to include it unconditionally). Compile tested on i386 and x86_64 with allnoconfig. Signed-off-by: Arun Sharma Cc: Eric Dumazet Cc: Ingo Molnar Cc: David Miller Acked-by: Mike Frysinger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/atomic.h | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index 5fe9cb3..10572e3 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -318,5 +318,4 @@ static inline void atomic_or_long(unsigned long *v1, unsigned long v2) # include "atomic64_64.h" #endif -#include #endif /* _ASM_X86_ATOMIC_H */ -- cgit v1.1 From fd079facb3fdd1b0517f0b2087ac05c30ea09cfe Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 25 Jul 2011 11:01:09 -0700 Subject: KVM: fix TASK_DELAY_ACCT kconfig warning Fix kconfig dependency warning: warning: (KVM) selects TASK_DELAY_ACCT which has unmet direct dependencies (TASKSTATS) Signed-off-by: Randy Dunlap Signed-off-by: Avi Kivity --- arch/x86/kvm/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 988724b..0a09b58 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -31,6 +31,7 @@ config KVM select KVM_ASYNC_PF select USER_RETURN_NOTIFIER select KVM_MMIO + select TASKSTATS select TASK_DELAY_ACCT ---help--- Support hosting fully virtualized guest machines using hardware -- cgit v1.1 From 628c6246d47b85f5357298601df2444d7f4dd3fd Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Sun, 31 Jul 2011 13:59:29 -0700 Subject: x86, random: Architectural inlines to get random integers with RDRAND Architectural inlines to get random ints and longs using the RDRAND instruction. Intel has introduced a new RDRAND instruction, a Digital Random Number Generator (DRNG), which is functionally an high bandwidth entropy source, cryptographic whitener, and integrity monitor all built into hardware. This enables RDRAND to be used directly, bypassing the kernel random number pool. For technical documentation, see: http://software.intel.com/en-us/articles/download-the-latest-bull-mountain-software-implementation-guide/ In this patch, this is *only* used for the nonblocking random number pool. RDRAND is a nonblocking source, similar to our /dev/urandom, and is therefore not a direct replacement for /dev/random. The architectural hooks presented in the previous patch only feed the kernel internal users, which only use the nonblocking pool, and so this is not a problem. Since this instruction is available in userspace, there is no reason to have a /dev/hw_rng device driver for the purpose of feeding rngd. This is especially so since RDRAND is a nonblocking source, and needs additional whitening and reduction (see the above technical documentation for details) in order to be of "pure entropy source" quality. The CONFIG_EXPERT compile-time option can be used to disable this use of RDRAND. Signed-off-by: H. Peter Anvin Originally-by: Fenghua Yu Cc: Matt Mackall Cc: Herbert Xu Cc: "Theodore Ts'o" --- arch/x86/Kconfig | 9 +++++ arch/x86/include/asm/archrandom.h | 73 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 82 insertions(+) create mode 100644 arch/x86/include/asm/archrandom.h (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 37357a5..a0e9bda7 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1451,6 +1451,15 @@ config ARCH_USES_PG_UNCACHED def_bool y depends on X86_PAT +config ARCH_RANDOM + def_bool y + prompt "x86 architectural random number generator" if EXPERT + ---help--- + Enable the x86 architectural RDRAND instruction + (Intel Bull Mountain technology) to generate random numbers. + If supported, this is a high bandwidth, cryptographically + secure hardware random number generator. + config EFI bool "EFI runtime service support" depends on ACPI diff --git a/arch/x86/include/asm/archrandom.h b/arch/x86/include/asm/archrandom.h new file mode 100644 index 0000000..b7b5bc0 --- /dev/null +++ b/arch/x86/include/asm/archrandom.h @@ -0,0 +1,73 @@ +/* + * This file is part of the Linux kernel. + * + * Copyright (c) 2011, Intel Corporation + * Authors: Fenghua Yu , + * H. Peter Anvin + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * + */ + +#ifndef ASM_X86_ARCHRANDOM_H +#define ASM_X86_ARCHRANDOM_H + +#include +#include +#include +#include + +#define RDRAND_RETRY_LOOPS 10 + +#define RDRAND_INT ".byte 0x0f,0xc7,0xf0" +#ifdef CONFIG_X86_64 +# define RDRAND_LONG ".byte 0x48,0x0f,0xc7,0xf0" +#else +# define RDRAND_LONG RDRAND_INT +#endif + +#ifdef CONFIG_ARCH_RANDOM + +#define GET_RANDOM(name, type, rdrand, nop) \ +static inline int name(type *v) \ +{ \ + int ok; \ + alternative_io("movl $0, %0\n\t" \ + nop, \ + "\n1: " rdrand "\n\t" \ + "jc 2f\n\t" \ + "decl %0\n\t" \ + "jnz 1b\n\t" \ + "2:", \ + X86_FEATURE_RDRAND, \ + ASM_OUTPUT2("=r" (ok), "=a" (*v)), \ + "0" (RDRAND_RETRY_LOOPS)); \ + return ok; \ +} + +#ifdef CONFIG_X86_64 + +GET_RANDOM(arch_get_random_long, unsigned long, RDRAND_LONG, ASM_NOP5); +GET_RANDOM(arch_get_random_int, unsigned int, RDRAND_INT, ASM_NOP4); + +#else + +GET_RANDOM(arch_get_random_long, unsigned long, RDRAND_LONG, ASM_NOP3); +GET_RANDOM(arch_get_random_int, unsigned int, RDRAND_INT, ASM_NOP3); + +#endif /* CONFIG_X86_64 */ + +#endif /* CONFIG_ARCH_RANDOM */ + +#endif /* ASM_X86_ARCHRANDOM_H */ -- cgit v1.1 From 49d859d78c5aeb998b6936fcb5f288f78d713489 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Sun, 31 Jul 2011 14:02:19 -0700 Subject: x86, random: Verify RDRAND functionality and allow it to be disabled If the CPU declares that RDRAND is available, go through a guranteed reseed sequence, and make sure that it is actually working (producing data.) If it does not, disable the CPU feature flag. Allow RDRAND to be disabled on the command line (as opposed to at compile time) for a user who has special requirements with regards to random numbers. Signed-off-by: H. Peter Anvin Cc: Matt Mackall Cc: Herbert Xu Cc: "Theodore Ts'o" --- arch/x86/include/asm/archrandom.h | 2 ++ arch/x86/kernel/cpu/Makefile | 1 + arch/x86/kernel/cpu/common.c | 2 ++ arch/x86/kernel/cpu/rdrand.c | 73 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 78 insertions(+) create mode 100644 arch/x86/kernel/cpu/rdrand.c (limited to 'arch/x86') diff --git a/arch/x86/include/asm/archrandom.h b/arch/x86/include/asm/archrandom.h index b7b5bc0..0d9ec77 100644 --- a/arch/x86/include/asm/archrandom.h +++ b/arch/x86/include/asm/archrandom.h @@ -70,4 +70,6 @@ GET_RANDOM(arch_get_random_int, unsigned int, RDRAND_INT, ASM_NOP3); #endif /* CONFIG_ARCH_RANDOM */ +extern void x86_init_rdrand(struct cpuinfo_x86 *c); + #endif /* ASM_X86_ARCHRANDOM_H */ diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 6042981..0e3a82a 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -15,6 +15,7 @@ CFLAGS_common.o := $(nostackp) obj-y := intel_cacheinfo.o scattered.o topology.o obj-y += proc.o capflags.o powerflags.o common.o obj-y += vmware.o hypervisor.o sched.o mshyperv.o +obj-y += rdrand.o obj-$(CONFIG_X86_32) += bugs.o obj-$(CONFIG_X86_64) += bugs_64.o diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 22a073d..8dbd929 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -857,6 +858,7 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) #endif init_hypervisor(c); + x86_init_rdrand(c); /* * Clear/Set all flags overriden by options, need do it diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c new file mode 100644 index 0000000..feca286 --- /dev/null +++ b/arch/x86/kernel/cpu/rdrand.c @@ -0,0 +1,73 @@ +/* + * This file is part of the Linux kernel. + * + * Copyright (c) 2011, Intel Corporation + * Authors: Fenghua Yu , + * H. Peter Anvin + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * + */ + +#include +#include +#include + +static int __init x86_rdrand_setup(char *s) +{ + setup_clear_cpu_cap(X86_FEATURE_RDRAND); + return 1; +} +__setup("nordrand", x86_rdrand_setup); + +/* We can't use arch_get_random_long() here since alternatives haven't run */ +static inline int rdrand_long(unsigned long *v) +{ + int ok; + asm volatile("1: " RDRAND_LONG "\n\t" + "jc 2f\n\t" + "decl %0\n\t" + "jnz 1b\n\t" + "2:" + : "=r" (ok), "=a" (*v) + : "0" (RDRAND_RETRY_LOOPS)); + return ok; +} + +/* + * Force a reseed cycle; we are architecturally guaranteed a reseed + * after no more than 512 128-bit chunks of random data. This also + * acts as a test of the CPU capability. + */ +#define RESEED_LOOP ((512*128)/sizeof(unsigned long)) + +void __cpuinit x86_init_rdrand(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_ARCH_RANDOM + unsigned long tmp; + int i, count, ok; + + if (!cpu_has(c, X86_FEATURE_RDRAND)) + return; /* Nothing to do */ + + for (count = i = 0; i < RESEED_LOOP; i++) { + ok = rdrand_long(&tmp); + if (ok) + count++; + } + + if (count != RESEED_LOOP) + clear_cpu_cap(c, X86_FEATURE_RDRAND); +#endif +} -- cgit v1.1 From b03e7495a862b028294f59fc87286d6d78ee7fa1 Mon Sep 17 00:00:00 2001 From: Jon Mason Date: Wed, 20 Jul 2011 15:20:54 -0500 Subject: PCI: Set PCI-E Max Payload Size on fabric On a given PCI-E fabric, each device, bridge, and root port can have a different PCI-E maximum payload size. There is a sizable performance boost for having the largest possible maximum payload size on each PCI-E device. However, if improperly configured, fatal bus errors can occur. Thus, it is important to ensure that PCI-E payloads sends by a device are never larger than the MPS setting of all devices on the way to the destination. This can be achieved two ways: - A conservative approach is to use the smallest common denominator of the entire tree below a root complex for every device on that fabric. This means for example that having a 128 bytes MPS USB controller on one leg of a switch will dramatically reduce performances of a video card or 10GE adapter on another leg of that same switch. It also means that any hierarchy supporting hotplug slots (including expresscard or thunderbolt I suppose, dbl check that) will have to be entirely clamped to 128 bytes since we cannot predict what will be plugged into those slots, and we cannot change the MPS on a "live" system. - A more optimal way is possible, if it falls within a couple of constraints: * The top-level host bridge will never generate packets larger than the smallest TLP (or if it can be controlled independently from its MPS at least) * The device will never generate packets larger than MPS (which can be configured via MRRS) * No support of direct PCI-E <-> PCI-E transfers between devices without some additional code to specifically deal with that case Then we can use an approach that basically ignores downstream requests and focuses exclusively on upstream requests. In that case, all we need to care about is that a device MPS is no larger than its parent MPS, which allows us to keep all switches/bridges to the max MPS supported by their parent and eventually the PHB. In this case, your USB controller would no longer "starve" your 10GE Ethernet and your hotplug slots won't affect your global MPS. Additionally, the hotplugged devices themselves can be configured to a larger MPS up to the value configured in the hotplug bridge. To choose between the two available options, two PCI kernel boot args have been added to the PCI calls. "pcie_bus_safe" will provide the former behavior, while "pcie_bus_perf" will perform the latter behavior. By default, the latter behavior is used. NOTE: due to the location of the enablement, each arch will need to add calls to this function. This patch only enables x86. This patch includes a number of changes recommended by Benjamin Herrenschmidt. Tested-by: Jordan_Hargrave@dell.com Signed-off-by: Jon Mason Signed-off-by: Jesse Barnes --- arch/x86/pci/acpi.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index ae3cb23..c953302 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -360,6 +360,15 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root) } } + /* After the PCI-E bus has been walked and all devices discovered, + * configure any settings of the fabric that might be necessary. + */ + if (bus) { + struct pci_bus *child; + list_for_each_entry(child, &bus->children, node) + pcie_bus_configure_settings(child, child->self->pcie_mpss); + } + if (!bus) kfree(sd); -- cgit v1.1 From 1d12d35284b74b7257b84ba0cef1e82a66d73aea Mon Sep 17 00:00:00 2001 From: Maarten Lankhorst Date: Mon, 1 Aug 2011 11:08:59 -0400 Subject: oprofile, x86: Convert memory allocation to static array On -rt, allocators don't work from atomic context any more, and the maximum size of the array is known at compile time. Call trace on a -rt kernel: oprofile: using NMI interrupt. BUG: sleeping function called from invalid context at kernel/rtmutex.c:645 in_atomic(): 1, irqs_disabled(): 1, pid: 0, name: kworker/0:1 Pid: 0, comm: kworker/0:1 Tainted: G C 3.0.0-rt3-patser+ #39 Call Trace: [] __might_sleep+0xca/0xf0 [] rt_spin_lock+0x24/0x40 [] __kmalloc+0xc7/0x370 [] ? ppro_setup_ctrs+0x215/0x260 [oprofile] [] ? oprofile_cpu_notifier+0x60/0x60 [oprofile] [] ppro_setup_ctrs+0x215/0x260 [oprofile] [] ? oprofile_cpu_notifier+0x60/0x60 [oprofile] [] ? oprofile_cpu_notifier+0x60/0x60 [oprofile] [] nmi_cpu_setup+0xc4/0x110 [oprofile] [] generic_smp_call_function_interrupt+0x95/0x190 [] smp_call_function_interrupt+0x27/0x40 [] call_function_interrupt+0x13/0x20 [] ? plist_check_head+0x54/0xc0 [] ? intel_idle+0xc8/0x120 [] ? intel_idle+0xa7/0x120 [] cpuidle_idle_call+0xb0/0x230 [] cpu_idle+0x8b/0xe0 [] start_secondary+0x1d3/0x1d8 Signed-off-by: Maarten Lankhorst Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_ppro.c | 24 +----------------------- 1 file changed, 1 insertion(+), 23 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index 94b7450..608874b7 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -28,7 +28,7 @@ static int counter_width = 32; #define MSR_PPRO_EVENTSEL_RESERVED ((0xFFFFFFFFULL<<32)|(1ULL<<21)) -static u64 *reset_value; +static u64 reset_value[OP_MAX_COUNTER]; static void ppro_shutdown(struct op_msrs const * const msrs) { @@ -40,10 +40,6 @@ static void ppro_shutdown(struct op_msrs const * const msrs) release_perfctr_nmi(MSR_P6_PERFCTR0 + i); release_evntsel_nmi(MSR_P6_EVNTSEL0 + i); } - if (reset_value) { - kfree(reset_value); - reset_value = NULL; - } } static int ppro_fill_in_addresses(struct op_msrs * const msrs) @@ -79,13 +75,6 @@ static void ppro_setup_ctrs(struct op_x86_model_spec const *model, u64 val; int i; - if (!reset_value) { - reset_value = kzalloc(sizeof(reset_value[0]) * num_counters, - GFP_ATOMIC); - if (!reset_value) - return; - } - if (cpu_has_arch_perfmon) { union cpuid10_eax eax; eax.full = cpuid_eax(0xa); @@ -141,13 +130,6 @@ static int ppro_check_ctrs(struct pt_regs * const regs, u64 val; int i; - /* - * This can happen if perf counters are in use when - * we steal the die notifier NMI. - */ - if (unlikely(!reset_value)) - goto out; - for (i = 0; i < num_counters; ++i) { if (!reset_value[i]) continue; @@ -179,8 +161,6 @@ static void ppro_start(struct op_msrs const * const msrs) u64 val; int i; - if (!reset_value) - return; for (i = 0; i < num_counters; ++i) { if (reset_value[i]) { rdmsrl(msrs->controls[i].addr, val); @@ -196,8 +176,6 @@ static void ppro_stop(struct op_msrs const * const msrs) u64 val; int i; - if (!reset_value) - return; for (i = 0; i < num_counters; ++i) { if (!reset_value[i]) continue; -- cgit v1.1 From df013ffb8119c89f062ab05b7f544704315db47b Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 13 Jul 2011 13:14:22 +0800 Subject: Add Kconfig option ARCH_HAVE_NMI_SAFE_CMPXCHG cmpxchg() is widely used by lockless code, including NMI-safe lockless code. But on some architectures, the cmpxchg() implementation is not NMI-safe, on these architectures the lockless code may need a spin_trylock_irqsave() based implementation. This patch adds a Kconfig option: ARCH_HAVE_NMI_SAFE_CMPXCHG, so that NMI-safe lockless code can depend on it or provide different implementation according to it. On many architectures, cmpxchg is only NMI-safe for several specific operand sizes. So, ARCH_HAVE_NMI_SAFE_CMPXCHG define in this patch only guarantees cmpxchg is NMI-safe for sizeof(unsigned long). Signed-off-by: Huang Ying Acked-by: Mike Frysinger Acked-by: Paul Mundt Acked-by: Hans-Christian Egtvedt Acked-by: Benjamin Herrenschmidt Acked-by: Chris Metcalf Acked-by: Richard Henderson CC: Mikael Starvik Acked-by: David Howells CC: Yoshinori Sato CC: Tony Luck CC: Hirokazu Takata CC: Geert Uytterhoeven CC: Michal Simek Acked-by: Ralf Baechle CC: Kyle McMartin CC: Martin Schwidefsky CC: Chen Liqin CC: "David S. Miller" CC: Ingo Molnar CC: Chris Zankel Signed-off-by: Len Brown --- arch/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index da34972..a680a60 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -70,6 +70,7 @@ config X86 select IRQ_FORCED_THREADING select USE_GENERIC_SMP_HELPERS if SMP select HAVE_BPF_JIT if (X86_64 && NET) + select ARCH_HAVE_NMI_SAFE_CMPXCHG config INSTRUCTION_DECODER def_bool (KPROBES || PERF_EVENTS) -- cgit v1.1 From 6dccf9c508d5d773859df1cc2dce75c5b19e35a0 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Tue, 12 Jul 2011 22:29:32 -0400 Subject: mrst_pmu: driver for Intel Moorestown Power Management Unit The Moorestown (MRST) Power Management Unit (PMU) driver directs the SOC power states in the "Langwell" south complex (SCU). It hooks pci_platform_pm_ops[] and thus observes all PCI ".set_state" requests. For devices in the SC, the pmu driver translates those PCI requests into the appropriate commands for the SCU. The PMU driver helps implement S0i3, a deep system idle power idle state. Entry into S0i3 is via cpuidle, just like regular processor c-states. S0i3 depends on pre-conditions including uni-processor, graphics off, and certain IO devices in the SC must be off. If those pre-conditions are met, then the PMU allows cpuidle to enter S0i3, otherwise such requests are demoted, either to Atom C4 or Atom C6. This driver is based on prototype work by Bruce Flemming, Illyas Mansoor, Rajeev D. Muralidhar, Vishwesh M. Rudramuni, Hari Seshadri and Sujith Thomas. The current driver also includes contributions from H. Peter Anvin, Arjan van de Ven, Kristen Accardi, and Yong Wang. Thanks for additional review feedback from Alan Cox and Randy Dunlap. Acked-by: Alan Cox Acked-by: H. Peter Anvin Signed-off-by: Len Brown --- arch/x86/platform/mrst/Makefile | 1 + arch/x86/platform/mrst/pmu.c | 817 ++++++++++++++++++++++++++++++++++++++++ arch/x86/platform/mrst/pmu.h | 234 ++++++++++++ 3 files changed, 1052 insertions(+) create mode 100644 arch/x86/platform/mrst/pmu.c create mode 100644 arch/x86/platform/mrst/pmu.h (limited to 'arch/x86') diff --git a/arch/x86/platform/mrst/Makefile b/arch/x86/platform/mrst/Makefile index f61ccdd..1ea3877 100644 --- a/arch/x86/platform/mrst/Makefile +++ b/arch/x86/platform/mrst/Makefile @@ -1,3 +1,4 @@ obj-$(CONFIG_X86_MRST) += mrst.o obj-$(CONFIG_X86_MRST) += vrtc.o obj-$(CONFIG_EARLY_PRINTK_MRST) += early_printk_mrst.o +obj-$(CONFIG_X86_MRST) += pmu.o diff --git a/arch/x86/platform/mrst/pmu.c b/arch/x86/platform/mrst/pmu.c new file mode 100644 index 0000000..9281da7 --- /dev/null +++ b/arch/x86/platform/mrst/pmu.c @@ -0,0 +1,817 @@ +/* + * mrst/pmu.c - driver for MRST Power Management Unit + * + * Copyright (c) 2011, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "pmu.h" + +#define IPCMSG_FW_REVISION 0xF4 + +struct mrst_device { + u16 pci_dev_num; /* DEBUG only */ + u16 lss; + u16 latest_request; + unsigned int pci_state_counts[PCI_D3cold + 1]; /* DEBUG only */ +}; + +/* + * comlete list of MRST PCI devices + */ +static struct mrst_device mrst_devs[] = { +/* 0 */ { 0x0800, LSS_SPI0 }, /* Moorestown SPI Ctrl 0 */ +/* 1 */ { 0x0801, LSS_SPI1 }, /* Moorestown SPI Ctrl 1 */ +/* 2 */ { 0x0802, LSS_I2C0 }, /* Moorestown I2C 0 */ +/* 3 */ { 0x0803, LSS_I2C1 }, /* Moorestown I2C 1 */ +/* 4 */ { 0x0804, LSS_I2C2 }, /* Moorestown I2C 2 */ +/* 5 */ { 0x0805, LSS_KBD }, /* Moorestown Keyboard Ctrl */ +/* 6 */ { 0x0806, LSS_USB_HC }, /* Moorestown USB Ctrl */ +/* 7 */ { 0x0807, LSS_SD_HC0 }, /* Moorestown SD Host Ctrl 0 */ +/* 8 */ { 0x0808, LSS_SD_HC1 }, /* Moorestown SD Host Ctrl 1 */ +/* 9 */ { 0x0809, LSS_NAND }, /* Moorestown NAND Ctrl */ +/* 10 */ { 0x080a, LSS_AUDIO }, /* Moorestown Audio Ctrl */ +/* 11 */ { 0x080b, LSS_IMAGING }, /* Moorestown ISP */ +/* 12 */ { 0x080c, LSS_SECURITY }, /* Moorestown Security Controller */ +/* 13 */ { 0x080d, LSS_DISPLAY }, /* Moorestown External Displays */ +/* 14 */ { 0x080e, 0 }, /* Moorestown SCU IPC */ +/* 15 */ { 0x080f, LSS_GPIO }, /* Moorestown GPIO Controller */ +/* 16 */ { 0x0810, 0 }, /* Moorestown Power Management Unit */ +/* 17 */ { 0x0811, LSS_USB_OTG }, /* Moorestown OTG Ctrl */ +/* 18 */ { 0x0812, LSS_SPI2 }, /* Moorestown SPI Ctrl 2 */ +/* 19 */ { 0x0813, 0 }, /* Moorestown SC DMA */ +/* 20 */ { 0x0814, LSS_AUDIO_LPE }, /* Moorestown LPE DMA */ +/* 21 */ { 0x0815, LSS_AUDIO_SSP }, /* Moorestown SSP0 */ + +/* 22 */ { 0x084F, LSS_SD_HC2 }, /* Moorestown SD Host Ctrl 2 */ + +/* 23 */ { 0x4102, 0 }, /* Lincroft */ +/* 24 */ { 0x4110, 0 }, /* Lincroft */ +}; + +/* n.b. We ignore PCI-id 0x815 in LSS9 b/c MeeGo has no driver for it */ +static u16 mrst_lss9_pci_ids[] = {0x080a, 0x0814, 0}; +static u16 mrst_lss10_pci_ids[] = {0x0800, 0x0801, 0x0802, 0x0803, + 0x0804, 0x0805, 0x080f, 0}; + +/* handle concurrent SMP invokations of pmu_pci_set_power_state() */ +static spinlock_t mrst_pmu_power_state_lock; + +static unsigned int wake_counters[MRST_NUM_LSS]; /* DEBUG only */ +static unsigned int pmu_irq_stats[INT_INVALID + 1]; /* DEBUG only */ + +static int graphics_is_off; +static int lss_s0i3_enabled; +static bool mrst_pmu_s0i3_enable; + +/* debug counters */ +static u32 pmu_wait_ready_calls; +static u32 pmu_wait_ready_udelays; +static u32 pmu_wait_ready_udelays_max; +static u32 pmu_wait_done_calls; +static u32 pmu_wait_done_udelays; +static u32 pmu_wait_done_udelays_max; +static u32 pmu_set_power_state_entry; +static u32 pmu_set_power_state_send_cmd; + +static struct mrst_device *pci_id_2_mrst_dev(u16 pci_dev_num) +{ + int index = 0; + + if ((pci_dev_num >= 0x0800) && (pci_dev_num <= 0x815)) + index = pci_dev_num - 0x800; + else if (pci_dev_num == 0x084F) + index = 22; + else if (pci_dev_num == 0x4102) + index = 23; + else if (pci_dev_num == 0x4110) + index = 24; + + if (pci_dev_num != mrst_devs[index].pci_dev_num) { + WARN_ONCE(1, FW_BUG "Unknown PCI device 0x%04X\n", pci_dev_num); + return 0; + } + + return &mrst_devs[index]; +} + +/** + * mrst_pmu_validate_cstates + * @dev: cpuidle_device + * + * Certain states are not appropriate for governor to pick in some cases. + * This function will be called as cpuidle_device's prepare callback and + * thus tells governor to ignore such states when selecting the next state + * to enter. + */ + +#define IDLE_STATE4_IS_C6 4 +#define IDLE_STATE5_IS_S0I3 5 + +int mrst_pmu_invalid_cstates(void) +{ + int cpu = smp_processor_id(); + + /* + * Demote to C4 if the PMU is busy. + * Since LSS changes leave the busy bit clear... + * busy means either the PMU is waiting for an ACK-C6 that + * isn't coming due to an MWAIT that returned immediately; + * or we returned from S0i3 successfully, and the PMU + * is not done sending us interrupts. + */ + if (pmu_read_busy_status()) + return 1 << IDLE_STATE4_IS_C6 | 1 << IDLE_STATE5_IS_S0I3; + + /* + * Disallow S0i3 if: PMU is not initialized, or CPU1 is active, + * or if device LSS is insufficient, or the GPU is active, + * or if it has been explicitly disabled. + */ + if (!pmu_reg || !cpumask_equal(cpu_online_mask, cpumask_of(cpu)) || + !lss_s0i3_enabled || !graphics_is_off || !mrst_pmu_s0i3_enable) + return 1 << IDLE_STATE5_IS_S0I3; + else + return 0; +} + +/* + * pmu_update_wake_counters(): read PM_WKS, update wake_counters[] + * DEBUG only. + */ +static void pmu_update_wake_counters(void) +{ + int lss; + u32 wake_status; + + wake_status = pmu_read_wks(); + + for (lss = 0; lss < MRST_NUM_LSS; ++lss) { + if (wake_status & (1 << lss)) + wake_counters[lss]++; + } +} + +int mrst_pmu_s0i3_entry(void) +{ + int status; + + /* Clear any possible error conditions */ + pmu_write_ics(0x300); + + /* set wake control to current D-states */ + pmu_write_wssc(S0I3_SSS_TARGET); + + status = mrst_s0i3_entry(PM_S0I3_COMMAND, &pmu_reg->pm_cmd); + pmu_update_wake_counters(); + return status; +} + +/* poll for maximum of 5ms for busy bit to clear */ +static int pmu_wait_ready(void) +{ + int udelays; + + pmu_wait_ready_calls++; + + for (udelays = 0; udelays < 500; ++udelays) { + if (udelays > pmu_wait_ready_udelays_max) + pmu_wait_ready_udelays_max = udelays; + + if (pmu_read_busy_status() == 0) + return 0; + + udelay(10); + pmu_wait_ready_udelays++; + } + + /* + * if this fires, observe + * /sys/kernel/debug/mrst_pmu_wait_ready_calls + * /sys/kernel/debug/mrst_pmu_wait_ready_udelays + */ + WARN_ONCE(1, "SCU not ready for 5ms"); + return -EBUSY; +} +/* poll for maximum of 50ms us for busy bit to clear */ +static int pmu_wait_done(void) +{ + int udelays; + + pmu_wait_done_calls++; + + for (udelays = 0; udelays < 500; ++udelays) { + if (udelays > pmu_wait_done_udelays_max) + pmu_wait_done_udelays_max = udelays; + + if (pmu_read_busy_status() == 0) + return 0; + + udelay(100); + pmu_wait_done_udelays++; + } + + /* + * if this fires, observe + * /sys/kernel/debug/mrst_pmu_wait_done_calls + * /sys/kernel/debug/mrst_pmu_wait_done_udelays + */ + WARN_ONCE(1, "SCU not done for 50ms"); + return -EBUSY; +} + +u32 mrst_pmu_msi_is_disabled(void) +{ + return pmu_msi_is_disabled(); +} + +void mrst_pmu_enable_msi(void) +{ + pmu_msi_enable(); +} + +/** + * pmu_irq - pmu driver interrupt handler + * Context: interrupt context + */ +static irqreturn_t pmu_irq(int irq, void *dummy) +{ + union pmu_pm_ics pmu_ics; + + pmu_ics.value = pmu_read_ics(); + + if (!pmu_ics.bits.pending) + return IRQ_NONE; + + switch (pmu_ics.bits.cause) { + case INT_SPURIOUS: + case INT_CMD_DONE: + case INT_CMD_ERR: + case INT_WAKE_RX: + case INT_SS_ERROR: + case INT_S0IX_MISS: + case INT_NO_ACKC6: + pmu_irq_stats[pmu_ics.bits.cause]++; + break; + default: + pmu_irq_stats[INT_INVALID]++; + } + + pmu_write_ics(pmu_ics.value); /* Clear pending interrupt */ + + return IRQ_HANDLED; +} + +/* + * Translate PCI power management to MRST LSS D-states + */ +static int pci_2_mrst_state(int lss, pci_power_t pci_state) +{ + switch (pci_state) { + case PCI_D0: + if (SSMSK(D0i1, lss) & D0I1_ACG_SSS_TARGET) + return D0i1; + else + return D0; + case PCI_D1: + return D0i1; + case PCI_D2: + return D0i2; + case PCI_D3hot: + case PCI_D3cold: + return D0i3; + default: + WARN(1, "pci_state %d\n", pci_state); + return 0; + } +} + +static int pmu_issue_command(u32 pm_ssc) +{ + union pmu_pm_set_cfg_cmd_t command; + + if (pmu_read_busy_status()) { + pr_debug("pmu is busy, Operation not permitted\n"); + return -1; + } + + /* + * enable interrupts in PMU so that interrupts are + * propagated when ioc bit for a particular set + * command is set + */ + + pmu_irq_enable(); + + /* Configure the sub systems for pmu2 */ + + pmu_write_ssc(pm_ssc); + + /* + * Send the set config command for pmu its configured + * for mode CM_IMMEDIATE & hence with No Trigger + */ + + command.pmu2_params.d_param.cfg_mode = CM_IMMEDIATE; + command.pmu2_params.d_param.cfg_delay = 0; + command.pmu2_params.d_param.rsvd = 0; + + /* construct the command to send SET_CFG to particular PMU */ + command.pmu2_params.d_param.cmd = SET_CFG_CMD; + command.pmu2_params.d_param.ioc = 0; + command.pmu2_params.d_param.mode_id = 0; + command.pmu2_params.d_param.sys_state = SYS_STATE_S0I0; + + /* write the value of PM_CMD into particular PMU */ + pr_debug("pmu command being written %x\n", + command.pmu_pm_set_cfg_cmd_value); + + pmu_write_cmd(command.pmu_pm_set_cfg_cmd_value); + + return 0; +} + +static u16 pmu_min_lss_pci_req(u16 *ids, u16 pci_state) +{ + u16 existing_request; + int i; + + for (i = 0; ids[i]; ++i) { + struct mrst_device *mrst_dev; + + mrst_dev = pci_id_2_mrst_dev(ids[i]); + if (unlikely(!mrst_dev)) + continue; + + existing_request = mrst_dev->latest_request; + if (existing_request < pci_state) + pci_state = existing_request; + } + return pci_state; +} + +/** + * pmu_pci_set_power_state - Callback function is used by all the PCI devices + * for a platform specific device power on/shutdown. + */ + +int pmu_pci_set_power_state(struct pci_dev *pdev, pci_power_t pci_state) +{ + u32 old_sss, new_sss; + int status = 0; + struct mrst_device *mrst_dev; + + pmu_set_power_state_entry++; + + BUG_ON(pdev->vendor != PCI_VENDOR_ID_INTEL); + BUG_ON(pci_state < PCI_D0 || pci_state > PCI_D3cold); + + mrst_dev = pci_id_2_mrst_dev(pdev->device); + if (unlikely(!mrst_dev)) + return -ENODEV; + + mrst_dev->pci_state_counts[pci_state]++; /* count invocations */ + + /* PMU driver calls self as part of PCI initialization, ignore */ + if (pdev->device == PCI_DEV_ID_MRST_PMU) + return 0; + + BUG_ON(!pmu_reg); /* SW bug if called before initialized */ + + spin_lock(&mrst_pmu_power_state_lock); + + if (pdev->d3_delay) { + dev_dbg(&pdev->dev, "d3_delay %d, should be 0\n", + pdev->d3_delay); + pdev->d3_delay = 0; + } + /* + * If Lincroft graphics, simply remember state + */ + if ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY + && !((pdev->class & PCI_SUB_CLASS_MASK) >> 8)) { + if (pci_state == PCI_D0) + graphics_is_off = 0; + else + graphics_is_off = 1; + goto ret; + } + + if (!mrst_dev->lss) + goto ret; /* device with no LSS */ + + if (mrst_dev->latest_request == pci_state) + goto ret; /* no change */ + + mrst_dev->latest_request = pci_state; /* record latest request */ + + /* + * LSS9 and LSS10 contain multiple PCI devices. + * Use the lowest numbered (highest power) state in the LSS + */ + if (mrst_dev->lss == 9) + pci_state = pmu_min_lss_pci_req(mrst_lss9_pci_ids, pci_state); + else if (mrst_dev->lss == 10) + pci_state = pmu_min_lss_pci_req(mrst_lss10_pci_ids, pci_state); + + status = pmu_wait_ready(); + if (status) + goto ret; + + old_sss = pmu_read_sss(); + new_sss = old_sss & ~SSMSK(3, mrst_dev->lss); + new_sss |= SSMSK(pci_2_mrst_state(mrst_dev->lss, pci_state), + mrst_dev->lss); + + if (new_sss == old_sss) + goto ret; /* nothing to do */ + + pmu_set_power_state_send_cmd++; + + status = pmu_issue_command(new_sss); + + if (unlikely(status != 0)) { + dev_err(&pdev->dev, "Failed to Issue a PM command\n"); + goto ret; + } + + if (pmu_wait_done()) + goto ret; + + lss_s0i3_enabled = + ((pmu_read_sss() & S0I3_SSS_TARGET) == S0I3_SSS_TARGET); +ret: + spin_unlock(&mrst_pmu_power_state_lock); + return status; +} + +#ifdef CONFIG_DEBUG_FS +static char *d0ix_names[] = {"D0", "D0i1", "D0i2", "D0i3"}; + +static inline const char *d0ix_name(int state) +{ + return d0ix_names[(int) state]; +} + +static int debug_mrst_pmu_show(struct seq_file *s, void *unused) +{ + struct pci_dev *pdev = NULL; + u32 cur_pmsss; + int lss; + + seq_printf(s, "0x%08X D0I1_ACG_SSS_TARGET\n", D0I1_ACG_SSS_TARGET); + + cur_pmsss = pmu_read_sss(); + + seq_printf(s, "0x%08X S0I3_SSS_TARGET\n", S0I3_SSS_TARGET); + + seq_printf(s, "0x%08X Current SSS ", cur_pmsss); + seq_printf(s, lss_s0i3_enabled ? "\n" : "[BLOCKS s0i3]\n"); + + if (cpumask_equal(cpu_online_mask, cpumask_of(0))) + seq_printf(s, "cpu0 is only cpu online\n"); + else + seq_printf(s, "cpu0 is NOT only cpu online [BLOCKS S0i3]\n"); + + seq_printf(s, "GFX: %s\n", graphics_is_off ? "" : "[BLOCKS s0i3]"); + + + for_each_pci_dev(pdev) { + int pos; + u16 pmcsr; + struct mrst_device *mrst_dev; + int i; + + mrst_dev = pci_id_2_mrst_dev(pdev->device); + + seq_printf(s, "%s %04x/%04X %-16.16s ", + dev_name(&pdev->dev), + pdev->vendor, pdev->device, + dev_driver_string(&pdev->dev)); + + if (unlikely (!mrst_dev)) { + seq_printf(s, " UNKNOWN\n"); + continue; + } + + if (mrst_dev->lss) + seq_printf(s, "LSS %2d %-4s ", mrst_dev->lss, + d0ix_name(((cur_pmsss >> + (mrst_dev->lss * 2)) & 0x3))); + else + seq_printf(s, " "); + + /* PCI PM config space setting */ + pos = pci_find_capability(pdev, PCI_CAP_ID_PM); + if (pos != 0) { + pci_read_config_word(pdev, pos + PCI_PM_CTRL, &pmcsr); + seq_printf(s, "PCI-%-4s", + pci_power_name(pmcsr & PCI_PM_CTRL_STATE_MASK)); + } else { + seq_printf(s, " "); + } + + seq_printf(s, " %s ", pci_power_name(mrst_dev->latest_request)); + for (i = 0; i <= PCI_D3cold; ++i) + seq_printf(s, "%d ", mrst_dev->pci_state_counts[i]); + + if (mrst_dev->lss) { + unsigned int lssmask; + + lssmask = SSMSK(D0i3, mrst_dev->lss); + + if ((lssmask & S0I3_SSS_TARGET) && + ((lssmask & cur_pmsss) != + (lssmask & S0I3_SSS_TARGET))) + seq_printf(s , "[BLOCKS s0i3]"); + } + + seq_printf(s, "\n"); + } + seq_printf(s, "Wake Counters:\n"); + for (lss = 0; lss < MRST_NUM_LSS; ++lss) + seq_printf(s, "LSS%d %d\n", lss, wake_counters[lss]); + + seq_printf(s, "Interrupt Counters:\n"); + seq_printf(s, + "INT_SPURIOUS \t%8u\n" "INT_CMD_DONE \t%8u\n" + "INT_CMD_ERR \t%8u\n" "INT_WAKE_RX \t%8u\n" + "INT_SS_ERROR \t%8u\n" "INT_S0IX_MISS\t%8u\n" + "INT_NO_ACKC6 \t%8u\n" "INT_INVALID \t%8u\n", + pmu_irq_stats[INT_SPURIOUS], pmu_irq_stats[INT_CMD_DONE], + pmu_irq_stats[INT_CMD_ERR], pmu_irq_stats[INT_WAKE_RX], + pmu_irq_stats[INT_SS_ERROR], pmu_irq_stats[INT_S0IX_MISS], + pmu_irq_stats[INT_NO_ACKC6], pmu_irq_stats[INT_INVALID]); + + seq_printf(s, "mrst_pmu_wait_ready_calls %8d\n", + pmu_wait_ready_calls); + seq_printf(s, "mrst_pmu_wait_ready_udelays %8d\n", + pmu_wait_ready_udelays); + seq_printf(s, "mrst_pmu_wait_ready_udelays_max %8d\n", + pmu_wait_ready_udelays_max); + seq_printf(s, "mrst_pmu_wait_done_calls %8d\n", + pmu_wait_done_calls); + seq_printf(s, "mrst_pmu_wait_done_udelays %8d\n", + pmu_wait_done_udelays); + seq_printf(s, "mrst_pmu_wait_done_udelays_max %8d\n", + pmu_wait_done_udelays_max); + seq_printf(s, "mrst_pmu_set_power_state_entry %8d\n", + pmu_set_power_state_entry); + seq_printf(s, "mrst_pmu_set_power_state_send_cmd %8d\n", + pmu_set_power_state_send_cmd); + seq_printf(s, "SCU busy: %d\n", pmu_read_busy_status()); + + return 0; +} + +static int debug_mrst_pmu_open(struct inode *inode, struct file *file) +{ + return single_open(file, debug_mrst_pmu_show, NULL); +} + +static const struct file_operations devices_state_operations = { + .open = debug_mrst_pmu_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif /* DEBUG_FS */ + +/* + * Validate SCU PCI shim PCI vendor capability byte + * against LSS hard-coded in mrst_devs[] above. + * DEBUG only. + */ +static void pmu_scu_firmware_debug(void) +{ + struct pci_dev *pdev = NULL; + + for_each_pci_dev(pdev) { + struct mrst_device *mrst_dev; + u8 pci_config_lss; + int pos; + + mrst_dev = pci_id_2_mrst_dev(pdev->device); + if (unlikely(!mrst_dev)) { + printk(KERN_ERR FW_BUG "pmu: Unknown " + "PCI device 0x%04X\n", pdev->device); + continue; + } + + if (mrst_dev->lss == 0) + continue; /* no LSS in our table */ + + pos = pci_find_capability(pdev, PCI_CAP_ID_VNDR); + if (!pos != 0) { + printk(KERN_ERR FW_BUG "pmu: 0x%04X " + "missing PCI Vendor Capability\n", + pdev->device); + continue; + } + pci_read_config_byte(pdev, pos + 4, &pci_config_lss); + if (!(pci_config_lss & PCI_VENDOR_CAP_LOG_SS_MASK)) { + printk(KERN_ERR FW_BUG "pmu: 0x%04X " + "invalid PCI Vendor Capability 0x%x " + " expected LSS 0x%X\n", + pdev->device, pci_config_lss, mrst_dev->lss); + continue; + } + pci_config_lss &= PCI_VENDOR_CAP_LOG_ID_MASK; + + if (mrst_dev->lss == pci_config_lss) + continue; + + printk(KERN_ERR FW_BUG "pmu: 0x%04X LSS = %d, expected %d\n", + pdev->device, pci_config_lss, mrst_dev->lss); + } +} + +/** + * pmu_probe + */ +static int __devinit pmu_probe(struct pci_dev *pdev, + const struct pci_device_id *pci_id) +{ + int ret; + struct mrst_pmu_reg *pmu; + + /* Init the device */ + ret = pci_enable_device(pdev); + if (ret) { + dev_err(&pdev->dev, "Unable to Enable PCI device\n"); + return ret; + } + + ret = pci_request_regions(pdev, MRST_PMU_DRV_NAME); + if (ret < 0) { + dev_err(&pdev->dev, "Cannot obtain PCI resources, aborting\n"); + goto out_err1; + } + + /* Map the memory of PMU reg base */ + pmu = pci_iomap(pdev, 0, 0); + if (!pmu) { + dev_err(&pdev->dev, "Unable to map the PMU address space\n"); + ret = -ENOMEM; + goto out_err2; + } + +#ifdef CONFIG_DEBUG_FS + /* /sys/kernel/debug/mrst_pmu */ + (void) debugfs_create_file("mrst_pmu", S_IFREG | S_IRUGO, + NULL, NULL, &devices_state_operations); +#endif + pmu_reg = pmu; /* success */ + + if (request_irq(pdev->irq, pmu_irq, 0, MRST_PMU_DRV_NAME, NULL)) { + dev_err(&pdev->dev, "Registering isr has failed\n"); + ret = -1; + goto out_err3; + } + + pmu_scu_firmware_debug(); + + pmu_write_wkc(S0I3_WAKE_SOURCES); /* Enable S0i3 wakeup sources */ + + pmu_wait_ready(); + + pmu_write_ssc(D0I1_ACG_SSS_TARGET); /* Enable Auto-Clock_Gating */ + pmu_write_cmd(0x201); + + spin_lock_init(&mrst_pmu_power_state_lock); + + /* Enable the hardware interrupt */ + pmu_irq_enable(); + return 0; + +out_err3: + free_irq(pdev->irq, NULL); + pci_iounmap(pdev, pmu_reg); + pmu_reg = NULL; +out_err2: + pci_release_region(pdev, 0); +out_err1: + pci_disable_device(pdev); + return ret; +} + +static void __devexit pmu_remove(struct pci_dev *pdev) +{ + dev_err(&pdev->dev, "Mid PM pmu_remove called\n"); + + /* Freeing up the irq */ + free_irq(pdev->irq, NULL); + + pci_iounmap(pdev, pmu_reg); + pmu_reg = NULL; + + /* disable the current PCI device */ + pci_release_region(pdev, 0); + pci_disable_device(pdev); +} + +static DEFINE_PCI_DEVICE_TABLE(pmu_pci_ids) = { + { PCI_VDEVICE(INTEL, PCI_DEV_ID_MRST_PMU), 0 }, + { } +}; + +MODULE_DEVICE_TABLE(pci, pmu_pci_ids); + +static struct pci_driver driver = { + .name = MRST_PMU_DRV_NAME, + .id_table = pmu_pci_ids, + .probe = pmu_probe, + .remove = __devexit_p(pmu_remove), +}; + +/** + * pmu_pci_register - register the PMU driver as PCI device + */ +static int __init pmu_pci_register(void) +{ + return pci_register_driver(&driver); +} + +/* Register and probe via fs_initcall() to preceed device_initcall() */ +fs_initcall(pmu_pci_register); + +static void __exit mid_pci_cleanup(void) +{ + pci_unregister_driver(&driver); +} + +static int ia_major; +static int ia_minor; + +static int pmu_sfi_parse_oem(struct sfi_table_header *table) +{ + struct sfi_table_simple *sb; + + sb = (struct sfi_table_simple *)table; + ia_major = (sb->pentry[1] >> 0) & 0xFFFF; + ia_minor = (sb->pentry[1] >> 16) & 0xFFFF; + printk(KERN_INFO "mrst_pmu: IA FW version v%x.%x\n", + ia_major, ia_minor); + + return 0; +} + +static int __init scu_fw_check(void) +{ + int ret; + u32 fw_version; + + if (!pmu_reg) + return 0; /* this driver didn't probe-out */ + + sfi_table_parse("OEMB", NULL, NULL, pmu_sfi_parse_oem); + + if (ia_major < 0x6005 || ia_minor < 0x1525) { + WARN(1, "mrst_pmu: IA FW version too old\n"); + return -1; + } + + ret = intel_scu_ipc_command(IPCMSG_FW_REVISION, 0, NULL, 0, + &fw_version, 1); + + if (ret) { + WARN(1, "mrst_pmu: IPC FW version? %d\n", ret); + } else { + int scu_major = (fw_version >> 8) & 0xFF; + int scu_minor = (fw_version >> 0) & 0xFF; + + printk(KERN_INFO "mrst_pmu: firmware v%x\n", fw_version); + + if ((scu_major >= 0xC0) && (scu_minor >= 0x49)) { + printk(KERN_INFO "mrst_pmu: enabling S0i3\n"); + mrst_pmu_s0i3_enable = true; + } else { + WARN(1, "mrst_pmu: S0i3 disabled, old firmware %X.%X", + scu_major, scu_minor); + } + } + return 0; +} +late_initcall(scu_fw_check); +module_exit(mid_pci_cleanup); diff --git a/arch/x86/platform/mrst/pmu.h b/arch/x86/platform/mrst/pmu.h new file mode 100644 index 0000000..bfbfe64 --- /dev/null +++ b/arch/x86/platform/mrst/pmu.h @@ -0,0 +1,234 @@ +/* + * mrst/pmu.h - private definitions for MRST Power Management Unit mrst/pmu.c + * + * Copyright (c) 2011, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#ifndef _MRST_PMU_H_ +#define _MRST_PMU_H_ + +#define PCI_DEV_ID_MRST_PMU 0x0810 +#define MRST_PMU_DRV_NAME "mrst_pmu" +#define PCI_SUB_CLASS_MASK 0xFF00 + +#define PCI_VENDOR_CAP_LOG_ID_MASK 0x7F +#define PCI_VENDOR_CAP_LOG_SS_MASK 0x80 + +#define SUB_SYS_ALL_D0I1 0x01155555 +#define S0I3_WAKE_SOURCES 0x00001FFF + +#define PM_S0I3_COMMAND \ + ((0 << 31) | /* Reserved */ \ + (0 << 30) | /* Core must be idle */ \ + (0xc2 << 22) | /* ACK C6 trigger */ \ + (3 << 19) | /* Trigger on DMI message */ \ + (3 << 16) | /* Enter S0i3 */ \ + (0 << 13) | /* Numeric mode ID (sw) */ \ + (3 << 9) | /* Trigger mode */ \ + (0 << 8) | /* Do not interrupt */ \ + (1 << 0)) /* Set configuration */ + +#define LSS_DMI 0 +#define LSS_SD_HC0 1 +#define LSS_SD_HC1 2 +#define LSS_NAND 3 +#define LSS_IMAGING 4 +#define LSS_SECURITY 5 +#define LSS_DISPLAY 6 +#define LSS_USB_HC 7 +#define LSS_USB_OTG 8 +#define LSS_AUDIO 9 +#define LSS_AUDIO_LPE 9 +#define LSS_AUDIO_SSP 9 +#define LSS_I2C0 10 +#define LSS_I2C1 10 +#define LSS_I2C2 10 +#define LSS_KBD 10 +#define LSS_SPI0 10 +#define LSS_SPI1 10 +#define LSS_SPI2 10 +#define LSS_GPIO 10 +#define LSS_SRAM 11 /* used by SCU, do not touch */ +#define LSS_SD_HC2 12 +/* LSS hardware bits 15,14,13 are hardwired to 0, thus unusable */ +#define MRST_NUM_LSS 13 + +#define MIN(a, b) (((a) < (b)) ? (a) : (b)) + +#define SSMSK(mask, lss) ((mask) << ((lss) * 2)) +#define D0 0 +#define D0i1 1 +#define D0i2 2 +#define D0i3 3 + +#define S0I3_SSS_TARGET ( \ + SSMSK(D0i1, LSS_DMI) | \ + SSMSK(D0i3, LSS_SD_HC0) | \ + SSMSK(D0i3, LSS_SD_HC1) | \ + SSMSK(D0i3, LSS_NAND) | \ + SSMSK(D0i3, LSS_SD_HC2) | \ + SSMSK(D0i3, LSS_IMAGING) | \ + SSMSK(D0i3, LSS_SECURITY) | \ + SSMSK(D0i3, LSS_DISPLAY) | \ + SSMSK(D0i3, LSS_USB_HC) | \ + SSMSK(D0i3, LSS_USB_OTG) | \ + SSMSK(D0i3, LSS_AUDIO) | \ + SSMSK(D0i1, LSS_I2C0)) + +/* + * D0i1 on Langwell is Autonomous Clock Gating (ACG). + * Enable ACG on every LSS except camera and audio + */ +#define D0I1_ACG_SSS_TARGET \ + (SUB_SYS_ALL_D0I1 & ~SSMSK(D0i1, LSS_IMAGING) & ~SSMSK(D0i1, LSS_AUDIO)) + +enum cm_mode { + CM_NOP, /* ignore the config mode value */ + CM_IMMEDIATE, + CM_DELAY, + CM_TRIGGER, + CM_INVALID +}; + +enum sys_state { + SYS_STATE_S0I0, + SYS_STATE_S0I1, + SYS_STATE_S0I2, + SYS_STATE_S0I3, + SYS_STATE_S3, + SYS_STATE_S5 +}; + +#define SET_CFG_CMD 1 + +enum int_status { + INT_SPURIOUS = 0, + INT_CMD_DONE = 1, + INT_CMD_ERR = 2, + INT_WAKE_RX = 3, + INT_SS_ERROR = 4, + INT_S0IX_MISS = 5, + INT_NO_ACKC6 = 6, + INT_INVALID = 7, +}; + +/* PMU register interface */ +static struct mrst_pmu_reg { + u32 pm_sts; /* 0x00 */ + u32 pm_cmd; /* 0x04 */ + u32 pm_ics; /* 0x08 */ + u32 _resv1; /* 0x0C */ + u32 pm_wkc[2]; /* 0x10 */ + u32 pm_wks[2]; /* 0x18 */ + u32 pm_ssc[4]; /* 0x20 */ + u32 pm_sss[4]; /* 0x30 */ + u32 pm_wssc[4]; /* 0x40 */ + u32 pm_c3c4; /* 0x50 */ + u32 pm_c5c6; /* 0x54 */ + u32 pm_msi_disable; /* 0x58 */ +} *pmu_reg; + +static inline u32 pmu_read_sts(void) { return readl(&pmu_reg->pm_sts); } +static inline u32 pmu_read_ics(void) { return readl(&pmu_reg->pm_ics); } +static inline u32 pmu_read_wks(void) { return readl(&pmu_reg->pm_wks[0]); } +static inline u32 pmu_read_sss(void) { return readl(&pmu_reg->pm_sss[0]); } + +static inline void pmu_write_cmd(u32 arg) { writel(arg, &pmu_reg->pm_cmd); } +static inline void pmu_write_ics(u32 arg) { writel(arg, &pmu_reg->pm_ics); } +static inline void pmu_write_wkc(u32 arg) { writel(arg, &pmu_reg->pm_wkc[0]); } +static inline void pmu_write_ssc(u32 arg) { writel(arg, &pmu_reg->pm_ssc[0]); } +static inline void pmu_write_wssc(u32 arg) + { writel(arg, &pmu_reg->pm_wssc[0]); } + +static inline void pmu_msi_enable(void) { writel(0, &pmu_reg->pm_msi_disable); } +static inline u32 pmu_msi_is_disabled(void) + { return readl(&pmu_reg->pm_msi_disable); } + +union pmu_pm_ics { + struct { + u32 cause:8; + u32 enable:1; + u32 pending:1; + u32 reserved:22; + } bits; + u32 value; +}; + +static inline void pmu_irq_enable(void) +{ + union pmu_pm_ics pmu_ics; + + pmu_ics.value = pmu_read_ics(); + pmu_ics.bits.enable = 1; + pmu_write_ics(pmu_ics.value); +} + +union pmu_pm_status { + struct { + u32 pmu_rev:8; + u32 pmu_busy:1; + u32 mode_id:4; + u32 Reserved:19; + } pmu_status_parts; + u32 pmu_status_value; +}; + +static inline int pmu_read_busy_status(void) +{ + union pmu_pm_status result; + + result.pmu_status_value = pmu_read_sts(); + + return result.pmu_status_parts.pmu_busy; +} + +/* pmu set config parameters */ +struct cfg_delay_param_t { + u32 cmd:8; + u32 ioc:1; + u32 cfg_mode:4; + u32 mode_id:3; + u32 sys_state:3; + u32 cfg_delay:8; + u32 rsvd:5; +}; + +struct cfg_trig_param_t { + u32 cmd:8; + u32 ioc:1; + u32 cfg_mode:4; + u32 mode_id:3; + u32 sys_state:3; + u32 cfg_trig_type:3; + u32 cfg_trig_val:8; + u32 cmbi:1; + u32 rsvd1:1; +}; + +union pmu_pm_set_cfg_cmd_t { + union { + struct cfg_delay_param_t d_param; + struct cfg_trig_param_t t_param; + } pmu2_params; + u32 pmu_pm_set_cfg_cmd_value; +}; + +#ifdef FUTURE_PATCH +extern int mrst_s0i3_entry(u32 regval, u32 *regaddr); +#else +static inline int mrst_s0i3_entry(u32 regval, u32 *regaddr) { return -1; } +#endif +#endif -- cgit v1.1 From d91ee5863b71e8c90eaf6035bff3078a85e2e7b5 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Fri, 1 Apr 2011 18:28:35 -0400 Subject: cpuidle: replace xen access to x86 pm_idle and default_idle When a Xen Dom0 kernel boots on a hypervisor, it gets access to the raw-hardware ACPI tables. While it parses the idle tables for the hypervisor's beneift, it uses HLT for its own idle. Rather than have xen scribble on pm_idle and access default_idle, have it simply disable_cpuidle() so acpi_idle will not load and architecture default HLT will be used. cc: xen-devel@lists.xensource.com Tested-by: Konrad Rzeszutek Wilk Acked-by: H. Peter Anvin Signed-off-by: Len Brown --- arch/x86/xen/setup.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 60aeeb5..a9627e2 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -426,7 +427,7 @@ void __init xen_arch_setup(void) #ifdef CONFIG_X86_32 boot_cpu_data.hlt_works_ok = 1; #endif - pm_idle = default_idle; + disable_cpuidle(); boot_option_idle_override = IDLE_HALT; fiddle_vdso(); -- cgit v1.1 From 4bfc8288bc4a64529c5547d17349a2a1f4675507 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Wed, 30 Mar 2011 23:52:29 -0400 Subject: x86 idle: move mwait_idle_with_hints() to where it is used ...and make it static no functional change cc: x86@kernel.org Acked-by: H. Peter Anvin Signed-off-by: Len Brown --- arch/x86/include/asm/processor.h | 2 -- arch/x86/kernel/acpi/cstate.c | 23 +++++++++++++++++++++++ arch/x86/kernel/process.c | 23 ----------------------- 3 files changed, 23 insertions(+), 25 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 2193715..0d1171c 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -751,8 +751,6 @@ static inline void __sti_mwait(unsigned long eax, unsigned long ecx) :: "a" (eax), "c" (ecx)); } -extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx); - extern void select_idle_routine(const struct cpuinfo_x86 *c); extern void init_amd_e400_c1e_mask(void); diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index 5812404..f50e7fb 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -149,6 +149,29 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu, } EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe); +/* + * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, + * which can obviate IPI to trigger checking of need_resched. + * We execute MONITOR against need_resched and enter optimized wait state + * through MWAIT. Whenever someone changes need_resched, we would be woken + * up from MWAIT (without an IPI). + * + * New with Core Duo processors, MWAIT can take some hints based on CPU + * capability. + */ +void mwait_idle_with_hints(unsigned long ax, unsigned long cx) +{ + if (!need_resched()) { + if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) + clflush((void *)¤t_thread_info()->flags); + + __monitor((void *)¤t_thread_info()->flags, 0, 0); + smp_mb(); + if (!need_resched()) + __mwait(ax, cx); + } +} + void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx) { unsigned int cpu = smp_processor_id(); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index e1ba8cb2..e7e3b01 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -438,29 +438,6 @@ void cpu_idle_wait(void) } EXPORT_SYMBOL_GPL(cpu_idle_wait); -/* - * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, - * which can obviate IPI to trigger checking of need_resched. - * We execute MONITOR against need_resched and enter optimized wait state - * through MWAIT. Whenever someone changes need_resched, we would be woken - * up from MWAIT (without an IPI). - * - * New with Core Duo processors, MWAIT can take some hints based on CPU - * capability. - */ -void mwait_idle_with_hints(unsigned long ax, unsigned long cx) -{ - if (!need_resched()) { - if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) - clflush((void *)¤t_thread_info()->flags); - - __monitor((void *)¤t_thread_info()->flags, 0, 0); - smp_mb(); - if (!need_resched()) - __mwait(ax, cx); - } -} - /* Default MONITOR/MWAIT with no hints, used for default C1 state */ static void mwait_idle(void) { -- cgit v1.1 From a0bfa1373859e9d11dc92561a8667588803e42d8 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Fri, 1 Apr 2011 19:34:59 -0400 Subject: cpuidle: stop depending on pm_idle cpuidle users should call cpuidle_call_idle() directly rather than via (pm_idle)() function pointer. Architecture may choose to continue using (pm_idle)(), but cpuidle need not depend on it: my_arch_cpu_idle() ... if(cpuidle_call_idle()) pm_idle(); cc: Kevin Hilman cc: Paul Mundt cc: x86@kernel.org Acked-by: H. Peter Anvin Signed-off-by: Len Brown --- arch/x86/kernel/process_32.c | 4 +++- arch/x86/kernel/process_64.c | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index a3d0dc5..7a3b651 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include @@ -109,7 +110,8 @@ void cpu_idle(void) local_irq_disable(); /* Don't trace irqs off for idle */ stop_critical_timings(); - pm_idle(); + if (cpuidle_idle_call()) + pm_idle(); start_critical_timings(); } tick_nohz_restart_sched_tick(); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index ca6f7ab..f693e44 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -136,7 +137,8 @@ void cpu_idle(void) enter_idle(); /* Don't trace irqs off for idle */ stop_critical_timings(); - pm_idle(); + if (cpuidle_idle_call()) + pm_idle(); start_critical_timings(); /* In many cases the interrupt that ended idle -- cgit v1.1 From 33f35f2a4ee3abfc0f87990058aa1b6b5092f725 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 3 Aug 2011 22:00:38 -1000 Subject: x86: don't include xen/xen.h in unless XEN is enabled Dmitry Kasatkin reports: "kernel-devel package with kernel headers have no directory if XEN is disabled. Modules which inclide asm/io.h won't compile. XEN related content is behind the CONFIG_XEN flag in the io.h. And should be also behind CONFIG_XEN flag." So move the include of down into the section that is conditional on CONFIG_XEN. Reported-by: Dmitry Kasatkin Signed-off-by: Linus Torvalds --- arch/x86/include/asm/io.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index d02804d..d8e8eef 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -40,8 +40,6 @@ #include #include -#include - #define build_mmio_read(name, size, type, reg, barrier) \ static inline type name(const volatile void __iomem *addr) \ { type ret; asm volatile("mov" size " %1,%0":reg (ret) \ @@ -334,6 +332,7 @@ extern void fixup_early_ioremap(void); extern bool is_early_ioremap_ptep(pte_t *ptep); #ifdef CONFIG_XEN +#include struct bio_vec; extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1, -- cgit v1.1 From 1e9ea2656b656edd3c8de98675bbc0340211b5bd Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 3 Aug 2011 09:43:44 -0700 Subject: xen/tracing: it looks like we wanted CONFIG_FTRACE Apparently we wanted CONFIG_FTRACE rather the CONFIG_FUNCTION_TRACER. Reported-by: Sander Eikelenboom Tested-by: Sander Eikelenboom Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 45e94ac..3326204 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -15,7 +15,7 @@ obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ grant-table.o suspend.o platform-pci-unplug.o \ p2m.o -obj-$(CONFIG_FUNCTION_TRACER) += trace.o +obj-$(CONFIG_FTRACE) += trace.o obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o -- cgit v1.1 From 8f3c5883d840d079a5d2db20893b5ac8ba453b40 Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Tue, 2 Aug 2011 11:45:24 +0200 Subject: xen: Fix printk() format in xen/setup.c Use correct format specifier for unsigned long. Signed-off-by: Igor Mammedov Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/setup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 60aeeb5..e405632 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -113,7 +113,7 @@ static unsigned long __init xen_release_chunk(phys_addr_t start_addr, len++; } } - printk(KERN_CONT "%ld pages freed\n", len); + printk(KERN_CONT "%lu pages freed\n", len); return len; } @@ -139,7 +139,7 @@ static unsigned long __init xen_return_unused_memory(unsigned long max_pfn, if (last_end < max_addr) released += xen_release_chunk(last_end, max_addr); - printk(KERN_INFO "released %ld pages of unused memory\n", released); + printk(KERN_INFO "released %lu pages of unused memory\n", released); return released; } -- cgit v1.1 From 98f531da8479eec3d828adc7f0865baaab99a543 Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Tue, 2 Aug 2011 11:45:25 +0200 Subject: xen: Fix misleading WARN message at xen_release_chunk WARN message should not complain "Failed to release memory %lx-%lx err=%d\n" ^^^^^^^ about range when it fails to release just one page, instead it should say what pfn is not freed. In addition line: printk(KERN_INFO "xen_release_chunk: looking at area pfn %lx-%lx: " ... printk(KERN_CONT "%lu pages freed\n", len); will be broken if WARN in between this line is fired. So fix it by using a single printk for this. Signed-off-by: Igor Mammedov Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/setup.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index e405632..02ffd9e 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -92,8 +92,6 @@ static unsigned long __init xen_release_chunk(phys_addr_t start_addr, if (end <= start) return 0; - printk(KERN_INFO "xen_release_chunk: looking at area pfn %lx-%lx: ", - start, end); for(pfn = start; pfn < end; pfn++) { unsigned long mfn = pfn_to_mfn(pfn); @@ -106,14 +104,14 @@ static unsigned long __init xen_release_chunk(phys_addr_t start_addr, ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation); - WARN(ret != 1, "Failed to release memory %lx-%lx err=%d\n", - start, end, ret); + WARN(ret != 1, "Failed to release pfn %lx err=%d\n", pfn, ret); if (ret == 1) { __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); len++; } } - printk(KERN_CONT "%lu pages freed\n", len); + printk(KERN_INFO "Freeing %lx-%lx pfn range: %lu pages freed\n", + start, end, len); return len; } -- cgit v1.1 From 1bdfac19b3ecfca545281c15c7aea7ebc2eaef31 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 3 Aug 2011 09:31:49 -0400 Subject: x86-64: Pad vDSO to a page boundary This avoids an information leak to userspace. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/a63380a3c58a0506a2f5a18ba1b12dbde1f25e58.1312378163.git.luto@mit.edu Signed-off-by: H. Peter Anvin --- arch/x86/vdso/vdso.S | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/vdso/vdso.S b/arch/x86/vdso/vdso.S index 1b979c1..01f5e3b 100644 --- a/arch/x86/vdso/vdso.S +++ b/arch/x86/vdso/vdso.S @@ -9,6 +9,7 @@ __PAGE_ALIGNED_DATA vdso_start: .incbin "arch/x86/vdso/vdso.so" vdso_end: + .align PAGE_SIZE /* extra data here leaks to userspace. */ .previous -- cgit v1.1 From 9c40818da5b39fca236029059ab839857b1ef56c Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 3 Aug 2011 09:31:50 -0400 Subject: x86-64: Move the "user" vsyscall segment out of the data segment. The kernel's loader doesn't seem to care, but gold complains. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/f0716870c297242a841b949953d80c0d87bf3d3f.1312378163.git.luto@mit.edu Reported-by: Arkadiusz Miskiewicz Signed-off-by: H. Peter Anvin --- arch/x86/kernel/vmlinux.lds.S | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 4aa9c54..e79fb39 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -154,6 +154,24 @@ SECTIONS #ifdef CONFIG_X86_64 + . = ALIGN(PAGE_SIZE); + __vvar_page = .; + + .vvar : AT(ADDR(.vvar) - LOAD_OFFSET) { + + /* Place all vvars at the offsets in asm/vvar.h. */ +#define EMIT_VVAR(name, offset) \ + . = offset; \ + *(.vvar_ ## name) +#define __VVAR_KERNEL_LDS +#include +#undef __VVAR_KERNEL_LDS +#undef EMIT_VVAR + + } :data + + . = ALIGN(__vvar_page + PAGE_SIZE, PAGE_SIZE); + #define VSYSCALL_ADDR (-10*1024*1024) #define VLOAD_OFFSET (VSYSCALL_ADDR - __vsyscall_0 + LOAD_OFFSET) @@ -162,7 +180,6 @@ SECTIONS #define VVIRT_OFFSET (VSYSCALL_ADDR - __vsyscall_0) #define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) - . = ALIGN(4096); __vsyscall_0 = .; . = VSYSCALL_ADDR; @@ -185,23 +202,6 @@ SECTIONS #undef VVIRT_OFFSET #undef VVIRT - __vvar_page = .; - - .vvar : AT(ADDR(.vvar) - LOAD_OFFSET) { - - /* Place all vvars at the offsets in asm/vvar.h. */ -#define EMIT_VVAR(name, offset) \ - . = offset; \ - *(.vvar_ ## name) -#define __VVAR_KERNEL_LDS -#include -#undef __VVAR_KERNEL_LDS -#undef EMIT_VVAR - - } :data - - . = ALIGN(__vvar_page + PAGE_SIZE, PAGE_SIZE); - #endif /* CONFIG_X86_64 */ /* Init code and data - will be freed after init */ -- cgit v1.1 From f670bb760e7d32ec9c690e748a1d5d04921363ab Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 3 Aug 2011 09:31:51 -0400 Subject: x86-64: Work around gold bug 13023 Gold has trouble assigning numbers to the location counter inside of an output section description. The bug was triggered by 9fd67b4ed0714ab718f1f9bd14c344af336a6df7, which consolidated all of the vsyscall sections into a single section. The workaround is IMO still nicer than the old way of doing it. This produces an apparently valid kernel image and passes my vdso tests on both GNU ld version 2.21.51.0.6-2.fc15 20110118 and GNU gold (version 2.21.51.0.6-2.fc15 20110118) 1.10 as distributed by Fedora 15. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/0b260cb806f1f9a25c00ce8377a5f035d57f557a.1312378163.git.luto@mit.edu Reported-by: Arkadiusz Miskiewicz Signed-off-by: H. Peter Anvin --- arch/x86/kernel/vmlinux.lds.S | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index e79fb39..8f3a265 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -158,10 +158,12 @@ SECTIONS __vvar_page = .; .vvar : AT(ADDR(.vvar) - LOAD_OFFSET) { + /* work around gold bug 13023 */ + __vvar_beginning_hack = .; - /* Place all vvars at the offsets in asm/vvar.h. */ -#define EMIT_VVAR(name, offset) \ - . = offset; \ + /* Place all vvars at the offsets in asm/vvar.h. */ +#define EMIT_VVAR(name, offset) \ + . = __vvar_beginning_hack + offset; \ *(.vvar_ ## name) #define __VVAR_KERNEL_LDS #include @@ -184,15 +186,17 @@ SECTIONS . = VSYSCALL_ADDR; .vsyscall : AT(VLOAD(.vsyscall)) { + /* work around gold bug 13023 */ + __vsyscall_beginning_hack = .; *(.vsyscall_0) - . = 1024; + . = __vsyscall_beginning_hack + 1024; *(.vsyscall_1) - . = 2048; + . = __vsyscall_beginning_hack + 2048; *(.vsyscall_2) - . = 4096; /* Pad the whole page. */ + . = __vsyscall_beginning_hack + 4096; /* Pad the whole page. */ } :user =0xcc . = ALIGN(__vsyscall_0 + PAGE_SIZE, PAGE_SIZE); -- cgit v1.1 From 5d5791af4c0d4fd32093882357506355c3357503 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 3 Aug 2011 09:31:52 -0400 Subject: x86-64, xen: Enable the vvar mapping Xen needs to handle VVAR_PAGE, introduced in git commit: 9fd67b4ed0714ab718f1f9bd14c344af336a6df7 x86-64: Give vvars their own page Otherwise we die during bootup with a message like: (XEN) mm.c:940:d10 Error getting mfn 1888 (pfn 1e3e48) from L1 entry 8000000001888465 for l1e_owner=10, pg_owner=10 (XEN) mm.c:5049:d10 ptwr_emulate: could not get_page_from_l1e() [ 0.000000] BUG: unable to handle kernel NULL pointer dereference at (null) [ 0.000000] IP: [] xen_set_pte+0x20/0xe0 Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/4659478ed2f3480938f96491c2ecbe2b2e113a23.1312378163.git.luto@mit.edu Reviewed-by: Konrad Rzeszutek Wilk Signed-off-by: H. Peter Anvin --- arch/x86/xen/mmu.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 0ccccb6..2e78619 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1829,6 +1829,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) # endif #else case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE: + case VVAR_PAGE: #endif case FIX_TEXT_POKE0: case FIX_TEXT_POKE1: @@ -1869,7 +1870,8 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) #ifdef CONFIG_X86_64 /* Replicate changes to map the vsyscall page into the user pagetable vsyscall mapping. */ - if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) { + if ((idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) || + idx == VVAR_PAGE) { unsigned long vaddr = __fix_to_virt(idx); set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte); } -- cgit v1.1 From 318f5a2a672152328c9fb4dead504b89ec738a43 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 3 Aug 2011 09:31:53 -0400 Subject: x86-64: Add user_64bit_mode paravirt op Three places in the kernel assume that the only long mode CPL 3 selector is __USER_CS. This is not true on Xen -- Xen's sysretq changes cs to the magic value 0xe033. Two of the places are corner cases, but as of "x86-64: Improve vsyscall emulation CS and RIP handling" (c9712944b2a12373cb6ff8059afcfb7e826a6c54), vsyscalls will segfault if called with Xen's extra CS selector. This causes a panic when older init builds die. It seems impossible to make Xen use __USER_CS reliably without taking a performance hit on every system call, so this fixes the tests instead with a new paravirt op. It's a little ugly because ptrace.h can't include paravirt.h. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/f4fcb3947340d9e96ce1054a432f183f9da9db83.1312378163.git.luto@mit.edu Reported-by: Konrad Rzeszutek Wilk Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/desc.h | 4 ++-- arch/x86/include/asm/paravirt_types.h | 6 ++++++ arch/x86/include/asm/ptrace.h | 19 +++++++++++++++++++ arch/x86/kernel/paravirt.c | 4 ++++ arch/x86/kernel/step.c | 2 +- arch/x86/kernel/vsyscall_64.c | 6 +----- arch/x86/mm/fault.c | 2 +- arch/x86/xen/enlighten.c | 4 ++++ 8 files changed, 38 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 7b439d9..41935fa 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -27,8 +27,8 @@ static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *in desc->base2 = (info->base_addr & 0xff000000) >> 24; /* - * Don't allow setting of the lm bit. It is useless anyway - * because 64bit system calls require __USER_CS: + * Don't allow setting of the lm bit. It would confuse + * user_64bit_mode and would get overridden by sysret anyway. */ desc->l = 0; } diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 8288509..96a0f80 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -41,6 +41,7 @@ #include #include +#include struct page; struct thread_struct; @@ -63,6 +64,11 @@ struct paravirt_callee_save { struct pv_info { unsigned int kernel_rpl; int shared_kernel_pmd; + +#ifdef CONFIG_X86_64 + u16 extra_user_64bit_cs; /* __USER_CS if none */ +#endif + int paravirt_enabled; const char *name; }; diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 94e7618..3566454 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -131,6 +131,9 @@ struct pt_regs { #ifdef __KERNEL__ #include +#ifdef CONFIG_PARAVIRT +#include +#endif struct cpuinfo_x86; struct task_struct; @@ -187,6 +190,22 @@ static inline int v8086_mode(struct pt_regs *regs) #endif } +#ifdef CONFIG_X86_64 +static inline bool user_64bit_mode(struct pt_regs *regs) +{ +#ifndef CONFIG_PARAVIRT + /* + * On non-paravirt systems, this is the only long mode CPL 3 + * selector. We do not allow long mode selectors in the LDT. + */ + return regs->cs == __USER_CS; +#else + /* Headers are too twisted for this to go in paravirt.h. */ + return regs->cs == __USER_CS || regs->cs == pv_info.extra_user_64bit_cs; +#endif +} +#endif + /* * X86_32 CPUs don't save ss and esp if the CPU is already in kernel mode * when it traps. The previous stack will be directly underneath the saved diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 869e1ae..681f159 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -299,6 +299,10 @@ struct pv_info pv_info = { .paravirt_enabled = 0, .kernel_rpl = 0, .shared_kernel_pmd = 1, /* Only used when CONFIG_X86_PAE is set */ + +#ifdef CONFIG_X86_64 + .extra_user_64bit_cs = __USER_CS, +#endif }; struct pv_init_ops pv_init_ops = { diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index 7977f0c..c346d11 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c @@ -74,7 +74,7 @@ static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs) #ifdef CONFIG_X86_64 case 0x40 ... 0x4f: - if (regs->cs != __USER_CS) + if (!user_64bit_mode(regs)) /* 32-bit mode: register increment */ return 0; /* 64-bit mode: REX prefix */ diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index dda7dff..1725930 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -127,11 +127,7 @@ void dotraplinkage do_emulate_vsyscall(struct pt_regs *regs, long error_code) local_irq_enable(); - /* - * Real 64-bit user mode code has cs == __USER_CS. Anything else - * is bogus. - */ - if (regs->cs != __USER_CS) { + if (!user_64bit_mode(regs)) { /* * If we trapped from kernel mode, we might as well OOPS now * instead of returning to some random address and OOPSing diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 2dbf6bf..c1d0182 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -105,7 +105,7 @@ check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr, * but for now it's good enough to assume that long * mode only uses well known segments or kernel. */ - return (!user_mode(regs)) || (regs->cs == __USER_CS); + return (!user_mode(regs) || user_64bit_mode(regs)); #endif case 0x60: /* 0x64 thru 0x67 are valid prefixes in all modes. */ diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 5525163..78fe33d 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -937,6 +937,10 @@ static const struct pv_info xen_info __initconst = { .paravirt_enabled = 1, .shared_kernel_pmd = 0, +#ifdef CONFIG_X86_64 + .extra_user_64bit_cs = FLAT_USER_CS64, +#endif + .name = "Xen", }; -- cgit v1.1 From c149a665ac488e0dac22a42287f45ad1bda06ff1 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 3 Aug 2011 09:31:54 -0400 Subject: x86-64: Add vsyscall:emulate_vsyscall trace event Vsyscall emulation is slow, so make it easy to track down. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/cdaad7da946a80b200df16647c1700db3e1171e9.1312378163.git.luto@mit.edu Signed-off-by: H. Peter Anvin --- arch/x86/kernel/vsyscall_64.c | 6 ++++++ arch/x86/kernel/vsyscall_trace.h | 29 +++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+) create mode 100644 arch/x86/kernel/vsyscall_trace.h (limited to 'arch/x86') diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 1725930..93a0d46 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -50,6 +50,9 @@ #include #include +#define CREATE_TRACE_POINTS +#include "vsyscall_trace.h" + DEFINE_VVAR(int, vgetcpu_mode); DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) = { @@ -146,6 +149,9 @@ void dotraplinkage do_emulate_vsyscall(struct pt_regs *regs, long error_code) * and int 0xcc is two bytes long. */ vsyscall_nr = addr_to_vsyscall_nr(regs->ip - 2); + + trace_emulate_vsyscall(vsyscall_nr); + if (vsyscall_nr < 0) { warn_bad_vsyscall(KERN_WARNING, regs, "illegal int 0xcc (exploit attempt?)"); diff --git a/arch/x86/kernel/vsyscall_trace.h b/arch/x86/kernel/vsyscall_trace.h new file mode 100644 index 0000000..a8b2ede --- /dev/null +++ b/arch/x86/kernel/vsyscall_trace.h @@ -0,0 +1,29 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM vsyscall + +#if !defined(__VSYSCALL_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) +#define __VSYSCALL_TRACE_H + +#include + +TRACE_EVENT(emulate_vsyscall, + + TP_PROTO(int nr), + + TP_ARGS(nr), + + TP_STRUCT__entry(__field(int, nr)), + + TP_fast_assign( + __entry->nr = nr; + ), + + TP_printk("nr = %d", __entry->nr) +); + +#endif + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH ../../arch/x86/kernel +#define TRACE_INCLUDE_FILE vsyscall_trace +#include -- cgit v1.1 From c00c8aa2d976e9ed1d12a57b42d6e9b27efb7abe Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 4 Aug 2011 18:42:10 -0400 Subject: xen/trace: Fix compile error when CONFIG_XEN_PRIVILEGED_GUEST is not set MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit with CONFIG_XEN and CONFIG_FTRACE set we get this: arch/x86/xen/trace.c:22: error: ‘__HYPERVISOR_console_io’ undeclared here (not in a function) arch/x86/xen/trace.c:22: error: array index in initializer not of integer type arch/x86/xen/trace.c:22: error: (near initialization for ‘xen_hypercall_names’) arch/x86/xen/trace.c:23: error: ‘__HYPERVISOR_physdev_op_compat’ undeclared here (not in a function) Issue was that the definitions of __HYPERVISOR were not pulled if CONFIG_XEN_PRIVILEGED_GUEST was not set. Reported-by: Randy Dunlap Acked-by: Randy Dunlap Acked-by: Ingo Molnar Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/trace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/trace.c b/arch/x86/xen/trace.c index 734beba..520022d 100644 --- a/arch/x86/xen/trace.c +++ b/arch/x86/xen/trace.c @@ -1,4 +1,5 @@ #include +#include #define N(x) [__HYPERVISOR_##x] = "("#x")" static const char *xen_hypercall_names[] = { -- cgit v1.1 From dfb09f9b7ab03fd367740e541a5caf830ed56726 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 5 Aug 2011 15:15:08 +0200 Subject: x86, amd: Avoid cache aliasing penalties on AMD family 15h This patch provides performance tuning for the "Bulldozer" CPU. With its shared instruction cache there is a chance of generating an excessive number of cache cross-invalidates when running specific workloads on the cores of a compute module. This excessive amount of cross-invalidations can be observed if cache lines backed by shared physical memory alias in bits [14:12] of their virtual addresses, as those bits are used for the index generation. This patch addresses the issue by clearing all the bits in the [14:12] slice of the file mapping's virtual address at generation time, thus forcing those bits the same for all mappings of a single shared library across processes and, in doing so, avoids instruction cache aliases. It also adds the command line option "align_va_addr=(32|64|on|off)" with which virtual address alignment can be enabled for 32-bit or 64-bit x86 individually, or both, or be completely disabled. This change leaves virtual region address allocation on other families and/or vendors unaffected. Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1312550110-24160-2-git-send-email-bp@amd64.org Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/elf.h | 31 +++++++++++++++++ arch/x86/kernel/cpu/amd.c | 13 +++++++ arch/x86/kernel/sys_x86_64.c | 81 ++++++++++++++++++++++++++++++++++++++++++-- arch/x86/mm/mmap.c | 15 -------- arch/x86/vdso/vma.c | 9 +++++ 5 files changed, 131 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index f2ad216..5f962df 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -4,6 +4,7 @@ /* * ELF register definitions.. */ +#include #include #include @@ -320,4 +321,34 @@ extern int syscall32_setup_pages(struct linux_binprm *, int exstack); extern unsigned long arch_randomize_brk(struct mm_struct *mm); #define arch_randomize_brk arch_randomize_brk +/* + * True on X86_32 or when emulating IA32 on X86_64 + */ +static inline int mmap_is_ia32(void) +{ +#ifdef CONFIG_X86_32 + return 1; +#endif +#ifdef CONFIG_IA32_EMULATION + if (test_thread_flag(TIF_IA32)) + return 1; +#endif + return 0; +} + +/* The first two values are special, do not change. See align_addr() */ +enum align_flags { + ALIGN_VA_32 = BIT(0), + ALIGN_VA_64 = BIT(1), + ALIGN_VDSO = BIT(2), + ALIGN_TOPDOWN = BIT(3), +}; + +struct va_alignment { + int flags; + unsigned long mask; +} ____cacheline_aligned; + +extern struct va_alignment va_align; +extern unsigned long align_addr(unsigned long, struct file *, enum align_flags); #endif /* _ASM_X86_ELF_H */ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index b13ed39..b0234bc 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -458,6 +458,19 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) "with P0 frequency!\n"); } } + + if (c->x86 == 0x15) { + unsigned long upperbit; + u32 cpuid, assoc; + + cpuid = cpuid_edx(0x80000005); + assoc = cpuid >> 16 & 0xff; + upperbit = ((cpuid >> 24) << 10) / assoc; + + va_align.mask = (upperbit - 1) & PAGE_MASK; + va_align.flags = ALIGN_VA_32 | ALIGN_VA_64; + + } } static void __cpuinit init_amd(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index ff14a50..aaa8d09 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -18,6 +18,72 @@ #include #include +struct __read_mostly va_alignment va_align = { + .flags = -1, +}; + +/* + * Align a virtual address to avoid aliasing in the I$ on AMD F15h. + * + * @flags denotes the allocation direction - bottomup or topdown - + * or vDSO; see call sites below. + */ +unsigned long align_addr(unsigned long addr, struct file *filp, + enum align_flags flags) +{ + unsigned long tmp_addr; + + /* handle 32- and 64-bit case with a single conditional */ + if (va_align.flags < 0 || !(va_align.flags & (2 - mmap_is_ia32()))) + return addr; + + if (!(current->flags & PF_RANDOMIZE)) + return addr; + + if (!((flags & ALIGN_VDSO) || filp)) + return addr; + + tmp_addr = addr; + + /* + * We need an address which is <= than the original + * one only when in topdown direction. + */ + if (!(flags & ALIGN_TOPDOWN)) + tmp_addr += va_align.mask; + + tmp_addr &= ~va_align.mask; + + return tmp_addr; +} + +static int __init control_va_addr_alignment(char *str) +{ + /* guard against enabling this on other CPU families */ + if (va_align.flags < 0) + return 1; + + if (*str == 0) + return 1; + + if (*str == '=') + str++; + + if (!strcmp(str, "32")) + va_align.flags = ALIGN_VA_32; + else if (!strcmp(str, "64")) + va_align.flags = ALIGN_VA_64; + else if (!strcmp(str, "off")) + va_align.flags = 0; + else if (!strcmp(str, "on")) + va_align.flags = ALIGN_VA_32 | ALIGN_VA_64; + else + return 0; + + return 1; +} +__setup("align_va_addr", control_va_addr_alignment); + SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len, unsigned long, prot, unsigned long, flags, unsigned long, fd, unsigned long, off) @@ -92,6 +158,9 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, start_addr = addr; full_search: + + addr = align_addr(addr, filp, 0); + for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { /* At this point: (!vma || addr < vma->vm_end). */ if (end - len < addr) { @@ -117,6 +186,7 @@ full_search: mm->cached_hole_size = vma->vm_start - addr; addr = vma->vm_end; + addr = align_addr(addr, filp, 0); } } @@ -161,10 +231,13 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, /* make sure it can fit in the remaining address space */ if (addr > len) { - vma = find_vma(mm, addr-len); - if (!vma || addr <= vma->vm_start) + unsigned long tmp_addr = align_addr(addr - len, filp, + ALIGN_TOPDOWN); + + vma = find_vma(mm, tmp_addr); + if (!vma || tmp_addr + len <= vma->vm_start) /* remember the address as a hint for next time */ - return mm->free_area_cache = addr-len; + return mm->free_area_cache = tmp_addr; } if (mm->mmap_base < len) @@ -173,6 +246,8 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, addr = mm->mmap_base-len; do { + addr = align_addr(addr, filp, ALIGN_TOPDOWN); + /* * Lookup failure means no vma is above this address, * else if new region fits below vma->vm_start, diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 1dab519..d4c0736 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -51,21 +51,6 @@ static unsigned int stack_maxrandom_size(void) #define MIN_GAP (128*1024*1024UL + stack_maxrandom_size()) #define MAX_GAP (TASK_SIZE/6*5) -/* - * True on X86_32 or when emulating IA32 on X86_64 - */ -static int mmap_is_ia32(void) -{ -#ifdef CONFIG_X86_32 - return 1; -#endif -#ifdef CONFIG_IA32_EMULATION - if (test_thread_flag(TIF_IA32)) - return 1; -#endif - return 0; -} - static int mmap_is_legacy(void) { if (current->personality & ADDR_COMPAT_LAYOUT) diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index 7abd2be..caa42ce 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -69,6 +69,15 @@ static unsigned long vdso_addr(unsigned long start, unsigned len) addr = start + (offset << PAGE_SHIFT); if (addr >= end) addr = end; + + /* + * page-align it here so that get_unmapped_area doesn't + * align it wrongfully again to the next page. addr can come in 4K + * unaligned here as a result of stack start randomization. + */ + addr = PAGE_ALIGN(addr); + addr = align_addr(addr, NULL, ALIGN_VDSO); + return addr; } -- cgit v1.1 From a110b5ec7371592eac856ac5c22dc7b518952d44 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 5 Aug 2011 20:01:16 +0200 Subject: x86: Add a BSP cpu_dev helper Add a function ptr to struct cpu_dev which is destined to be run only once on the BSP during boot. Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/20110805180116.GB26217@aftab Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/common.c | 3 +++ arch/x86/kernel/cpu/cpu.h | 1 + 2 files changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 22a073d..8ed394a 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -681,6 +681,9 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) filter_cpuid_features(c, false); setup_smep(c); + + if (this_cpu->c_bsp_init) + this_cpu->c_bsp_init(c); } void __init early_cpu_init(void) diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index e765633..1b22dcc 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -18,6 +18,7 @@ struct cpu_dev { struct cpu_model_info c_models[4]; void (*c_early_init)(struct cpuinfo_x86 *); + void (*c_bsp_init)(struct cpuinfo_x86 *); void (*c_init)(struct cpuinfo_x86 *); void (*c_identify)(struct cpuinfo_x86 *); unsigned int (*c_size_cache)(struct cpuinfo_x86 *, unsigned int); -- cgit v1.1 From 8fa8b035085e7320c15875c1f6b03b290ca2dd66 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 5 Aug 2011 20:04:09 +0200 Subject: x86, amd: Move BSP code to cpu_dev helper Move code which is run once on the BSP during boot into the cpu_dev helper. [ hpa: removed bogus cpu_has -> static_cpu_has conversion ] Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/20110805180409.GC26217@aftab Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/amd.c | 59 +++++++++++++++++++++++------------------------ 1 file changed, 29 insertions(+), 30 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index b0234bc..b6e3e87 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -410,6 +410,34 @@ static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c) #endif } +static void __cpuinit bsp_init_amd(struct cpuinfo_x86 *c) +{ + if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) { + + if (c->x86 > 0x10 || + (c->x86 == 0x10 && c->x86_model >= 0x2)) { + u64 val; + + rdmsrl(MSR_K7_HWCR, val); + if (!(val & BIT(24))) + printk(KERN_WARNING FW_BUG "TSC doesn't count " + "with P0 frequency!\n"); + } + } + + if (c->x86 == 0x15) { + unsigned long upperbit; + u32 cpuid, assoc; + + cpuid = cpuid_edx(0x80000005); + assoc = cpuid >> 16 & 0xff; + upperbit = ((cpuid >> 24) << 10) / assoc; + + va_align.mask = (upperbit - 1) & PAGE_MASK; + va_align.flags = ALIGN_VA_32 | ALIGN_VA_64; + } +} + static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) { early_init_amd_mc(c); @@ -441,36 +469,6 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_EXTD_APICID); } #endif - - /* We need to do the following only once */ - if (c != &boot_cpu_data) - return; - - if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) { - - if (c->x86 > 0x10 || - (c->x86 == 0x10 && c->x86_model >= 0x2)) { - u64 val; - - rdmsrl(MSR_K7_HWCR, val); - if (!(val & BIT(24))) - printk(KERN_WARNING FW_BUG "TSC doesn't count " - "with P0 frequency!\n"); - } - } - - if (c->x86 == 0x15) { - unsigned long upperbit; - u32 cpuid, assoc; - - cpuid = cpuid_edx(0x80000005); - assoc = cpuid >> 16 & 0xff; - upperbit = ((cpuid >> 24) << 10) / assoc; - - va_align.mask = (upperbit - 1) & PAGE_MASK; - va_align.flags = ALIGN_VA_32 | ALIGN_VA_64; - - } } static void __cpuinit init_amd(struct cpuinfo_x86 *c) @@ -692,6 +690,7 @@ static const struct cpu_dev __cpuinitconst amd_cpu_dev = { .c_size_cache = amd_size_cache, #endif .c_early_init = early_init_amd, + .c_bsp_init = bsp_init_amd, .c_init = init_amd, .c_x86_vendor = X86_VENDOR_AMD, }; -- cgit v1.1 From a3ea14df0e383f44dcb2e61badb71180dbffe526 Mon Sep 17 00:00:00 2001 From: Paul Fox Date: Tue, 26 Jul 2011 16:42:26 +0100 Subject: x86, olpc: Wait for last byte of EC command to be accepted When executing EC commands, only waiting when there are still more bytes to write is usually fine. However, if the system suspends very quickly after a call to olpc_ec_cmd(), the last data byte may not yet be transferred to the EC, and the command will not complete. This solves a bug where the SCI wakeup mask was not correctly written when going into suspend. It means that sometimes, on XO-1.5 (but not XO-1), the devices that were marked as wakeup sources can't wake up the system. e.g. you ask for wifi wakeups, suspend, but then incoming wifi frames don't wake up the system as they should. Signed-off-by: Paul Fox Signed-off-by: Daniel Drake Acked-by: Andres Salomon Cc: Signed-off-by: Ingo Molnar --- arch/x86/platform/olpc/olpc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/olpc/olpc.c b/arch/x86/platform/olpc/olpc.c index 8b9940e..7cce722 100644 --- a/arch/x86/platform/olpc/olpc.c +++ b/arch/x86/platform/olpc/olpc.c @@ -161,13 +161,13 @@ restart: if (inbuf && inlen) { /* write data to EC */ for (i = 0; i < inlen; i++) { + pr_devel("olpc-ec: sending cmd arg 0x%x\n", inbuf[i]); + outb(inbuf[i], 0x68); if (wait_on_ibf(0x6c, 0)) { printk(KERN_ERR "olpc-ec: timeout waiting for" " EC accept data!\n"); goto err; } - pr_devel("olpc-ec: sending cmd arg 0x%x\n", inbuf[i]); - outb(inbuf[i], 0x68); } } if (outbuf && outlen) { -- cgit v1.1 From 05e33fc20ea5e493a2a1e7f1d04f43cdf89f83ed Mon Sep 17 00:00:00 2001 From: Jack Steiner Date: Fri, 5 Aug 2011 09:09:00 -0500 Subject: x86, UV: Remove UV delay in starting slave cpus Delete the 10 msec delay between the INIT and SIPI when starting slave cpus. I can find no requirement for this delay. BIOS also has similar code sequences without the delay. Removing the delay reduces boot time by 40 sec. Every bit helps. Signed-off-by: Jack Steiner Cc: Link: http://lkml.kernel.org/r/20110805140900.GA6774@sgi.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/x2apic_uv_x.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index adc66c3..34b1859 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -207,7 +207,6 @@ static int __cpuinit uv_wakeup_secondary(int phys_apicid, unsigned long start_ri ((start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) | APIC_DM_INIT; uv_write_global_mmr64(pnode, UVH_IPI_INT, val); - mdelay(10); val = (1UL << UVH_IPI_INT_SEND_SHFT) | (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) | -- cgit v1.1 From 9387f774d61b01ab71bade85e6d0bfab0b3419bd Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sat, 6 Aug 2011 14:31:38 +0200 Subject: x86-32, amd: Move va_align definition to unbreak 32-bit build hpa reported that dfb09f9b7ab03fd367740e541a5caf830ed56726 breaks 32-bit builds with the following error message: /home/hpa/kernel/linux-tip.cpu/arch/x86/kernel/cpu/amd.c:437: undefined reference to `va_align' /home/hpa/kernel/linux-tip.cpu/arch/x86/kernel/cpu/amd.c:436: undefined reference to `va_align' This is due to the fact that va_align is a global in a 64-bit only compilation unit. Move it to mmap.c where it is visible to both subarches. Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1312633899-1131-1-git-send-email-bp@amd64.org Signed-off-by: H. Peter Anvin --- arch/x86/kernel/sys_x86_64.c | 4 ---- arch/x86/mm/mmap.c | 5 ++++- 2 files changed, 4 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index aaa8d09..fe7d2da 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -18,10 +18,6 @@ #include #include -struct __read_mostly va_alignment va_align = { - .flags = -1, -}; - /* * Align a virtual address to avoid aliasing in the I$ on AMD F15h. * diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index d4c0736..4b5ba85 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -31,6 +31,10 @@ #include #include +struct __read_mostly va_alignment va_align = { + .flags = -1, +}; + static unsigned int stack_maxrandom_size(void) { unsigned int max = 0; @@ -42,7 +46,6 @@ static unsigned int stack_maxrandom_size(void) return max; } - /* * Top of mmap area (just below the process stack). * -- cgit v1.1 From a34668f6beb4ab01e07683276d6a24bab6c175e0 Mon Sep 17 00:00:00 2001 From: Youquan Song Date: Tue, 2 Aug 2011 14:01:35 +0800 Subject: perf, x86: Add model 45 SandyBridge support Add support to Romely-EP SandyBridge. Signed-off-by: Youquan Song Signed-off-by: Anhua Xu Signed-off-by: Lin Ming Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1312264895-2010-1-git-send-email-youquan.song@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 45fbb8f..f88af2c 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1590,6 +1590,7 @@ static __init int intel_pmu_init(void) break; case 42: /* SandyBridge */ + case 45: /* SandyBridge, "Romely-EP" */ memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids)); -- cgit v1.1 From 10fe570fc16721d78afdba9689720094527c1ba3 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Tue, 9 Aug 2011 13:02:50 -0400 Subject: Revert "xen/debug: WARN_ON when identity PFN has no _PAGE_IOMAP flag set." We don' use it anymore and there are more false positives. This reverts commit fc25151d9ac7d809239fe68de0a1490b504bb94a. Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/Kconfig | 8 -------- arch/x86/xen/mmu.c | 38 -------------------------------------- 2 files changed, 46 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 5cc821c..ae559fe 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -49,11 +49,3 @@ config XEN_DEBUG_FS help Enable statistics output and various tuning options in debugfs. Enabling this option may incur a significant performance overhead. - -config XEN_DEBUG - bool "Enable Xen debug checks" - depends on XEN - default n - help - Enable various WARN_ON checks in the Xen MMU code. - Enabling this option WILL incur a significant performance overhead. diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index f987bde..3c9aecd 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -495,41 +495,6 @@ static pte_t xen_make_pte(pteval_t pte) } PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte); -#ifdef CONFIG_XEN_DEBUG -pte_t xen_make_pte_debug(pteval_t pte) -{ - phys_addr_t addr = (pte & PTE_PFN_MASK); - phys_addr_t other_addr; - bool io_page = false; - pte_t _pte; - - if (pte & _PAGE_IOMAP) - io_page = true; - - _pte = xen_make_pte(pte); - - if (!addr) - return _pte; - - if (io_page && - (xen_initial_domain() || addr >= ISA_END_ADDRESS)) { - other_addr = pfn_to_mfn(addr >> PAGE_SHIFT) << PAGE_SHIFT; - WARN_ONCE(addr != other_addr, - "0x%lx is using VM_IO, but it is 0x%lx!\n", - (unsigned long)addr, (unsigned long)other_addr); - } else { - pteval_t iomap_set = (_pte.pte & PTE_FLAGS_MASK) & _PAGE_IOMAP; - other_addr = (_pte.pte & PTE_PFN_MASK); - WARN_ONCE((addr == other_addr) && (!io_page) && (!iomap_set), - "0x%lx is missing VM_IO (and wasn't fixed)!\n", - (unsigned long)addr); - } - - return _pte; -} -PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte_debug); -#endif - static pgd_t xen_make_pgd(pgdval_t pgd) { pgd = pte_pfn_to_mfn(pgd); @@ -1988,9 +1953,6 @@ void __init xen_ident_map_ISA(void) static void __init xen_post_allocator_init(void) { -#ifdef CONFIG_XEN_DEBUG - pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte_debug); -#endif pv_mmu_ops.set_pte = xen_set_pte; pv_mmu_ops.set_pmd = xen_set_pmd; pv_mmu_ops.set_pud = xen_set_pud; -- cgit v1.1 From 5cdd174feab1b4a74afc494c447906274aed4a20 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Wed, 10 Aug 2011 11:49:56 +1000 Subject: x86, amd: Include elf.h explicitly, prepare the code for the module.h split When the moduleu.h splitting tree is merged to the latest tip:x86/cpu tree, the x86_64 allmodconfig build fails like this: arch/x86/kernel/cpu/amd.c: In function 'bsp_init_amd': arch/x86/kernel/cpu/amd.c:437:3: error: 'va_align' undeclared (first use in this function) arch/x86/kernel/cpu/amd.c:438:23: error: 'ALIGN_VA_32' undeclared (first use in this function) arch/x86/kernel/cpu/amd.c:438:37: error: 'ALIGN_VA_64' undeclared (first use in this function) This is caused by the module.h split up intreacting with commit dfb09f9b7ab0 ("x86, amd: Avoid cache aliasing penalties on AMD family 15h") from the tip:x86/cpu tree. I have added the following patch for today (this, or something similar, could be applied to the tip tree directly - the export.h include below was added by the module.h splitup). So include elf.h to use va_align and remove this implicit dependency on module.h doing it for us. Signed-off-by: Stephen Rothwell Cc: Borislav Petkov Cc: Peter Zijlstra Cc: Paul Gortmaker Link: http://lkml.kernel.org/r/20110810114956.238d66772883636e3040d29f@canb.auug.org.au Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/amd.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index b6e3e87..13c6ec8 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -1,5 +1,6 @@ #include #include +#include #include #include -- cgit v1.1 From 66be895158886a6cd816aa1eaa18965a5c522d8f Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Thu, 4 Aug 2011 20:19:25 +0200 Subject: crypto: sha1 - SSSE3 based SHA1 implementation for x86-64 This is an assembler implementation of the SHA1 algorithm using the Supplemental SSE3 (SSSE3) instructions or, when available, the Advanced Vector Extensions (AVX). Testing with the tcrypt module shows the raw hash performance is up to 2.3 times faster than the C implementation, using 8k data blocks on a Core 2 Duo T5500. For the smalest data set (16 byte) it is still 25% faster. Since this implementation uses SSE/YMM registers it cannot safely be used in every situation, e.g. while an IRQ interrupts a kernel thread. The implementation falls back to the generic SHA1 variant, if using the SSE/YMM registers is not possible. With this algorithm I was able to increase the throughput of a single IPsec link from 344 Mbit/s to 464 Mbit/s on a Core 2 Quad CPU using the SSSE3 variant -- a speedup of +34.8%. Saving and restoring SSE/YMM state might make the actual throughput fluctuate when there are FPU intensive userland applications running. For example, meassuring the performance using iperf2 directly on the machine under test gives wobbling numbers because iperf2 uses the FPU for each packet to check if the reporting interval has expired (in the above test I got min/max/avg: 402/484/464 MBit/s). Using this algorithm on a IPsec gateway gives much more reasonable and stable numbers, albeit not as high as in the directly connected case. Here is the result from an RFC 2544 test run with a EXFO Packet Blazer FTB-8510: frame size sha1-generic sha1-ssse3 delta 64 byte 37.5 MBit/s 37.5 MBit/s 0.0% 128 byte 56.3 MBit/s 62.5 MBit/s +11.0% 256 byte 87.5 MBit/s 100.0 MBit/s +14.3% 512 byte 131.3 MBit/s 150.0 MBit/s +14.2% 1024 byte 162.5 MBit/s 193.8 MBit/s +19.3% 1280 byte 175.0 MBit/s 212.5 MBit/s +21.4% 1420 byte 175.0 MBit/s 218.7 MBit/s +25.0% 1518 byte 150.0 MBit/s 181.2 MBit/s +20.8% The throughput for the largest frame size is lower than for the previous size because the IP packets need to be fragmented in this case to make there way through the IPsec tunnel. Signed-off-by: Mathias Krause Cc: Maxim Locktyukhin Signed-off-by: Herbert Xu --- arch/x86/crypto/Makefile | 8 + arch/x86/crypto/sha1_ssse3_asm.S | 558 ++++++++++++++++++++++++++++++++++++++ arch/x86/crypto/sha1_ssse3_glue.c | 240 ++++++++++++++++ arch/x86/include/asm/cpufeature.h | 3 + 4 files changed, 809 insertions(+) create mode 100644 arch/x86/crypto/sha1_ssse3_asm.S create mode 100644 arch/x86/crypto/sha1_ssse3_glue.c (limited to 'arch/x86') diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index c04f1b7..57c7f7b 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -13,6 +13,7 @@ obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o +obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o aes-i586-y := aes-i586-asm_32.o aes_glue.o twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o @@ -25,3 +26,10 @@ salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o + +# enable AVX support only when $(AS) can actually assemble the instructions +ifeq ($(call as-instr,vpxor %xmm0$(comma)%xmm1$(comma)%xmm2,yes,no),yes) +AFLAGS_sha1_ssse3_asm.o += -DSHA1_ENABLE_AVX_SUPPORT +CFLAGS_sha1_ssse3_glue.o += -DSHA1_ENABLE_AVX_SUPPORT +endif +sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o diff --git a/arch/x86/crypto/sha1_ssse3_asm.S b/arch/x86/crypto/sha1_ssse3_asm.S new file mode 100644 index 0000000..b2c2f57 --- /dev/null +++ b/arch/x86/crypto/sha1_ssse3_asm.S @@ -0,0 +1,558 @@ +/* + * This is a SIMD SHA-1 implementation. It requires the Intel(R) Supplemental + * SSE3 instruction set extensions introduced in Intel Core Microarchitecture + * processors. CPUs supporting Intel(R) AVX extensions will get an additional + * boost. + * + * This work was inspired by the vectorized implementation of Dean Gaudet. + * Additional information on it can be found at: + * http://www.arctic.org/~dean/crypto/sha1.html + * + * It was improved upon with more efficient vectorization of the message + * scheduling. This implementation has also been optimized for all current and + * several future generations of Intel CPUs. + * + * See this article for more information about the implementation details: + * http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/ + * + * Copyright (C) 2010, Intel Corp. + * Authors: Maxim Locktyukhin + * Ronen Zohar + * + * Converted to AT&T syntax and adapted for inclusion in the Linux kernel: + * Author: Mathias Krause + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#define CTX %rdi // arg1 +#define BUF %rsi // arg2 +#define CNT %rdx // arg3 + +#define REG_A %ecx +#define REG_B %esi +#define REG_C %edi +#define REG_D %ebp +#define REG_E %edx + +#define REG_T1 %eax +#define REG_T2 %ebx + +#define K_BASE %r8 +#define HASH_PTR %r9 +#define BUFFER_PTR %r10 +#define BUFFER_END %r11 + +#define W_TMP1 %xmm0 +#define W_TMP2 %xmm9 + +#define W0 %xmm1 +#define W4 %xmm2 +#define W8 %xmm3 +#define W12 %xmm4 +#define W16 %xmm5 +#define W20 %xmm6 +#define W24 %xmm7 +#define W28 %xmm8 + +#define XMM_SHUFB_BSWAP %xmm10 + +/* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */ +#define WK(t) (((t) & 15) * 4)(%rsp) +#define W_PRECALC_AHEAD 16 + +/* + * This macro implements the SHA-1 function's body for single 64-byte block + * param: function's name + */ +.macro SHA1_VECTOR_ASM name + .global \name + .type \name, @function + .align 32 +\name: + push %rbx + push %rbp + push %r12 + + mov %rsp, %r12 + sub $64, %rsp # allocate workspace + and $~15, %rsp # align stack + + mov CTX, HASH_PTR + mov BUF, BUFFER_PTR + + shl $6, CNT # multiply by 64 + add BUF, CNT + mov CNT, BUFFER_END + + lea K_XMM_AR(%rip), K_BASE + xmm_mov BSWAP_SHUFB_CTL(%rip), XMM_SHUFB_BSWAP + + SHA1_PIPELINED_MAIN_BODY + + # cleanup workspace + mov $8, %ecx + mov %rsp, %rdi + xor %rax, %rax + rep stosq + + mov %r12, %rsp # deallocate workspace + + pop %r12 + pop %rbp + pop %rbx + ret + + .size \name, .-\name +.endm + +/* + * This macro implements 80 rounds of SHA-1 for one 64-byte block + */ +.macro SHA1_PIPELINED_MAIN_BODY + INIT_REGALLOC + + mov (HASH_PTR), A + mov 4(HASH_PTR), B + mov 8(HASH_PTR), C + mov 12(HASH_PTR), D + mov 16(HASH_PTR), E + + .set i, 0 + .rept W_PRECALC_AHEAD + W_PRECALC i + .set i, (i+1) + .endr + +.align 4 +1: + RR F1,A,B,C,D,E,0 + RR F1,D,E,A,B,C,2 + RR F1,B,C,D,E,A,4 + RR F1,E,A,B,C,D,6 + RR F1,C,D,E,A,B,8 + + RR F1,A,B,C,D,E,10 + RR F1,D,E,A,B,C,12 + RR F1,B,C,D,E,A,14 + RR F1,E,A,B,C,D,16 + RR F1,C,D,E,A,B,18 + + RR F2,A,B,C,D,E,20 + RR F2,D,E,A,B,C,22 + RR F2,B,C,D,E,A,24 + RR F2,E,A,B,C,D,26 + RR F2,C,D,E,A,B,28 + + RR F2,A,B,C,D,E,30 + RR F2,D,E,A,B,C,32 + RR F2,B,C,D,E,A,34 + RR F2,E,A,B,C,D,36 + RR F2,C,D,E,A,B,38 + + RR F3,A,B,C,D,E,40 + RR F3,D,E,A,B,C,42 + RR F3,B,C,D,E,A,44 + RR F3,E,A,B,C,D,46 + RR F3,C,D,E,A,B,48 + + RR F3,A,B,C,D,E,50 + RR F3,D,E,A,B,C,52 + RR F3,B,C,D,E,A,54 + RR F3,E,A,B,C,D,56 + RR F3,C,D,E,A,B,58 + + add $64, BUFFER_PTR # move to the next 64-byte block + cmp BUFFER_END, BUFFER_PTR # if the current is the last one use + cmovae K_BASE, BUFFER_PTR # dummy source to avoid buffer overrun + + RR F4,A,B,C,D,E,60 + RR F4,D,E,A,B,C,62 + RR F4,B,C,D,E,A,64 + RR F4,E,A,B,C,D,66 + RR F4,C,D,E,A,B,68 + + RR F4,A,B,C,D,E,70 + RR F4,D,E,A,B,C,72 + RR F4,B,C,D,E,A,74 + RR F4,E,A,B,C,D,76 + RR F4,C,D,E,A,B,78 + + UPDATE_HASH (HASH_PTR), A + UPDATE_HASH 4(HASH_PTR), B + UPDATE_HASH 8(HASH_PTR), C + UPDATE_HASH 12(HASH_PTR), D + UPDATE_HASH 16(HASH_PTR), E + + RESTORE_RENAMED_REGS + cmp K_BASE, BUFFER_PTR # K_BASE means, we reached the end + jne 1b +.endm + +.macro INIT_REGALLOC + .set A, REG_A + .set B, REG_B + .set C, REG_C + .set D, REG_D + .set E, REG_E + .set T1, REG_T1 + .set T2, REG_T2 +.endm + +.macro RESTORE_RENAMED_REGS + # order is important (REG_C is where it should be) + mov B, REG_B + mov D, REG_D + mov A, REG_A + mov E, REG_E +.endm + +.macro SWAP_REG_NAMES a, b + .set _T, \a + .set \a, \b + .set \b, _T +.endm + +.macro F1 b, c, d + mov \c, T1 + SWAP_REG_NAMES \c, T1 + xor \d, T1 + and \b, T1 + xor \d, T1 +.endm + +.macro F2 b, c, d + mov \d, T1 + SWAP_REG_NAMES \d, T1 + xor \c, T1 + xor \b, T1 +.endm + +.macro F3 b, c ,d + mov \c, T1 + SWAP_REG_NAMES \c, T1 + mov \b, T2 + or \b, T1 + and \c, T2 + and \d, T1 + or T2, T1 +.endm + +.macro F4 b, c, d + F2 \b, \c, \d +.endm + +.macro UPDATE_HASH hash, val + add \hash, \val + mov \val, \hash +.endm + +/* + * RR does two rounds of SHA-1 back to back with W[] pre-calc + * t1 = F(b, c, d); e += w(i) + * e += t1; b <<= 30; d += w(i+1); + * t1 = F(a, b, c); + * d += t1; a <<= 5; + * e += a; + * t1 = e; a >>= 7; + * t1 <<= 5; + * d += t1; + */ +.macro RR F, a, b, c, d, e, round + add WK(\round), \e + \F \b, \c, \d # t1 = F(b, c, d); + W_PRECALC (\round + W_PRECALC_AHEAD) + rol $30, \b + add T1, \e + add WK(\round + 1), \d + + \F \a, \b, \c + W_PRECALC (\round + W_PRECALC_AHEAD + 1) + rol $5, \a + add \a, \e + add T1, \d + ror $7, \a # (a <>r 7) => a <= 80) && (i < (80 + W_PRECALC_AHEAD)))) + .set i, ((\r) % 80) # pre-compute for the next iteration + .if (i == 0) + W_PRECALC_RESET + .endif + W_PRECALC_00_15 + .elseif (i<32) + W_PRECALC_16_31 + .elseif (i < 80) // rounds 32-79 + W_PRECALC_32_79 + .endif +.endm + +.macro W_PRECALC_RESET + .set W, W0 + .set W_minus_04, W4 + .set W_minus_08, W8 + .set W_minus_12, W12 + .set W_minus_16, W16 + .set W_minus_20, W20 + .set W_minus_24, W24 + .set W_minus_28, W28 + .set W_minus_32, W +.endm + +.macro W_PRECALC_ROTATE + .set W_minus_32, W_minus_28 + .set W_minus_28, W_minus_24 + .set W_minus_24, W_minus_20 + .set W_minus_20, W_minus_16 + .set W_minus_16, W_minus_12 + .set W_minus_12, W_minus_08 + .set W_minus_08, W_minus_04 + .set W_minus_04, W + .set W, W_minus_32 +.endm + +.macro W_PRECALC_SSSE3 + +.macro W_PRECALC_00_15 + W_PRECALC_00_15_SSSE3 +.endm +.macro W_PRECALC_16_31 + W_PRECALC_16_31_SSSE3 +.endm +.macro W_PRECALC_32_79 + W_PRECALC_32_79_SSSE3 +.endm + +/* message scheduling pre-compute for rounds 0-15 */ +.macro W_PRECALC_00_15_SSSE3 + .if ((i & 3) == 0) + movdqu (i*4)(BUFFER_PTR), W_TMP1 + .elseif ((i & 3) == 1) + pshufb XMM_SHUFB_BSWAP, W_TMP1 + movdqa W_TMP1, W + .elseif ((i & 3) == 2) + paddd (K_BASE), W_TMP1 + .elseif ((i & 3) == 3) + movdqa W_TMP1, WK(i&~3) + W_PRECALC_ROTATE + .endif +.endm + +/* message scheduling pre-compute for rounds 16-31 + * + * - calculating last 32 w[i] values in 8 XMM registers + * - pre-calculate K+w[i] values and store to mem, for later load by ALU add + * instruction + * + * some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3] + * dependency, but improves for 32-79 + */ +.macro W_PRECALC_16_31_SSSE3 + # blended scheduling of vector and scalar instruction streams, one 4-wide + # vector iteration / 4 scalar rounds + .if ((i & 3) == 0) + movdqa W_minus_12, W + palignr $8, W_minus_16, W # w[i-14] + movdqa W_minus_04, W_TMP1 + psrldq $4, W_TMP1 # w[i-3] + pxor W_minus_08, W + .elseif ((i & 3) == 1) + pxor W_minus_16, W_TMP1 + pxor W_TMP1, W + movdqa W, W_TMP2 + movdqa W, W_TMP1 + pslldq $12, W_TMP2 + .elseif ((i & 3) == 2) + psrld $31, W + pslld $1, W_TMP1 + por W, W_TMP1 + movdqa W_TMP2, W + psrld $30, W_TMP2 + pslld $2, W + .elseif ((i & 3) == 3) + pxor W, W_TMP1 + pxor W_TMP2, W_TMP1 + movdqa W_TMP1, W + paddd K_XMM(K_BASE), W_TMP1 + movdqa W_TMP1, WK(i&~3) + W_PRECALC_ROTATE + .endif +.endm + +/* message scheduling pre-compute for rounds 32-79 + * + * in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1 + * instead we do equal: w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2 + * allows more efficient vectorization since w[i]=>w[i-3] dependency is broken + */ +.macro W_PRECALC_32_79_SSSE3 + .if ((i & 3) == 0) + movdqa W_minus_04, W_TMP1 + pxor W_minus_28, W # W is W_minus_32 before xor + palignr $8, W_minus_08, W_TMP1 + .elseif ((i & 3) == 1) + pxor W_minus_16, W + pxor W_TMP1, W + movdqa W, W_TMP1 + .elseif ((i & 3) == 2) + psrld $30, W + pslld $2, W_TMP1 + por W, W_TMP1 + .elseif ((i & 3) == 3) + movdqa W_TMP1, W + paddd K_XMM(K_BASE), W_TMP1 + movdqa W_TMP1, WK(i&~3) + W_PRECALC_ROTATE + .endif +.endm + +.endm // W_PRECALC_SSSE3 + + +#define K1 0x5a827999 +#define K2 0x6ed9eba1 +#define K3 0x8f1bbcdc +#define K4 0xca62c1d6 + +.section .rodata +.align 16 + +K_XMM_AR: + .long K1, K1, K1, K1 + .long K2, K2, K2, K2 + .long K3, K3, K3, K3 + .long K4, K4, K4, K4 + +BSWAP_SHUFB_CTL: + .long 0x00010203 + .long 0x04050607 + .long 0x08090a0b + .long 0x0c0d0e0f + + +.section .text + +W_PRECALC_SSSE3 +.macro xmm_mov a, b + movdqu \a,\b +.endm + +/* SSSE3 optimized implementation: + * extern "C" void sha1_transform_ssse3(u32 *digest, const char *data, u32 *ws, + * unsigned int rounds); + */ +SHA1_VECTOR_ASM sha1_transform_ssse3 + +#ifdef SHA1_ENABLE_AVX_SUPPORT + +.macro W_PRECALC_AVX + +.purgem W_PRECALC_00_15 +.macro W_PRECALC_00_15 + W_PRECALC_00_15_AVX +.endm +.purgem W_PRECALC_16_31 +.macro W_PRECALC_16_31 + W_PRECALC_16_31_AVX +.endm +.purgem W_PRECALC_32_79 +.macro W_PRECALC_32_79 + W_PRECALC_32_79_AVX +.endm + +.macro W_PRECALC_00_15_AVX + .if ((i & 3) == 0) + vmovdqu (i*4)(BUFFER_PTR), W_TMP1 + .elseif ((i & 3) == 1) + vpshufb XMM_SHUFB_BSWAP, W_TMP1, W + .elseif ((i & 3) == 2) + vpaddd (K_BASE), W, W_TMP1 + .elseif ((i & 3) == 3) + vmovdqa W_TMP1, WK(i&~3) + W_PRECALC_ROTATE + .endif +.endm + +.macro W_PRECALC_16_31_AVX + .if ((i & 3) == 0) + vpalignr $8, W_minus_16, W_minus_12, W # w[i-14] + vpsrldq $4, W_minus_04, W_TMP1 # w[i-3] + vpxor W_minus_08, W, W + vpxor W_minus_16, W_TMP1, W_TMP1 + .elseif ((i & 3) == 1) + vpxor W_TMP1, W, W + vpslldq $12, W, W_TMP2 + vpslld $1, W, W_TMP1 + .elseif ((i & 3) == 2) + vpsrld $31, W, W + vpor W, W_TMP1, W_TMP1 + vpslld $2, W_TMP2, W + vpsrld $30, W_TMP2, W_TMP2 + .elseif ((i & 3) == 3) + vpxor W, W_TMP1, W_TMP1 + vpxor W_TMP2, W_TMP1, W + vpaddd K_XMM(K_BASE), W, W_TMP1 + vmovdqu W_TMP1, WK(i&~3) + W_PRECALC_ROTATE + .endif +.endm + +.macro W_PRECALC_32_79_AVX + .if ((i & 3) == 0) + vpalignr $8, W_minus_08, W_minus_04, W_TMP1 + vpxor W_minus_28, W, W # W is W_minus_32 before xor + .elseif ((i & 3) == 1) + vpxor W_minus_16, W_TMP1, W_TMP1 + vpxor W_TMP1, W, W + .elseif ((i & 3) == 2) + vpslld $2, W, W_TMP1 + vpsrld $30, W, W + vpor W, W_TMP1, W + .elseif ((i & 3) == 3) + vpaddd K_XMM(K_BASE), W, W_TMP1 + vmovdqu W_TMP1, WK(i&~3) + W_PRECALC_ROTATE + .endif +.endm + +.endm // W_PRECALC_AVX + +W_PRECALC_AVX +.purgem xmm_mov +.macro xmm_mov a, b + vmovdqu \a,\b +.endm + + +/* AVX optimized implementation: + * extern "C" void sha1_transform_avx(u32 *digest, const char *data, u32 *ws, + * unsigned int rounds); + */ +SHA1_VECTOR_ASM sha1_transform_avx + +#endif diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c new file mode 100644 index 0000000..f916499 --- /dev/null +++ b/arch/x86/crypto/sha1_ssse3_glue.c @@ -0,0 +1,240 @@ +/* + * Cryptographic API. + * + * Glue code for the SHA1 Secure Hash Algorithm assembler implementation using + * Supplemental SSE3 instructions. + * + * This file is based on sha1_generic.c + * + * Copyright (c) Alan Smithee. + * Copyright (c) Andrew McDonald + * Copyright (c) Jean-Francois Dive + * Copyright (c) Mathias Krause + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +asmlinkage void sha1_transform_ssse3(u32 *digest, const char *data, + unsigned int rounds); +#ifdef SHA1_ENABLE_AVX_SUPPORT +asmlinkage void sha1_transform_avx(u32 *digest, const char *data, + unsigned int rounds); +#endif + +static asmlinkage void (*sha1_transform_asm)(u32 *, const char *, unsigned int); + + +static int sha1_ssse3_init(struct shash_desc *desc) +{ + struct sha1_state *sctx = shash_desc_ctx(desc); + + *sctx = (struct sha1_state){ + .state = { SHA1_H0, SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 }, + }; + + return 0; +} + +static int __sha1_ssse3_update(struct shash_desc *desc, const u8 *data, + unsigned int len, unsigned int partial) +{ + struct sha1_state *sctx = shash_desc_ctx(desc); + unsigned int done = 0; + + sctx->count += len; + + if (partial) { + done = SHA1_BLOCK_SIZE - partial; + memcpy(sctx->buffer + partial, data, done); + sha1_transform_asm(sctx->state, sctx->buffer, 1); + } + + if (len - done >= SHA1_BLOCK_SIZE) { + const unsigned int rounds = (len - done) / SHA1_BLOCK_SIZE; + + sha1_transform_asm(sctx->state, data + done, rounds); + done += rounds * SHA1_BLOCK_SIZE; + } + + memcpy(sctx->buffer, data + done, len - done); + + return 0; +} + +static int sha1_ssse3_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + struct sha1_state *sctx = shash_desc_ctx(desc); + unsigned int partial = sctx->count % SHA1_BLOCK_SIZE; + int res; + + /* Handle the fast case right here */ + if (partial + len < SHA1_BLOCK_SIZE) { + sctx->count += len; + memcpy(sctx->buffer + partial, data, len); + + return 0; + } + + if (!irq_fpu_usable()) { + res = crypto_sha1_update(desc, data, len); + } else { + kernel_fpu_begin(); + res = __sha1_ssse3_update(desc, data, len, partial); + kernel_fpu_end(); + } + + return res; +} + + +/* Add padding and return the message digest. */ +static int sha1_ssse3_final(struct shash_desc *desc, u8 *out) +{ + struct sha1_state *sctx = shash_desc_ctx(desc); + unsigned int i, index, padlen; + __be32 *dst = (__be32 *)out; + __be64 bits; + static const u8 padding[SHA1_BLOCK_SIZE] = { 0x80, }; + + bits = cpu_to_be64(sctx->count << 3); + + /* Pad out to 56 mod 64 and append length */ + index = sctx->count % SHA1_BLOCK_SIZE; + padlen = (index < 56) ? (56 - index) : ((SHA1_BLOCK_SIZE+56) - index); + if (!irq_fpu_usable()) { + crypto_sha1_update(desc, padding, padlen); + crypto_sha1_update(desc, (const u8 *)&bits, sizeof(bits)); + } else { + kernel_fpu_begin(); + /* We need to fill a whole block for __sha1_ssse3_update() */ + if (padlen <= 56) { + sctx->count += padlen; + memcpy(sctx->buffer + index, padding, padlen); + } else { + __sha1_ssse3_update(desc, padding, padlen, index); + } + __sha1_ssse3_update(desc, (const u8 *)&bits, sizeof(bits), 56); + kernel_fpu_end(); + } + + /* Store state in digest */ + for (i = 0; i < 5; i++) + dst[i] = cpu_to_be32(sctx->state[i]); + + /* Wipe context */ + memset(sctx, 0, sizeof(*sctx)); + + return 0; +} + +static int sha1_ssse3_export(struct shash_desc *desc, void *out) +{ + struct sha1_state *sctx = shash_desc_ctx(desc); + + memcpy(out, sctx, sizeof(*sctx)); + + return 0; +} + +static int sha1_ssse3_import(struct shash_desc *desc, const void *in) +{ + struct sha1_state *sctx = shash_desc_ctx(desc); + + memcpy(sctx, in, sizeof(*sctx)); + + return 0; +} + +static struct shash_alg alg = { + .digestsize = SHA1_DIGEST_SIZE, + .init = sha1_ssse3_init, + .update = sha1_ssse3_update, + .final = sha1_ssse3_final, + .export = sha1_ssse3_export, + .import = sha1_ssse3_import, + .descsize = sizeof(struct sha1_state), + .statesize = sizeof(struct sha1_state), + .base = { + .cra_name = "sha1", + .cra_driver_name= "sha1-ssse3", + .cra_priority = 150, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA1_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +}; + +#ifdef SHA1_ENABLE_AVX_SUPPORT +static bool __init avx_usable(void) +{ + u64 xcr0; + + if (!cpu_has_avx || !cpu_has_osxsave) + return false; + + xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); + if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { + pr_info("AVX detected but unusable.\n"); + + return false; + } + + return true; +} +#endif + +static int __init sha1_ssse3_mod_init(void) +{ + /* test for SSSE3 first */ + if (cpu_has_ssse3) + sha1_transform_asm = sha1_transform_ssse3; + +#ifdef SHA1_ENABLE_AVX_SUPPORT + /* allow AVX to override SSSE3, it's a little faster */ + if (avx_usable()) + sha1_transform_asm = sha1_transform_avx; +#endif + + if (sha1_transform_asm) { + pr_info("Using %s optimized SHA-1 implementation\n", + sha1_transform_asm == sha1_transform_ssse3 ? "SSSE3" + : "AVX"); + return crypto_register_shash(&alg); + } + pr_info("Neither AVX nor SSSE3 is available/usable.\n"); + + return -ENODEV; +} + +static void __exit sha1_ssse3_mod_fini(void) +{ + crypto_unregister_shash(&alg); +} + +module_init(sha1_ssse3_mod_init); +module_exit(sha1_ssse3_mod_fini); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm, Supplemental SSE3 accelerated"); + +MODULE_ALIAS("sha1"); diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 4258aac..48a93ef 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -257,7 +257,9 @@ extern const char * const x86_power_flags[32]; #define cpu_has_xmm boot_cpu_has(X86_FEATURE_XMM) #define cpu_has_xmm2 boot_cpu_has(X86_FEATURE_XMM2) #define cpu_has_xmm3 boot_cpu_has(X86_FEATURE_XMM3) +#define cpu_has_ssse3 boot_cpu_has(X86_FEATURE_SSSE3) #define cpu_has_aes boot_cpu_has(X86_FEATURE_AES) +#define cpu_has_avx boot_cpu_has(X86_FEATURE_AVX) #define cpu_has_ht boot_cpu_has(X86_FEATURE_HT) #define cpu_has_mp boot_cpu_has(X86_FEATURE_MP) #define cpu_has_nx boot_cpu_has(X86_FEATURE_NX) @@ -285,6 +287,7 @@ extern const char * const x86_power_flags[32]; #define cpu_has_xmm4_2 boot_cpu_has(X86_FEATURE_XMM4_2) #define cpu_has_x2apic boot_cpu_has(X86_FEATURE_X2APIC) #define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE) +#define cpu_has_osxsave boot_cpu_has(X86_FEATURE_OSXSAVE) #define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR) #define cpu_has_pclmulqdq boot_cpu_has(X86_FEATURE_PCLMULQDQ) #define cpu_has_perfctr_core boot_cpu_has(X86_FEATURE_PERFCTR_CORE) -- cgit v1.1 From f3fb5b7bb70d6e679c15fef85707810a067f5fb6 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 10 Aug 2011 11:15:30 -0400 Subject: x86: Remove unnecessary compile flag tweaks for vsyscall code As of commit 98d0ac38ca7b1b7a552c9a2359174ff84decb600 Author: Andy Lutomirski Date: Thu Jul 14 06:47:22 2011 -0400 x86-64: Move vread_tsc and vread_hpet into the vDSO user code no longer directly calls into code in arch/x86/kernel/, so we don't need compile flag hacks to make it safe. All vdso code is in the vdso directory now. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/835cd05a4c7740544d09723d6ba48f4406f9826c.1312988155.git.luto@mit.edu Signed-off-by: H. Peter Anvin --- arch/x86/kernel/Makefile | 13 ------------- arch/x86/kernel/vsyscall_64.c | 3 --- 2 files changed, 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 2deef3d..3d1ac39 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -17,19 +17,6 @@ CFLAGS_REMOVE_ftrace.o = -pg CFLAGS_REMOVE_early_printk.o = -pg endif -# -# vsyscalls (which work on the user stack) should have -# no stack-protector checks: -# -nostackp := $(call cc-option, -fno-stack-protector) -CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp) -CFLAGS_hpet.o := $(nostackp) -CFLAGS_paravirt.o := $(nostackp) -GCOV_PROFILE_vsyscall_64.o := n -GCOV_PROFILE_hpet.o := n -GCOV_PROFILE_tsc.o := n -GCOV_PROFILE_paravirt.o := n - obj-y := process_$(BITS).o signal.o entry_$(BITS).o obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time.o ioport.o ldt.o dumpstack.o diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 93a0d46..bf8e9ff 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -18,9 +18,6 @@ * use the vDSO. */ -/* Disable profiling for userspace code: */ -#define DISABLE_BRANCH_PROFILING - #include #include #include -- cgit v1.1 From fce8dc06423d6fb2709469dc5c55b04e09c1d126 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 10 Aug 2011 11:15:31 -0400 Subject: x86-64: Wire up getcpu syscall getcpu is available as a vdso entry and an emulated vsyscall. Programs that for some reason don't want to use the vdso should still be able to call getcpu without relying on the slow emulated vsyscall. It costs almost nothing to expose it as a real syscall. We also need this for the following patch in vsyscall=native mode. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/6b19f55bdb06a0c32c2fa6dba9b6f222e1fde999.1312988155.git.luto@mit.edu Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/unistd_64.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h index 705bf13..d92641c 100644 --- a/arch/x86/include/asm/unistd_64.h +++ b/arch/x86/include/asm/unistd_64.h @@ -681,6 +681,8 @@ __SYSCALL(__NR_syncfs, sys_syncfs) __SYSCALL(__NR_sendmmsg, sys_sendmmsg) #define __NR_setns 308 __SYSCALL(__NR_setns, sys_setns) +#define __NR_getcpu 309 +__SYSCALL(__NR_getcpu, sys_getcpu) #ifndef __NO_STUBS #define __ARCH_WANT_OLD_READDIR -- cgit v1.1 From 3ae36655b97a03fa1decf72f04078ef945647c1a Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 10 Aug 2011 11:15:32 -0400 Subject: x86-64: Rework vsyscall emulation and add vsyscall= parameter There are three choices: vsyscall=native: Vsyscalls are native code that issues the corresponding syscalls. vsyscall=emulate (default): Vsyscalls are emulated by instruction fault traps, tested in the bad_area path. The actual contents of the vsyscall page is the same as the vsyscall=native case except that it's marked NX. This way programs that make assumptions about what the code in the page does will not be confused when they read that code. vsyscall=none: Trying to execute a vsyscall will segfault. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/8449fb3abf89851fd6b2260972666a6f82542284.1312988155.git.luto@mit.edu Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/irq_vectors.h | 4 -- arch/x86/include/asm/traps.h | 2 - arch/x86/include/asm/vsyscall.h | 6 +++ arch/x86/kernel/entry_64.S | 1 - arch/x86/kernel/traps.c | 6 --- arch/x86/kernel/vmlinux.lds.S | 33 ---------------- arch/x86/kernel/vsyscall_64.c | 79 +++++++++++++++++++++++--------------- arch/x86/kernel/vsyscall_emu_64.S | 36 ++++++++++------- arch/x86/mm/fault.c | 12 ++++++ 9 files changed, 90 insertions(+), 89 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index a563c50..2c224e1 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -17,7 +17,6 @@ * Vectors 0 ... 31 : system traps and exceptions - hardcoded events * Vectors 32 ... 127 : device interrupts * Vector 128 : legacy int80 syscall interface - * Vector 204 : legacy x86_64 vsyscall emulation * Vectors 129 ... INVALIDATE_TLB_VECTOR_START-1 except 204 : device interrupts * Vectors INVALIDATE_TLB_VECTOR_START ... 255 : special interrupts * @@ -51,9 +50,6 @@ #ifdef CONFIG_X86_32 # define SYSCALL_VECTOR 0x80 #endif -#ifdef CONFIG_X86_64 -# define VSYSCALL_EMU_VECTOR 0xcc -#endif /* * Vectors 0x30-0x3f are used for ISA interrupts. diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 2bae0a5..0012d09 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -40,7 +40,6 @@ asmlinkage void alignment_check(void); asmlinkage void machine_check(void); #endif /* CONFIG_X86_MCE */ asmlinkage void simd_coprocessor_error(void); -asmlinkage void emulate_vsyscall(void); dotraplinkage void do_divide_error(struct pt_regs *, long); dotraplinkage void do_debug(struct pt_regs *, long); @@ -67,7 +66,6 @@ dotraplinkage void do_alignment_check(struct pt_regs *, long); dotraplinkage void do_machine_check(struct pt_regs *, long); #endif dotraplinkage void do_simd_coprocessor_error(struct pt_regs *, long); -dotraplinkage void do_emulate_vsyscall(struct pt_regs *, long); #ifdef CONFIG_X86_32 dotraplinkage void do_iret_error(struct pt_regs *, long); #endif diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h index 6010707..eaea1d3 100644 --- a/arch/x86/include/asm/vsyscall.h +++ b/arch/x86/include/asm/vsyscall.h @@ -27,6 +27,12 @@ extern struct timezone sys_tz; extern void map_vsyscall(void); +/* + * Called on instruction fetch fault in vsyscall page. + * Returns true if handled. + */ +extern bool emulate_vsyscall(struct pt_regs *regs, unsigned long address); + #endif /* __KERNEL__ */ #endif /* _ASM_X86_VSYSCALL_H */ diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index e949793..46792d9 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1123,7 +1123,6 @@ zeroentry spurious_interrupt_bug do_spurious_interrupt_bug zeroentry coprocessor_error do_coprocessor_error errorentry alignment_check do_alignment_check zeroentry simd_coprocessor_error do_simd_coprocessor_error -zeroentry emulate_vsyscall do_emulate_vsyscall /* Reload gs selector with exception handling */ diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index fbc097a..b9b6716 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -872,12 +872,6 @@ void __init trap_init(void) set_bit(SYSCALL_VECTOR, used_vectors); #endif -#ifdef CONFIG_X86_64 - BUG_ON(test_bit(VSYSCALL_EMU_VECTOR, used_vectors)); - set_system_intr_gate(VSYSCALL_EMU_VECTOR, &emulate_vsyscall); - set_bit(VSYSCALL_EMU_VECTOR, used_vectors); -#endif - /* * Should be a barrier for any external CPU state: */ diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 8f3a265..0f703f1 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -71,7 +71,6 @@ PHDRS { text PT_LOAD FLAGS(5); /* R_E */ data PT_LOAD FLAGS(6); /* RW_ */ #ifdef CONFIG_X86_64 - user PT_LOAD FLAGS(5); /* R_E */ #ifdef CONFIG_SMP percpu PT_LOAD FLAGS(6); /* RW_ */ #endif @@ -174,38 +173,6 @@ SECTIONS . = ALIGN(__vvar_page + PAGE_SIZE, PAGE_SIZE); -#define VSYSCALL_ADDR (-10*1024*1024) - -#define VLOAD_OFFSET (VSYSCALL_ADDR - __vsyscall_0 + LOAD_OFFSET) -#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET) - -#define VVIRT_OFFSET (VSYSCALL_ADDR - __vsyscall_0) -#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) - - __vsyscall_0 = .; - - . = VSYSCALL_ADDR; - .vsyscall : AT(VLOAD(.vsyscall)) { - /* work around gold bug 13023 */ - __vsyscall_beginning_hack = .; - *(.vsyscall_0) - - . = __vsyscall_beginning_hack + 1024; - *(.vsyscall_1) - - . = __vsyscall_beginning_hack + 2048; - *(.vsyscall_2) - - . = __vsyscall_beginning_hack + 4096; /* Pad the whole page. */ - } :user =0xcc - . = ALIGN(__vsyscall_0 + PAGE_SIZE, PAGE_SIZE); - -#undef VSYSCALL_ADDR -#undef VLOAD_OFFSET -#undef VLOAD -#undef VVIRT_OFFSET -#undef VVIRT - #endif /* CONFIG_X86_64 */ /* Init code and data - will be freed after init */ diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index bf8e9ff..18ae83d 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -56,6 +56,27 @@ DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) = .lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock), }; +static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE; + +static int __init vsyscall_setup(char *str) +{ + if (str) { + if (!strcmp("emulate", str)) + vsyscall_mode = EMULATE; + else if (!strcmp("native", str)) + vsyscall_mode = NATIVE; + else if (!strcmp("none", str)) + vsyscall_mode = NONE; + else + return -EINVAL; + + return 0; + } + + return -EINVAL; +} +early_param("vsyscall", vsyscall_setup); + void update_vsyscall_tz(void) { unsigned long flags; @@ -100,7 +121,7 @@ static void warn_bad_vsyscall(const char *level, struct pt_regs *regs, printk("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n", level, tsk->comm, task_pid_nr(tsk), - message, regs->ip - 2, regs->cs, + message, regs->ip, regs->cs, regs->sp, regs->ax, regs->si, regs->di); } @@ -118,45 +139,39 @@ static int addr_to_vsyscall_nr(unsigned long addr) return nr; } -void dotraplinkage do_emulate_vsyscall(struct pt_regs *regs, long error_code) +bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) { struct task_struct *tsk; unsigned long caller; int vsyscall_nr; long ret; - local_irq_enable(); + /* + * No point in checking CS -- the only way to get here is a user mode + * trap to a high address, which means that we're in 64-bit user code. + */ - if (!user_64bit_mode(regs)) { - /* - * If we trapped from kernel mode, we might as well OOPS now - * instead of returning to some random address and OOPSing - * then. - */ - BUG_ON(!user_mode(regs)); + WARN_ON_ONCE(address != regs->ip); - /* Compat mode and non-compat 32-bit CS should both segfault. */ - warn_bad_vsyscall(KERN_WARNING, regs, - "illegal int 0xcc from 32-bit mode"); - goto sigsegv; + if (vsyscall_mode == NONE) { + warn_bad_vsyscall(KERN_INFO, regs, + "vsyscall attempted with vsyscall=none"); + return false; } - /* - * x86-ism here: regs->ip points to the instruction after the int 0xcc, - * and int 0xcc is two bytes long. - */ - vsyscall_nr = addr_to_vsyscall_nr(regs->ip - 2); + vsyscall_nr = addr_to_vsyscall_nr(address); trace_emulate_vsyscall(vsyscall_nr); if (vsyscall_nr < 0) { warn_bad_vsyscall(KERN_WARNING, regs, - "illegal int 0xcc (exploit attempt?)"); + "misaligned vsyscall (exploit attempt or buggy program) -- look up the vsyscall kernel parameter if you need a workaround"); goto sigsegv; } if (get_user(caller, (unsigned long __user *)regs->sp) != 0) { - warn_bad_vsyscall(KERN_WARNING, regs, "int 0xcc with bad stack (exploit attempt?)"); + warn_bad_vsyscall(KERN_WARNING, regs, + "vsyscall with bad stack (exploit attempt?)"); goto sigsegv; } @@ -201,13 +216,11 @@ void dotraplinkage do_emulate_vsyscall(struct pt_regs *regs, long error_code) regs->ip = caller; regs->sp += 8; - local_irq_disable(); - return; + return true; sigsegv: - regs->ip -= 2; /* The faulting instruction should be the int 0xcc. */ force_sig(SIGSEGV, current); - local_irq_disable(); + return true; } /* @@ -255,15 +268,21 @@ cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg) void __init map_vsyscall(void) { - extern char __vsyscall_0; - unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0); + extern char __vsyscall_page; + unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page); extern char __vvar_page; unsigned long physaddr_vvar_page = __pa_symbol(&__vvar_page); - /* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */ - __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL); + __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_vsyscall, + vsyscall_mode == NATIVE + ? PAGE_KERNEL_VSYSCALL + : PAGE_KERNEL_VVAR); + BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_FIRST_PAGE) != + (unsigned long)VSYSCALL_START); + __set_fixmap(VVAR_PAGE, physaddr_vvar_page, PAGE_KERNEL_VVAR); - BUILD_BUG_ON((unsigned long)__fix_to_virt(VVAR_PAGE) != (unsigned long)VVAR_ADDRESS); + BUILD_BUG_ON((unsigned long)__fix_to_virt(VVAR_PAGE) != + (unsigned long)VVAR_ADDRESS); } static int __init vsyscall_init(void) diff --git a/arch/x86/kernel/vsyscall_emu_64.S b/arch/x86/kernel/vsyscall_emu_64.S index ffa845e..c9596a9 100644 --- a/arch/x86/kernel/vsyscall_emu_64.S +++ b/arch/x86/kernel/vsyscall_emu_64.S @@ -7,21 +7,31 @@ */ #include + #include +#include +#include + +__PAGE_ALIGNED_DATA + .globl __vsyscall_page + .balign PAGE_SIZE, 0xcc + .type __vsyscall_page, @object +__vsyscall_page: + + mov $__NR_gettimeofday, %rax + syscall + ret -/* The unused parts of the page are filled with 0xcc by the linker script. */ + .balign 1024, 0xcc + mov $__NR_time, %rax + syscall + ret -.section .vsyscall_0, "a" -ENTRY(vsyscall_0) - int $VSYSCALL_EMU_VECTOR -END(vsyscall_0) + .balign 1024, 0xcc + mov $__NR_getcpu, %rax + syscall + ret -.section .vsyscall_1, "a" -ENTRY(vsyscall_1) - int $VSYSCALL_EMU_VECTOR -END(vsyscall_1) + .balign 4096, 0xcc -.section .vsyscall_2, "a" -ENTRY(vsyscall_2) - int $VSYSCALL_EMU_VECTOR -END(vsyscall_2) + .size __vsyscall_page, 4096 diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index c1d0182..e58935c 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -720,6 +720,18 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, if (is_errata100(regs, address)) return; +#ifdef CONFIG_X86_64 + /* + * Instruction fetch faults in the vsyscall page might need + * emulation. + */ + if (unlikely((error_code & PF_INSTR) && + ((address & ~0xfff) == VSYSCALL_START))) { + if (emulate_vsyscall(regs, address)) + return; + } +#endif + if (unlikely(show_unhandled_signals)) show_signal_msg(regs, error_code, address, tsk); -- cgit v1.1 From 7fdba1ca10462f42ad2246b918fe6368f5ce488e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Jul 2011 13:41:54 +0200 Subject: perf, x86: Avoid kfree() in CPU_STARTING On -rt kfree() can schedule, but CPU_STARTING is before the CPU is fully up and running. These are contradictory, so avoid it. Instead push the kfree() to CPU_ONLINE where we're free to schedule. Reported-by: Thomas Gleixner Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-kwd4j6ayld5thrscvaxgjquv@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 8 ++++++++ arch/x86/kernel/cpu/perf_event_amd.c | 2 +- arch/x86/kernel/cpu/perf_event_intel.c | 2 +- 3 files changed, 10 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 4ee3abf..594d425 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -129,6 +129,8 @@ struct cpu_hw_events { * AMD specific bits */ struct amd_nb *amd_nb; + + void *kfree_on_online; }; #define __EVENT_CONSTRAINT(c, n, m, w) {\ @@ -1466,10 +1468,12 @@ static int __cpuinit x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) { unsigned int cpu = (long)hcpu; + struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); int ret = NOTIFY_OK; switch (action & ~CPU_TASKS_FROZEN) { case CPU_UP_PREPARE: + cpuc->kfree_on_online = NULL; if (x86_pmu.cpu_prepare) ret = x86_pmu.cpu_prepare(cpu); break; @@ -1479,6 +1483,10 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) x86_pmu.cpu_starting(cpu); break; + case CPU_ONLINE: + kfree(cpuc->kfree_on_online); + break; + case CPU_DYING: if (x86_pmu.cpu_dying) x86_pmu.cpu_dying(cpu); diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index 941caa2..ee9436c 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -350,7 +350,7 @@ static void amd_pmu_cpu_starting(int cpu) continue; if (nb->nb_id == nb_id) { - kfree(cpuc->amd_nb); + cpuc->kfree_on_online = cpuc->amd_nb; cpuc->amd_nb = nb; break; } diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index f88af2c..3751494 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1362,7 +1362,7 @@ static void intel_pmu_cpu_starting(int cpu) pc = per_cpu(cpu_hw_events, i).shared_regs; if (pc && pc->core_id == core_id) { - kfree(cpuc->shared_regs); + cpuc->kfree_on_online = cpuc->shared_regs; cpuc->shared_regs = pc; break; } -- cgit v1.1 From cedf03bd9aa54d1d7a9065dddc9e76505f476b12 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 15 Aug 2011 10:18:46 -0700 Subject: x86: fix mm/fault.c build arch/x86/mm/fault.c needs to include asm/vsyscall.h to fix a build error: arch/x86/mm/fault.c: In function '__bad_area_nosemaphore': arch/x86/mm/fault.c:728: error: 'VSYSCALL_START' undeclared (first use in this function) Signed-off-by: Randy Dunlap Signed-off-by: Linus Torvalds --- arch/x86/mm/fault.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 247aae3..0d17c8c 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -17,6 +17,7 @@ #include /* dotraplinkage, ... */ #include /* pgd_*(), ... */ #include /* kmemcheck_*(), ... */ +#include /* * Page fault error code bits: -- cgit v1.1 From fab1167c4698e3ff11ebb06281d78def6c53728b Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 15 Aug 2011 22:28:56 -0700 Subject: x86, vsyscall: Add missing to arch/x86/mm/fault.c arch/x86/mm/fault.c now depend on having the symbol VSYSCALL_START defined, which is best handled by including (it isn't unreasonable we may want other fixed addresses in this file in the future, and so it is cleaner than including directly.) This addresses an x86-64 allnoconfig build failure. On other configurations it was masked by an indirect path: -> -> -> ... however, the first such include is conditional on CONFIG_X86_LOCAL_APIC. Originally-by: Randy Dunlap Cc: Linus Torvalds Link: http://lkml.kernel.org/r/CA%2B55aFxsOMc9=p02r8-QhJ=h=Mqwckk4_Pnx9LQt5%2BfqMp_exQ@mail.gmail.com Signed-off-by: H. Peter Anvin --- arch/x86/mm/fault.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 247aae3..f2d4c9d 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -17,6 +17,7 @@ #include /* dotraplinkage, ... */ #include /* pgd_*(), ... */ #include /* kmemcheck_*(), ... */ +#include /* VSYSCALL_START */ /* * Page fault error code bits: -- cgit v1.1 From df3d8ae1f8780166a16dd7d08b4842a4d5b5f2b4 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 2 Aug 2011 12:54:31 -0700 Subject: KVM: uses TASKSTATS, depends on NET CONFIG_TASKSTATS just had a change to use netlink, including a change to "depends on NET". Since "select" does not follow dependencies, KVM also needs to depend on NET to prevent build errors when CONFIG_NET is not enabled. Sample of the reported "undefined reference" build errors: taskstats.c:(.text+0x8f686): undefined reference to `nla_put' taskstats.c:(.text+0x8f721): undefined reference to `nla_reserve' taskstats.c:(.text+0x8f8fb): undefined reference to `init_net' taskstats.c:(.text+0x8f905): undefined reference to `netlink_unicast' taskstats.c:(.text+0x8f934): undefined reference to `kfree_skb' taskstats.c:(.text+0x8f9e9): undefined reference to `skb_clone' taskstats.c:(.text+0x90060): undefined reference to `__alloc_skb' taskstats.c:(.text+0x901e9): undefined reference to `skb_put' taskstats.c:(.init.text+0x4665): undefined reference to `genl_register_family' taskstats.c:(.init.text+0x4699): undefined reference to `genl_register_ops' taskstats.c:(.init.text+0x4710): undefined reference to `genl_unregister_ops' taskstats.c:(.init.text+0x471c): undefined reference to `genl_unregister_family' Signed-off-by: Randy Dunlap Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/Kconfig | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 0a09b58..ff5790d 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -22,6 +22,8 @@ config KVM depends on HAVE_KVM # for device assignment: depends on PCI + # for TASKSTATS/TASK_DELAY_ACCT: + depends on NET select PREEMPT_NOTIFIERS select MMU_NOTIFIER select ANON_INODES -- cgit v1.1 From 298557db42eb2d6efca81669dc369425b46c5be6 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 16 Aug 2011 23:39:53 +0200 Subject: oprofile, x86: Fix overflow and warning (commit 1d12d35) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Following fixes for: 1d12d35 oprofile, x86: Convert memory allocation to static array Fix potential buffer overflow. Fix the following warning: arch/x86/oprofile/op_model_ppro.c: In function ‘ppro_check_ctrs’: arch/x86/oprofile/op_model_ppro.c:143: warning: label ‘out’ defined but not used Cc: Maarten Lankhorst Cc: Andi Kleen Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_ppro.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index 608874b7..d90528e 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -140,7 +140,6 @@ static int ppro_check_ctrs(struct pt_regs * const regs, wrmsrl(msrs->counters[i].addr, -reset_value[i]); } -out: /* Only P6 based Pentium M need to re-unmask the apic vector but it * doesn't hurt other P6 variant */ apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED); @@ -220,7 +219,7 @@ static void arch_perfmon_setup_counters(void) eax.split.bit_width = 40; } - num_counters = eax.split.num_counters; + num_counters = min((int)eax.split.num_counters, OP_MAX_COUNTER); op_arch_perfmon_spec.num_counters = num_counters; op_arch_perfmon_spec.num_controls = num_counters; -- cgit v1.1 From ccbcdf7cf1b5f6c6db30d84095b9c6c53043af55 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 16 Aug 2011 15:07:41 +0100 Subject: xen/x86: replace order-based range checking of M2P table by linear one The order-based approach is not only less efficient (requiring a shift and a compare, typical generated code looking like this mov eax, [machine_to_phys_order] mov ecx, eax shr ebx, cl test ebx, ebx jnz ... whereas a direct check requires just a compare, like in cmp ebx, [machine_to_phys_nr] jae ... ), but also slightly dangerous in the 32-on-64 case - the element address calculation can wrap if the next power of two boundary is sufficiently far away from the actual upper limit of the table, and hence can result in user space addresses being accessed (with it being unknown what may actually be mapped there). Additionally, the elimination of the mistaken use of fls() here (should have been __fls()) fixes a latent issue on x86-64 that would trigger if the code was run on a system with memory extending beyond the 44-bit boundary. CC: stable@kernel.org Signed-off-by: Jan Beulich [v1: Based on Jeremy's feedback] Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/include/asm/xen/page.h | 4 ++-- arch/x86/xen/enlighten.c | 4 ++-- arch/x86/xen/mmu.c | 12 ++++++++---- 3 files changed, 12 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index 64a619d..7ff4669 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -39,7 +39,7 @@ typedef struct xpaddr { ((unsigned long)((u64)CONFIG_XEN_MAX_DOMAIN_MEMORY * 1024 * 1024 * 1024 / PAGE_SIZE)) extern unsigned long *machine_to_phys_mapping; -extern unsigned int machine_to_phys_order; +extern unsigned long machine_to_phys_nr; extern unsigned long get_phys_to_machine(unsigned long pfn); extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn); @@ -87,7 +87,7 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn) if (xen_feature(XENFEAT_auto_translated_physmap)) return mfn; - if (unlikely((mfn >> machine_to_phys_order) != 0)) { + if (unlikely(mfn >= machine_to_phys_nr)) { pfn = ~0; goto try_override; } diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 974a528..b960429 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -77,8 +77,8 @@ EXPORT_SYMBOL_GPL(xen_domain_type); unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START; EXPORT_SYMBOL(machine_to_phys_mapping); -unsigned int machine_to_phys_order; -EXPORT_SYMBOL(machine_to_phys_order); +unsigned long machine_to_phys_nr; +EXPORT_SYMBOL(machine_to_phys_nr); struct start_info *xen_start_info; EXPORT_SYMBOL_GPL(xen_start_info); diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index f987bde..24abc1f 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1713,15 +1713,19 @@ static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) void __init xen_setup_machphys_mapping(void) { struct xen_machphys_mapping mapping; - unsigned long machine_to_phys_nr_ents; if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) { machine_to_phys_mapping = (unsigned long *)mapping.v_start; - machine_to_phys_nr_ents = mapping.max_mfn + 1; + machine_to_phys_nr = mapping.max_mfn + 1; } else { - machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES; + machine_to_phys_nr = MACH2PHYS_NR_ENTRIES; } - machine_to_phys_order = fls(machine_to_phys_nr_ents - 1); +#ifdef CONFIG_X86_32 + if ((machine_to_phys_mapping + machine_to_phys_nr) + < machine_to_phys_mapping) + machine_to_phys_nr = (unsigned long *)NULL + - machine_to_phys_mapping; +#endif } #ifdef CONFIG_X86_64 -- cgit v1.1 From 2f3aa7a06f6f48d6f78a90595b17e6beafa7abf6 Mon Sep 17 00:00:00 2001 From: Kevin Winchester Date: Tue, 16 Aug 2011 21:04:38 -0300 Subject: x86: jump_label: arch_jump_label_text_poke_early: add missing __init arch_jump_label_text_poke_early calls text_poke_early, which is an __init function. Thus arch_jump_label_text_poke_early should be the same. Signed-off-by: Kevin Winchester Cc: jbaron@redhat.com Cc: tglx@linutronix.de Cc: mingo@redhat.com Cc: hpa@zytor.com Cc: x86@kernel.org Link: http://lkml.kernel.org/r/1313539478-30303-1-git-send-email-kjwinchester@gmail.com [ Use __init_or_module instead of __init ] Signed-off-by: Steven Rostedt --- arch/x86/kernel/jump_label.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index 3fee346..cacdd46 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -42,7 +42,7 @@ void arch_jump_label_transform(struct jump_entry *entry, put_online_cpus(); } -void arch_jump_label_text_poke_early(jump_label_t addr) +void __init_or_module arch_jump_label_text_poke_early(jump_label_t addr) { text_poke_early((void *)addr, ideal_nops[NOP_ATOMIC5], JUMP_LABEL_NOP_SIZE); -- cgit v1.1 From 3c05c4bed4ccce3f22f6d7899b308faae24ad198 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Wed, 17 Aug 2011 15:15:00 +0200 Subject: xen: Do not enable PV IPIs when vector callback not present Fix regression for HVM case on older (<4.1.1) hypervisors caused by commit 99bbb3a84a99cd04ab16b998b20f01a72cfa9f4f Author: Stefano Stabellini Date: Thu Dec 2 17:55:10 2010 +0000 xen: PV on HVM: support PV spinlocks and IPIs This change replaced the SMP operations with event based handlers without taking into account that this only works when the hypervisor supports callback vectors. This causes unexplainable hangs early on boot for HVM guests with more than one CPU. BugLink: http://bugs.launchpad.net/bugs/791850 CC: stable@kernel.org Signed-off-by: Stefan Bader Signed-off-by: Stefano Stabellini Tested-and-Reported-by: Stefan Bader Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/smp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index b4533a8..e79dbb9 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -521,8 +521,6 @@ static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus) native_smp_prepare_cpus(max_cpus); WARN_ON(xen_smp_intr_init(0)); - if (!xen_have_vector_callback) - return; xen_init_lock_cpu(0); xen_init_spinlocks(); } @@ -546,6 +544,8 @@ static void xen_hvm_cpu_die(unsigned int cpu) void __init xen_hvm_smp_init(void) { + if (!xen_have_vector_callback) + return; smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus; smp_ops.smp_send_reschedule = xen_smp_send_reschedule; smp_ops.cpu_up = xen_hvm_cpu_up; -- cgit v1.1 From 60c5f08e154fd235056645e050f2cd5671b19125 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 11 Aug 2011 13:17:20 -0700 Subject: xen/tracing: Fix tracing config option properly Steven Rostedt says we should use CONFIG_EVENT_TRACING. Cc:Steven Rostedt Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 3326204..add2c2d 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -15,7 +15,7 @@ obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ grant-table.o suspend.o platform-pci-unplug.o \ p2m.o -obj-$(CONFIG_FTRACE) += trace.o +obj-$(CONFIG_EVENT_TRACING) += trace.o obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o -- cgit v1.1 From be604e695f9a0db4cd5c4c232857a10c6bc9dee4 Mon Sep 17 00:00:00 2001 From: Arun Thomas Date: Fri, 19 Aug 2011 21:42:23 +0200 Subject: x86, cpu: Add cpufeature flag for PCIDs This patch add a flag for Process-Context Identifiers (PCIDs) aka Address Space Identifiers (ASIDs) aka Tagged TLB support. Signed-off-by: Arun Thomas Link: http://lkml.kernel.org/r/1313782943-3898-1-git-send-email-arun.thomas@gmail.com Acked-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/cpufeature.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 4258aac..f24ed51 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -114,6 +114,7 @@ #define X86_FEATURE_CX16 (4*32+13) /* CMPXCHG16B */ #define X86_FEATURE_XTPR (4*32+14) /* Send Task Priority Messages */ #define X86_FEATURE_PDCM (4*32+15) /* Performance Capabilities */ +#define X86_FEATURE_PCID (4*32+17) /* Process Context Identifiers */ #define X86_FEATURE_DCA (4*32+18) /* Direct Cache Access */ #define X86_FEATURE_XMM4_1 (4*32+19) /* "sse4_1" SSE-4.1 */ #define X86_FEATURE_XMM4_2 (4*32+20) /* "sse4_2" SSE-4.2 */ -- cgit v1.1 From 7a4e8beabcb5262547b364851cbdd995b6798be1 Mon Sep 17 00:00:00 2001 From: Jesper Juhl Date: Mon, 1 Aug 2011 23:07:51 +0200 Subject: x86, cleanup: Remove unneeded version.h include from arch/x86/ It was pointed out by 'make versioncheck' that the include of linux/version.h is not needed in arch/x86/mm/mmio-mod.c . This patch removes it. Signed-off-by: Jesper Juhl Link: http://lkml.kernel.org/r/alpine.LNX.2.00.1108012305570.31999@swampdragon.chaosbits.net Signed-off-by: H. Peter Anvin --- arch/x86/mm/mmio-mod.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c index 67421f3..de54b9b 100644 --- a/arch/x86/mm/mmio-mod.c +++ b/arch/x86/mm/mmio-mod.c @@ -29,7 +29,6 @@ #include #include #include -#include #include #include #include -- cgit v1.1 From c812d8f7ad27bd5f1ed1c790eea8527b6f8ff1c7 Mon Sep 17 00:00:00 2001 From: Zhao Jin Date: Sat, 20 Aug 2011 21:24:57 +0800 Subject: x86, mm, trivial: Remove unnecessary get_order() in free_thread_info() Because THREAD_SIZE is defined as PAGE_SIZE << THREAD_ORDER on x86, the call of get_order(THREAD_SIZE) can be replaced with THREAD_ORDER. Signed-off-by: Zhao Jin Link: http://lkml.kernel.org/r/4E4FB5A9.700@gmail.com Signed-off-by: H. Peter Anvin Signed-off-by: Ingo Molnar --- arch/x86/kernel/process.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index e7e3b01..b9b3b1a 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -49,7 +49,7 @@ void free_thread_xstate(struct task_struct *tsk) void free_thread_info(struct thread_info *ti) { free_thread_xstate(ti->task); - free_pages((unsigned long)ti, get_order(THREAD_SIZE)); + free_pages((unsigned long)ti, THREAD_ORDER); } void arch_task_cache_init(void) -- cgit v1.1 From 7ca0758cdb7c241cb4e0490a8d95f0eb5b861daf Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 22 Aug 2011 13:27:06 -0700 Subject: x86-32, vdso: On system call restart after SYSENTER, use int $0x80 When we enter a 32-bit system call via SYSENTER or SYSCALL, we shuffle the arguments to match the int $0x80 calling convention. This was probably a design mistake, but it's what it is now. This causes errors if the system call as to be restarted. For SYSENTER, we have to invoke the instruction from the vdso as the return address is hardcoded. Accordingly, we can simply replace the jump in the vdso with an int $0x80 instruction and use the slower entry point for a post-restart. Suggested-by: Linus Torvalds Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/CA%2B55aFztZ=r5wa0x26KJQxvZOaQq8s2v3u50wCyJcA-Sc4g8gQ@mail.gmail.com Cc: --- arch/x86/vdso/vdso32/sysenter.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/vdso/vdso32/sysenter.S b/arch/x86/vdso/vdso32/sysenter.S index e2800af..e354bce 100644 --- a/arch/x86/vdso/vdso32/sysenter.S +++ b/arch/x86/vdso/vdso32/sysenter.S @@ -43,7 +43,7 @@ __kernel_vsyscall: .space 7,0x90 /* 14: System call restart point is here! (SYSENTER_RETURN-2) */ - jmp .Lenter_kernel + int $0x80 /* 16: System call normal return point is here! */ VDSO32_SYSENTER_RETURN: /* Symbol used by sysenter.c via vdso32-syms.h */ pop %ebp -- cgit v1.1 From f1c39625d63c9f8eba8f036429c10a9cb9e32920 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 24 Aug 2011 09:54:24 -0700 Subject: xen: use non-tracing preempt in xen_clocksource_read() The tracing code used sched_clock() to get tracing timestamps, which ends up calling xen_clocksource_read(). xen_clocksource_read() must disable preemption, but if preemption tracing is enabled, this results in infinite recursion. I've only noticed this when boot-time tracing tests are enabled, but it seems like a generic bug. It looks like it would also affect kvm_clocksource_read(). Reported-by: Konrad Rzeszutek Wilk Signed-off-by: Jeremy Fitzhardinge Cc: Avi Kivity Cc: Marcelo Tosatti --- arch/x86/xen/time.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 5158c50..163b467 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -168,9 +168,10 @@ cycle_t xen_clocksource_read(void) struct pvclock_vcpu_time_info *src; cycle_t ret; - src = &get_cpu_var(xen_vcpu)->time; + preempt_disable_notrace(); + src = &__get_cpu_var(xen_vcpu)->time; ret = pvclock_clocksource_read(src); - put_cpu_var(xen_vcpu); + preempt_enable_notrace(); return ret; } -- cgit v1.1 From 7e49b1c8c6d64f55ac83e1f5901b22fa9e51ab80 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 24 Aug 2011 10:19:44 -0400 Subject: x86-64, unistd: Remove bogus __IGNORE_getcpu The change: commit fce8dc06423d6fb2709469dc5c55b04e09c1d126 Author: Andy Lutomirski Date: Wed Aug 10 11:15:31 2011 -0400 x86-64: Wire up getcpu syscall added getcpu as a real syscall, so we shouldn't ignore it any more. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/b4cb60ef45db3a675a0e2b9d51bcb022b0a9ab9c.1314195481.git.luto@mit.edu Reported-by: H.J. Lu Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/unistd_64.h | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h index d92641c..940d720 100644 --- a/arch/x86/include/asm/unistd_64.h +++ b/arch/x86/include/asm/unistd_64.h @@ -624,7 +624,6 @@ __SYSCALL(__NR_vmsplice, sys_vmsplice) __SYSCALL(__NR_move_pages, sys_move_pages) #define __NR_utimensat 280 __SYSCALL(__NR_utimensat, sys_utimensat) -#define __IGNORE_getcpu /* implemented as a vsyscall */ #define __NR_epoll_pwait 281 __SYSCALL(__NR_epoll_pwait, sys_epoll_pwait) #define __NR_signalfd 282 -- cgit v1.1 From cbbfa38fcb95930babc5233cf6927ec430f38abc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 25 Aug 2011 19:46:56 +0200 Subject: mtrr: fix UP breakage caused during switch to stop_machine While removing custom rendezvous code and switching to stop_machine, commit 192d8857427d ("x86, mtrr: use stop_machine APIs for doing MTRR rendezvous") completely dropped mtrr setting code on !CONFIG_SMP breaking MTRR settting on UP. Fix it by removing the incorrect CONFIG_SMP. Signed-off-by: Tejun Heo Reported-by: Anders Eriksson Tested-and-acked-by: Suresh Siddha Acked-by: H. Peter Anvin Signed-off-by: Linus Torvalds --- arch/x86/kernel/cpu/mtrr/main.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 08119a3..6b96110 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -149,7 +149,6 @@ struct set_mtrr_data { */ static int mtrr_rendezvous_handler(void *info) { -#ifdef CONFIG_SMP struct set_mtrr_data *data = info; /* @@ -171,7 +170,6 @@ static int mtrr_rendezvous_handler(void *info) } else if (mtrr_aps_delayed_init || !cpu_online(smp_processor_id())) { mtrr_if->set_all(); } -#endif return 0; } -- cgit v1.1 From b4ca46e4e82a0a5976fe5eab85be585d75f8202f Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 25 Aug 2011 16:10:33 -0400 Subject: x86-32: Fix boot with CONFIG_X86_INVD_BUG entry_32.S contained a hardcoded alternative instruction entry, and the format changed in commit 59e97e4d6fbc ("x86: Make alternative instruction pointers relative"). Replace the hardcoded entry with the altinstruction_entry macro. This fixes the 32-bit boot with CONFIG_X86_INVD_BUG=y. Reported-and-tested-by: Arnaud Lacombe Signed-off-by: Andy Lutomirski Cc: Peter Anvin Cc: Ingo Molnar Signed-off-by: Linus Torvalds --- arch/x86/kernel/entry_32.S | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 5c1a9197..f3f6f53 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -54,6 +54,7 @@ #include #include #include +#include /* Avoid __ASSEMBLER__'ifying just for this. */ #include @@ -873,12 +874,7 @@ ENTRY(simd_coprocessor_error) 661: pushl_cfi $do_general_protection 662: .section .altinstructions,"a" - .balign 4 - .long 661b - .long 663f - .word X86_FEATURE_XMM - .byte 662b-661b - .byte 664f-663f + altinstruction_entry 661b, 663f, X86_FEATURE_XMM, 662b-661b, 664f-663f .previous .section .altinstr_replacement,"ax" 663: pushl $do_simd_coprocessor_error -- cgit v1.1 From 5289d3d160e8d5c63974502696ab5def71891f18 Mon Sep 17 00:00:00 2001 From: "K. Y. Srinivasan" Date: Thu, 25 Aug 2011 09:49:01 -0700 Subject: Staging: hv: vmbus: Retry vmbus_post_msg() before giving up The function hv_post_msg() can fail because of transient resource conditions. It may be useful to retry the operation. Signed-off-by: K. Y. Srinivasan Signed-off-by: Haiyang Zhang Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/hyperv.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hyperv.h b/arch/x86/include/asm/hyperv.h index 5df477a..b80420b 100644 --- a/arch/x86/include/asm/hyperv.h +++ b/arch/x86/include/asm/hyperv.h @@ -189,5 +189,6 @@ #define HV_STATUS_INVALID_HYPERCALL_CODE 2 #define HV_STATUS_INVALID_HYPERCALL_INPUT 3 #define HV_STATUS_INVALID_ALIGNMENT 4 +#define HV_STATUS_INSUFFICIENT_BUFFERS 19 #endif -- cgit v1.1 From a94cc4e6c0a26a7c8f79a432ab2c89534aa674d5 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 26 Aug 2011 12:20:59 +0100 Subject: sfi: table irq 0xFF means 'no interrupt' According to the SFI specification irq number 0xFF means device has no interrupt or interrupt attached via GPIO. Currently, we don't handle this special case and set irq field in *_board_info structs to 255. It leads to confusion in some drivers. Accelerometer driver tries to register interrupt 255, fails and prints "Cannot get IRQ" to dmesg. Signed-off-by: Kirill A. Shutemov Signed-off-by: Alan Cox Signed-off-by: Linus Torvalds --- arch/x86/platform/mrst/mrst.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c index 7000e74b..58425ad 100644 --- a/arch/x86/platform/mrst/mrst.c +++ b/arch/x86/platform/mrst/mrst.c @@ -689,7 +689,9 @@ static int __init sfi_parse_devs(struct sfi_table_header *table) irq_attr.trigger = 1; irq_attr.polarity = 1; io_apic_set_pci_routing(NULL, pentry->irq, &irq_attr); - } + } else + pentry->irq = 0; /* No irq */ + switch (pentry->type) { case SFI_DEV_TYPE_IPC: /* ID as IRQ is a hack that will go away */ -- cgit v1.1 From efe3ed9837fd2f9679659673f1d2078f1597bf18 Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Fri, 26 Aug 2011 11:25:14 +0100 Subject: x86/mrst: Add platform data for Max3110 devices Those info will be used when spi controller driver setup max3110 as a slave device Signed-off-by: Feng Tang Signed-off-by: Alan Cox Signed-off-by: Dirk Brandewie Signed-off-by: Greg Kroah-Hartman --- arch/x86/platform/mrst/mrst.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c index 7000e74b..b8da1b9 100644 --- a/arch/x86/platform/mrst/mrst.c +++ b/arch/x86/platform/mrst/mrst.c @@ -14,6 +14,8 @@ #include #include +#include +#include #include #include #include @@ -392,6 +394,7 @@ static void __init *max3111_platform_data(void *info) struct spi_board_info *spi_info = info; int intr = get_gpio_by_name("max3111_int"); + spi_info->mode = SPI_MODE_0; if (intr == -1) return NULL; spi_info->irq = intr + MRST_IRQ_OFFSET; -- cgit v1.1 From f5b940997397229975ea073679b03967932a541b Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 26 Aug 2011 18:03:11 -0400 Subject: All Arch: remove linkage for sys_nfsservctl system call The nfsservctl system call is now gone, so we should remove all linkage for it. Signed-off-by: NeilBrown Signed-off-by: J. Bruce Fields Signed-off-by: Linus Torvalds --- arch/x86/ia32/ia32entry.S | 2 +- arch/x86/include/asm/unistd_64.h | 2 +- arch/x86/kernel/syscall_table_32.S | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index a0e866d..54edb207 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -672,7 +672,7 @@ ia32_sys_call_table: .quad sys32_vm86_warning /* vm86 */ .quad quiet_ni_syscall /* query_module */ .quad sys_poll - .quad compat_sys_nfsservctl + .quad quiet_ni_syscall /* old nfsservctl */ .quad sys_setresgid16 /* 170 */ .quad sys_getresgid16 .quad sys_prctl diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h index d92641c..2010405 100644 --- a/arch/x86/include/asm/unistd_64.h +++ b/arch/x86/include/asm/unistd_64.h @@ -414,7 +414,7 @@ __SYSCALL(__NR_query_module, sys_ni_syscall) __SYSCALL(__NR_quotactl, sys_quotactl) #define __NR_nfsservctl 180 -__SYSCALL(__NR_nfsservctl, sys_nfsservctl) +__SYSCALL(__NR_nfsservctl, sys_ni_syscall) /* reserved for LiS/STREAMS */ #define __NR_getpmsg 181 diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index fbb0a04..bc19be3 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -168,7 +168,7 @@ ENTRY(sys_call_table) .long ptregs_vm86 .long sys_ni_syscall /* Old sys_query_module */ .long sys_poll - .long sys_nfsservctl + .long sys_ni_syscall /* Old nfsservctl */ .long sys_setresgid16 /* 170 */ .long sys_getresgid16 .long sys_prctl -- cgit v1.1 From 4009338d62e9d1c21b230bd7ebab6c43c482430e Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 18 Aug 2011 11:33:39 -0700 Subject: x86, cmpxchg: has LOCK_PREFIX Not . Signed-off-by: Jeremy Fitzhardinge Link: http://lkml.kernel.org/r/4E5BCC40.3030501@goop.org Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/cmpxchg_32.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h index 3deb725..024b694 100644 --- a/arch/x86/include/asm/cmpxchg_32.h +++ b/arch/x86/include/asm/cmpxchg_32.h @@ -1,7 +1,7 @@ #ifndef _ASM_X86_CMPXCHG_32_H #define _ASM_X86_CMPXCHG_32_H -#include /* for LOCK_PREFIX */ +#include /* Provides LOCK_PREFIX */ /* * Note: if you use set64_bit(), __cmpxchg64(), or their variants, you -- cgit v1.1 From 416185bd5ac8a749ee43d1b0967b43782843d1a0 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 18 Aug 2011 11:34:46 -0700 Subject: x86, cmpxchg: Move 32-bit __cmpxchg_wrong_size to match 64 bit. Signed-off-by: Jeremy Fitzhardinge Link: http://lkml.kernel.org/r/4E5BCC40.3030501@goop.org Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/cmpxchg_32.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h index 024b694..59d8e36 100644 --- a/arch/x86/include/asm/cmpxchg_32.h +++ b/arch/x86/include/asm/cmpxchg_32.h @@ -9,6 +9,7 @@ */ extern void __xchg_wrong_size(void); +extern void __cmpxchg_wrong_size(void); /* * Note: no "lock" prefix even on SMP: xchg always implies lock anyway. @@ -84,8 +85,6 @@ static inline void set_64bit(volatile u64 *ptr, u64 value) : "memory"); } -extern void __cmpxchg_wrong_size(void); - /* * Atomic compare and exchange. Compare OLD with MEM, if identical, * store NEW in MEM. Return the initial value in MEM. Success is -- cgit v1.1 From 00a41546e8008b9944382eed1841c785f4fc8d9c Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 18 Aug 2011 11:40:22 -0700 Subject: x86, cmpxchg: Move 64-bit set64_bit() to match 32-bit Reduce arbitrary differences between 32 and 64 bits. Signed-off-by: Jeremy Fitzhardinge Link: http://lkml.kernel.org/r/4E5BCC40.3030501@goop.org Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/cmpxchg_64.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h index 7cf5c0a..5bfa560 100644 --- a/arch/x86/include/asm/cmpxchg_64.h +++ b/arch/x86/include/asm/cmpxchg_64.h @@ -3,11 +3,6 @@ #include /* Provides LOCK_PREFIX */ -static inline void set_64bit(volatile u64 *ptr, u64 val) -{ - *ptr = val; -} - extern void __xchg_wrong_size(void); extern void __cmpxchg_wrong_size(void); @@ -66,6 +61,11 @@ extern void __cmpxchg_wrong_size(void); #define xchg(ptr, v) \ __xchg((v), (ptr), sizeof(*ptr)) +static inline void set_64bit(volatile u64 *ptr, u64 val) +{ + *ptr = val; +} + #define __HAVE_ARCH_CMPXCHG 1 /* -- cgit v1.1 From e9826380d83d1bda3ee5663bf3fa4667a6fbe60a Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 18 Aug 2011 11:48:06 -0700 Subject: x86, cmpxchg: Unify cmpxchg into cmpxchg.h Everything that's actually common between 32 and 64-bit is moved into cmpxchg.h. xchg/cmpxchg will fail with a link error if they're passed an unsupported size (which includes 64-bit args on 32-bit systems). Signed-off-by: Jeremy Fitzhardinge Link: http://lkml.kernel.org/r/4E5BCC40.3030501@goop.org Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/cmpxchg.h | 155 ++++++++++++++++++++++++++++++++++++++ arch/x86/include/asm/cmpxchg_32.h | 113 --------------------------- arch/x86/include/asm/cmpxchg_64.h | 131 -------------------------------- 3 files changed, 155 insertions(+), 244 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h index a460fa0..efe3ec7 100644 --- a/arch/x86/include/asm/cmpxchg.h +++ b/arch/x86/include/asm/cmpxchg.h @@ -1,5 +1,160 @@ +#ifndef ASM_X86_CMPXCHG_H +#define ASM_X86_CMPXCHG_H + +#include /* Provides LOCK_PREFIX */ + +/* Non-existant functions to indicate usage errors at link time. */ +extern void __xchg_wrong_size(void); +extern void __cmpxchg_wrong_size(void); + +/* + * Constants for operation sizes. On 32-bit, the 64-bit size it set to + * -1 because sizeof will never return -1, thereby making those switch + * case statements guaranteeed dead code which the compiler will + * eliminate, and allowing the "missing symbol in the default case" to + * indicate a usage error. + */ +#define __X86_CASE_B 1 +#define __X86_CASE_W 2 +#define __X86_CASE_L 4 +#ifdef CONFIG_64BIT +#define __X86_CASE_Q 8 +#else +#define __X86_CASE_Q -1 /* sizeof will never return -1 */ +#endif + +/* + * Note: no "lock" prefix even on SMP: xchg always implies lock anyway. + * Since this is generally used to protect other memory information, we + * use "asm volatile" and "memory" clobbers to prevent gcc from moving + * information around. + */ +#define __xchg(x, ptr, size) \ +({ \ + __typeof(*(ptr)) __x = (x); \ + switch (size) { \ + case __X86_CASE_B: \ + { \ + volatile u8 *__ptr = (volatile u8 *)(ptr); \ + asm volatile("xchgb %0,%1" \ + : "=q" (__x), "+m" (*__ptr) \ + : "0" (__x) \ + : "memory"); \ + break; \ + } \ + case __X86_CASE_W: \ + { \ + volatile u16 *__ptr = (volatile u16 *)(ptr); \ + asm volatile("xchgw %0,%1" \ + : "=r" (__x), "+m" (*__ptr) \ + : "0" (__x) \ + : "memory"); \ + break; \ + } \ + case __X86_CASE_L: \ + { \ + volatile u32 *__ptr = (volatile u32 *)(ptr); \ + asm volatile("xchgl %0,%1" \ + : "=r" (__x), "+m" (*__ptr) \ + : "0" (__x) \ + : "memory"); \ + break; \ + } \ + case __X86_CASE_Q: \ + { \ + volatile u64 *__ptr = (volatile u64 *)(ptr); \ + asm volatile("xchgq %0,%1" \ + : "=r" (__x), "+m" (*__ptr) \ + : "0" (__x) \ + : "memory"); \ + break; \ + } \ + default: \ + __xchg_wrong_size(); \ + } \ + __x; \ +}) + +#define xchg(ptr, v) \ + __xchg((v), (ptr), sizeof(*ptr)) + +/* + * Atomic compare and exchange. Compare OLD with MEM, if identical, + * store NEW in MEM. Return the initial value in MEM. Success is + * indicated by comparing RETURN with OLD. + */ +#define __raw_cmpxchg(ptr, old, new, size, lock) \ +({ \ + __typeof__(*(ptr)) __ret; \ + __typeof__(*(ptr)) __old = (old); \ + __typeof__(*(ptr)) __new = (new); \ + switch (size) { \ + case __X86_CASE_B: \ + { \ + volatile u8 *__ptr = (volatile u8 *)(ptr); \ + asm volatile(lock "cmpxchgb %2,%1" \ + : "=a" (__ret), "+m" (*__ptr) \ + : "q" (__new), "0" (__old) \ + : "memory"); \ + break; \ + } \ + case __X86_CASE_W: \ + { \ + volatile u16 *__ptr = (volatile u16 *)(ptr); \ + asm volatile(lock "cmpxchgw %2,%1" \ + : "=a" (__ret), "+m" (*__ptr) \ + : "r" (__new), "0" (__old) \ + : "memory"); \ + break; \ + } \ + case __X86_CASE_L: \ + { \ + volatile u32 *__ptr = (volatile u32 *)(ptr); \ + asm volatile(lock "cmpxchgl %2,%1" \ + : "=a" (__ret), "+m" (*__ptr) \ + : "r" (__new), "0" (__old) \ + : "memory"); \ + break; \ + } \ + case __X86_CASE_Q: \ + { \ + volatile u64 *__ptr = (volatile u64 *)(ptr); \ + asm volatile(lock "cmpxchgq %2,%1" \ + : "=a" (__ret), "+m" (*__ptr) \ + : "r" (__new), "0" (__old) \ + : "memory"); \ + break; \ + } \ + default: \ + __cmpxchg_wrong_size(); \ + } \ + __ret; \ +}) + +#define __cmpxchg(ptr, old, new, size) \ + __raw_cmpxchg((ptr), (old), (new), (size), LOCK_PREFIX) + +#define __sync_cmpxchg(ptr, old, new, size) \ + __raw_cmpxchg((ptr), (old), (new), (size), "lock; ") + +#define __cmpxchg_local(ptr, old, new, size) \ + __raw_cmpxchg((ptr), (old), (new), (size), "") + #ifdef CONFIG_X86_32 # include "cmpxchg_32.h" #else # include "cmpxchg_64.h" #endif + +#ifdef __HAVE_ARCH_CMPXCHG +#define cmpxchg(ptr, old, new) \ + __cmpxchg((ptr), (old), (new), sizeof(*ptr)) + +#define sync_cmpxchg(ptr, old, new) \ + __sync_cmpxchg((ptr), (old), (new), sizeof(*ptr)) + +#define cmpxchg_local(ptr, old, new) \ + __cmpxchg_local((ptr), (old), (new), sizeof(*ptr)) +#endif + +#endif /* ASM_X86_CMPXCHG_H */ diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h index 59d8e36..fbebb07 100644 --- a/arch/x86/include/asm/cmpxchg_32.h +++ b/arch/x86/include/asm/cmpxchg_32.h @@ -1,62 +1,11 @@ #ifndef _ASM_X86_CMPXCHG_32_H #define _ASM_X86_CMPXCHG_32_H -#include /* Provides LOCK_PREFIX */ - /* * Note: if you use set64_bit(), __cmpxchg64(), or their variants, you * you need to test for the feature in boot_cpu_data. */ -extern void __xchg_wrong_size(void); -extern void __cmpxchg_wrong_size(void); - -/* - * Note: no "lock" prefix even on SMP: xchg always implies lock anyway. - * Since this is generally used to protect other memory information, we - * use "asm volatile" and "memory" clobbers to prevent gcc from moving - * information around. - */ -#define __xchg(x, ptr, size) \ -({ \ - __typeof(*(ptr)) __x = (x); \ - switch (size) { \ - case 1: \ - { \ - volatile u8 *__ptr = (volatile u8 *)(ptr); \ - asm volatile("xchgb %0,%1" \ - : "=q" (__x), "+m" (*__ptr) \ - : "0" (__x) \ - : "memory"); \ - break; \ - } \ - case 2: \ - { \ - volatile u16 *__ptr = (volatile u16 *)(ptr); \ - asm volatile("xchgw %0,%1" \ - : "=r" (__x), "+m" (*__ptr) \ - : "0" (__x) \ - : "memory"); \ - break; \ - } \ - case 4: \ - { \ - volatile u32 *__ptr = (volatile u32 *)(ptr); \ - asm volatile("xchgl %0,%1" \ - : "=r" (__x), "+m" (*__ptr) \ - : "0" (__x) \ - : "memory"); \ - break; \ - } \ - default: \ - __xchg_wrong_size(); \ - } \ - __x; \ -}) - -#define xchg(ptr, v) \ - __xchg((v), (ptr), sizeof(*ptr)) - /* * CMPXCHG8B only writes to the target if we had the previous * value in registers, otherwise it acts as a read and gives us the @@ -85,70 +34,8 @@ static inline void set_64bit(volatile u64 *ptr, u64 value) : "memory"); } -/* - * Atomic compare and exchange. Compare OLD with MEM, if identical, - * store NEW in MEM. Return the initial value in MEM. Success is - * indicated by comparing RETURN with OLD. - */ -#define __raw_cmpxchg(ptr, old, new, size, lock) \ -({ \ - __typeof__(*(ptr)) __ret; \ - __typeof__(*(ptr)) __old = (old); \ - __typeof__(*(ptr)) __new = (new); \ - switch (size) { \ - case 1: \ - { \ - volatile u8 *__ptr = (volatile u8 *)(ptr); \ - asm volatile(lock "cmpxchgb %2,%1" \ - : "=a" (__ret), "+m" (*__ptr) \ - : "q" (__new), "0" (__old) \ - : "memory"); \ - break; \ - } \ - case 2: \ - { \ - volatile u16 *__ptr = (volatile u16 *)(ptr); \ - asm volatile(lock "cmpxchgw %2,%1" \ - : "=a" (__ret), "+m" (*__ptr) \ - : "r" (__new), "0" (__old) \ - : "memory"); \ - break; \ - } \ - case 4: \ - { \ - volatile u32 *__ptr = (volatile u32 *)(ptr); \ - asm volatile(lock "cmpxchgl %2,%1" \ - : "=a" (__ret), "+m" (*__ptr) \ - : "r" (__new), "0" (__old) \ - : "memory"); \ - break; \ - } \ - default: \ - __cmpxchg_wrong_size(); \ - } \ - __ret; \ -}) - -#define __cmpxchg(ptr, old, new, size) \ - __raw_cmpxchg((ptr), (old), (new), (size), LOCK_PREFIX) - -#define __sync_cmpxchg(ptr, old, new, size) \ - __raw_cmpxchg((ptr), (old), (new), (size), "lock; ") - -#define __cmpxchg_local(ptr, old, new, size) \ - __raw_cmpxchg((ptr), (old), (new), (size), "") - #ifdef CONFIG_X86_CMPXCHG #define __HAVE_ARCH_CMPXCHG 1 - -#define cmpxchg(ptr, old, new) \ - __cmpxchg((ptr), (old), (new), sizeof(*ptr)) - -#define sync_cmpxchg(ptr, old, new) \ - __sync_cmpxchg((ptr), (old), (new), sizeof(*ptr)) - -#define cmpxchg_local(ptr, old, new) \ - __cmpxchg_local((ptr), (old), (new), sizeof(*ptr)) #endif #ifdef CONFIG_X86_CMPXCHG64 diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h index 5bfa560..285da02 100644 --- a/arch/x86/include/asm/cmpxchg_64.h +++ b/arch/x86/include/asm/cmpxchg_64.h @@ -1,66 +1,6 @@ #ifndef _ASM_X86_CMPXCHG_64_H #define _ASM_X86_CMPXCHG_64_H -#include /* Provides LOCK_PREFIX */ - -extern void __xchg_wrong_size(void); -extern void __cmpxchg_wrong_size(void); - -/* - * Note: no "lock" prefix even on SMP: xchg always implies lock anyway. - * Since this is generally used to protect other memory information, we - * use "asm volatile" and "memory" clobbers to prevent gcc from moving - * information around. - */ -#define __xchg(x, ptr, size) \ -({ \ - __typeof(*(ptr)) __x = (x); \ - switch (size) { \ - case 1: \ - { \ - volatile u8 *__ptr = (volatile u8 *)(ptr); \ - asm volatile("xchgb %0,%1" \ - : "=q" (__x), "+m" (*__ptr) \ - : "0" (__x) \ - : "memory"); \ - break; \ - } \ - case 2: \ - { \ - volatile u16 *__ptr = (volatile u16 *)(ptr); \ - asm volatile("xchgw %0,%1" \ - : "=r" (__x), "+m" (*__ptr) \ - : "0" (__x) \ - : "memory"); \ - break; \ - } \ - case 4: \ - { \ - volatile u32 *__ptr = (volatile u32 *)(ptr); \ - asm volatile("xchgl %0,%1" \ - : "=r" (__x), "+m" (*__ptr) \ - : "0" (__x) \ - : "memory"); \ - break; \ - } \ - case 8: \ - { \ - volatile u64 *__ptr = (volatile u64 *)(ptr); \ - asm volatile("xchgq %0,%1" \ - : "=r" (__x), "+m" (*__ptr) \ - : "0" (__x) \ - : "memory"); \ - break; \ - } \ - default: \ - __xchg_wrong_size(); \ - } \ - __x; \ -}) - -#define xchg(ptr, v) \ - __xchg((v), (ptr), sizeof(*ptr)) - static inline void set_64bit(volatile u64 *ptr, u64 val) { *ptr = val; @@ -68,77 +8,6 @@ static inline void set_64bit(volatile u64 *ptr, u64 val) #define __HAVE_ARCH_CMPXCHG 1 -/* - * Atomic compare and exchange. Compare OLD with MEM, if identical, - * store NEW in MEM. Return the initial value in MEM. Success is - * indicated by comparing RETURN with OLD. - */ -#define __raw_cmpxchg(ptr, old, new, size, lock) \ -({ \ - __typeof__(*(ptr)) __ret; \ - __typeof__(*(ptr)) __old = (old); \ - __typeof__(*(ptr)) __new = (new); \ - switch (size) { \ - case 1: \ - { \ - volatile u8 *__ptr = (volatile u8 *)(ptr); \ - asm volatile(lock "cmpxchgb %2,%1" \ - : "=a" (__ret), "+m" (*__ptr) \ - : "q" (__new), "0" (__old) \ - : "memory"); \ - break; \ - } \ - case 2: \ - { \ - volatile u16 *__ptr = (volatile u16 *)(ptr); \ - asm volatile(lock "cmpxchgw %2,%1" \ - : "=a" (__ret), "+m" (*__ptr) \ - : "r" (__new), "0" (__old) \ - : "memory"); \ - break; \ - } \ - case 4: \ - { \ - volatile u32 *__ptr = (volatile u32 *)(ptr); \ - asm volatile(lock "cmpxchgl %2,%1" \ - : "=a" (__ret), "+m" (*__ptr) \ - : "r" (__new), "0" (__old) \ - : "memory"); \ - break; \ - } \ - case 8: \ - { \ - volatile u64 *__ptr = (volatile u64 *)(ptr); \ - asm volatile(lock "cmpxchgq %2,%1" \ - : "=a" (__ret), "+m" (*__ptr) \ - : "r" (__new), "0" (__old) \ - : "memory"); \ - break; \ - } \ - default: \ - __cmpxchg_wrong_size(); \ - } \ - __ret; \ -}) - -#define __cmpxchg(ptr, old, new, size) \ - __raw_cmpxchg((ptr), (old), (new), (size), LOCK_PREFIX) - -#define __sync_cmpxchg(ptr, old, new, size) \ - __raw_cmpxchg((ptr), (old), (new), (size), "lock; ") - -#define __cmpxchg_local(ptr, old, new, size) \ - __raw_cmpxchg((ptr), (old), (new), (size), "") - -#define cmpxchg(ptr, old, new) \ - __cmpxchg((ptr), (old), (new), sizeof(*ptr)) - -#define sync_cmpxchg(ptr, old, new) \ - __sync_cmpxchg((ptr), (old), (new), sizeof(*ptr)) - -#define cmpxchg_local(ptr, old, new) \ - __cmpxchg_local((ptr), (old), (new), sizeof(*ptr)) - #define cmpxchg64(ptr, o, n) \ ({ \ BUILD_BUG_ON(sizeof(*(ptr)) != 8); \ -- cgit v1.1 From 433b3520616be694e0aa777089346c8718c91a7b Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 21 Jun 2011 12:00:55 -0700 Subject: x86: Add xadd helper macro Add a common xadd implementation. Signed-off-by: Jeremy Fitzhardinge Link: http://lkml.kernel.org/r/4E5BCC40.3030501@goop.org Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/cmpxchg.h | 43 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h index efe3ec7..0d0d9cd 100644 --- a/arch/x86/include/asm/cmpxchg.h +++ b/arch/x86/include/asm/cmpxchg.h @@ -6,6 +6,7 @@ /* Non-existant functions to indicate usage errors at link time. */ extern void __xchg_wrong_size(void); extern void __cmpxchg_wrong_size(void); +extern void __xadd_wrong_size(void); /* * Constants for operation sizes. On 32-bit, the 64-bit size it set to @@ -157,4 +158,46 @@ extern void __cmpxchg_wrong_size(void); __cmpxchg_local((ptr), (old), (new), sizeof(*ptr)) #endif +#define __xadd(ptr, inc, lock) \ + ({ \ + __typeof__ (*(ptr)) __ret = (inc); \ + switch (sizeof(*(ptr))) { \ + case __X86_CASE_B: \ + asm volatile (lock "xaddb %b0, %1\n" \ + : "+r" (__ret), "+m" (*(ptr)) \ + : : "memory", "cc"); \ + break; \ + case __X86_CASE_W: \ + asm volatile (lock "xaddw %w0, %1\n" \ + : "+r" (__ret), "+m" (*(ptr)) \ + : : "memory", "cc"); \ + break; \ + case __X86_CASE_L: \ + asm volatile (lock "xaddl %0, %1\n" \ + : "+r" (__ret), "+m" (*(ptr)) \ + : : "memory", "cc"); \ + break; \ + case __X86_CASE_Q: \ + asm volatile (lock "xaddq %q0, %1\n" \ + : "+r" (__ret), "+m" (*(ptr)) \ + : : "memory", "cc"); \ + break; \ + default: \ + __xadd_wrong_size(); \ + } \ + __ret; \ + }) + +/* + * xadd() adds "inc" to "*ptr" and atomically returns the previous + * value of "*ptr". + * + * xadd() is locked when multiple CPUs are online + * xadd_sync() is always locked + * xadd_local() is never locked + */ +#define xadd(ptr, inc) __xadd((ptr), (inc), LOCK_PREFIX) +#define xadd_sync(ptr, inc) __xadd((ptr), (inc), "lock; ") +#define xadd_local(ptr, inc) __xadd((ptr), (inc), "") + #endif /* ASM_X86_CMPXCHG_H */ -- cgit v1.1 From 8b8bc2f7311c3223213dbe346d9cc2e299fdb5eb Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 23 Aug 2011 16:59:58 -0700 Subject: x86: Use xadd helper more widely This covers the trivial cases from open-coded xadd to the xadd macros. Signed-off-by: Jeremy Fitzhardinge Link: http://lkml.kernel.org/r/4E5BCC40.3030501@goop.org Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/atomic.h | 8 ++------ arch/x86/include/asm/atomic64_64.h | 6 +----- arch/x86/include/asm/rwsem.h | 8 +------- arch/x86/include/asm/uv/uv_bau.h | 6 +----- 4 files changed, 5 insertions(+), 23 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index 10572e3..58cb6d4 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -172,18 +172,14 @@ static inline int atomic_add_negative(int i, atomic_t *v) */ static inline int atomic_add_return(int i, atomic_t *v) { - int __i; #ifdef CONFIG_M386 + int __i; unsigned long flags; if (unlikely(boot_cpu_data.x86 <= 3)) goto no_xadd; #endif /* Modern 486+ processor */ - __i = i; - asm volatile(LOCK_PREFIX "xaddl %0, %1" - : "+r" (i), "+m" (v->counter) - : : "memory"); - return i + __i; + return i + xadd(&v->counter, i); #ifdef CONFIG_M386 no_xadd: /* Legacy 386 processor */ diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h index 017594d..0e1cbfc 100644 --- a/arch/x86/include/asm/atomic64_64.h +++ b/arch/x86/include/asm/atomic64_64.h @@ -170,11 +170,7 @@ static inline int atomic64_add_negative(long i, atomic64_t *v) */ static inline long atomic64_add_return(long i, atomic64_t *v) { - long __i = i; - asm volatile(LOCK_PREFIX "xaddq %0, %1;" - : "+r" (i), "+m" (v->counter) - : : "memory"); - return i + __i; + return i + xadd(&v->counter, i); } static inline long atomic64_sub_return(long i, atomic64_t *v) diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h index df4cd32..2dbe4a7 100644 --- a/arch/x86/include/asm/rwsem.h +++ b/arch/x86/include/asm/rwsem.h @@ -204,13 +204,7 @@ static inline void rwsem_atomic_add(long delta, struct rw_semaphore *sem) */ static inline long rwsem_atomic_update(long delta, struct rw_semaphore *sem) { - long tmp = delta; - - asm volatile(LOCK_PREFIX "xadd %0,%1" - : "+r" (tmp), "+m" (sem->count) - : : "memory"); - - return tmp + delta; + return delta + xadd(&sem->count, delta); } #endif /* __KERNEL__ */ diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index 37d3698..c568ccc 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -656,11 +656,7 @@ static inline int atomic_read_short(const struct atomic_short *v) */ static inline int atom_asr(short i, struct atomic_short *v) { - short __i = i; - asm volatile(LOCK_PREFIX "xaddw %0, %1" - : "+r" (i), "+m" (v->counter) - : : "memory"); - return i + __i; + return i + xadd(&v->counter, i); } /* -- cgit v1.1 From 84eb950db13ca40a0572ce9957e14723500943d6 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 2 Jul 2010 23:26:36 +0100 Subject: x86, ticketlock: Clean up types and accessors A few cleanups to the way spinlocks are defined and accessed: - define __ticket_t which is the size of a spinlock ticket (ie, enough bits to hold all the cpus) - Define struct arch_spinlock as a union containing plain slock and the head and tail tickets - Use head and tail to implement some of the spinlock predicates. - Make all ticket variables unsigned. - Use TICKET_SHIFT to form constants Most of this will be used in later patches. Signed-off-by: Jeremy Fitzhardinge Link: http://lkml.kernel.org/r/4E5BCC40.3030501@goop.org Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/spinlock.h | 24 ++++++++++-------------- arch/x86/include/asm/spinlock_types.h | 20 ++++++++++++++++++-- 2 files changed, 28 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index ee67edf..ea2a04f 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -55,11 +55,9 @@ * much between them in performance though, especially as locks are out of line. */ #if (NR_CPUS < 256) -#define TICKET_SHIFT 8 - static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock) { - short inc = 0x0100; + unsigned short inc = 1 << TICKET_SHIFT; asm volatile ( LOCK_PREFIX "xaddw %w0, %1\n" @@ -78,7 +76,7 @@ static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock) static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock) { - int tmp, new; + unsigned int tmp, new; asm volatile("movzwl %2, %0\n\t" "cmpb %h0,%b0\n\t" @@ -103,12 +101,10 @@ static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock) : "memory", "cc"); } #else -#define TICKET_SHIFT 16 - static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock) { - int inc = 0x00010000; - int tmp; + unsigned inc = 1 << TICKET_SHIFT; + unsigned tmp; asm volatile(LOCK_PREFIX "xaddl %0, %1\n" "movzwl %w0, %2\n\t" @@ -128,8 +124,8 @@ static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock) static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock) { - int tmp; - int new; + unsigned tmp; + unsigned new; asm volatile("movl %2,%0\n\t" "movl %0,%1\n\t" @@ -159,16 +155,16 @@ static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock) static inline int __ticket_spin_is_locked(arch_spinlock_t *lock) { - int tmp = ACCESS_ONCE(lock->slock); + struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets); - return !!(((tmp >> TICKET_SHIFT) ^ tmp) & ((1 << TICKET_SHIFT) - 1)); + return !!(tmp.tail ^ tmp.head); } static inline int __ticket_spin_is_contended(arch_spinlock_t *lock) { - int tmp = ACCESS_ONCE(lock->slock); + struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets); - return (((tmp >> TICKET_SHIFT) - tmp) & ((1 << TICKET_SHIFT) - 1)) > 1; + return ((tmp.tail - tmp.head) & TICKET_MASK) > 1; } #ifndef CONFIG_PARAVIRT_SPINLOCKS diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h index 7c7a486..1c51bd2 100644 --- a/arch/x86/include/asm/spinlock_types.h +++ b/arch/x86/include/asm/spinlock_types.h @@ -5,11 +5,27 @@ # error "please don't include this file directly" #endif +#include + +#if (CONFIG_NR_CPUS < 256) +typedef u8 __ticket_t; +#else +typedef u16 __ticket_t; +#endif + +#define TICKET_SHIFT (sizeof(__ticket_t) * 8) +#define TICKET_MASK ((__ticket_t)((1 << TICKET_SHIFT) - 1)) + typedef struct arch_spinlock { - unsigned int slock; + union { + unsigned int slock; + struct __raw_tickets { + __ticket_t head, tail; + } tickets; + }; } arch_spinlock_t; -#define __ARCH_SPIN_LOCK_UNLOCKED { 0 } +#define __ARCH_SPIN_LOCK_UNLOCKED { { .slock = 0 } } #include -- cgit v1.1 From c576a3ea905c25d50339503e0e5c7fef724e0147 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Sat, 3 Jul 2010 01:06:04 +0100 Subject: x86, ticketlock: Convert spin loop to C The inner loop of __ticket_spin_lock isn't doing anything very special, so reimplement it in C. For the 8 bit ticket lock variant, we use a register union to get direct access to the lower and upper bytes in the tickets, but unfortunately gcc won't generate a direct comparison between the two halves of the register, so the generated asm isn't quite as pretty as the hand-coded version. However benchmarking shows that this is actually a small improvement in runtime performance on some benchmarks, and never a slowdown. We also need to make sure there's a barrier at the end of the lock loop to make sure that the compiler doesn't move any instructions from within the locked region into the region where we don't yet own the lock. Signed-off-by: Jeremy Fitzhardinge Link: http://lkml.kernel.org/r/4E5BCC40.3030501@goop.org Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/spinlock.h | 60 ++++++++++++++++++++--------------------- 1 file changed, 30 insertions(+), 30 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index ea2a04f..5240cde 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -57,21 +57,21 @@ #if (NR_CPUS < 256) static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock) { - unsigned short inc = 1 << TICKET_SHIFT; - - asm volatile ( - LOCK_PREFIX "xaddw %w0, %1\n" - "1:\t" - "cmpb %h0, %b0\n\t" - "je 2f\n\t" - "rep ; nop\n\t" - "movb %1, %b0\n\t" - /* don't need lfence here, because loads are in-order */ - "jmp 1b\n" - "2:" - : "+Q" (inc), "+m" (lock->slock) - : - : "memory", "cc"); + register union { + struct __raw_tickets tickets; + unsigned short slock; + } inc = { .slock = 1 << TICKET_SHIFT }; + + asm volatile (LOCK_PREFIX "xaddw %w0, %1\n" + : "+Q" (inc), "+m" (lock->slock) : : "memory", "cc"); + + for (;;) { + if (inc.tickets.head == inc.tickets.tail) + break; + cpu_relax(); + inc.tickets.head = ACCESS_ONCE(lock->tickets.head); + } + barrier(); /* make sure nothing creeps before the lock is taken */ } static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock) @@ -104,22 +104,22 @@ static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock) static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock) { unsigned inc = 1 << TICKET_SHIFT; - unsigned tmp; + __ticket_t tmp; - asm volatile(LOCK_PREFIX "xaddl %0, %1\n" - "movzwl %w0, %2\n\t" - "shrl $16, %0\n\t" - "1:\t" - "cmpl %0, %2\n\t" - "je 2f\n\t" - "rep ; nop\n\t" - "movzwl %1, %2\n\t" - /* don't need lfence here, because loads are in-order */ - "jmp 1b\n" - "2:" - : "+r" (inc), "+m" (lock->slock), "=&r" (tmp) - : - : "memory", "cc"); + asm volatile(LOCK_PREFIX "xaddl %0, %1\n\t" + : "+r" (inc), "+m" (lock->slock) + : : "memory", "cc"); + + tmp = inc; + inc >>= TICKET_SHIFT; + + for (;;) { + if ((__ticket_t)inc == tmp) + break; + cpu_relax(); + tmp = ACCESS_ONCE(lock->tickets.head); + } + barrier(); /* make sure nothing creeps before the lock is taken */ } static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock) -- cgit v1.1 From 2994488fe5bb721de1ded53af1a2fc41f47f6ddc Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 13 Jul 2010 14:07:45 -0700 Subject: x86, ticketlock: Convert __ticket_spin_lock to use xadd() Convert the two variants of __ticket_spin_lock() to use xadd(), which has the effect of making them identical, so remove the duplicate function. Signed-off-by: Jeremy Fitzhardinge Link: http://lkml.kernel.org/r/4E5BCC40.3030501@goop.org Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/spinlock.h | 35 +++++------------------------------ 1 file changed, 5 insertions(+), 30 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index 5240cde..b69e0b4 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -54,26 +54,22 @@ * save some instructions and make the code more elegant. There really isn't * much between them in performance though, especially as locks are out of line. */ -#if (NR_CPUS < 256) static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock) { - register union { - struct __raw_tickets tickets; - unsigned short slock; - } inc = { .slock = 1 << TICKET_SHIFT }; + register struct __raw_tickets inc = { .tail = 1 }; - asm volatile (LOCK_PREFIX "xaddw %w0, %1\n" - : "+Q" (inc), "+m" (lock->slock) : : "memory", "cc"); + inc = xadd(&lock->tickets, inc); for (;;) { - if (inc.tickets.head == inc.tickets.tail) + if (inc.head == inc.tail) break; cpu_relax(); - inc.tickets.head = ACCESS_ONCE(lock->tickets.head); + inc.head = ACCESS_ONCE(lock->tickets.head); } barrier(); /* make sure nothing creeps before the lock is taken */ } +#if (NR_CPUS < 256) static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock) { unsigned int tmp, new; @@ -101,27 +97,6 @@ static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock) : "memory", "cc"); } #else -static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock) -{ - unsigned inc = 1 << TICKET_SHIFT; - __ticket_t tmp; - - asm volatile(LOCK_PREFIX "xaddl %0, %1\n\t" - : "+r" (inc), "+m" (lock->slock) - : : "memory", "cc"); - - tmp = inc; - inc >>= TICKET_SHIFT; - - for (;;) { - if ((__ticket_t)inc == tmp) - break; - cpu_relax(); - tmp = ACCESS_ONCE(lock->tickets.head); - } - barrier(); /* make sure nothing creeps before the lock is taken */ -} - static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock) { unsigned tmp; -- cgit v1.1 From 229855d6f3b40d01a903120c433d75e483a0b06d Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 13 Jul 2010 15:14:26 -0700 Subject: x86, ticketlock: Make __ticket_spin_trylock common Make trylock code common regardless of ticket size. (Also, rename arch_spinlock.slock to head_tail.) Signed-off-by: Jeremy Fitzhardinge Link: http://lkml.kernel.org/r/4E5BCC40.3030501@goop.org Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/spinlock.h | 51 +++++++++-------------------------- arch/x86/include/asm/spinlock_types.h | 6 +++-- 2 files changed, 16 insertions(+), 41 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index b69e0b4..f5695ee 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -69,60 +69,33 @@ static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock) barrier(); /* make sure nothing creeps before the lock is taken */ } -#if (NR_CPUS < 256) static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock) { - unsigned int tmp, new; - - asm volatile("movzwl %2, %0\n\t" - "cmpb %h0,%b0\n\t" - "leal 0x100(%" REG_PTR_MODE "0), %1\n\t" - "jne 1f\n\t" - LOCK_PREFIX "cmpxchgw %w1,%2\n\t" - "1:" - "sete %b1\n\t" - "movzbl %b1,%0\n\t" - : "=&a" (tmp), "=&q" (new), "+m" (lock->slock) - : - : "memory", "cc"); + arch_spinlock_t old, new; + + old.tickets = ACCESS_ONCE(lock->tickets); + if (old.tickets.head != old.tickets.tail) + return 0; - return tmp; + new.head_tail = old.head_tail + (1 << TICKET_SHIFT); + + /* cmpxchg is a full barrier, so nothing can move before it */ + return cmpxchg(&lock->head_tail, old.head_tail, new.head_tail) == old.head_tail; } +#if (NR_CPUS < 256) static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock) { asm volatile(UNLOCK_LOCK_PREFIX "incb %0" - : "+m" (lock->slock) + : "+m" (lock->head_tail) : : "memory", "cc"); } #else -static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock) -{ - unsigned tmp; - unsigned new; - - asm volatile("movl %2,%0\n\t" - "movl %0,%1\n\t" - "roll $16, %0\n\t" - "cmpl %0,%1\n\t" - "leal 0x00010000(%" REG_PTR_MODE "0), %1\n\t" - "jne 1f\n\t" - LOCK_PREFIX "cmpxchgl %1,%2\n\t" - "1:" - "sete %b1\n\t" - "movzbl %b1,%0\n\t" - : "=&a" (tmp), "=&q" (new), "+m" (lock->slock) - : - : "memory", "cc"); - - return tmp; -} - static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock) { asm volatile(UNLOCK_LOCK_PREFIX "incw %0" - : "+m" (lock->slock) + : "+m" (lock->head_tail) : : "memory", "cc"); } diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h index 1c51bd2..8ebd5df 100644 --- a/arch/x86/include/asm/spinlock_types.h +++ b/arch/x86/include/asm/spinlock_types.h @@ -9,8 +9,10 @@ #if (CONFIG_NR_CPUS < 256) typedef u8 __ticket_t; +typedef u16 __ticketpair_t; #else typedef u16 __ticket_t; +typedef u32 __ticketpair_t; #endif #define TICKET_SHIFT (sizeof(__ticket_t) * 8) @@ -18,14 +20,14 @@ typedef u16 __ticket_t; typedef struct arch_spinlock { union { - unsigned int slock; + __ticketpair_t head_tail; struct __raw_tickets { __ticket_t head, tail; } tickets; }; } arch_spinlock_t; -#define __ARCH_SPIN_LOCK_UNLOCKED { { .slock = 0 } } +#define __ARCH_SPIN_LOCK_UNLOCKED { { 0 } } #include -- cgit v1.1 From 61e2cd0acc248c14793cefd7e23e209be9e0b70d Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 29 Aug 2011 14:47:58 -0700 Subject: x86, cmpxchg: Use __compiletime_error() to make usage messages a bit nicer Use __compiletime_error() to produce a compile-time error rather than link-time, where available. Signed-off-by: Jeremy Fitzhardinge Link: http://lkml.kernel.org/r/4E5BCC40.3030501@goop.org Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/cmpxchg.h | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h index 0d0d9cd..5d3acdf 100644 --- a/arch/x86/include/asm/cmpxchg.h +++ b/arch/x86/include/asm/cmpxchg.h @@ -1,12 +1,19 @@ #ifndef ASM_X86_CMPXCHG_H #define ASM_X86_CMPXCHG_H +#include #include /* Provides LOCK_PREFIX */ -/* Non-existant functions to indicate usage errors at link time. */ -extern void __xchg_wrong_size(void); -extern void __cmpxchg_wrong_size(void); -extern void __xadd_wrong_size(void); +/* + * Non-existant functions to indicate usage errors at link time + * (or compile-time if the compiler implements __compiletime_error(). + */ +extern void __xchg_wrong_size(void) + __compiletime_error("Bad argument size for xchg"); +extern void __cmpxchg_wrong_size(void) + __compiletime_error("Bad argument size for cmpxchg"); +extern void __xadd_wrong_size(void) + __compiletime_error("Bad argument size for xadd"); /* * Constants for operation sizes. On 32-bit, the 64-bit size it set to -- cgit v1.1 From 3b217116edaac634bf31e85c35708298059a8171 Mon Sep 17 00:00:00 2001 From: Duncan Sands Date: Tue, 30 Aug 2011 10:58:22 +0200 Subject: KVM: Fix instruction size issue in pvclock scaling Commit de2d1a524e94 ("KVM: Fix register corruption in pvclock_scale_delta") introduced a mul instruction that may have only a memory operand; the assembler therefore cannot select the correct size: pvclock.s:229: Error: no instruction mnemonic suffix given and no register operands; can't size instruction In this example the assembler is: #APP mul -48(%rbp) ; shrd $32, %rdx, %rax #NO_APP A simple solution is to use mulq. Signed-off-by: Duncan Sands Signed-off-by: Avi Kivity --- arch/x86/include/asm/pvclock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index a518c0a..c59cc97 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -44,7 +44,7 @@ static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift) : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); #elif defined(__x86_64__) __asm__ ( - "mul %[mul_frac] ; shrd $32, %[hi], %[lo]" + "mulq %[mul_frac] ; shrd $32, %[hi], %[lo]" : [lo]"=a"(product), [hi]"=d"(tmp) : "0"(delta), -- cgit v1.1 From 20afc60f892d285fde179ead4b24e6a7938c2f1b Mon Sep 17 00:00:00 2001 From: Andrey Vagin Date: Tue, 30 Aug 2011 12:32:36 +0400 Subject: x86, perf: Check that current->mm is alive before getting user callchain An event may occur when an mm is already released. I added an event in dequeue_entity() and caught a panic with the following backtrace: [ 434.421110] BUG: unable to handle kernel NULL pointer dereference at 0000000000000050 [ 434.421258] IP: [] __get_user_pages_fast+0x9c/0x120 ... [ 434.421258] Call Trace: [ 434.421258] [] copy_from_user_nmi+0x51/0xf0 [ 434.421258] [] ? sched_clock_local+0x25/0x90 [ 434.421258] [] perf_callchain_user+0x128/0x170 [ 434.421258] [] ? __perf_event_header__init_id+0xed/0x100 [ 434.421258] [] perf_prepare_sample+0x200/0x280 [ 434.421258] [] __perf_event_overflow+0x1b8/0x290 [ 434.421258] [] ? tg_shares_up+0x0/0x670 [ 434.421258] [] ? walk_tg_tree+0x6a/0xb0 [ 434.421258] [] perf_swevent_overflow+0xc4/0xf0 [ 434.421258] [] do_perf_sw_event+0x1e0/0x250 [ 434.421258] [] perf_tp_event+0x44/0x70 [ 434.421258] [] ftrace_profile_sched_block+0xdf/0x110 [ 434.421258] [] dequeue_entity+0x2ad/0x2d0 [ 434.421258] [] dequeue_task_fair+0x1c/0x60 [ 434.421258] [] dequeue_task+0x9a/0xb0 [ 434.421258] [] deactivate_task+0x42/0xe0 [ 434.421258] [] thread_return+0x191/0x808 [ 434.421258] [] ? switch_task_namespaces+0x24/0x60 [ 434.421258] [] do_exit+0x464/0x910 [ 434.421258] [] do_group_exit+0x58/0xd0 [ 434.421258] [] sys_exit_group+0x17/0x20 [ 434.421258] [] system_call_fastpath+0x16/0x1b Signed-off-by: Andrey Vagin Signed-off-by: Peter Zijlstra Cc: stable@kernel.org Link: http://lkml.kernel.org/r/1314693156-24131-1-git-send-email-avagin@openvz.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 4ee3abf..cfa62ec 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1900,6 +1900,9 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) perf_callchain_store(entry, regs->ip); + if (!current->mm) + return; + if (perf_callchain_user32(regs, entry)) return; -- cgit v1.1 From d312ae878b6aed3912e1acaaf5d0b2a9d08a4f11 Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Fri, 19 Aug 2011 15:57:16 +0100 Subject: xen: use maximum reservation to limit amount of usable RAM Use the domain's maximum reservation to limit the amount of extra RAM for the memory balloon. This reduces the size of the pages tables and the amount of reserved low memory (which defaults to about 1/32 of the total RAM). On a system with 8 GiB of RAM with the domain limited to 1 GiB the kernel reports: Before: Memory: 627792k/4472000k available After: Memory: 549740k/11132224k available A increase of about 76 MiB (~1.5% of the unused 7 GiB). The reserved low memory is also reduced from 253 MiB to 32 MiB. The total additional usable RAM is 329 MiB. For dom0, this requires at patch to Xen ('x86: use 'dom0_mem' to limit the number of pages for dom0') (c/s 23790) CC: stable@kernel.org Signed-off-by: David Vrabel Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/setup.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 02ffd9e..ff3dfa1 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -183,6 +183,19 @@ static unsigned long __init xen_set_identity(const struct e820entry *list, PFN_UP(start_pci), PFN_DOWN(last)); return identity; } + +static unsigned long __init xen_get_max_pages(void) +{ + unsigned long max_pages = MAX_DOMAIN_PAGES; + domid_t domid = DOMID_SELF; + int ret; + + ret = HYPERVISOR_memory_op(XENMEM_maximum_reservation, &domid); + if (ret > 0) + max_pages = ret; + return min(max_pages, MAX_DOMAIN_PAGES); +} + /** * machine_specific_memory_setup - Hook for machine specific memory setup. **/ @@ -291,6 +304,12 @@ char * __init xen_memory_setup(void) sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); + extra_limit = xen_get_max_pages(); + if (extra_limit >= max_pfn) + extra_pages = extra_limit - max_pfn; + else + extra_pages = 0; + extra_pages += xen_return_unused_memory(xen_start_info->nr_pages, &e820); /* -- cgit v1.1 From d198d499148a0c64a41b3aba9e7dd43772832b91 Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Thu, 1 Sep 2011 13:46:55 +0200 Subject: xen: x86_32: do not enable iterrupts when returning from exception in interrupt context If vmalloc page_fault happens inside of interrupt handler with interrupts disabled then on exit path from exception handler when there is no pending interrupts, the following code (arch/x86/xen/xen-asm_32.S:112): cmpw $0x0001, XEN_vcpu_info_pending(%eax) sete XEN_vcpu_info_mask(%eax) will enable interrupts even if they has been previously disabled according to eflags from the bounce frame (arch/x86/xen/xen-asm_32.S:99) testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp) setz XEN_vcpu_info_mask(%eax) Solution is in setting XEN_vcpu_info_mask only when it should be set according to cmpw $0x0001, XEN_vcpu_info_pending(%eax) but not clearing it if there isn't any pending events. Reproducer for bug is attached to RHBZ 707552 CC: stable@kernel.org Signed-off-by: Igor Mammedov Acked-by: Jeremy Fitzhardinge Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/xen-asm_32.S | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S index 22a2093..b040b0e 100644 --- a/arch/x86/xen/xen-asm_32.S +++ b/arch/x86/xen/xen-asm_32.S @@ -113,11 +113,13 @@ xen_iret_start_crit: /* * If there's something pending, mask events again so we can - * jump back into xen_hypervisor_callback + * jump back into xen_hypervisor_callback. Otherwise do not + * touch XEN_vcpu_info_mask. */ - sete XEN_vcpu_info_mask(%eax) + jne 1f + movb $1, XEN_vcpu_info_mask(%eax) - popl %eax +1: popl %eax /* * From this point on the registers are restored and the stack -- cgit v1.1 From ed467e69f16e6b480e2face7bc5963834d025f91 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 1 Sep 2011 09:48:27 -0400 Subject: xen/smp: Warn user why they keel over - nosmp or noapic and what to use instead. We have hit a couple of customer bugs where they would like to use those parameters to run an UP kernel - but both of those options turn of important sources of interrupt information so we end up not being able to boot. The correct way is to pass in 'dom0_max_vcpus=1' on the Xen hypervisor line and the kernel will patch itself to be a UP kernel. Fixes bug: http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=637308 CC: stable@kernel.org Acked-by: Ian Campbell Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/smp.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index e79dbb9..d4fc6d4 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -32,6 +32,7 @@ #include #include +#include #include "xen-ops.h" #include "mmu.h" @@ -207,6 +208,15 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus) unsigned cpu; unsigned int i; + if (skip_ioapic_setup) { + char *m = (max_cpus == 0) ? + "The nosmp parameter is incompatible with Xen; " \ + "use Xen dom0_max_vcpus=1 parameter" : + "The noapic parameter is incompatible with Xen"; + + xen_raw_printk(m); + panic(m); + } xen_init_lock_cpu(0); smp_store_cpu_info(0); -- cgit v1.1 From 6f4151c89b7d036c755d8cf74729e09b76fa6676 Mon Sep 17 00:00:00 2001 From: "K. Y. Srinivasan" Date: Wed, 7 Sep 2011 15:25:10 -0700 Subject: x86: Hyper-V: Integrate the clocksource with Hyper-V detection code The Hyper-V clocksource driver is best integrated with Hyper-V detection code since: (a) Linux guests running on Hyper-V require it (b) Integration into that code significanly reduces code size Signed-off-by: K. Y. Srinivasan Signed-off-by: Haiyang Zhang Cc: gregkh@suse.de Cc: devel@linuxdriverproject.org Cc: virtualization@lists.osdl.org Link: http://lkml.kernel.org/r/1315434310-4827-1-git-send-email-kys@microsoft.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/mshyperv.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index d944bf6..0a630dd 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -11,6 +11,8 @@ */ #include +#include +#include #include #include #include @@ -36,6 +38,25 @@ static bool __init ms_hyperv_platform(void) !memcmp("Microsoft Hv", hyp_signature, 12); } +static cycle_t read_hv_clock(struct clocksource *arg) +{ + cycle_t current_tick; + /* + * Read the partition counter to get the current tick count. This count + * is set to 0 when the partition is created and is incremented in + * 100 nanosecond units. + */ + rdmsrl(HV_X64_MSR_TIME_REF_COUNT, current_tick); + return current_tick; +} + +static struct clocksource hyperv_cs = { + .name = "hyperv_clocksource", + .rating = 400, /* use this when running on Hyperv*/ + .read = read_hv_clock, + .mask = CLOCKSOURCE_MASK(64), +}; + static void __init ms_hyperv_init_platform(void) { /* @@ -46,6 +67,8 @@ static void __init ms_hyperv_init_platform(void) printk(KERN_INFO "HyperV: features 0x%x, hints 0x%x\n", ms_hyperv.features, ms_hyperv.hints); + + clocksource_register_hz(&hyperv_cs, NSEC_PER_SEC/100); } const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = { -- cgit v1.1 From d1748302f70be7469809809283fe164156a34231 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Tue, 23 Aug 2011 15:29:42 +0200 Subject: clockevents: Make minimum delay adjustments configurable The automatic increase of the min_delta_ns of a clockevents device should be done in the clockevents code as the minimum delay is an attribute of the clockevents device. In addition not all architectures want the automatic adjustment, on a massively virtualized system it can happen that the programming of a clock event fails several times in a row because the virtual cpu has been rescheduled quickly enough. In that case the minimum delay will erroneously be increased with no way back. The new config symbol GENERIC_CLOCKEVENTS_MIN_ADJUST is used to enable the automatic adjustment. The config option is selected only for x86. Signed-off-by: Martin Schwidefsky Cc: john stultz Link: http://lkml.kernel.org/r/20110823133142.494157493@de.ibm.com Signed-off-by: Thomas Gleixner --- arch/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 6a47bb2..a1609cd 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -68,6 +68,7 @@ config X86 select GENERIC_IRQ_PROBE select GENERIC_PENDING_IRQ if SMP select GENERIC_IRQ_SHOW + select GENERIC_CLOCKEVENTS_MIN_ADJUST select IRQ_FORCED_THREADING select USE_GENERIC_SMP_HELPERS if SMP select HAVE_BPF_JIT if (X86_64 && NET) -- cgit v1.1 From f10cd522c5fbfec9ae3cc01967868c9c2401ed23 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Tue, 6 Sep 2011 17:41:47 +0100 Subject: xen: disable PV spinlocks on HVM PV spinlocks cannot possibly work with the current code because they are enabled after pvops patching has already been done, and because PV spinlocks use a different data structure than native spinlocks so we cannot switch between them dynamically. A spinlock that has been taken once by the native code (__ticket_spin_lock) cannot be taken by __xen_spin_lock even after it has been released. Reported-and-Tested-by: Stefan Bader Signed-off-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/smp.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index d4fc6d4..041d4fe 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -532,7 +532,6 @@ static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus) WARN_ON(xen_smp_intr_init(0)); xen_init_lock_cpu(0); - xen_init_spinlocks(); } static int __cpuinit xen_hvm_cpu_up(unsigned int cpu) -- cgit v1.1 From 5307f6d5fb12fd01f9f321bc4a8fd77e74858647 Mon Sep 17 00:00:00 2001 From: Shyam Iyer Date: Thu, 8 Sep 2011 16:41:17 -0500 Subject: Fix pointer dereference before call to pcie_bus_configure_settings Commit b03e7495a862 ("PCI: Set PCI-E Max Payload Size on fabric") introduced a potential NULL pointer dereference in calls to pcie_bus_configure_settings due to attempts to access pci_bus self variables when the self pointer is NULL. To correct this, verify that the self pointer in pci_bus is non-NULL before dereferencing it. Reported-by: Stanislaw Gruszka Signed-off-by: Shyam Iyer Signed-off-by: Jon Mason Acked-by: Jesse Barnes Signed-off-by: Linus Torvalds --- arch/x86/pci/acpi.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index c953302..039d913 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -365,8 +365,13 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root) */ if (bus) { struct pci_bus *child; - list_for_each_entry(child, &bus->children, node) - pcie_bus_configure_settings(child, child->self->pcie_mpss); + list_for_each_entry(child, &bus->children, node) { + struct pci_dev *self = child->self; + if (!self) + continue; + + pcie_bus_configure_settings(child, self->pcie_mpss); + } } if (!bus) -- cgit v1.1 From 05b217b021e003d60471eb419d0ceed84d06c5db Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 24 Jul 2011 09:46:08 +0000 Subject: x86: cache_info: Remove bogus free of amd_l3_cache data free_cache_attributes() kfree's: per_cpu(ici_cpuid4_info, cpu)->l3 which is a pointer to memory which was allocated as a block in amd_init_l3_cache(). l3 of a particular cpu points to a part of this memory blob. The part and the rest of the blob are still referenced by other cpus. As far as I can tell from the git history this is a leftover from the conversion from per cpu to node data with commit ba06edb63(x86, cacheinfo: Make L3 cache info per node) and the following commit f658bcfb2(x86, cacheinfo: Cleanup L3 cache index disable support) Signed-off-by: Thomas Gleixner Cc: Hans Rosenfeld Cc: Borislav Petkov Cc: Andreas Herrmann Cc: Mike Travis Link: http://lkml.kernel.org/r/20110723212626.550539989@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel_cacheinfo.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index c105c53..aa27c38 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -820,7 +820,6 @@ static void __cpuinit free_cache_attributes(unsigned int cpu) for (i = 0; i < num_cache_leaves; i++) cache_remove_shared_cpu_map(cpu, i); - kfree(per_cpu(ici_cpuid4_info, cpu)->l3); kfree(per_cpu(ici_cpuid4_info, cpu)); per_cpu(ici_cpuid4_info, cpu) = NULL; } -- cgit v1.1 From b7d11a768b061c307aaaa6242f83da2d2388c756 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 24 Jul 2011 09:46:08 +0000 Subject: x86: cache_info: Kill the moronic shadow struct Commit f9b90566c ("x86: reduce stack usage in init_intel_cacheinfo") introduced a shadow structure to reduce the stack usage on large machines instead of making the smaller structure embedded into the large one. That's definitely a candidate for the bad taste award. Move the small struct into the large one and get rid of the ugly type casts. Signed-off-by: Thomas Gleixner Cc: Hans Rosenfeld Cc: Borislav Petkov Cc: Andreas Herrmann Cc: Mike Travis Link: http://lkml.kernel.org/r/20110723212626.625651773@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel_cacheinfo.c | 60 +++++++++++++---------------------- 1 file changed, 22 insertions(+), 38 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index aa27c38..311322b 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -157,22 +157,17 @@ struct amd_l3_cache { u8 subcaches[4]; }; -struct _cpuid4_info { +struct _cpuid4_info_regs { union _cpuid4_leaf_eax eax; union _cpuid4_leaf_ebx ebx; union _cpuid4_leaf_ecx ecx; unsigned long size; struct amd_l3_cache *l3; - DECLARE_BITMAP(shared_cpu_map, NR_CPUS); }; -/* subset of above _cpuid4_info w/o shared_cpu_map */ -struct _cpuid4_info_regs { - union _cpuid4_leaf_eax eax; - union _cpuid4_leaf_ebx ebx; - union _cpuid4_leaf_ecx ecx; - unsigned long size; - struct amd_l3_cache *l3; +struct _cpuid4_info { + struct _cpuid4_info_regs base; + DECLARE_BITMAP(shared_cpu_map, NR_CPUS); }; unsigned short num_cache_leaves; @@ -387,11 +382,10 @@ static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, { int index; - if (!this_leaf->l3 || - !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) + if (!this_leaf->base.l3 || !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) return -EINVAL; - index = amd_get_l3_disable_slot(this_leaf->l3, slot); + index = amd_get_l3_disable_slot(this_leaf->base.l3, slot); if (index >= 0) return sprintf(buf, "%d\n", index); @@ -480,8 +474,7 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!this_leaf->l3 || - !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) + if (!this_leaf->base.l3 || !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) return -EINVAL; cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); @@ -489,7 +482,7 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, if (strict_strtoul(buf, 10, &val) < 0) return -EINVAL; - err = amd_set_l3_disable_slot(this_leaf->l3, cpu, slot, val); + err = amd_set_l3_disable_slot(this_leaf->base.l3, cpu, slot, val); if (err) { if (err == -EEXIST) printk(KERN_WARNING "L3 disable slot %d in use!\n", @@ -518,7 +511,7 @@ static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, static ssize_t show_subcaches(struct _cpuid4_info *this_leaf, char *buf, unsigned int cpu) { - if (!this_leaf->l3 || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + if (!this_leaf->base.l3 || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) return -EINVAL; return sprintf(buf, "%x\n", amd_get_subcaches(cpu)); @@ -533,7 +526,7 @@ store_subcaches(struct _cpuid4_info *this_leaf, const char *buf, size_t count, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!this_leaf->l3 || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + if (!this_leaf->base.l3 || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) return -EINVAL; if (strict_strtoul(buf, 16, &val) < 0) @@ -769,7 +762,7 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) return; } this_leaf = CPUID4_INFO_IDX(cpu, index); - num_threads_sharing = 1 + this_leaf->eax.split.num_threads_sharing; + num_threads_sharing = 1 + this_leaf->base.eax.split.num_threads_sharing; if (num_threads_sharing == 1) cpumask_set_cpu(cpu, to_cpumask(this_leaf->shared_cpu_map)); @@ -824,24 +817,15 @@ static void __cpuinit free_cache_attributes(unsigned int cpu) per_cpu(ici_cpuid4_info, cpu) = NULL; } -static int -__cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf) -{ - struct _cpuid4_info_regs *leaf_regs = - (struct _cpuid4_info_regs *)this_leaf; - - return cpuid4_cache_lookup_regs(index, leaf_regs); -} - static void __cpuinit get_cpu_leaves(void *_retval) { int j, *retval = _retval, cpu = smp_processor_id(); /* Do cpuid and store the results */ for (j = 0; j < num_cache_leaves; j++) { - struct _cpuid4_info *this_leaf; - this_leaf = CPUID4_INFO_IDX(cpu, j); - *retval = cpuid4_cache_lookup(j, this_leaf); + struct _cpuid4_info *this_leaf = CPUID4_INFO_IDX(cpu, j); + + *retval = cpuid4_cache_lookup_regs(j, &this_leaf->base); if (unlikely(*retval < 0)) { int i; @@ -899,16 +883,16 @@ static ssize_t show_##file_name(struct _cpuid4_info *this_leaf, char *buf, \ return sprintf(buf, "%lu\n", (unsigned long)this_leaf->object + val); \ } -show_one_plus(level, eax.split.level, 0); -show_one_plus(coherency_line_size, ebx.split.coherency_line_size, 1); -show_one_plus(physical_line_partition, ebx.split.physical_line_partition, 1); -show_one_plus(ways_of_associativity, ebx.split.ways_of_associativity, 1); -show_one_plus(number_of_sets, ecx.split.number_of_sets, 1); +show_one_plus(level, base.eax.split.level, 0); +show_one_plus(coherency_line_size, base.ebx.split.coherency_line_size, 1); +show_one_plus(physical_line_partition, base.ebx.split.physical_line_partition, 1); +show_one_plus(ways_of_associativity, base.ebx.split.ways_of_associativity, 1); +show_one_plus(number_of_sets, base.ecx.split.number_of_sets, 1); static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf, unsigned int cpu) { - return sprintf(buf, "%luK\n", this_leaf->size / 1024); + return sprintf(buf, "%luK\n", this_leaf->base.size / 1024); } static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf, @@ -945,7 +929,7 @@ static inline ssize_t show_shared_cpu_list(struct _cpuid4_info *leaf, char *buf, static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf, unsigned int cpu) { - switch (this_leaf->eax.split.type) { + switch (this_leaf->base.eax.split.type) { case CACHE_TYPE_DATA: return sprintf(buf, "Data\n"); case CACHE_TYPE_INST: @@ -1134,7 +1118,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) ktype_cache.default_attrs = default_attrs; #ifdef CONFIG_AMD_NB - if (this_leaf->l3) + if (this_leaf->base.l3) ktype_cache.default_attrs = amd_l3_attrs(); #endif retval = kobject_init_and_add(&(this_object->kobj), -- cgit v1.1 From d2946041ff3cbeb0e59db601044025093579bc23 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 24 Jul 2011 09:46:09 +0000 Subject: x86: cache_info: Kill the atomic allocation in amd_init_l3_cache() It's not a good reason to allocate memory in the smp function call just because someone thought it's the most conveniant place. The AMD L3 data is coupled to the northbridge info by a pointer to the corresponding north bridge data. So allocating it with the northbridge data and referencing the northbridge in the cache_info code instead uses less memory and gets rid of that atomic allocation hack in the smp function call. Signed-off-by: Thomas Gleixner Tested-by: Borislav Petkov Cc: Hans Rosenfeld Cc: Andreas Herrmann Cc: Mike Travis Link: http://lkml.kernel.org/r/20110723212626.688229918@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/amd_nb.h | 6 +++ arch/x86/kernel/cpu/intel_cacheinfo.c | 74 ++++++++++++----------------------- 2 files changed, 32 insertions(+), 48 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h index 67f87f2..8e41071 100644 --- a/arch/x86/include/asm/amd_nb.h +++ b/arch/x86/include/asm/amd_nb.h @@ -19,9 +19,15 @@ extern int amd_numa_init(void); extern int amd_get_subcaches(int); extern int amd_set_subcaches(int, int); +struct amd_l3_cache { + unsigned indices; + u8 subcaches[4]; +}; + struct amd_northbridge { struct pci_dev *misc; struct pci_dev *link; + struct amd_l3_cache l3_cache; }; struct amd_northbridge_info { diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 311322b..951820f 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -151,18 +151,12 @@ union _cpuid4_leaf_ecx { u32 full; }; -struct amd_l3_cache { - struct amd_northbridge *nb; - unsigned indices; - u8 subcaches[4]; -}; - struct _cpuid4_info_regs { union _cpuid4_leaf_eax eax; union _cpuid4_leaf_ebx ebx; union _cpuid4_leaf_ecx ecx; unsigned long size; - struct amd_l3_cache *l3; + struct amd_northbridge *nb; }; struct _cpuid4_info { @@ -309,12 +303,13 @@ struct _cache_attr { /* * L3 cache descriptors */ -static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3) +static void __cpuinit amd_calc_l3_indices(struct amd_northbridge *nb) { + struct amd_l3_cache *l3 = &nb->l3_cache; unsigned int sc0, sc1, sc2, sc3; u32 val = 0; - pci_read_config_dword(l3->nb->misc, 0x1C4, &val); + pci_read_config_dword(nb->misc, 0x1C4, &val); /* calculate subcache sizes */ l3->subcaches[0] = sc0 = !(val & BIT(0)); @@ -328,33 +323,16 @@ static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3) static void __cpuinit amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index) { - static struct amd_l3_cache *__cpuinitdata l3_caches; int node; /* only for L3, and not in virtualized environments */ - if (index < 3 || amd_nb_num() == 0) + if (index < 3) return; - /* - * Strictly speaking, the amount in @size below is leaked since it is - * never freed but this is done only on shutdown so it doesn't matter. - */ - if (!l3_caches) { - int size = amd_nb_num() * sizeof(struct amd_l3_cache); - - l3_caches = kzalloc(size, GFP_ATOMIC); - if (!l3_caches) - return; - } - node = amd_get_nb_id(smp_processor_id()); - - if (!l3_caches[node].nb) { - l3_caches[node].nb = node_to_amd_nb(node); - amd_calc_l3_indices(&l3_caches[node]); - } - - this_leaf->l3 = &l3_caches[node]; + this_leaf->nb = node_to_amd_nb(node); + if (this_leaf->nb && !this_leaf->nb->l3_cache.indices) + amd_calc_l3_indices(this_leaf->nb); } /* @@ -364,11 +342,11 @@ static void __cpuinit amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, * * @returns: the disabled index if used or negative value if slot free. */ -int amd_get_l3_disable_slot(struct amd_l3_cache *l3, unsigned slot) +int amd_get_l3_disable_slot(struct amd_northbridge *nb, unsigned slot) { unsigned int reg = 0; - pci_read_config_dword(l3->nb->misc, 0x1BC + slot * 4, ®); + pci_read_config_dword(nb->misc, 0x1BC + slot * 4, ®); /* check whether this slot is activated already */ if (reg & (3UL << 30)) @@ -382,10 +360,10 @@ static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, { int index; - if (!this_leaf->base.l3 || !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) + if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) return -EINVAL; - index = amd_get_l3_disable_slot(this_leaf->base.l3, slot); + index = amd_get_l3_disable_slot(this_leaf->base.nb, slot); if (index >= 0) return sprintf(buf, "%d\n", index); @@ -402,7 +380,7 @@ show_cache_disable_##slot(struct _cpuid4_info *this_leaf, char *buf, \ SHOW_CACHE_DISABLE(0) SHOW_CACHE_DISABLE(1) -static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu, +static void amd_l3_disable_index(struct amd_northbridge *nb, int cpu, unsigned slot, unsigned long idx) { int i; @@ -415,10 +393,10 @@ static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu, for (i = 0; i < 4; i++) { u32 reg = idx | (i << 20); - if (!l3->subcaches[i]) + if (!nb->l3_cache.subcaches[i]) continue; - pci_write_config_dword(l3->nb->misc, 0x1BC + slot * 4, reg); + pci_write_config_dword(nb->misc, 0x1BC + slot * 4, reg); /* * We need to WBINVD on a core on the node containing the L3 @@ -428,7 +406,7 @@ static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu, wbinvd_on_cpu(cpu); reg |= BIT(31); - pci_write_config_dword(l3->nb->misc, 0x1BC + slot * 4, reg); + pci_write_config_dword(nb->misc, 0x1BC + slot * 4, reg); } } @@ -442,24 +420,24 @@ static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu, * * @return: 0 on success, error status on failure */ -int amd_set_l3_disable_slot(struct amd_l3_cache *l3, int cpu, unsigned slot, +int amd_set_l3_disable_slot(struct amd_northbridge *nb, int cpu, unsigned slot, unsigned long index) { int ret = 0; /* check if @slot is already used or the index is already disabled */ - ret = amd_get_l3_disable_slot(l3, slot); + ret = amd_get_l3_disable_slot(nb, slot); if (ret >= 0) return -EINVAL; - if (index > l3->indices) + if (index > nb->l3_cache.indices) return -EINVAL; /* check whether the other slot has disabled the same index already */ - if (index == amd_get_l3_disable_slot(l3, !slot)) + if (index == amd_get_l3_disable_slot(nb, !slot)) return -EINVAL; - amd_l3_disable_index(l3, cpu, slot, index); + amd_l3_disable_index(nb, cpu, slot, index); return 0; } @@ -474,7 +452,7 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!this_leaf->base.l3 || !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) + if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) return -EINVAL; cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); @@ -482,7 +460,7 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, if (strict_strtoul(buf, 10, &val) < 0) return -EINVAL; - err = amd_set_l3_disable_slot(this_leaf->base.l3, cpu, slot, val); + err = amd_set_l3_disable_slot(this_leaf->base.nb, cpu, slot, val); if (err) { if (err == -EEXIST) printk(KERN_WARNING "L3 disable slot %d in use!\n", @@ -511,7 +489,7 @@ static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, static ssize_t show_subcaches(struct _cpuid4_info *this_leaf, char *buf, unsigned int cpu) { - if (!this_leaf->base.l3 || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) return -EINVAL; return sprintf(buf, "%x\n", amd_get_subcaches(cpu)); @@ -526,7 +504,7 @@ store_subcaches(struct _cpuid4_info *this_leaf, const char *buf, size_t count, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!this_leaf->base.l3 || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) return -EINVAL; if (strict_strtoul(buf, 16, &val) < 0) @@ -1118,7 +1096,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) ktype_cache.default_attrs = default_attrs; #ifdef CONFIG_AMD_NB - if (this_leaf->base.l3) + if (this_leaf->base.nb) ktype_cache.default_attrs = amd_l3_attrs(); #endif retval = kobject_init_and_add(&(this_object->kobj), -- cgit v1.1 From 77e75fc764baf65394f0f1a934ae1cb4e575d48d Mon Sep 17 00:00:00 2001 From: Frank Arnold Date: Wed, 18 May 2011 11:32:10 +0200 Subject: x86: cache_info: Update calculation of AMD L3 cache indices L3 subcaches 0 and 1 of AMD Family 15h CPUs can have a size of 2MB. Update the calculation routine for the number of L3 indices to reflect that. Signed-off-by: Frank Arnold Cc: Borislav Petkov Cc: Rosenfeld Hans Cc: Herrmann3 Andreas Cc: Mike Travis Cc: Frank Arnold Link: http://lkml.kernel.org/r/20110726170449.GB32536@aftab Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel_cacheinfo.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 951820f..a3b0811 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -314,6 +314,12 @@ static void __cpuinit amd_calc_l3_indices(struct amd_northbridge *nb) /* calculate subcache sizes */ l3->subcaches[0] = sc0 = !(val & BIT(0)); l3->subcaches[1] = sc1 = !(val & BIT(4)); + + if (boot_cpu_data.x86 == 0x15) { + l3->subcaches[0] = sc0 += !(val & BIT(1)); + l3->subcaches[1] = sc1 += !(val & BIT(5)); + } + l3->subcaches[2] = sc2 = !(val & BIT(8)) + !(val & BIT(9)); l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13)); -- cgit v1.1 From 2d21a29fb62f142b8a62496700d8d82a6a8fd783 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 25 Jul 2009 16:18:34 +0200 Subject: locking, oprofile: Annotate oprofilefs lock as raw The oprofilefs_lock can be taken in atomic context (in profiling interrupts) and therefore cannot cannot be preempted on -rt - annotate it. In mainline this change documents the low level nature of the lock - otherwise there's no functional difference. Lockdep and Sparse checking will work as usual. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/oprofile/nmi_int.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 68894fd..96646b3 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -355,10 +355,10 @@ static void nmi_cpu_setup(void *dummy) int cpu = smp_processor_id(); struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu); nmi_cpu_save_registers(msrs); - spin_lock(&oprofilefs_lock); + raw_spin_lock(&oprofilefs_lock); model->setup_ctrs(model, msrs); nmi_cpu_setup_mux(cpu, msrs); - spin_unlock(&oprofilefs_lock); + raw_spin_unlock(&oprofilefs_lock); per_cpu(saved_lvtpc, cpu) = apic_read(APIC_LVTPC); apic_write(APIC_LVTPC, APIC_DM_NMI); } -- cgit v1.1 From 59d958d2c7de20409a0dc202adc87d3973ada13d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Jul 2010 14:28:02 +0200 Subject: locking, x86: mce: Annotate cmci_discover_lock as raw The cmci_discover_lock can be taken in atomic context (cpu bring up sequence) and therefore cannot be preempted on -rt. In mainline this change documents the low level nature of the lock - otherwise there's no functional difference. Lockdep and Sparse checking will work as usual. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce_intel.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 8694ef56..38e49bc9 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -28,7 +28,7 @@ static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned); * cmci_discover_lock protects against parallel discovery attempts * which could race against each other. */ -static DEFINE_SPINLOCK(cmci_discover_lock); +static DEFINE_RAW_SPINLOCK(cmci_discover_lock); #define CMCI_THRESHOLD 1 @@ -85,7 +85,7 @@ static void cmci_discover(int banks, int boot) int hdr = 0; int i; - spin_lock_irqsave(&cmci_discover_lock, flags); + raw_spin_lock_irqsave(&cmci_discover_lock, flags); for (i = 0; i < banks; i++) { u64 val; @@ -116,7 +116,7 @@ static void cmci_discover(int banks, int boot) WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks))); } } - spin_unlock_irqrestore(&cmci_discover_lock, flags); + raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); if (hdr) printk(KERN_CONT "\n"); } @@ -150,7 +150,7 @@ void cmci_clear(void) if (!cmci_supported(&banks)) return; - spin_lock_irqsave(&cmci_discover_lock, flags); + raw_spin_lock_irqsave(&cmci_discover_lock, flags); for (i = 0; i < banks; i++) { if (!test_bit(i, __get_cpu_var(mce_banks_owned))) continue; @@ -160,7 +160,7 @@ void cmci_clear(void) wrmsrl(MSR_IA32_MCx_CTL2(i), val); __clear_bit(i, __get_cpu_var(mce_banks_owned)); } - spin_unlock_irqrestore(&cmci_discover_lock, flags); + raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); } /* -- cgit v1.1 From e3b73c4a25e9a5705b4ef28b91676caf01f9bc9f Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Tue, 13 Sep 2011 10:17:32 -0400 Subject: xen/e820: if there is no dom0_mem=, don't tweak extra_pages. The patch "xen: use maximum reservation to limit amount of usable RAM" (d312ae878b6aed3912e1acaaf5d0b2a9d08a4f11) breaks machines that do not use 'dom0_mem=' argument with: reserve RAM buffer: 000000133f2e2000 - 000000133fffffff (XEN) mm.c:4976:d0 Global bit is set to kernel page fffff8117e (XEN) domain_crash_sync called from entry.S (XEN) Domain 0 (vcpu#0) crashed on cpu#0: ... The reason being that the last E820 entry is created using the 'extra_pages' (which is based on how many pages have been freed). The mentioned git commit sets the initial value of 'extra_pages' using a hypercall which returns the number of pages (if dom0_mem has been used) or -1 otherwise. If the later we return with MAX_DOMAIN_PAGES as basis for calculation: return min(max_pages, MAX_DOMAIN_PAGES); and use it: extra_limit = xen_get_max_pages(); if (extra_limit >= max_pfn) extra_pages = extra_limit - max_pfn; else extra_pages = 0; which means we end up with extra_pages = 128GB in PFNs (33554432) - 8GB in PFNs (2097152, on this specific box, can be larger or smaller), and then we add that value to the E820 making it: Xen: 00000000ff000000 - 0000000100000000 (reserved) Xen: 0000000100000000 - 000000133f2e2000 (usable) which is clearly wrong. It should look as so: Xen: 00000000ff000000 - 0000000100000000 (reserved) Xen: 0000000100000000 - 000000027fbda000 (usable) Naturally this problem does not present itself if dom0_mem=max:X is used. CC: stable@kernel.org Signed-off-by: David Vrabel Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/setup.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index ff3dfa1..09688eb 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -305,10 +305,12 @@ char * __init xen_memory_setup(void) sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); extra_limit = xen_get_max_pages(); - if (extra_limit >= max_pfn) - extra_pages = extra_limit - max_pfn; - else - extra_pages = 0; + if (max_pfn + extra_pages > extra_limit) { + if (extra_limit > max_pfn) + extra_pages = extra_limit - max_pfn; + else + extra_pages = 0; + } extra_pages += xen_return_unused_memory(xen_start_info->nr_pages, &e820); -- cgit v1.1 From 9aaef96f61d93062556d34e15731f7d5869dd82e Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Fri, 17 Jun 2011 04:40:36 -0400 Subject: x86, mce: Do not call del_timer_sync() in IRQ context del_timer_sync() can cause a deadlock when called in interrupt context. It is used with on_each_cpu() in some parts for sysfs files like bank*, check_interval, cmci_disabled and ignore_ce. However, use of on_each_cpu() results in calling the function passed as the argument in interrupt context. This causes a flood of nested warnings from del_timer_sync() (it runs on each CPU) caused even by a simple file access like: $ echo 300 > /sys/devices/system/machinecheck/machinecheck0/check_interval Fortunately, these MCE-specific files are rarely used and AFAIK only few MCE geeks experience this warning. To remove the warning, move timer deletion outside of the interrupt context. Signed-off-by: Hidetoshi Seto Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/mcheck/mce.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 08363b0..5b5ccee 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1140,6 +1140,15 @@ static void mce_start_timer(unsigned long data) add_timer_on(t, smp_processor_id()); } +/* Must not be called in IRQ context where del_timer_sync() can deadlock */ +static void mce_timer_delete_all(void) +{ + int cpu; + + for_each_online_cpu(cpu) + del_timer_sync(&per_cpu(mce_timer, cpu)); +} + static void mce_do_trigger(struct work_struct *work) { call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT); @@ -1750,7 +1759,6 @@ static struct syscore_ops mce_syscore_ops = { static void mce_cpu_restart(void *data) { - del_timer_sync(&__get_cpu_var(mce_timer)); if (!mce_available(__this_cpu_ptr(&cpu_info))) return; __mcheck_cpu_init_generic(); @@ -1760,16 +1768,15 @@ static void mce_cpu_restart(void *data) /* Reinit MCEs after user configuration changes */ static void mce_restart(void) { + mce_timer_delete_all(); on_each_cpu(mce_cpu_restart, NULL, 1); } /* Toggle features for corrected errors */ -static void mce_disable_ce(void *all) +static void mce_disable_cmci(void *data) { if (!mce_available(__this_cpu_ptr(&cpu_info))) return; - if (all) - del_timer_sync(&__get_cpu_var(mce_timer)); cmci_clear(); } @@ -1852,7 +1859,8 @@ static ssize_t set_ignore_ce(struct sys_device *s, if (mce_ignore_ce ^ !!new) { if (new) { /* disable ce features */ - on_each_cpu(mce_disable_ce, (void *)1, 1); + mce_timer_delete_all(); + on_each_cpu(mce_disable_cmci, NULL, 1); mce_ignore_ce = 1; } else { /* enable ce features */ @@ -1875,7 +1883,7 @@ static ssize_t set_cmci_disabled(struct sys_device *s, if (mce_cmci_disabled ^ !!new) { if (new) { /* disable cmci */ - on_each_cpu(mce_disable_ce, NULL, 1); + on_each_cpu(mce_disable_cmci, NULL, 1); mce_cmci_disabled = 1; } else { /* enable cmci */ -- cgit v1.1 From 61cca2fab7ecba18f9b9680cd736ef5fa82ad3b1 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 15 Sep 2011 08:52:40 +0100 Subject: xen/i386: follow-up to "replace order-based range checking of M2P table by linear one" The numbers obtained from the hypervisor really can't ever lead to an overflow here, only the original calculation going through the order of the range could have. This avoids the (as Jeremy points outs) somewhat ugly NULL-based calculation here. Signed-off-by: Jan Beulich Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/mmu.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 24abc1f..a3872f7 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1721,10 +1721,8 @@ void __init xen_setup_machphys_mapping(void) machine_to_phys_nr = MACH2PHYS_NR_ENTRIES; } #ifdef CONFIG_X86_32 - if ((machine_to_phys_mapping + machine_to_phys_nr) - < machine_to_phys_mapping) - machine_to_phys_nr = (unsigned long *)NULL - - machine_to_phys_mapping; + WARN_ON((machine_to_phys_mapping + (machine_to_phys_nr - 1)) + < machine_to_phys_mapping); #endif } -- cgit v1.1 From ea70ef3d9db1cbfd439f3e3f124c28ef4fe5835d Mon Sep 17 00:00:00 2001 From: Kamalesh Babulal Date: Thu, 28 Apr 2011 14:32:08 +0530 Subject: sched: x86_32 Fix typo in switch_to() description This patch fixes the typo in parameters passed to x86_32 switch_to() description. Signed-off-by: Kamalesh Babulal Signed-off-by: Jiri Kosina --- arch/x86/kernel/process_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index a3d0dc5..a069c0c 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -260,7 +260,7 @@ EXPORT_SYMBOL_GPL(start_thread); /* - * switch_to(x,yn) should switch tasks from x to y. + * switch_to(x,y) should switch tasks from x to y. * * We fsave/fwait so that an exception goes off at the right time * (as a call from the fsave or fwait in effect) rather than to -- cgit v1.1 From 469bfb4a50954c7c1ec506e5399fdb1dbf230650 Mon Sep 17 00:00:00 2001 From: Jesper Juhl Date: Mon, 1 Aug 2011 23:07:51 +0200 Subject: Remove unneeded version.h include from arch/x86/ It was pointed out by 'make versioncheck' that the include of linux/version.h is not needed in arch/x86/mm/mmio-mod.c . This patch removes it. Signed-off-by: Jesper Juhl Signed-off-by: Jiri Kosina --- arch/x86/mm/mmio-mod.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c index 3adff7d..c83c3d0 100644 --- a/arch/x86/mm/mmio-mod.c +++ b/arch/x86/mm/mmio-mod.c @@ -29,7 +29,6 @@ #include #include #include -#include #include #include #include -- cgit v1.1 From a7f934d4f16144cb9521b62e9b8c9ac0118097da Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 15 Sep 2011 13:28:33 -0700 Subject: asm alternatives: remove incorrect alignment notes On x86-64, they were just wasteful: with the explicitly added (now unnecessary) padding, the size of the alternatives structure was 16 bytes, and an alignment of 8 bytes didn't hurt much. However, it was still silly, since the natural size and alignment for the structure is actually just 12 bytes, 4-byte aligned since commit 59e97e4d6fbc ("x86: Make alternative instruction pointers relative"). So removing the padding, and removing the extra alignment is just a good idea. On x86-32, the alignment of 4 bytes was correct, but was incorrectly hardcoded as 8 bytes in . That header file had used to be an x86-64 only header file, but various unification efforts have made it be used for x86-32 too (ie the unification of rwlock and rwsem). That in turn caused x86-32 boot failures, because the extra alignment would result in random zero-filled words in the altinstructions section, causing oopses early at boot when doing alternative instruction replacement. So just remove all the alignment noise entirely. It's wrong, and it's unnecessary. The section itself is already properly aligned by the linker scripts, and all additions to the section had better be of the proper 12-byte format, keeping it aligned. So if the align directive were to ever make a difference, that would be an indication of a serious bug to begin with. Reported-by: Werner Landgraf Acked-by: Andrew Lutomirski Signed-off-by: Linus Torvalds --- arch/x86/include/asm/alternative-asm.h | 1 - arch/x86/include/asm/alternative.h | 4 ---- arch/x86/include/asm/cpufeature.h | 2 -- 3 files changed, 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h index 4554cc6..091508b 100644 --- a/arch/x86/include/asm/alternative-asm.h +++ b/arch/x86/include/asm/alternative-asm.h @@ -16,7 +16,6 @@ #endif .macro altinstruction_entry orig alt feature orig_len alt_len - .align 8 .long \orig - . .long \alt - . .word \feature diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 23fb6d7..37ad100 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -48,9 +48,6 @@ struct alt_instr { u16 cpuid; /* cpuid bit set for replacement */ u8 instrlen; /* length of original instruction */ u8 replacementlen; /* length of new instruction, <= instrlen */ -#ifdef CONFIG_X86_64 - u32 pad2; -#endif }; extern void alternative_instructions(void); @@ -83,7 +80,6 @@ static inline int alternatives_text_reserved(void *start, void *end) \ "661:\n\t" oldinstr "\n662:\n" \ ".section .altinstructions,\"a\"\n" \ - _ASM_ALIGN "\n" \ " .long 661b - .\n" /* label */ \ " .long 663f - .\n" /* new instruction */ \ " .word " __stringify(feature) "\n" /* feature bit */ \ diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 4258aac..88b23a4 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -332,7 +332,6 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) asm goto("1: jmp %l[t_no]\n" "2:\n" ".section .altinstructions,\"a\"\n" - _ASM_ALIGN "\n" " .long 1b - .\n" " .long 0\n" /* no replacement */ " .word %P0\n" /* feature bit */ @@ -350,7 +349,6 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) asm volatile("1: movb $0,%0\n" "2:\n" ".section .altinstructions,\"a\"\n" - _ASM_ALIGN "\n" " .long 1b - .\n" " .long 3f - .\n" " .word %P1\n" /* feature bit */ -- cgit v1.1 From 41750d31fc9599fd81763e685a6b7b42d298c4f8 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Tue, 23 Aug 2011 17:05:18 -0700 Subject: x86, x2apic: Enable the bios request for x2apic optout On the platforms which are x2apic and interrupt-remapping capable, Linux kernel is enabling x2apic even if the BIOS doesn't. This is to take advantage of the features that x2apic brings in. Some of the OEM platforms are running into issues because of this, as their bios is not x2apic aware. For example, this was resulting in interrupt migration issues on one of the platforms. Also if the BIOS SMI handling uses APIC interface to send SMI's, then the BIOS need to be aware of x2apic mode that OS has enabled. On some of these platforms, BIOS doesn't have a HW mechanism to turnoff the x2apic feature to prevent OS from enabling it. To resolve this mess, recent changes to the VT-d2 specification: http://download.intel.com/technology/computing/vptech/Intel(r)_VT_for_Direct_IO.pdf includes a mechanism that provides BIOS a way to request system software to opt out of enabling x2apic mode. Look at the x2apic optout flag in the DMAR tables before enabling the x2apic mode in the platform. Also print a warning that we have disabled x2apic based on the BIOS request. Kernel boot parameter "intremap=no_x2apic_optout" can be used to override the BIOS x2apic optout request. Signed-off-by: Youquan Song Signed-off-by: Suresh Siddha Cc: yinghai@kernel.org Cc: joerg.roedel@amd.com Cc: tony.luck@intel.com Cc: dwmw2@infradead.org Link: http://lkml.kernel.org/r/20110824001456.171766616@sbsiddha-desk.sc.intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 52fa563..6b9874a 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1440,24 +1440,18 @@ int __init enable_IR(void) #ifdef CONFIG_INTR_REMAP if (!intr_remapping_supported()) { pr_debug("intr-remapping not supported\n"); - return 0; + return -1; } if (!x2apic_preenabled && skip_ioapic_setup) { pr_info("Skipped enabling intr-remap because of skipping " "io-apic setup\n"); - return 0; + return -1; } - if (enable_intr_remapping(x2apic_supported())) - return 0; - - pr_info("Enabled Interrupt-remapping\n"); - - return 1; - + return enable_intr_remapping(); #endif - return 0; + return -1; } void __init enable_IR_x2apic(void) @@ -1481,11 +1475,11 @@ void __init enable_IR_x2apic(void) mask_ioapic_entries(); if (dmar_table_init_ret) - ret = 0; + ret = -1; else ret = enable_IR(); - if (!ret) { + if (ret < 0) { /* IR is required if there is APIC ID > 255 even when running * under KVM */ @@ -1499,6 +1493,9 @@ void __init enable_IR_x2apic(void) x2apic_force_phys(); } + if (ret == IRQ_REMAP_XAPIC_MODE) + goto nox2apic; + x2apic_enabled = 1; if (x2apic_supported() && !x2apic_mode) { @@ -1508,19 +1505,21 @@ void __init enable_IR_x2apic(void) } nox2apic: - if (!ret) /* IR enabling failed */ + if (ret < 0) /* IR enabling failed */ restore_ioapic_entries(); legacy_pic->restore_mask(); local_irq_restore(flags); out: - if (x2apic_enabled) + if (x2apic_enabled || !x2apic_supported()) return; if (x2apic_preenabled) panic("x2apic: enabled by BIOS but kernel init failed."); - else if (cpu_has_x2apic) - pr_info("Not enabling x2apic, Intr-remapping init failed.\n"); + else if (ret == IRQ_REMAP_XAPIC_MODE) + pr_info("x2apic not enabled, IRQ remapping is in xapic mode\n"); + else if (ret < 0) + pr_info("x2apic not enabled, IRQ remapping init failed\n"); } #ifdef CONFIG_X86_64 -- cgit v1.1 From 13ea20f7a29aec1ed776de05f86bd892dc9ac395 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Tue, 23 Aug 2011 17:05:23 -0700 Subject: x86, msi, intr-remap: Use the ioapic set affinity routine IRQ set affinity routine is same for the IO-APIC IRQ's aswell as the MSI IRQ's in the presence of interrupt-remapping. This is because we modify the interrupt-remapping table entry and doesn't touch the IO-APIC RTE or the MSI entry. So remove the ir_msi_set_affinity() and re-use the ir_ioapic_set_affinity() Signed-off-by: Suresh Siddha Cc: yinghai@kernel.org Cc: youquan.song@intel.com Cc: joerg.roedel@amd.com Cc: tony.luck@intel.com Cc: dwmw2@infradead.org Link: http://lkml.kernel.org/r/20110824001456.452760446@sbsiddha-desk.sc.intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 54 ++++++++++-------------------------------- 1 file changed, 12 insertions(+), 42 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 8eb863e..f88af6b 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2267,6 +2267,9 @@ ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, * updated vector information), by using a virtual vector (io-apic pin number). * Real vector that is used for interrupting cpu will be coming from * the interrupt-remapping table entry. + * + * As the migration is a simple atomic update of IRTE, the same mechanism + * is used to migrate MSI irq's in the presence of interrupt-remapping. */ static int ir_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, @@ -2291,10 +2294,16 @@ ir_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, irte.dest_id = IRTE_DEST(dest); /* - * Modified the IRTE and flushes the Interrupt entry cache. + * Atomically updates the IRTE with the new destination, vector + * and flushes the interrupt entry cache. */ modify_irte(irq, &irte); + /* + * After this point, all the interrupts will start arriving + * at the new destination. So, time to cleanup the previous + * vector allocation. + */ if (cfg->move_in_progress) send_cleanup_vector(cfg); @@ -3144,45 +3153,6 @@ msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) return 0; } -#ifdef CONFIG_INTR_REMAP -/* - * Migrate the MSI irq to another cpumask. This migration is - * done in the process context using interrupt-remapping hardware. - */ -static int -ir_msi_set_affinity(struct irq_data *data, const struct cpumask *mask, - bool force) -{ - struct irq_cfg *cfg = data->chip_data; - unsigned int dest, irq = data->irq; - struct irte irte; - - if (get_irte(irq, &irte)) - return -1; - - if (__ioapic_set_affinity(data, mask, &dest)) - return -1; - - irte.vector = cfg->vector; - irte.dest_id = IRTE_DEST(dest); - - /* - * atomically update the IRTE with the new destination and vector. - */ - modify_irte(irq, &irte); - - /* - * After this point, all the interrupts will start arriving - * at the new destination. So, time to cleanup the previous - * vector allocation. - */ - if (cfg->move_in_progress) - send_cleanup_vector(cfg); - - return 0; -} - -#endif #endif /* CONFIG_SMP */ /* @@ -3207,7 +3177,7 @@ static struct irq_chip msi_ir_chip = { #ifdef CONFIG_INTR_REMAP .irq_ack = ir_ack_apic_edge, #ifdef CONFIG_SMP - .irq_set_affinity = ir_msi_set_affinity, + .irq_set_affinity = ir_ioapic_set_affinity, #endif #endif .irq_retrigger = ioapic_retrigger_irq, @@ -3416,7 +3386,7 @@ static struct irq_chip ir_hpet_msi_type = { #ifdef CONFIG_INTR_REMAP .irq_ack = ir_ack_apic_edge, #ifdef CONFIG_SMP - .irq_set_affinity = ir_msi_set_affinity, + .irq_set_affinity = ir_ioapic_set_affinity, #endif #endif .irq_retrigger = ioapic_retrigger_irq, -- cgit v1.1 From c39d77ffa28c6e72702193df4fa53928c1b6f3e6 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Tue, 23 Aug 2011 17:05:24 -0700 Subject: x86, ioapic: Define irq_remap_modify_chip_defaults() Define irq_remap_modify_chip_defaults() and remove the duplicate code, cleanup the unnecessary ifdefs. Signed-off-by: Suresh Siddha Cc: yinghai@kernel.org Cc: youquan.song@intel.com Cc: joerg.roedel@amd.com Cc: tony.luck@intel.com Cc: dwmw2@infradead.org Link: http://lkml.kernel.org/r/20110824001456.499225692@sbsiddha-desk.sc.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/irq_remapping.h | 4 +++ arch/x86/kernel/apic/io_apic.c | 64 +++++++++++------------------------- 2 files changed, 23 insertions(+), 45 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h index 1c23360..7000f0f 100644 --- a/arch/x86/include/asm/irq_remapping.h +++ b/arch/x86/include/asm/irq_remapping.h @@ -4,6 +4,7 @@ #define IRTE_DEST(dest) ((x2apic_mode) ? dest : dest << 8) #ifdef CONFIG_INTR_REMAP +static void irq_remap_modify_chip_defaults(struct irq_chip *chip); static inline void prepare_irte(struct irte *irte, int vector, unsigned int dest) { @@ -36,6 +37,9 @@ static inline bool irq_remapped(struct irq_cfg *cfg) { return false; } +static inline void irq_remap_modify_chip_defaults(struct irq_chip *chip) +{ +} #endif #endif /* _ASM_X86_IRQ_REMAPPING_H */ diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index f88af6b..e75d7e2 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1202,7 +1202,6 @@ void __setup_vector_irq(int cpu) } static struct irq_chip ioapic_chip; -static struct irq_chip ir_ioapic_chip; #ifdef CONFIG_X86_32 static inline int IO_APIC_irq_trigger(int irq) @@ -1246,7 +1245,7 @@ static void ioapic_register_intr(unsigned int irq, struct irq_cfg *cfg, if (irq_remapped(cfg)) { irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); - chip = &ir_ioapic_chip; + irq_remap_modify_chip_defaults(chip); fasteoi = trigger != 0; } @@ -2572,6 +2571,22 @@ static void ir_ack_apic_level(struct irq_data *data) ack_APIC_irq(); eoi_ioapic_irq(data->irq, data->chip_data); } + +static void ir_print_prefix(struct irq_data *data, struct seq_file *p) +{ + seq_printf(p, " IR-%s", data->chip->name); +} + +static void irq_remap_modify_chip_defaults(struct irq_chip *chip) +{ + chip->irq_print_chip = ir_print_prefix; + chip->irq_ack = ir_ack_apic_edge; + chip->irq_eoi = ir_ack_apic_level; + +#ifdef CONFIG_SMP + chip->irq_set_affinity = ir_ioapic_set_affinity; +#endif +} #endif /* CONFIG_INTR_REMAP */ static struct irq_chip ioapic_chip __read_mostly = { @@ -2587,21 +2602,6 @@ static struct irq_chip ioapic_chip __read_mostly = { .irq_retrigger = ioapic_retrigger_irq, }; -static struct irq_chip ir_ioapic_chip __read_mostly = { - .name = "IR-IO-APIC", - .irq_startup = startup_ioapic_irq, - .irq_mask = mask_ioapic_irq, - .irq_unmask = unmask_ioapic_irq, -#ifdef CONFIG_INTR_REMAP - .irq_ack = ir_ack_apic_edge, - .irq_eoi = ir_ack_apic_level, -#ifdef CONFIG_SMP - .irq_set_affinity = ir_ioapic_set_affinity, -#endif -#endif - .irq_retrigger = ioapic_retrigger_irq, -}; - static inline void init_IO_APIC_traps(void) { struct irq_cfg *cfg; @@ -3170,19 +3170,6 @@ static struct irq_chip msi_chip = { .irq_retrigger = ioapic_retrigger_irq, }; -static struct irq_chip msi_ir_chip = { - .name = "IR-PCI-MSI", - .irq_unmask = unmask_msi_irq, - .irq_mask = mask_msi_irq, -#ifdef CONFIG_INTR_REMAP - .irq_ack = ir_ack_apic_edge, -#ifdef CONFIG_SMP - .irq_set_affinity = ir_ioapic_set_affinity, -#endif -#endif - .irq_retrigger = ioapic_retrigger_irq, -}; - /* * Map the PCI dev to the corresponding remapping hardware unit * and allocate 'nvec' consecutive interrupt-remapping table entries @@ -3225,7 +3212,7 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq) if (irq_remapped(irq_get_chip_data(irq))) { irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); - chip = &msi_ir_chip; + irq_remap_modify_chip_defaults(chip); } irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge"); @@ -3379,19 +3366,6 @@ static int hpet_msi_set_affinity(struct irq_data *data, #endif /* CONFIG_SMP */ -static struct irq_chip ir_hpet_msi_type = { - .name = "IR-HPET_MSI", - .irq_unmask = hpet_msi_unmask, - .irq_mask = hpet_msi_mask, -#ifdef CONFIG_INTR_REMAP - .irq_ack = ir_ack_apic_edge, -#ifdef CONFIG_SMP - .irq_set_affinity = ir_ioapic_set_affinity, -#endif -#endif - .irq_retrigger = ioapic_retrigger_irq, -}; - static struct irq_chip hpet_msi_type = { .name = "HPET_MSI", .irq_unmask = hpet_msi_unmask, @@ -3428,7 +3402,7 @@ int arch_setup_hpet_msi(unsigned int irq, unsigned int id) hpet_msi_write(irq_get_handler_data(irq), &msg); irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); if (irq_remapped(irq_get_chip_data(irq))) - chip = &ir_hpet_msi_type; + irq_remap_modify_chip_defaults(chip); irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge"); return 0; -- cgit v1.1 From d3f138106b4b40640dc667f0222fd9f137387b32 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Tue, 23 Aug 2011 17:05:25 -0700 Subject: iommu: Rename the DMAR and INTR_REMAP config options Change the CONFIG_DMAR to CONFIG_INTEL_IOMMU to be consistent with the other IOMMU options. Rename the CONFIG_INTR_REMAP to CONFIG_IRQ_REMAP to match the irq subsystem name. And define the CONFIG_DMAR_TABLE for the common ACPI DMAR routines shared by both CONFIG_INTEL_IOMMU and CONFIG_IRQ_REMAP. Signed-off-by: Suresh Siddha Cc: yinghai@kernel.org Cc: youquan.song@intel.com Cc: joerg.roedel@amd.com Cc: tony.luck@intel.com Cc: dwmw2@infradead.org Link: http://lkml.kernel.org/r/20110824001456.558630224@sbsiddha-desk.sc.intel.com Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 6 +++--- arch/x86/configs/x86_64_defconfig | 4 ++-- arch/x86/include/asm/device.h | 2 +- arch/x86/include/asm/hw_irq.h | 2 +- arch/x86/include/asm/irq_remapping.h | 2 +- arch/x86/kernel/apic/apic.c | 2 +- arch/x86/kernel/apic/io_apic.c | 8 ++++---- 7 files changed, 13 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 6a47bb2..b8cd544 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -130,7 +130,7 @@ config SBUS bool config NEED_DMA_MAP_STATE - def_bool (X86_64 || DMAR || DMA_API_DEBUG) + def_bool (X86_64 || INTEL_IOMMU || DMA_API_DEBUG) config NEED_SG_DMA_LENGTH def_bool y @@ -220,7 +220,7 @@ config ARCH_SUPPORTS_DEBUG_PAGEALLOC config HAVE_INTEL_TXT def_bool y - depends on EXPERIMENTAL && DMAR && ACPI + depends on EXPERIMENTAL && INTEL_IOMMU && ACPI config X86_32_SMP def_bool y @@ -287,7 +287,7 @@ config SMP config X86_X2APIC bool "Support x2apic" - depends on X86_LOCAL_APIC && X86_64 && INTR_REMAP + depends on X86_LOCAL_APIC && X86_64 && IRQ_REMAP ---help--- This enables x2apic support on CPUs that have this feature. diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index 22a0dc8..058a35b 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -67,8 +67,8 @@ CONFIG_CPU_FREQ_GOV_PERFORMANCE=y CONFIG_CPU_FREQ_GOV_ONDEMAND=y CONFIG_X86_ACPI_CPUFREQ=y CONFIG_PCI_MMCONFIG=y -CONFIG_DMAR=y -# CONFIG_DMAR_DEFAULT_ON is not set +CONFIG_INTEL_IOMMU=y +# CONFIG_INTEL_IOMMU_DEFAULT_ON is not set CONFIG_PCIEPORTBUS=y CONFIG_PCCARD=y CONFIG_YENTA=y diff --git a/arch/x86/include/asm/device.h b/arch/x86/include/asm/device.h index 029f230..63a2a03 100644 --- a/arch/x86/include/asm/device.h +++ b/arch/x86/include/asm/device.h @@ -8,7 +8,7 @@ struct dev_archdata { #ifdef CONFIG_X86_64 struct dma_map_ops *dma_ops; #endif -#if defined(CONFIG_DMAR) || defined(CONFIG_AMD_IOMMU) +#if defined(CONFIG_INTEL_IOMMU) || defined(CONFIG_AMD_IOMMU) void *iommu; /* hook for IOMMU specific extension */ #endif }; diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 0919905..eb92a6e 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -119,7 +119,7 @@ struct irq_cfg { cpumask_var_t old_domain; u8 vector; u8 move_in_progress : 1; -#ifdef CONFIG_INTR_REMAP +#ifdef CONFIG_IRQ_REMAP struct irq_2_iommu irq_2_iommu; #endif }; diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h index 7000f0f..47d9993 100644 --- a/arch/x86/include/asm/irq_remapping.h +++ b/arch/x86/include/asm/irq_remapping.h @@ -3,7 +3,7 @@ #define IRTE_DEST(dest) ((x2apic_mode) ? dest : dest << 8) -#ifdef CONFIG_INTR_REMAP +#ifdef CONFIG_IRQ_REMAP static void irq_remap_modify_chip_defaults(struct irq_chip *chip); static inline void prepare_irte(struct irte *irte, int vector, unsigned int dest) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 6b9874a..a2fd72e 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1437,7 +1437,7 @@ void enable_x2apic(void) int __init enable_IR(void) { -#ifdef CONFIG_INTR_REMAP +#ifdef CONFIG_IRQ_REMAP if (!intr_remapping_supported()) { pr_debug("intr-remapping not supported\n"); return -1; diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index e75d7e2..620da6f 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2254,7 +2254,7 @@ ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, return ret; } -#ifdef CONFIG_INTR_REMAP +#ifdef CONFIG_IRQ_REMAP /* * Migrate the IO-APIC irq in the presence of intr-remapping. @@ -2560,7 +2560,7 @@ static void ack_apic_level(struct irq_data *data) } } -#ifdef CONFIG_INTR_REMAP +#ifdef CONFIG_IRQ_REMAP static void ir_ack_apic_edge(struct irq_data *data) { ack_APIC_irq(); @@ -2587,7 +2587,7 @@ static void irq_remap_modify_chip_defaults(struct irq_chip *chip) chip->irq_set_affinity = ir_ioapic_set_affinity; #endif } -#endif /* CONFIG_INTR_REMAP */ +#endif /* CONFIG_IRQ_REMAP */ static struct irq_chip ioapic_chip __read_mostly = { .name = "IO-APIC", @@ -3285,7 +3285,7 @@ void native_teardown_msi_irq(unsigned int irq) destroy_irq(irq); } -#if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP) +#ifdef CONFIG_DMAR_TABLE #ifdef CONFIG_SMP static int dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask, -- cgit v1.1 From 1e75b31d638d5242ca8e9771dfdcbd28a5f041df Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Thu, 25 Aug 2011 12:01:11 -0700 Subject: x86, kdump, ioapic: Reset remote-IRR in clear_IO_APIC In the kdump scenario mentioned below, we can have a case where the device using level triggered interrupt will not generate any interrupts in the kdump kernel. 1. IO-APIC sends a level triggered interrupt to the CPU's local APIC. 2. Kernel crashed before the CPU services this interrupt, leaving the remote-IRR in the IO-APIC set. 3. kdump kernel boot sequence does clear_IO_APIC() as part of IO-APIC initialization. But this fails to reset remote-IRR bit of the IO-APIC RTE as the remote-IRR bit is read-only. 4. Device using that level triggered entry can't generate any more interrupts because of the remote-IRR bit. In clear_IO_APIC_pin(), check if the remote-IRR bit is set and if so do an explicit attempt to clear it (by doing EOI write on modern io-apic's and changing trigger mode to edge/level on older io-apic's). Also before doing the explicit EOI to the io-apic, ensure that the trigger mode is indeed set to level. This will enable the explicit EOI to the io-apic to reset the remote-IRR bit. Tested-by: Leonardo Chiquitto Signed-off-by: Suresh Siddha Fixes: https://bugzilla.novell.com/show_bug.cgi?id=701686 Cc: Rafael Wysocki Cc: Maciej W. Rozycki Cc: Thomas Renninger Cc: jbeulich@novell.com Cc: yinghai@kernel.org Link: http://lkml.kernel.org/r/20110825190657.157502602@sbsiddha-desk.sc.intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 48 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 47 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 620da6f..913d4bd 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -593,10 +593,56 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) entry = ioapic_read_entry(apic, pin); if (entry.delivery_mode == dest_SMI) return; + + /* + * Make sure the entry is masked and re-read the contents to check + * if it is a level triggered pin and if the remote-IRR is set. + */ + if (!entry.mask) { + entry.mask = 1; + ioapic_write_entry(apic, pin, entry); + entry = ioapic_read_entry(apic, pin); + } + + if (entry.irr) { + /* + * Make sure the trigger mode is set to level. Explicit EOI + * doesn't clear the remote-IRR if the trigger mode is not + * set to level. + */ + if (!entry.trigger) { + entry.trigger = IOAPIC_LEVEL; + ioapic_write_entry(apic, pin, entry); + } + + if (mpc_ioapic_ver(apic) >= 0x20) { + unsigned long flags; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + io_apic_eoi(apic, entry.vector); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + } else { + /* + * Mechanism by which we clear remote-IRR in this + * case is by changing the trigger mode to edge and + * back to level. + */ + entry.trigger = IOAPIC_EDGE; + ioapic_write_entry(apic, pin, entry); + entry.trigger = IOAPIC_LEVEL; + ioapic_write_entry(apic, pin, entry); + } + } + /* - * Disable it in the IO-APIC irq-routing table: + * Clear the rest of the bits in the IO-APIC RTE except for the mask + * bit. */ ioapic_mask_entry(apic, pin); + entry = ioapic_read_entry(apic, pin); + if (entry.irr) + printk(KERN_ERR "Unable to reset IRR for apic: %d, pin :%d\n", + mpc_ioapic_id(apic), pin); } static void clear_IO_APIC (void) -- cgit v1.1 From e57253a81d9cc7049e9e43bd806ce6cdd297ec1c Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Thu, 25 Aug 2011 12:01:12 -0700 Subject: x86, ioapic: Restore the mask bit correctly in eoi_ioapic_irq() For older IO-APIC's, we were clearing the remote-IRR by changing the RTE trigger mode to edge and then back to level. We wanted to mask the RTE during this process, so we were essentially doing mask+edge and then to unmask+level. As part of the commit ca64c47cecd0321b2e0dcbd7aaff44b68ce20654, we moved this EOI process earlier where the IO-APIC RTE is masked. So we were wrongly unmasking it in the eoi_ioapic_irq(). So change the remote-IRR clear sequence in eoi_ioapic_irq() to mask + edge and then restore the previous RTE entry which will restore the mask status as well as the level trigger. Signed-off-by: Suresh Siddha Cc: Maciej W. Rozycki Cc: Thomas Renninger Cc: Rafael Wysocki Cc: lchiquitto@novell.com Cc: jbeulich@novell.com Cc: yinghai@kernel.org Link: http://lkml.kernel.org/r/20110825190657.210286410@sbsiddha-desk.sc.intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 43 ++++++++++++++++++++++++++---------------- 1 file changed, 27 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 913d4bd..85050c9 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -394,13 +394,21 @@ union entry_union { struct IO_APIC_route_entry entry; }; +static struct IO_APIC_route_entry __ioapic_read_entry(int apic, int pin) +{ + union entry_union eu; + + eu.w1 = io_apic_read(apic, 0x10 + 2 * pin); + eu.w2 = io_apic_read(apic, 0x11 + 2 * pin); + return eu.entry; +} + static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) { union entry_union eu; unsigned long flags; raw_spin_lock_irqsave(&ioapic_lock, flags); - eu.w1 = io_apic_read(apic, 0x10 + 2 * pin); - eu.w2 = io_apic_read(apic, 0x11 + 2 * pin); + eu.entry = __ioapic_read_entry(apic, pin); raw_spin_unlock_irqrestore(&ioapic_lock, flags); return eu.entry; } @@ -529,18 +537,6 @@ static void io_apic_modify_irq(struct irq_cfg *cfg, __io_apic_modify_irq(entry, mask_and, mask_or, final); } -static void __mask_and_edge_IO_APIC_irq(struct irq_pin_list *entry) -{ - __io_apic_modify_irq(entry, ~IO_APIC_REDIR_LEVEL_TRIGGER, - IO_APIC_REDIR_MASKED, NULL); -} - -static void __unmask_and_level_IO_APIC_irq(struct irq_pin_list *entry) -{ - __io_apic_modify_irq(entry, ~IO_APIC_REDIR_MASKED, - IO_APIC_REDIR_LEVEL_TRIGGER, NULL); -} - static void io_apic_sync(struct irq_pin_list *entry) { /* @@ -2496,8 +2492,23 @@ static void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) else io_apic_eoi(entry->apic, cfg->vector); } else { - __mask_and_edge_IO_APIC_irq(entry); - __unmask_and_level_IO_APIC_irq(entry); + struct IO_APIC_route_entry rte, rte1; + + rte = rte1 = + __ioapic_read_entry(entry->apic, entry->pin); + + /* + * Mask the entry and change the trigger mode to edge. + */ + rte1.mask = 1; + rte1.trigger = IOAPIC_EDGE; + + __ioapic_write_entry(apic, pin, rte1); + + /* + * Restore the previous level triggered entry. + */ + __ioapic_write_entry(apic, pin, rte); } } raw_spin_unlock_irqrestore(&ioapic_lock, flags); -- cgit v1.1 From c020570138f5d9cb1fc0a853f9cf9e641178b5c5 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Thu, 25 Aug 2011 12:01:13 -0700 Subject: x86, ioapic: Consolidate the explicit EOI code Consolidate the io-apic EOI code in clear_IO_APIC_pin() and eoi_ioapic_irq(). Signed-off-by: Suresh Siddha Cc: Thomas Renninger Cc: Rafael Wysocki Cc: Maciej W. Rozycki Cc: lchiquitto@novell.com Cc: jbeulich@novell.com Cc: yinghai@kernel.org Link: http://lkml.kernel.org/r/20110825190657.259696697@sbsiddha-desk.sc.intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 139 +++++++++++++++++++---------------------- 1 file changed, 65 insertions(+), 74 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 85050c9..229e19f 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -581,6 +581,66 @@ static void unmask_ioapic_irq(struct irq_data *data) unmask_ioapic(data->chip_data); } +/* + * IO-APIC versions below 0x20 don't support EOI register. + * For the record, here is the information about various versions: + * 0Xh 82489DX + * 1Xh I/OAPIC or I/O(x)APIC which are not PCI 2.2 Compliant + * 2Xh I/O(x)APIC which is PCI 2.2 Compliant + * 30h-FFh Reserved + * + * Some of the Intel ICH Specs (ICH2 to ICH5) documents the io-apic + * version as 0x2. This is an error with documentation and these ICH chips + * use io-apic's of version 0x20. + * + * For IO-APIC's with EOI register, we use that to do an explicit EOI. + * Otherwise, we simulate the EOI message manually by changing the trigger + * mode to edge and then back to level, with RTE being masked during this. + */ +static void __eoi_ioapic_pin(int apic, int pin, int vector, struct irq_cfg *cfg) +{ + if (mpc_ioapic_ver(apic) >= 0x20) { + /* + * Intr-remapping uses pin number as the virtual vector + * in the RTE. Actual vector is programmed in + * intr-remapping table entry. Hence for the io-apic + * EOI we use the pin number. + */ + if (cfg && irq_remapped(cfg)) + io_apic_eoi(apic, pin); + else + io_apic_eoi(apic, vector); + } else { + struct IO_APIC_route_entry entry, entry1; + + entry = entry1 = __ioapic_read_entry(apic, pin); + + /* + * Mask the entry and change the trigger mode to edge. + */ + entry1.mask = 1; + entry1.trigger = IOAPIC_EDGE; + + __ioapic_write_entry(apic, pin, entry1); + + /* + * Restore the previous level triggered entry. + */ + __ioapic_write_entry(apic, pin, entry); + } +} + +static void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) +{ + struct irq_pin_list *entry; + unsigned long flags; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + for_each_irq_pin(entry, cfg->irq_2_pin) + __eoi_ioapic_pin(entry->apic, entry->pin, cfg->vector, cfg); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); +} + static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) { struct IO_APIC_route_entry entry; @@ -601,6 +661,8 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) } if (entry.irr) { + unsigned long flags; + /* * Make sure the trigger mode is set to level. Explicit EOI * doesn't clear the remote-IRR if the trigger mode is not @@ -611,23 +673,9 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) ioapic_write_entry(apic, pin, entry); } - if (mpc_ioapic_ver(apic) >= 0x20) { - unsigned long flags; - - raw_spin_lock_irqsave(&ioapic_lock, flags); - io_apic_eoi(apic, entry.vector); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); - } else { - /* - * Mechanism by which we clear remote-IRR in this - * case is by changing the trigger mode to edge and - * back to level. - */ - entry.trigger = IOAPIC_EDGE; - ioapic_write_entry(apic, pin, entry); - entry.trigger = IOAPIC_LEVEL; - ioapic_write_entry(apic, pin, entry); - } + raw_spin_lock_irqsave(&ioapic_lock, flags); + __eoi_ioapic_pin(apic, pin, entry.vector, NULL); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); } /* @@ -2457,63 +2505,6 @@ static void ack_apic_edge(struct irq_data *data) atomic_t irq_mis_count; -/* - * IO-APIC versions below 0x20 don't support EOI register. - * For the record, here is the information about various versions: - * 0Xh 82489DX - * 1Xh I/OAPIC or I/O(x)APIC which are not PCI 2.2 Compliant - * 2Xh I/O(x)APIC which is PCI 2.2 Compliant - * 30h-FFh Reserved - * - * Some of the Intel ICH Specs (ICH2 to ICH5) documents the io-apic - * version as 0x2. This is an error with documentation and these ICH chips - * use io-apic's of version 0x20. - * - * For IO-APIC's with EOI register, we use that to do an explicit EOI. - * Otherwise, we simulate the EOI message manually by changing the trigger - * mode to edge and then back to level, with RTE being masked during this. -*/ -static void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) -{ - struct irq_pin_list *entry; - unsigned long flags; - - raw_spin_lock_irqsave(&ioapic_lock, flags); - for_each_irq_pin(entry, cfg->irq_2_pin) { - if (mpc_ioapic_ver(entry->apic) >= 0x20) { - /* - * Intr-remapping uses pin number as the virtual vector - * in the RTE. Actual vector is programmed in - * intr-remapping table entry. Hence for the io-apic - * EOI we use the pin number. - */ - if (irq_remapped(cfg)) - io_apic_eoi(entry->apic, entry->pin); - else - io_apic_eoi(entry->apic, cfg->vector); - } else { - struct IO_APIC_route_entry rte, rte1; - - rte = rte1 = - __ioapic_read_entry(entry->apic, entry->pin); - - /* - * Mask the entry and change the trigger mode to edge. - */ - rte1.mask = 1; - rte1.trigger = IOAPIC_EDGE; - - __ioapic_write_entry(apic, pin, rte1); - - /* - * Restore the previous level triggered entry. - */ - __ioapic_write_entry(apic, pin, rte); - } - } - raw_spin_unlock_irqrestore(&ioapic_lock, flags); -} - static void ack_apic_level(struct irq_data *data) { struct irq_cfg *cfg = data->chip_data; -- cgit v1.1 From d4f3e350172a1dc769ed5e7f5bd540feb0c475d8 Mon Sep 17 00:00:00 2001 From: Ed Wildgoose Date: Tue, 20 Sep 2011 14:00:12 -0700 Subject: x86: geode: New PCEngines Alix system driver This new driver replaces the old PCEngines Alix 2/3 LED driver with a new driver that controls the LEDs through the leds-gpio driver. The old driver accessed GPIOs directly, which created a conflict and prevented also loading the cs5535-gpio driver to read other GPIOs on the Alix board. With this new driver, we hook into leds-gpio which in turn uses GPIO to control the LEDs and therefore it's possible to control both the LEDs and access onboard GPIOs Driver is moved to platform/geode as requested by Grant and any other geode initialisation modules should move here also This driver is inspired by leds-net5501.c by Alessandro Zummo. Ideally, leds-net5501.c should also be moved to platform/geode. Additionally the driver relies on parts of the patch: 7f131cf3ed ("leds: leds-alix2c - take port address from MSR) by Daniel Mack to perform detection of the Alix board. [akpm@linux-foundation.org: include module.h] Signed-off-by: Ed Wildgoose Cc: git@wildgooses.com Cc: Alessandro Zummo Cc: Daniel Mack Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Richard Purdie Reviewed-by: Grant Likely Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- arch/x86/Kconfig | 14 ++++ arch/x86/platform/Makefile | 1 + arch/x86/platform/geode/Makefile | 1 + arch/x86/platform/geode/alix.c | 142 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 158 insertions(+) create mode 100644 arch/x86/platform/geode/Makefile create mode 100644 arch/x86/platform/geode/alix.c (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 6a47bb2..8c4e5bc 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2064,6 +2064,20 @@ config OLPC_XO15_SCI - AC adapter status updates - Battery status updates +config ALIX + bool "PCEngines ALIX System Support (LED setup)" + select GPIOLIB + ---help--- + This option enables system support for the PCEngines ALIX. + At present this just sets up LEDs for GPIO control on + ALIX2/3/6 boards. However, other system specific setup should + get added here. + + Note: You must still enable the drivers for GPIO and LED support + (GPIO_CS5535 & LEDS_GPIO) to actually use the LEDs + + Note: You have to set alix.force=1 for boards with Award BIOS. + endif # X86_32 config AMD_NB diff --git a/arch/x86/platform/Makefile b/arch/x86/platform/Makefile index 021eee9..8d87439 100644 --- a/arch/x86/platform/Makefile +++ b/arch/x86/platform/Makefile @@ -1,6 +1,7 @@ # Platform specific code goes here obj-y += ce4100/ obj-y += efi/ +obj-y += geode/ obj-y += iris/ obj-y += mrst/ obj-y += olpc/ diff --git a/arch/x86/platform/geode/Makefile b/arch/x86/platform/geode/Makefile new file mode 100644 index 0000000..07c9cd0 --- /dev/null +++ b/arch/x86/platform/geode/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_ALIX) += alix.o diff --git a/arch/x86/platform/geode/alix.c b/arch/x86/platform/geode/alix.c new file mode 100644 index 0000000..ca19736 --- /dev/null +++ b/arch/x86/platform/geode/alix.c @@ -0,0 +1,142 @@ +/* + * System Specific setup for PCEngines ALIX. + * At the moment this means setup of GPIO control of LEDs + * on Alix.2/3/6 boards. + * + * + * Copyright (C) 2008 Constantin Baranov + * Copyright (C) 2011 Ed Wildgoose + * + * TODO: There are large similarities with leds-net5501.c + * by Alessandro Zummo + * In the future leds-net5501.c should be migrated over to platform + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +static int force = 0; +module_param(force, bool, 0444); +/* FIXME: Award bios is not automatically detected as Alix platform */ +MODULE_PARM_DESC(force, "Force detection as ALIX.2/ALIX.3 platform"); + +static struct gpio_led alix_leds[] = { + { + .name = "alix:1", + .gpio = 6, + .default_trigger = "default-on", + .active_low = 1, + }, + { + .name = "alix:2", + .gpio = 25, + .default_trigger = "default-off", + .active_low = 1, + }, + { + .name = "alix:3", + .gpio = 27, + .default_trigger = "default-off", + .active_low = 1, + }, +}; + +static struct gpio_led_platform_data alix_leds_data = { + .num_leds = ARRAY_SIZE(alix_leds), + .leds = alix_leds, +}; + +static struct platform_device alix_leds_dev = { + .name = "leds-gpio", + .id = -1, + .dev.platform_data = &alix_leds_data, +}; + +static void __init register_alix(void) +{ + /* Setup LED control through leds-gpio driver */ + platform_device_register(&alix_leds_dev); +} + +static int __init alix_present(unsigned long bios_phys, + const char *alix_sig, + size_t alix_sig_len) +{ + const size_t bios_len = 0x00010000; + const char *bios_virt; + const char *scan_end; + const char *p; + char name[64]; + + if (force) { + printk(KERN_NOTICE "%s: forced to skip BIOS test, " + "assume system is ALIX.2/ALIX.3\n", + KBUILD_MODNAME); + return 1; + } + + bios_virt = phys_to_virt(bios_phys); + scan_end = bios_virt + bios_len - (alix_sig_len + 2); + for (p = bios_virt; p < scan_end; p++) { + const char *tail; + char *a; + + if (memcmp(p, alix_sig, alix_sig_len) != 0) + continue; + + memcpy(name, p, sizeof(name)); + + /* remove the first \0 character from string */ + a = strchr(name, '\0'); + if (a) + *a = ' '; + + /* cut the string at a newline */ + a = strchr(name, '\r'); + if (a) + *a = '\0'; + + tail = p + alix_sig_len; + if ((tail[0] == '2' || tail[0] == '3')) { + printk(KERN_INFO + "%s: system is recognized as \"%s\"\n", + KBUILD_MODNAME, name); + return 1; + } + } + + return 0; +} + +static int __init alix_init(void) +{ + const char tinybios_sig[] = "PC Engines ALIX."; + const char coreboot_sig[] = "PC Engines\0ALIX."; + + if (!is_geode()) + return 0; + + if (alix_present(0xf0000, tinybios_sig, sizeof(tinybios_sig) - 1) || + alix_present(0x500, coreboot_sig, sizeof(coreboot_sig) - 1)) + register_alix(); + + return 0; +} + +module_init(alix_init); + +MODULE_AUTHOR("Ed Wildgoose "); +MODULE_DESCRIPTION("PCEngines ALIX System Setup"); +MODULE_LICENSE("GPL"); -- cgit v1.1 From 6a469e4665bc158599de55d64388861d0a9f10f4 Mon Sep 17 00:00:00 2001 From: Jack Steiner Date: Tue, 20 Sep 2011 13:55:04 -0700 Subject: x86: uv2: Workaround for UV2 Hub bug (system global address format) This is a workaround for a UV2 hub bug that affects the format of system global addresses. The GRU API for UV2 was inadvertently broken by a hardware change. The format of the physical address used for TLB dropins and for addresses used with instructions running in unmapped mode has changed. This change was not documented and became apparent only when diags failed running on system simulators. For UV1, TLB and GRU instruction physical addresses are identical to socket physical addresses (although high NASID bits must be OR'ed into the address). For UV2, socket physical addresses need to be converted. The NODE portion of the physical address needs to be shifted so that the low bit is in bit 39 or bit 40, depending on an MMR value. It is not yet clear if this bug will be fixed in a silicon respin. If it is fixed, the hub revision will be incremented & the workaround disabled. Signed-off-by: Jack Steiner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/uv/uv_bau.h | 1 + arch/x86/include/asm/uv/uv_hub.h | 37 ++++++++++++++++++++++++++++++++++--- arch/x86/kernel/apic/x2apic_uv_x.c | 7 +++++-- arch/x86/platform/uv/tlb_uv.c | 17 ++++++----------- 4 files changed, 46 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index 37d3698..0c767a8 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -55,6 +55,7 @@ #define UV_BAU_TUNABLES_DIR "sgi_uv" #define UV_BAU_TUNABLES_FILE "bau_tunables" #define WHITESPACE " \t\n" +#define uv_mmask ((1UL << uv_hub_info->m_val) - 1) #define uv_physnodeaddr(x) ((__pa((unsigned long)(x)) & uv_mmask)) #define cpubit_isset(cpu, bau_local_cpumask) \ test_bit((cpu), (bau_local_cpumask).bits) diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index f26544a..54a13aae 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -46,6 +46,13 @@ * PNODE - the low N bits of the GNODE. The PNODE is the most useful variant * of the nasid for socket usage. * + * GPA - (global physical address) a socket physical address converted + * so that it can be used by the GRU as a global address. Socket + * physical addresses 1) need additional NASID (node) bits added + * to the high end of the address, and 2) unaliased if the + * partition does not have a physical address 0. In addition, on + * UV2 rev 1, GPAs need the gnode left shifted to bits 39 or 40. + * * * NumaLink Global Physical Address Format: * +--------------------------------+---------------------+ @@ -141,6 +148,8 @@ struct uv_hub_info_s { unsigned int gnode_extra; unsigned char hub_revision; unsigned char apic_pnode_shift; + unsigned char m_shift; + unsigned char n_lshift; unsigned long gnode_upper; unsigned long lowmem_remap_top; unsigned long lowmem_remap_base; @@ -177,6 +186,16 @@ static inline int is_uv2_hub(void) return uv_hub_info->hub_revision >= UV2_HUB_REVISION_BASE; } +static inline int is_uv2_1_hub(void) +{ + return uv_hub_info->hub_revision == UV2_HUB_REVISION_BASE; +} + +static inline int is_uv2_2_hub(void) +{ + return uv_hub_info->hub_revision == UV2_HUB_REVISION_BASE + 1; +} + union uvh_apicid { unsigned long v; struct uvh_apicid_s { @@ -276,7 +295,10 @@ static inline unsigned long uv_soc_phys_ram_to_gpa(unsigned long paddr) { if (paddr < uv_hub_info->lowmem_remap_top) paddr |= uv_hub_info->lowmem_remap_base; - return paddr | uv_hub_info->gnode_upper; + paddr |= uv_hub_info->gnode_upper; + paddr = ((paddr << uv_hub_info->m_shift) >> uv_hub_info->m_shift) | + ((paddr >> uv_hub_info->m_val) << uv_hub_info->n_lshift); + return paddr; } @@ -300,16 +322,19 @@ static inline unsigned long uv_gpa_to_soc_phys_ram(unsigned long gpa) unsigned long remap_base = uv_hub_info->lowmem_remap_base; unsigned long remap_top = uv_hub_info->lowmem_remap_top; + gpa = ((gpa << uv_hub_info->m_shift) >> uv_hub_info->m_shift) | + ((gpa >> uv_hub_info->n_lshift) << uv_hub_info->m_val); + gpa = gpa & uv_hub_info->gpa_mask; if (paddr >= remap_base && paddr < remap_base + remap_top) paddr -= remap_base; return paddr; } -/* gnode -> pnode */ +/* gpa -> pnode */ static inline unsigned long uv_gpa_to_gnode(unsigned long gpa) { - return gpa >> uv_hub_info->m_val; + return gpa >> uv_hub_info->n_lshift; } /* gpa -> pnode */ @@ -320,6 +345,12 @@ static inline int uv_gpa_to_pnode(unsigned long gpa) return uv_gpa_to_gnode(gpa) & n_mask; } +/* gpa -> node offset*/ +static inline unsigned long uv_gpa_to_offset(unsigned long gpa) +{ + return (gpa << uv_hub_info->m_shift) >> uv_hub_info->m_shift; +} + /* pnode, offset --> socket virtual */ static inline void *uv_pnode_offset_to_vaddr(int pnode, unsigned long offset) { diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 34b1859..cfeb978 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -832,6 +832,10 @@ void __init uv_system_init(void) uv_cpu_hub_info(cpu)->apic_pnode_shift = uvh_apicid.s.pnode_shift; uv_cpu_hub_info(cpu)->hub_revision = uv_hub_info->hub_revision; + uv_cpu_hub_info(cpu)->m_shift = 64 - m_val; + uv_cpu_hub_info(cpu)->n_lshift = is_uv2_1_hub() ? + (m_val == 40 ? 40 : 39) : m_val; + pnode = uv_apicid_to_pnode(apicid); blade = boot_pnode_to_blade(pnode); lcpu = uv_blade_info[blade].nr_possible_cpus; @@ -862,8 +866,7 @@ void __init uv_system_init(void) if (uv_node_to_blade[nid] >= 0) continue; paddr = node_start_pfn(nid) << PAGE_SHIFT; - paddr = uv_soc_phys_ram_to_gpa(paddr); - pnode = (paddr >> m_val) & pnode_mask; + pnode = uv_gpa_to_pnode(uv_soc_phys_ram_to_gpa(paddr)); blade = boot_pnode_to_blade(pnode); uv_node_to_blade[nid] = blade; } diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index db8b915..5b55219 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -115,9 +115,6 @@ early_param("nobau", setup_nobau); /* base pnode in this partition */ static int uv_base_pnode __read_mostly; -/* position of pnode (which is nasid>>1): */ -static int uv_nshift __read_mostly; -static unsigned long uv_mmask __read_mostly; static DEFINE_PER_CPU(struct ptc_stats, ptcstats); static DEFINE_PER_CPU(struct bau_control, bau_control); @@ -1435,7 +1432,7 @@ static void activation_descriptor_init(int node, int pnode, int base_pnode) { int i; int cpu; - unsigned long pa; + unsigned long gpa; unsigned long m; unsigned long n; size_t dsize; @@ -1451,9 +1448,9 @@ static void activation_descriptor_init(int node, int pnode, int base_pnode) bau_desc = kmalloc_node(dsize, GFP_KERNEL, node); BUG_ON(!bau_desc); - pa = uv_gpa(bau_desc); /* need the real nasid*/ - n = pa >> uv_nshift; - m = pa & uv_mmask; + gpa = uv_gpa(bau_desc); + n = uv_gpa_to_gnode(gpa); + m = uv_gpa_to_offset(gpa); /* the 14-bit pnode */ write_mmr_descriptor_base(pnode, (n << UV_DESC_PSHIFT | m)); @@ -1525,9 +1522,9 @@ static void pq_init(int node, int pnode) bcp->queue_last = pqp + (DEST_Q_SIZE - 1); } /* - * need the pnode of where the memory was really allocated + * need the gnode of where the memory was really allocated */ - pn = uv_gpa(pqp) >> uv_nshift; + pn = uv_gpa_to_gnode(uv_gpa(pqp)); first = uv_physnodeaddr(pqp); pn_first = ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) | first; last = uv_physnodeaddr(pqp + (DEST_Q_SIZE - 1)); @@ -1837,8 +1834,6 @@ static int __init uv_bau_init(void) zalloc_cpumask_var_node(mask, GFP_KERNEL, cpu_to_node(cur_cpu)); } - uv_nshift = uv_hub_info->m_val; - uv_mmask = (1UL << uv_hub_info->m_val) - 1; nuvhubs = uv_num_possible_blades(); spin_lock_init(&disable_lock); congested_cycles = usec_2_cycles(congested_respns_us); -- cgit v1.1 From 47997d756aa2a84ab577e1b0383cc12d582fc69c Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Wed, 21 Sep 2011 16:08:03 +0200 Subject: x86/rtc: Don't recursively acquire rtc_lock A deadlock was introduced on x86 in commit ef68c8f87ed1 ("x86: Serialize EFI time accesses on rtc_lock") because efi_get_time() and friends can be called with rtc_lock already held by read_persistent_time(), e.g.: timekeeping_init() read_persistent_clock() <-- acquire rtc_lock efi_get_time() phys_efi_get_time() <-- acquire rtc_lock To fix this let's push the locking down into the get_wallclock() and set_wallclock() implementations. Only the clock implementations that access the x86 RTC directly need to acquire rtc_lock, so it makes sense to push the locking down into the rtc, vrtc and efi code. The virtualization implementations don't require rtc_lock to be held because they provide their own serialization. Signed-off-by: Matt Fleming Acked-by: Jan Beulich Acked-by: Avi Kivity [for the virtualization aspect] Cc: "H. Peter Anvin" Cc: Zhang Rui Cc: Josh Boyer Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/rtc.c | 23 ++++++++++++----------- arch/x86/platform/mrst/vrtc.c | 9 +++++++++ 2 files changed, 21 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index 3f2ad26..ccdbc16 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -42,8 +42,11 @@ int mach_set_rtc_mmss(unsigned long nowtime) { int real_seconds, real_minutes, cmos_minutes; unsigned char save_control, save_freq_select; + unsigned long flags; int retval = 0; + spin_lock_irqsave(&rtc_lock, flags); + /* tell the clock it's being set */ save_control = CMOS_READ(RTC_CONTROL); CMOS_WRITE((save_control|RTC_SET), RTC_CONTROL); @@ -93,12 +96,17 @@ int mach_set_rtc_mmss(unsigned long nowtime) CMOS_WRITE(save_control, RTC_CONTROL); CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); + spin_unlock_irqrestore(&rtc_lock, flags); + return retval; } unsigned long mach_get_cmos_time(void) { unsigned int status, year, mon, day, hour, min, sec, century = 0; + unsigned long flags; + + spin_lock_irqsave(&rtc_lock, flags); /* * If UIP is clear, then we have >= 244 microseconds before @@ -125,6 +133,8 @@ unsigned long mach_get_cmos_time(void) status = CMOS_READ(RTC_CONTROL); WARN_ON_ONCE(RTC_ALWAYS_BCD && (status & RTC_DM_BINARY)); + spin_unlock_irqrestore(&rtc_lock, flags); + if (RTC_ALWAYS_BCD || !(status & RTC_DM_BINARY)) { sec = bcd2bin(sec); min = bcd2bin(min); @@ -169,24 +179,15 @@ EXPORT_SYMBOL(rtc_cmos_write); int update_persistent_clock(struct timespec now) { - unsigned long flags; - int retval; - - spin_lock_irqsave(&rtc_lock, flags); - retval = x86_platform.set_wallclock(now.tv_sec); - spin_unlock_irqrestore(&rtc_lock, flags); - - return retval; + return x86_platform.set_wallclock(now.tv_sec); } /* not static: needed by APM */ void read_persistent_clock(struct timespec *ts) { - unsigned long retval, flags; + unsigned long retval; - spin_lock_irqsave(&rtc_lock, flags); retval = x86_platform.get_wallclock(); - spin_unlock_irqrestore(&rtc_lock, flags); ts->tv_sec = retval; ts->tv_nsec = 0; diff --git a/arch/x86/platform/mrst/vrtc.c b/arch/x86/platform/mrst/vrtc.c index 73d70d6..6d5dbcd 100644 --- a/arch/x86/platform/mrst/vrtc.c +++ b/arch/x86/platform/mrst/vrtc.c @@ -58,8 +58,11 @@ EXPORT_SYMBOL_GPL(vrtc_cmos_write); unsigned long vrtc_get_time(void) { u8 sec, min, hour, mday, mon; + unsigned long flags; u32 year; + spin_lock_irqsave(&rtc_lock, flags); + while ((vrtc_cmos_read(RTC_FREQ_SELECT) & RTC_UIP)) cpu_relax(); @@ -70,6 +73,8 @@ unsigned long vrtc_get_time(void) mon = vrtc_cmos_read(RTC_MONTH); year = vrtc_cmos_read(RTC_YEAR); + spin_unlock_irqrestore(&rtc_lock, flags); + /* vRTC YEAR reg contains the offset to 1960 */ year += 1960; @@ -83,8 +88,10 @@ unsigned long vrtc_get_time(void) int vrtc_set_mmss(unsigned long nowtime) { int real_sec, real_min; + unsigned long flags; int vrtc_min; + spin_lock_irqsave(&rtc_lock, flags); vrtc_min = vrtc_cmos_read(RTC_MINUTES); real_sec = nowtime % 60; @@ -95,6 +102,8 @@ int vrtc_set_mmss(unsigned long nowtime) vrtc_cmos_write(real_sec, RTC_SECONDS); vrtc_cmos_write(real_min, RTC_MINUTES); + spin_unlock_irqrestore(&rtc_lock, flags); + return 0; } -- cgit v1.1 From 64b94ceae8c16cd1b2800cac83112d3815be5250 Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Fri, 2 Sep 2011 01:45:22 +0300 Subject: crypto: blowfish - add x86_64 assembly implementation Patch adds x86_64 assembly implementation of blowfish. Two set of assembler functions are provided. First set is regular 'one-block at time' encrypt/decrypt functions. Second is 'four-block at time' functions that gain performance increase on out-of-order CPUs. Performance of 4-way functions should be equal to 1-way functions with in-order CPUs. Summary of the tcrypt benchmarks: Blowfish assembler vs blowfish C (256bit 8kb block ECB) encrypt: 2.2x speed decrypt: 2.3x speed Blowfish assembler vs blowfish C (256bit 8kb block CBC) encrypt: 1.12x speed decrypt: 2.5x speed Blowfish assembler vs blowfish C (256bit 8kb block CTR) encrypt: 2.5x speed Full output: http://koti.mbnet.fi/axh/kernel/crypto/tcrypt-speed-blowfish-asm-x86_64.txt http://koti.mbnet.fi/axh/kernel/crypto/tcrypt-speed-blowfish-c-x86_64.txt Tests were run on: vendor_id : AuthenticAMD cpu family : 16 model : 10 model name : AMD Phenom(tm) II X6 1055T Processor stepping : 0 Signed-off-by: Jussi Kivilinna Signed-off-by: Herbert Xu --- arch/x86/crypto/Makefile | 2 + arch/x86/crypto/blowfish-x86_64-asm_64.S | 392 +++++++++++++++++++++++++ arch/x86/crypto/blowfish_glue.c | 487 +++++++++++++++++++++++++++++++ 3 files changed, 881 insertions(+) create mode 100644 arch/x86/crypto/blowfish-x86_64-asm_64.S create mode 100644 arch/x86/crypto/blowfish_glue.c (limited to 'arch/x86') diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 57c7f7b..725addf 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -7,6 +7,7 @@ obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o +obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o @@ -20,6 +21,7 @@ twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o +blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o diff --git a/arch/x86/crypto/blowfish-x86_64-asm_64.S b/arch/x86/crypto/blowfish-x86_64-asm_64.S new file mode 100644 index 0000000..44eb23a --- /dev/null +++ b/arch/x86/crypto/blowfish-x86_64-asm_64.S @@ -0,0 +1,392 @@ +/* + * Blowfish Cipher Algorithm (x86_64) + * + * Copyright (C) 2011 Jussi Kivilinna + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + * USA + * + */ + +.file "blowfish-x86_64-asm.S" +.text + +/* structure of crypto context */ +#define p 0 +#define s0 ((16 + 2) * 4) +#define s1 ((16 + 2 + (1 * 256)) * 4) +#define s2 ((16 + 2 + (2 * 256)) * 4) +#define s3 ((16 + 2 + (3 * 256)) * 4) + +/* register macros */ +#define CTX %rdi +#define RIO %rsi + +#define RX0 %rax +#define RX1 %rbx +#define RX2 %rcx +#define RX3 %rdx + +#define RX0d %eax +#define RX1d %ebx +#define RX2d %ecx +#define RX3d %edx + +#define RX0bl %al +#define RX1bl %bl +#define RX2bl %cl +#define RX3bl %dl + +#define RX0bh %ah +#define RX1bh %bh +#define RX2bh %ch +#define RX3bh %dh + +#define RT0 %rbp +#define RT1 %rsi + +#define RT0d %ebp +#define RT1d %esi + +#define RK0 %r8 +#define RK1 %r9 +#define RK2 %r10 +#define RK3 %r11 + +#define RK0d %r8d +#define RK1d %r9d +#define RK2d %r10d +#define RK3d %r11d + +#define RKEY %r12 + +/*********************************************************************** + * 1-way blowfish + ***********************************************************************/ +#define F(x, k) \ + rorq $16, x; \ + movzbl x ## bh, RT0d; \ + movzbl x ## bl, RT1d; \ + rolq $16, x; \ + movl s0(CTX,RT0,4), k ## d; \ + addl s1(CTX,RT1,4), k ## d; \ + movzbl x ## bh, RT0d; \ + movzbl x ## bl, RT1d; \ + rolq $32, x; \ + xorl s2(CTX,RT0,4), k ## d; \ + addl s3(CTX,RT1,4), k ## d; \ + xorq k, x; + +#define add_roundkey_enc(n) \ + xorq p+4*(n)(CTX), RX0; + +#define round_enc(n) \ + add_roundkey_enc(n); \ + \ + F(RX0, RK0); \ + F(RX0, RK0); + +#define round_final_enc(n) \ + xorq p+4*(n)(CTX), RX0; + +#define add_roundkey_dec(n) \ + movq p+4*(n-1)(CTX), RT0; \ + rorq $32, RT0; \ + xorq RT0, RX0; + +#define round_dec(n) \ + add_roundkey_dec(n); \ + \ + F(RX0, RK0); \ + F(RX0, RK0); \ + +#define read_block() \ + movq (RIO), RX0; \ + rorq $32, RX0; \ + bswapq RX0; + +#define write_block() \ + bswapq RX0; \ + movq RX0, (RIO); + +#define xor_block() \ + bswapq RX0; \ + xorq RX0, (RIO); + +.align 8 +.global __blowfish_enc_blk +.type __blowfish_enc_blk,@function; + +__blowfish_enc_blk: + // input: + // %rdi: ctx, CTX + // %rsi: dst + // %rdx: src + // %rcx: bool xor + pushq %rbp; + pushq %rbx; + + pushq %rsi; + pushq %rcx; + movq %rdx, RIO; + + read_block(); + + round_enc(0); + round_enc(2); + round_enc(4); + round_enc(6); + round_enc(8); + round_enc(10); + round_enc(12); + round_enc(14); + add_roundkey_enc(16); + + popq %rbp; + popq RIO; + + test %bpl, %bpl; + jnz __enc_xor; + + write_block(); + +__enc_ret: + popq %rbx; + popq %rbp; + + ret; + +__enc_xor: + xor_block(); + + jmp __enc_ret; + +.align 8 +.global blowfish_dec_blk +.type blowfish_dec_blk,@function; + +blowfish_dec_blk: + // input: + // %rdi: ctx, CTX + // %rsi: dst + // %rdx: src + pushq %rbp; + pushq %rbx; + + pushq %rsi; + movq %rdx, RIO; + + read_block(); + + round_dec(17); + round_dec(15); + round_dec(13); + round_dec(11); + round_dec(9); + round_dec(7); + round_dec(5); + round_dec(3); + add_roundkey_dec(1); + + popq RIO; + write_block(); + + popq %rbx; + popq %rbp; + + ret; + +/********************************************************************** + 4-way blowfish, four blocks parallel + **********************************************************************/ +#define add_preloaded_roundkey4() \ + xorq RKEY, RX0; \ + xorq RKEY, RX1; \ + xorq RKEY, RX2; \ + xorq RKEY, RX3; + +#define preload_roundkey_enc(n) \ + movq p+4*(n)(CTX), RKEY; + +#define add_roundkey_enc4(n) \ + add_preloaded_roundkey4(); \ + preload_roundkey_enc(n + 2); + +#define round_enc4(n) \ + add_roundkey_enc4(n); \ + \ + F(RX0, RK0); \ + F(RX1, RK1); \ + F(RX2, RK2); \ + F(RX3, RK3); \ + \ + F(RX0, RK0); \ + F(RX1, RK1); \ + F(RX2, RK2); \ + F(RX3, RK3); + +#define preload_roundkey_dec(n) \ + movq p+4*((n)-1)(CTX), RKEY; \ + rorq $32, RKEY; + +#define add_roundkey_dec4(n) \ + add_preloaded_roundkey4(); \ + preload_roundkey_dec(n - 2); + +#define round_dec4(n) \ + add_roundkey_dec4(n); \ + \ + F(RX0, RK0); \ + F(RX1, RK1); \ + F(RX2, RK2); \ + F(RX3, RK3); \ + \ + F(RX0, RK0); \ + F(RX1, RK1); \ + F(RX2, RK2); \ + F(RX3, RK3); + +#define read_block4() \ + movq (RIO), RX0; \ + rorq $32, RX0; \ + bswapq RX0; \ + \ + movq 8(RIO), RX1; \ + rorq $32, RX1; \ + bswapq RX1; \ + \ + movq 16(RIO), RX2; \ + rorq $32, RX2; \ + bswapq RX2; \ + \ + movq 24(RIO), RX3; \ + rorq $32, RX3; \ + bswapq RX3; + +#define write_block4() \ + bswapq RX0; \ + movq RX0, (RIO); \ + \ + bswapq RX1; \ + movq RX1, 8(RIO); \ + \ + bswapq RX2; \ + movq RX2, 16(RIO); \ + \ + bswapq RX3; \ + movq RX3, 24(RIO); + +#define xor_block4() \ + bswapq RX0; \ + xorq RX0, (RIO); \ + \ + bswapq RX1; \ + xorq RX1, 8(RIO); \ + \ + bswapq RX2; \ + xorq RX2, 16(RIO); \ + \ + bswapq RX3; \ + xorq RX3, 24(RIO); + +.align 8 +.global __blowfish_enc_blk_4way +.type __blowfish_enc_blk_4way,@function; + +__blowfish_enc_blk_4way: + // input: + // %rdi: ctx, CTX + // %rsi: dst + // %rdx: src + // %rcx: bool xor + pushq %rbp; + pushq %rbx; + pushq RKEY; + preload_roundkey_enc(0); + + pushq %rsi; + pushq %rcx; + movq %rdx, RIO; + + read_block4(); + + round_enc4(0); + round_enc4(2); + round_enc4(4); + round_enc4(6); + round_enc4(8); + round_enc4(10); + round_enc4(12); + round_enc4(14); + add_preloaded_roundkey4(); + + popq %rbp; + popq RIO; + + test %bpl, %bpl; + jnz __enc_xor4; + + write_block4(); + +__enc_ret4: + popq RKEY; + popq %rbx; + popq %rbp; + + ret; + +__enc_xor4: + xor_block4(); + + jmp __enc_ret4; + +.align 8 +.global blowfish_dec_blk_4way +.type blowfish_dec_blk_4way,@function; + +blowfish_dec_blk_4way: + // input: + // %rdi: ctx, CTX + // %rsi: dst + // %rdx: src + pushq %rbp; + pushq %rbx; + pushq RKEY; + preload_roundkey_dec(17); + + pushq %rsi; + movq %rdx, RIO; + + read_block4(); + + round_dec4(17); + round_dec4(15); + round_dec4(13); + round_dec4(11); + round_dec4(9); + round_dec4(7); + round_dec4(5); + round_dec4(3); + add_preloaded_roundkey4(); + + popq RIO; + write_block4(); + + popq RKEY; + popq %rbx; + popq %rbp; + + ret; + diff --git a/arch/x86/crypto/blowfish_glue.c b/arch/x86/crypto/blowfish_glue.c new file mode 100644 index 0000000..40911ab --- /dev/null +++ b/arch/x86/crypto/blowfish_glue.c @@ -0,0 +1,487 @@ +/* + * Glue Code for assembler optimized version of Blowfish + * + * Copyright (c) 2011 Jussi Kivilinna + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + * USA + * + */ + +#include +#include +#include +#include +#include +#include + +/* regular block cipher functions */ +asmlinkage void __blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src, + bool xor); +asmlinkage void blowfish_dec_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src); + +/* 4-way parallel cipher functions */ +asmlinkage void __blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst, + const u8 *src, bool xor); +asmlinkage void blowfish_dec_blk_4way(struct bf_ctx *ctx, u8 *dst, + const u8 *src); + +static inline void blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src) +{ + __blowfish_enc_blk(ctx, dst, src, false); +} + +static inline void blowfish_enc_blk_xor(struct bf_ctx *ctx, u8 *dst, + const u8 *src) +{ + __blowfish_enc_blk(ctx, dst, src, true); +} + +static inline void blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst, + const u8 *src) +{ + __blowfish_enc_blk_4way(ctx, dst, src, false); +} + +static inline void blowfish_enc_blk_xor_4way(struct bf_ctx *ctx, u8 *dst, + const u8 *src) +{ + __blowfish_enc_blk_4way(ctx, dst, src, true); +} + +static void blowfish_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) +{ + blowfish_enc_blk(crypto_tfm_ctx(tfm), dst, src); +} + +static void blowfish_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) +{ + blowfish_dec_blk(crypto_tfm_ctx(tfm), dst, src); +} + +static struct crypto_alg bf_alg = { + .cra_name = "blowfish", + .cra_driver_name = "blowfish-asm", + .cra_priority = 200, + .cra_flags = CRYPTO_ALG_TYPE_CIPHER, + .cra_blocksize = BF_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct bf_ctx), + .cra_alignmask = 3, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(bf_alg.cra_list), + .cra_u = { + .cipher = { + .cia_min_keysize = BF_MIN_KEY_SIZE, + .cia_max_keysize = BF_MAX_KEY_SIZE, + .cia_setkey = blowfish_setkey, + .cia_encrypt = blowfish_encrypt, + .cia_decrypt = blowfish_decrypt, + } + } +}; + +static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk, + void (*fn)(struct bf_ctx *, u8 *, const u8 *), + void (*fn_4way)(struct bf_ctx *, u8 *, const u8 *)) +{ + struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + unsigned int bsize = BF_BLOCK_SIZE; + unsigned int nbytes; + int err; + + err = blkcipher_walk_virt(desc, walk); + + while ((nbytes = walk->nbytes)) { + u8 *wsrc = walk->src.virt.addr; + u8 *wdst = walk->dst.virt.addr; + + /* Process four block batch */ + if (nbytes >= bsize * 4) { + do { + fn_4way(ctx, wdst, wsrc); + + wsrc += bsize * 4; + wdst += bsize * 4; + nbytes -= bsize * 4; + } while (nbytes >= bsize * 4); + + if (nbytes < bsize) + goto done; + } + + /* Handle leftovers */ + do { + fn(ctx, wdst, wsrc); + + wsrc += bsize; + wdst += bsize; + nbytes -= bsize; + } while (nbytes >= bsize); + +done: + err = blkcipher_walk_done(desc, walk, nbytes); + } + + return err; +} + +static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct blkcipher_walk walk; + + blkcipher_walk_init(&walk, dst, src, nbytes); + return ecb_crypt(desc, &walk, blowfish_enc_blk, blowfish_enc_blk_4way); +} + +static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct blkcipher_walk walk; + + blkcipher_walk_init(&walk, dst, src, nbytes); + return ecb_crypt(desc, &walk, blowfish_dec_blk, blowfish_dec_blk_4way); +} + +static struct crypto_alg blk_ecb_alg = { + .cra_name = "ecb(blowfish)", + .cra_driver_name = "ecb-blowfish-asm", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = BF_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct bf_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(blk_ecb_alg.cra_list), + .cra_u = { + .blkcipher = { + .min_keysize = BF_MIN_KEY_SIZE, + .max_keysize = BF_MAX_KEY_SIZE, + .setkey = blowfish_setkey, + .encrypt = ecb_encrypt, + .decrypt = ecb_decrypt, + }, + }, +}; + +static unsigned int __cbc_encrypt(struct blkcipher_desc *desc, + struct blkcipher_walk *walk) +{ + struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + unsigned int bsize = BF_BLOCK_SIZE; + unsigned int nbytes = walk->nbytes; + u64 *src = (u64 *)walk->src.virt.addr; + u64 *dst = (u64 *)walk->dst.virt.addr; + u64 *iv = (u64 *)walk->iv; + + do { + *dst = *src ^ *iv; + blowfish_enc_blk(ctx, (u8 *)dst, (u8 *)dst); + iv = dst; + + src += 1; + dst += 1; + nbytes -= bsize; + } while (nbytes >= bsize); + + *(u64 *)walk->iv = *iv; + return nbytes; +} + +static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct blkcipher_walk walk; + int err; + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt(desc, &walk); + + while ((nbytes = walk.nbytes)) { + nbytes = __cbc_encrypt(desc, &walk); + err = blkcipher_walk_done(desc, &walk, nbytes); + } + + return err; +} + +static unsigned int __cbc_decrypt(struct blkcipher_desc *desc, + struct blkcipher_walk *walk) +{ + struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + unsigned int bsize = BF_BLOCK_SIZE; + unsigned int nbytes = walk->nbytes; + u64 *src = (u64 *)walk->src.virt.addr; + u64 *dst = (u64 *)walk->dst.virt.addr; + u64 ivs[4 - 1]; + u64 last_iv; + + /* Start of the last block. */ + src += nbytes / bsize - 1; + dst += nbytes / bsize - 1; + + last_iv = *src; + + /* Process four block batch */ + if (nbytes >= bsize * 4) { + do { + nbytes -= bsize * 4 - bsize; + src -= 4 - 1; + dst -= 4 - 1; + + ivs[0] = src[0]; + ivs[1] = src[1]; + ivs[2] = src[2]; + + blowfish_dec_blk_4way(ctx, (u8 *)dst, (u8 *)src); + + dst[1] ^= ivs[0]; + dst[2] ^= ivs[1]; + dst[3] ^= ivs[2]; + + nbytes -= bsize; + if (nbytes < bsize) + goto done; + + *dst ^= *(src - 1); + src -= 1; + dst -= 1; + } while (nbytes >= bsize * 4); + + if (nbytes < bsize) + goto done; + } + + /* Handle leftovers */ + for (;;) { + blowfish_dec_blk(ctx, (u8 *)dst, (u8 *)src); + + nbytes -= bsize; + if (nbytes < bsize) + break; + + *dst ^= *(src - 1); + src -= 1; + dst -= 1; + } + +done: + *dst ^= *(u64 *)walk->iv; + *(u64 *)walk->iv = last_iv; + + return nbytes; +} + +static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct blkcipher_walk walk; + int err; + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt(desc, &walk); + + while ((nbytes = walk.nbytes)) { + nbytes = __cbc_decrypt(desc, &walk); + err = blkcipher_walk_done(desc, &walk, nbytes); + } + + return err; +} + +static struct crypto_alg blk_cbc_alg = { + .cra_name = "cbc(blowfish)", + .cra_driver_name = "cbc-blowfish-asm", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = BF_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct bf_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(blk_cbc_alg.cra_list), + .cra_u = { + .blkcipher = { + .min_keysize = BF_MIN_KEY_SIZE, + .max_keysize = BF_MAX_KEY_SIZE, + .ivsize = BF_BLOCK_SIZE, + .setkey = blowfish_setkey, + .encrypt = cbc_encrypt, + .decrypt = cbc_decrypt, + }, + }, +}; + +static void ctr_crypt_final(struct bf_ctx *ctx, struct blkcipher_walk *walk) +{ + u8 *ctrblk = walk->iv; + u8 keystream[BF_BLOCK_SIZE]; + u8 *src = walk->src.virt.addr; + u8 *dst = walk->dst.virt.addr; + unsigned int nbytes = walk->nbytes; + + blowfish_enc_blk(ctx, keystream, ctrblk); + crypto_xor(keystream, src, nbytes); + memcpy(dst, keystream, nbytes); + + crypto_inc(ctrblk, BF_BLOCK_SIZE); +} + +static unsigned int __ctr_crypt(struct blkcipher_desc *desc, + struct blkcipher_walk *walk) +{ + struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + unsigned int bsize = BF_BLOCK_SIZE; + unsigned int nbytes = walk->nbytes; + u64 *src = (u64 *)walk->src.virt.addr; + u64 *dst = (u64 *)walk->dst.virt.addr; + u64 ctrblk = be64_to_cpu(*(__be64 *)walk->iv); + __be64 ctrblocks[4]; + + /* Process four block batch */ + if (nbytes >= bsize * 4) { + do { + if (dst != src) { + dst[0] = src[0]; + dst[1] = src[1]; + dst[2] = src[2]; + dst[3] = src[3]; + } + + /* create ctrblks for parallel encrypt */ + ctrblocks[0] = cpu_to_be64(ctrblk++); + ctrblocks[1] = cpu_to_be64(ctrblk++); + ctrblocks[2] = cpu_to_be64(ctrblk++); + ctrblocks[3] = cpu_to_be64(ctrblk++); + + blowfish_enc_blk_xor_4way(ctx, (u8 *)dst, + (u8 *)ctrblocks); + + src += 4; + dst += 4; + } while ((nbytes -= bsize * 4) >= bsize * 4); + + if (nbytes < bsize) + goto done; + } + + /* Handle leftovers */ + do { + if (dst != src) + *dst = *src; + + ctrblocks[0] = cpu_to_be64(ctrblk++); + + blowfish_enc_blk_xor(ctx, (u8 *)dst, (u8 *)ctrblocks); + + src += 1; + dst += 1; + } while ((nbytes -= bsize) >= bsize); + +done: + *(__be64 *)walk->iv = cpu_to_be64(ctrblk); + return nbytes; +} + +static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct blkcipher_walk walk; + int err; + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt_block(desc, &walk, BF_BLOCK_SIZE); + + while ((nbytes = walk.nbytes) >= BF_BLOCK_SIZE) { + nbytes = __ctr_crypt(desc, &walk); + err = blkcipher_walk_done(desc, &walk, nbytes); + } + + if (walk.nbytes) { + ctr_crypt_final(crypto_blkcipher_ctx(desc->tfm), &walk); + err = blkcipher_walk_done(desc, &walk, 0); + } + + return err; +} + +static struct crypto_alg blk_ctr_alg = { + .cra_name = "ctr(blowfish)", + .cra_driver_name = "ctr-blowfish-asm", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = BF_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct bf_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(blk_ctr_alg.cra_list), + .cra_u = { + .blkcipher = { + .min_keysize = BF_MIN_KEY_SIZE, + .max_keysize = BF_MAX_KEY_SIZE, + .ivsize = BF_BLOCK_SIZE, + .setkey = blowfish_setkey, + .encrypt = ctr_crypt, + .decrypt = ctr_crypt, + }, + }, +}; + +static int __init init(void) +{ + int err; + + err = crypto_register_alg(&bf_alg); + if (err) + goto bf_err; + err = crypto_register_alg(&blk_ecb_alg); + if (err) + goto ecb_err; + err = crypto_register_alg(&blk_cbc_alg); + if (err) + goto cbc_err; + err = crypto_register_alg(&blk_ctr_alg); + if (err) + goto ctr_err; + + return 0; + +ctr_err: + crypto_unregister_alg(&blk_cbc_alg); +cbc_err: + crypto_unregister_alg(&blk_ecb_alg); +ecb_err: + crypto_unregister_alg(&bf_alg); +bf_err: + return err; +} + +static void __exit fini(void) +{ + crypto_unregister_alg(&blk_ctr_alg); + crypto_unregister_alg(&blk_cbc_alg); + crypto_unregister_alg(&blk_ecb_alg); + crypto_unregister_alg(&bf_alg); +} + +module_init(init); +module_exit(fini); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Blowfish Cipher Algorithm, asm optimized"); +MODULE_ALIAS("blowfish"); +MODULE_ALIAS("blowfish-asm"); -- cgit v1.1 From 4a4cc2b6bf475c183443278808b758c702c01404 Mon Sep 17 00:00:00 2001 From: H Hartley Sweeten Date: Thu, 22 Sep 2011 21:33:01 +1000 Subject: crypto: aes-x86 - quiet sparse noise about symbol not declared Include to pick up the declarations for crypto_aes_encrypt_x86 and crypto_aes_decrypt_x86 to quiet the sparse noise: warning: symbol 'crypto_aes_encrypt_x86' was not declared. Should it be static? warning: symbol 'crypto_aes_decrypt_x86' was not declared. Should it be static? Signed-off-by: H Hartley Sweeten Acked-by: Mandeep Singh Baines Signed-off-by: Herbert Xu --- arch/x86/crypto/aes_glue.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/crypto/aes_glue.c b/arch/x86/crypto/aes_glue.c index 49ae9fe..b0b6950 100644 --- a/arch/x86/crypto/aes_glue.c +++ b/arch/x86/crypto/aes_glue.c @@ -4,6 +4,7 @@ */ #include +#include asmlinkage void aes_enc_blk(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in); asmlinkage void aes_dec_blk(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in); -- cgit v1.1 From 55e901fc1f03dd8437f877813c68b6014cdbeefd Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 22 Sep 2011 09:17:57 +0100 Subject: xen/pci: support multi-segment systems Now that the hypercall interface changes are in -unstable, make the kernel side code not ignore the segment (aka domain) number anymore (which results in pretty odd behavior on such systems). Rather, if only the old interfaces are available, don't call them for devices on non-zero segments at all. Signed-off-by: Jan Beulich [v1: Edited git description] Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/pci/xen.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index f567965..265fa88 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -185,6 +185,8 @@ static void xen_teardown_msi_irq(unsigned int irq) } #ifdef CONFIG_XEN_DOM0 +static bool __read_mostly pci_seg_supported = true; + static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) { int ret = 0; @@ -202,10 +204,11 @@ static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) memset(&map_irq, 0, sizeof(map_irq)); map_irq.domid = domid; - map_irq.type = MAP_PIRQ_TYPE_MSI; + map_irq.type = MAP_PIRQ_TYPE_MSI_SEG; map_irq.index = -1; map_irq.pirq = -1; - map_irq.bus = dev->bus->number; + map_irq.bus = dev->bus->number | + (pci_domain_nr(dev->bus) << 16); map_irq.devfn = dev->devfn; if (type == PCI_CAP_ID_MSIX) { @@ -222,7 +225,20 @@ static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) map_irq.entry_nr = msidesc->msi_attrib.entry_nr; } - ret = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq); + ret = -EINVAL; + if (pci_seg_supported) + ret = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, + &map_irq); + if (ret == -EINVAL && !pci_domain_nr(dev->bus)) { + map_irq.type = MAP_PIRQ_TYPE_MSI; + map_irq.index = -1; + map_irq.pirq = -1; + map_irq.bus = dev->bus->number; + ret = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, + &map_irq); + if (ret != -EINVAL) + pci_seg_supported = false; + } if (ret) { dev_warn(&dev->dev, "xen map irq failed %d for %d domain\n", ret, domid); -- cgit v1.1 From a867db10e89e12a3d97dedafdd411aa1527a6540 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 23 Sep 2011 16:32:47 -0400 Subject: xen/p2m: Make debug/xen/mmu/p2m visible again. We dropped a lot of the MMU debugfs in favour of using tracing API - but there is one which just provides mostly static information that was made invisible by this change. Bring it back. Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/include/asm/xen/page.h | 3 --- arch/x86/xen/mmu.c | 14 -------------- arch/x86/xen/p2m.c | 35 ++++++++++++++++++++++++++++++++--- 3 files changed, 32 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index 64a619d..bc12c12 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -53,9 +53,6 @@ extern int m2p_remove_override(struct page *page, bool clear_pte); extern struct page *m2p_find_override(unsigned long mfn); extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn); -#ifdef CONFIG_XEN_DEBUG_FS -extern int p2m_dump_show(struct seq_file *m, void *v); -#endif static inline unsigned long pfn_to_mfn(unsigned long pfn) { unsigned long mfn; diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 3c9aecd..4df0444 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -2362,17 +2362,3 @@ out: return err; } EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range); - -#ifdef CONFIG_XEN_DEBUG_FS -static int p2m_dump_open(struct inode *inode, struct file *filp) -{ - return single_open(filp, p2m_dump_show, NULL); -} - -static const struct file_operations p2m_dump_fops = { - .open = p2m_dump_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; -#endif /* CONFIG_XEN_DEBUG_FS */ diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 58efeb9..cc2f8dc 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -782,8 +782,9 @@ unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn) EXPORT_SYMBOL_GPL(m2p_find_override_pfn); #ifdef CONFIG_XEN_DEBUG_FS - -int p2m_dump_show(struct seq_file *m, void *v) +#include +#include "debugfs.h" +static int p2m_dump_show(struct seq_file *m, void *v) { static const char * const level_name[] = { "top", "middle", "entry", "abnormal" }; @@ -856,4 +857,32 @@ int p2m_dump_show(struct seq_file *m, void *v) #undef TYPE_PFN #undef TYPE_UNKNOWN } -#endif + +static int p2m_dump_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, p2m_dump_show, NULL); +} + +static const struct file_operations p2m_dump_fops = { + .open = p2m_dump_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static struct dentry *d_mmu_debug; + +static int __init xen_p2m_debugfs(void) +{ + struct dentry *d_xen = xen_init_debugfs(); + + if (d_xen == NULL) + return -ENOMEM; + + d_mmu_debug = debugfs_create_dir("mmu", d_xen); + + debugfs_create_file("p2m", 0600, d_mmu_debug, NULL, &p2m_dump_fops); + return 0; +} +fs_initcall(xen_p2m_debugfs); +#endif /* CONFIG_XEN_DEBUG_FS */ -- cgit v1.1 From 0f4b49eaf25e661fbe63a5370b7781166b34d616 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 23 Sep 2011 17:36:07 -0400 Subject: xen/p2m: Use SetPagePrivate and its friends for M2P overrides. We use the page->private field and hence should use the proper macros and set proper bits. Also WARN_ON in case somebody tries to overwrite our data. Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/p2m.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index cc2f8dc..6e56b65 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -692,8 +692,9 @@ int m2p_add_override(unsigned long mfn, struct page *page, bool clear_pte) "m2p_add_override: pfn %lx not mapped", pfn)) return -EINVAL; } - - page->private = mfn; + WARN_ON(PagePrivate(page)); + SetPagePrivate(page); + set_page_private(page, mfn); page->index = pfn_to_mfn(pfn); if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)))) @@ -736,7 +737,8 @@ int m2p_remove_override(struct page *page, bool clear_pte) list_del(&page->lru); spin_unlock_irqrestore(&m2p_override_lock, flags); set_phys_to_machine(pfn, page->index); - + WARN_ON(!PagePrivate(page)); + ClearPagePrivate(page); if (clear_pte && !PageHighMem(page)) set_pte_at(&init_mm, address, ptep, pfn_pte(pfn, PAGE_KERNEL)); @@ -758,7 +760,7 @@ struct page *m2p_find_override(unsigned long mfn) spin_lock_irqsave(&m2p_override_lock, flags); list_for_each_entry(p, bucket, lru) { - if (p->private == mfn) { + if (page_private(p) == mfn) { ret = p; break; } -- cgit v1.1 From 41bc3186b3c92a4ca05e2aa14bb6272fb491e679 Mon Sep 17 00:00:00 2001 From: Zhao Jin Date: Mon, 19 Sep 2011 12:19:51 +0800 Subject: KVM: MMU: fix incorrect return of spte __update_clear_spte_slow should return original spte while the current code returns low half of original spte combined with high half of new spte. Signed-off-by: Zhao Jin Reviewed-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 1c5b693..8e8da79 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -400,7 +400,8 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte) /* xchg acts as a barrier before the setting of the high bits */ orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low); - orig.spte_high = ssptep->spte_high = sspte.spte_high; + orig.spte_high = ssptep->spte_high; + ssptep->spte_high = sspte.spte_high; count_spte_clear(sptep, spte); return orig.spte; -- cgit v1.1 From 9be3be1f153e90ea4e1e5b6ed1d72a73d44318d1 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 13 Sep 2011 10:45:38 +0300 Subject: KVM: x86 emulator: fix Src2CL decode Src2CL decode (used for double width shifts) erronously decodes only bit 3 of %rcx, instead of bits 7:0. Fix by decoding %cl in its entirety. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 6f08bc9..8b4cc5f 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3603,7 +3603,7 @@ done_prefixes: break; case Src2CL: ctxt->src2.bytes = 1; - ctxt->src2.val = ctxt->regs[VCPU_REGS_RCX] & 0x8; + ctxt->src2.val = ctxt->regs[VCPU_REGS_RCX] & 0xff; break; case Src2ImmByte: rc = decode_imm(ctxt, &ctxt->src2, 1, true); -- cgit v1.1 From ca7d58f375c650cf36900cb1da1ca2cc99b13393 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 13 Jul 2011 14:31:08 +0800 Subject: KVM: x86: fix broken read emulation spans a page boundary If the range spans a page boundary, the mmio access can be broke, fix it as write emulation. And we already get the guest physical address, so use it to read guest data directly to avoid walking guest page table again Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 39 +++++++++++++++++++++++++++++++-------- 1 file changed, 31 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 84a28ea..1453248 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4045,13 +4045,12 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, return 0; } -static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, - unsigned long addr, - void *val, - unsigned int bytes, - struct x86_exception *exception) +static int emulator_read_emulated_onepage(unsigned long addr, + void *val, + unsigned int bytes, + struct x86_exception *exception, + struct kvm_vcpu *vcpu) { - struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); gpa_t gpa; int handled, ret; @@ -4071,8 +4070,7 @@ static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, if (ret) goto mmio; - if (kvm_read_guest_virt(ctxt, addr, val, bytes, exception) - == X86EMUL_CONTINUE) + if (!kvm_read_guest(vcpu->kvm, gpa, val, bytes)) return X86EMUL_CONTINUE; mmio: @@ -4101,6 +4099,31 @@ mmio: return X86EMUL_IO_NEEDED; } +static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, + unsigned long addr, + void *val, + unsigned int bytes, + struct x86_exception *exception) +{ + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + + /* Crossing a page boundary? */ + if (((addr + bytes - 1) ^ addr) & PAGE_MASK) { + int rc, now; + + now = -addr & ~PAGE_MASK; + rc = emulator_read_emulated_onepage(addr, val, now, exception, + vcpu); + if (rc != X86EMUL_CONTINUE) + return rc; + addr += now; + val += now; + bytes -= now; + } + return emulator_read_emulated_onepage(addr, val, bytes, exception, + vcpu); +} + int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, const void *val, int bytes) { -- cgit v1.1 From 77d197b2ca37b33b0461ab1e2dbe40cbe4a6fd6a Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 13 Jul 2011 14:31:50 +0800 Subject: KVM: x86: abstract the operation for read/write emulation The operations of read emulation and write emulation are very similar, so we can abstract the operation of them, in larter patch, it is used to cleanup the same code Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1453248..f5c60a8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4136,6 +4136,78 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, return 1; } +struct read_write_emulator_ops { + int (*read_write_prepare)(struct kvm_vcpu *vcpu, void *val, + int bytes); + int (*read_write_emulate)(struct kvm_vcpu *vcpu, gpa_t gpa, + void *val, int bytes); + int (*read_write_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa, + int bytes, void *val); + int (*read_write_exit_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa, + void *val, int bytes); + bool write; +}; + +static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes) +{ + if (vcpu->mmio_read_completed) { + memcpy(val, vcpu->mmio_data, bytes); + trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, + vcpu->mmio_phys_addr, *(u64 *)val); + vcpu->mmio_read_completed = 0; + return 1; + } + + return 0; +} + +static int read_emulate(struct kvm_vcpu *vcpu, gpa_t gpa, + void *val, int bytes) +{ + return !kvm_read_guest(vcpu->kvm, gpa, val, bytes); +} + +static int write_emulate(struct kvm_vcpu *vcpu, gpa_t gpa, + void *val, int bytes) +{ + return emulator_write_phys(vcpu, gpa, val, bytes); +} + +static int write_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes, void *val) +{ + trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val); + return vcpu_mmio_write(vcpu, gpa, bytes, val); +} + +static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, + void *val, int bytes) +{ + trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0); + return X86EMUL_IO_NEEDED; +} + +static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, + void *val, int bytes) +{ + memcpy(vcpu->mmio_data, val, bytes); + memcpy(vcpu->run->mmio.data, vcpu->mmio_data, 8); + return X86EMUL_CONTINUE; +} + +static struct read_write_emulator_ops read_emultor = { + .read_write_prepare = read_prepare, + .read_write_emulate = read_emulate, + .read_write_mmio = vcpu_mmio_read, + .read_write_exit_mmio = read_exit_mmio, +}; + +static struct read_write_emulator_ops write_emultor = { + .read_write_emulate = write_emulate, + .read_write_mmio = write_mmio, + .read_write_exit_mmio = write_exit_mmio, + .write = true, +}; + static int emulator_write_emulated_onepage(unsigned long addr, const void *val, unsigned int bytes, -- cgit v1.1 From 22388a3c8ce2a2a004ce764194cce8a2f9b13d66 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 13 Jul 2011 14:32:31 +0800 Subject: KVM: x86: cleanup the code of read/write emulation Using the read/write operation to remove the same code Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 146 +++++++++++++++++------------------------------------ 1 file changed, 45 insertions(+), 101 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f5c60a8..2b76ae3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4045,85 +4045,6 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, return 0; } -static int emulator_read_emulated_onepage(unsigned long addr, - void *val, - unsigned int bytes, - struct x86_exception *exception, - struct kvm_vcpu *vcpu) -{ - gpa_t gpa; - int handled, ret; - - if (vcpu->mmio_read_completed) { - memcpy(val, vcpu->mmio_data, bytes); - trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, - vcpu->mmio_phys_addr, *(u64 *)val); - vcpu->mmio_read_completed = 0; - return X86EMUL_CONTINUE; - } - - ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, false); - - if (ret < 0) - return X86EMUL_PROPAGATE_FAULT; - - if (ret) - goto mmio; - - if (!kvm_read_guest(vcpu->kvm, gpa, val, bytes)) - return X86EMUL_CONTINUE; - -mmio: - /* - * Is this MMIO handled locally? - */ - handled = vcpu_mmio_read(vcpu, gpa, bytes, val); - - if (handled == bytes) - return X86EMUL_CONTINUE; - - gpa += handled; - bytes -= handled; - val += handled; - - trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0); - - vcpu->mmio_needed = 1; - vcpu->run->exit_reason = KVM_EXIT_MMIO; - vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; - vcpu->mmio_size = bytes; - vcpu->run->mmio.len = min(vcpu->mmio_size, 8); - vcpu->run->mmio.is_write = vcpu->mmio_is_write = 0; - vcpu->mmio_index = 0; - - return X86EMUL_IO_NEEDED; -} - -static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, - unsigned long addr, - void *val, - unsigned int bytes, - struct x86_exception *exception) -{ - struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - - /* Crossing a page boundary? */ - if (((addr + bytes - 1) ^ addr) & PAGE_MASK) { - int rc, now; - - now = -addr & ~PAGE_MASK; - rc = emulator_read_emulated_onepage(addr, val, now, exception, - vcpu); - if (rc != X86EMUL_CONTINUE) - return rc; - addr += now; - val += now; - bytes -= now; - } - return emulator_read_emulated_onepage(addr, val, bytes, exception, - vcpu); -} - int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, const void *val, int bytes) { @@ -4208,16 +4129,21 @@ static struct read_write_emulator_ops write_emultor = { .write = true, }; -static int emulator_write_emulated_onepage(unsigned long addr, - const void *val, - unsigned int bytes, - struct x86_exception *exception, - struct kvm_vcpu *vcpu) +static int emulator_read_write_onepage(unsigned long addr, void *val, + unsigned int bytes, + struct x86_exception *exception, + struct kvm_vcpu *vcpu, + struct read_write_emulator_ops *ops) { gpa_t gpa; int handled, ret; + bool write = ops->write; + + if (ops->read_write_prepare && + ops->read_write_prepare(vcpu, val, bytes)) + return X86EMUL_CONTINUE; - ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, true); + ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write); if (ret < 0) return X86EMUL_PROPAGATE_FAULT; @@ -4226,15 +4152,14 @@ static int emulator_write_emulated_onepage(unsigned long addr, if (ret) goto mmio; - if (emulator_write_phys(vcpu, gpa, val, bytes)) + if (ops->read_write_emulate(vcpu, gpa, val, bytes)) return X86EMUL_CONTINUE; mmio: - trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val); /* * Is this MMIO handled locally? */ - handled = vcpu_mmio_write(vcpu, gpa, bytes, val); + handled = ops->read_write_mmio(vcpu, gpa, bytes, val); if (handled == bytes) return X86EMUL_CONTINUE; @@ -4243,23 +4168,20 @@ mmio: val += handled; vcpu->mmio_needed = 1; - memcpy(vcpu->mmio_data, val, bytes); vcpu->run->exit_reason = KVM_EXIT_MMIO; vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; vcpu->mmio_size = bytes; vcpu->run->mmio.len = min(vcpu->mmio_size, 8); - vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1; - memcpy(vcpu->run->mmio.data, vcpu->mmio_data, 8); + vcpu->run->mmio.is_write = vcpu->mmio_is_write = write; vcpu->mmio_index = 0; - return X86EMUL_CONTINUE; + return ops->read_write_exit_mmio(vcpu, gpa, val, bytes); } -int emulator_write_emulated(struct x86_emulate_ctxt *ctxt, - unsigned long addr, - const void *val, - unsigned int bytes, - struct x86_exception *exception) +int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr, + void *val, unsigned int bytes, + struct x86_exception *exception, + struct read_write_emulator_ops *ops) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); @@ -4268,16 +4190,38 @@ int emulator_write_emulated(struct x86_emulate_ctxt *ctxt, int rc, now; now = -addr & ~PAGE_MASK; - rc = emulator_write_emulated_onepage(addr, val, now, exception, - vcpu); + rc = emulator_read_write_onepage(addr, val, now, exception, + vcpu, ops); + if (rc != X86EMUL_CONTINUE) return rc; addr += now; val += now; bytes -= now; } - return emulator_write_emulated_onepage(addr, val, bytes, exception, - vcpu); + + return emulator_read_write_onepage(addr, val, bytes, exception, + vcpu, ops); +} + +static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, + unsigned long addr, + void *val, + unsigned int bytes, + struct x86_exception *exception) +{ + return emulator_read_write(ctxt, addr, val, bytes, + exception, &read_emultor); +} + +int emulator_write_emulated(struct x86_emulate_ctxt *ctxt, + unsigned long addr, + const void *val, + unsigned int bytes, + struct x86_exception *exception) +{ + return emulator_read_write(ctxt, addr, (void *)val, bytes, + exception, &write_emultor); } #define CMPXCHG_TYPE(t, ptr, old, new) \ -- cgit v1.1 From 8c3ba334f8588e1d5099f8602cf01897720e0eca Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Mon, 18 Jul 2011 17:17:15 +0300 Subject: KVM: x86: Raise the hard VCPU count limit The patch raises the hard limit of VCPU count to 254. This will allow developers to easily work on scalability and will allow users to test high VCPU setups easily without patching the kernel. To prevent possible issues with current setups, KVM_CAP_NR_VCPUS now returns the recommended VCPU limit (which is still 64) - this should be a safe value for everybody, while a new KVM_CAP_MAX_VCPUS returns the hard limit which is now 254. Cc: Avi Kivity Cc: Ingo Molnar Cc: Marcelo Tosatti Cc: Pekka Enberg Suggested-by: Pekka Enberg Signed-off-by: Sasha Levin Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_host.h | 3 ++- arch/x86/kvm/x86.c | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index dd51c83..c00ec28 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -26,7 +26,8 @@ #include #include -#define KVM_MAX_VCPUS 64 +#define KVM_MAX_VCPUS 254 +#define KVM_SOFT_MAX_VCPUS 64 #define KVM_MEMORY_SLOTS 32 /* memory slots that does not exposed to userspace */ #define KVM_PRIVATE_MEM_SLOTS 4 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2b76ae3..41dfebe 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2086,6 +2086,9 @@ int kvm_dev_ioctl_check_extension(long ext) r = !kvm_x86_ops->cpu_has_accelerated_tpr(); break; case KVM_CAP_NR_VCPUS: + r = KVM_SOFT_MAX_VCPUS; + break; + case KVM_CAP_MAX_VCPUS: r = KVM_MAX_VCPUS; break; case KVM_CAP_NR_MEMSLOTS: -- cgit v1.1 From 14fa67ee95d4f7313fbf149fe37faccb903857c8 Mon Sep 17 00:00:00 2001 From: Mike Waychison Date: Thu, 21 Jul 2011 15:38:10 -0700 Subject: KVM: x86: get_msr support for HV_X64_MSR_APIC_ASSIST_PAGE "get" support for the HV_X64_MSR_APIC_ASSIST_PAGE msr was missing, even though it is explicitly enumerated as something the vmm should save in msrs_to_save and reported to userland via the KVM_GET_MSR_INDEX_LIST ioctl. Add "get" support for HV_X64_MSR_APIC_ASSIST_PAGE. We simply return the guest visible value of this register, which seems to be correct as a set on the register is validated for us already. Signed-off-by: Mike Waychison Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 41dfebe..e80f0d7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1825,6 +1825,8 @@ static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata); case HV_X64_MSR_TPR: return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata); + case HV_X64_MSR_APIC_ASSIST_PAGE: + return vcpu->arch.hv_vapic; default: pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); return 1; -- cgit v1.1 From d1613ad5d0018a009bd4865b0fa5930abb5ed259 Mon Sep 17 00:00:00 2001 From: Mike Waychison Date: Sat, 23 Jul 2011 00:31:45 -0700 Subject: KVM: Really fix HV_X64_MSR_APIC_ASSIST_PAGE Commit 0945d4b228 tried to fix the get_msr path for the HV_X64_MSR_APIC_ASSIST_PAGE msr, but was poorly tested. We should be returning 0 if the read succeeded, and passing the value back to the caller via the pdata out argument, not returning the value directly. Signed-off-by: Mike Waychison Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e80f0d7..6cb353c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1826,7 +1826,8 @@ static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case HV_X64_MSR_TPR: return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata); case HV_X64_MSR_APIC_ASSIST_PAGE: - return vcpu->arch.hv_vapic; + data = vcpu->arch.hv_vapic; + break; default: pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); return 1; -- cgit v1.1 From e097e5ffd69cbd7be61466e2d54c145468d48073 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Fri, 22 Jul 2011 12:46:52 +0100 Subject: KVM: Record instruction set in all vmexit tracepoints The kvm_exit tracepoint recently added the isa argument to aid decoding exit_reason. The semantics of exit_reason depend on the instruction set (vmx or svm) and the isa argument allows traces to be analyzed on other machines. Add the isa argument to kvm_nested_vmexit and kvm_nested_vmexit_inject so these tracepoints can also be self-describing. Signed-off-by: Stefan Hajnoczi Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 6 ++++-- arch/x86/kvm/trace.h | 12 ++++++++---- 2 files changed, 12 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 475d1c9..6adb7ba 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2182,7 +2182,8 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) vmcb->control.exit_info_1, vmcb->control.exit_info_2, vmcb->control.exit_int_info, - vmcb->control.exit_int_info_err); + vmcb->control.exit_int_info_err, + KVM_ISA_SVM); nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, &page); if (!nested_vmcb) @@ -3335,7 +3336,8 @@ static int handle_exit(struct kvm_vcpu *vcpu) svm->vmcb->control.exit_info_1, svm->vmcb->control.exit_info_2, svm->vmcb->control.exit_int_info, - svm->vmcb->control.exit_int_info_err); + svm->vmcb->control.exit_int_info_err, + KVM_ISA_SVM); vmexit = nested_svm_exit_special(svm); diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 3ff898c..4e1716b 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -486,9 +486,9 @@ TRACE_EVENT(kvm_nested_intercepts, TRACE_EVENT(kvm_nested_vmexit, TP_PROTO(__u64 rip, __u32 exit_code, __u64 exit_info1, __u64 exit_info2, - __u32 exit_int_info, __u32 exit_int_info_err), + __u32 exit_int_info, __u32 exit_int_info_err, __u32 isa), TP_ARGS(rip, exit_code, exit_info1, exit_info2, - exit_int_info, exit_int_info_err), + exit_int_info, exit_int_info_err, isa), TP_STRUCT__entry( __field( __u64, rip ) @@ -497,6 +497,7 @@ TRACE_EVENT(kvm_nested_vmexit, __field( __u64, exit_info2 ) __field( __u32, exit_int_info ) __field( __u32, exit_int_info_err ) + __field( __u32, isa ) ), TP_fast_assign( @@ -506,6 +507,7 @@ TRACE_EVENT(kvm_nested_vmexit, __entry->exit_info2 = exit_info2; __entry->exit_int_info = exit_int_info; __entry->exit_int_info_err = exit_int_info_err; + __entry->isa = isa; ), TP_printk("rip: 0x%016llx reason: %s ext_inf1: 0x%016llx " "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x", @@ -522,9 +524,9 @@ TRACE_EVENT(kvm_nested_vmexit, TRACE_EVENT(kvm_nested_vmexit_inject, TP_PROTO(__u32 exit_code, __u64 exit_info1, __u64 exit_info2, - __u32 exit_int_info, __u32 exit_int_info_err), + __u32 exit_int_info, __u32 exit_int_info_err, __u32 isa), TP_ARGS(exit_code, exit_info1, exit_info2, - exit_int_info, exit_int_info_err), + exit_int_info, exit_int_info_err, isa), TP_STRUCT__entry( __field( __u32, exit_code ) @@ -532,6 +534,7 @@ TRACE_EVENT(kvm_nested_vmexit_inject, __field( __u64, exit_info2 ) __field( __u32, exit_int_info ) __field( __u32, exit_int_info_err ) + __field( __u32, isa ) ), TP_fast_assign( @@ -540,6 +543,7 @@ TRACE_EVENT(kvm_nested_vmexit_inject, __entry->exit_info2 = exit_info2; __entry->exit_int_info = exit_int_info; __entry->exit_int_info_err = exit_int_info_err; + __entry->isa = isa; ), TP_printk("reason: %s ext_inf1: 0x%016llx " -- cgit v1.1 From 0d460ffc0956d2dbe12ca9f5f6aa0f8701ea9d73 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Fri, 22 Jul 2011 12:46:53 +0100 Subject: KVM: Use __print_symbolic() for vmexit tracepoints The vmexit tracepoints format the exit_reason to make it human-readable. Since the exit_reason depends on the instruction set (vmx or svm), formatting is handled with ftrace_print_symbols_seq() by referring to the appropriate exit reason table. However, the ftrace_print_symbols_seq() function is not meant to be used directly in tracepoints since it does not export the formatting table which userspace tools like trace-cmd and perf use to format traces. In practice perf dies when formatting vmexit-related events and trace-cmd falls back to printing the numeric value (with extra formatting code in the kvm plugin to paper over this limitation). Other userspace consumers of vmexit-related tracepoints would be in similar trouble. To avoid significant changes to the kvm_exit tracepoint, this patch moves the vmx and svm exit reason tables into arch/x86/kvm/trace.h and selects the right table with __print_symbolic() depending on the instruction set. Note that __print_symbolic() is designed for exporting the formatting table to userspace and allows trace-cmd and perf to work. Signed-off-by: Stefan Hajnoczi Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 2 - arch/x86/kvm/svm.c | 55 --------------------- arch/x86/kvm/trace.h | 106 +++++++++++++++++++++++++++++++++++++--- arch/x86/kvm/vmx.c | 44 ----------------- 4 files changed, 100 insertions(+), 107 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c00ec28..307e3cf 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -635,8 +635,6 @@ struct kvm_x86_ops { int (*check_intercept)(struct kvm_vcpu *vcpu, struct x86_instruction_info *info, enum x86_intercept_stage stage); - - const struct trace_print_flags *exit_reasons_str; }; struct kvm_arch_async_pf { diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 6adb7ba..2b24a88 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3899,60 +3899,6 @@ static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) } } -static const struct trace_print_flags svm_exit_reasons_str[] = { - { SVM_EXIT_READ_CR0, "read_cr0" }, - { SVM_EXIT_READ_CR3, "read_cr3" }, - { SVM_EXIT_READ_CR4, "read_cr4" }, - { SVM_EXIT_READ_CR8, "read_cr8" }, - { SVM_EXIT_WRITE_CR0, "write_cr0" }, - { SVM_EXIT_WRITE_CR3, "write_cr3" }, - { SVM_EXIT_WRITE_CR4, "write_cr4" }, - { SVM_EXIT_WRITE_CR8, "write_cr8" }, - { SVM_EXIT_READ_DR0, "read_dr0" }, - { SVM_EXIT_READ_DR1, "read_dr1" }, - { SVM_EXIT_READ_DR2, "read_dr2" }, - { SVM_EXIT_READ_DR3, "read_dr3" }, - { SVM_EXIT_WRITE_DR0, "write_dr0" }, - { SVM_EXIT_WRITE_DR1, "write_dr1" }, - { SVM_EXIT_WRITE_DR2, "write_dr2" }, - { SVM_EXIT_WRITE_DR3, "write_dr3" }, - { SVM_EXIT_WRITE_DR5, "write_dr5" }, - { SVM_EXIT_WRITE_DR7, "write_dr7" }, - { SVM_EXIT_EXCP_BASE + DB_VECTOR, "DB excp" }, - { SVM_EXIT_EXCP_BASE + BP_VECTOR, "BP excp" }, - { SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" }, - { SVM_EXIT_EXCP_BASE + PF_VECTOR, "PF excp" }, - { SVM_EXIT_EXCP_BASE + NM_VECTOR, "NM excp" }, - { SVM_EXIT_EXCP_BASE + MC_VECTOR, "MC excp" }, - { SVM_EXIT_INTR, "interrupt" }, - { SVM_EXIT_NMI, "nmi" }, - { SVM_EXIT_SMI, "smi" }, - { SVM_EXIT_INIT, "init" }, - { SVM_EXIT_VINTR, "vintr" }, - { SVM_EXIT_CPUID, "cpuid" }, - { SVM_EXIT_INVD, "invd" }, - { SVM_EXIT_HLT, "hlt" }, - { SVM_EXIT_INVLPG, "invlpg" }, - { SVM_EXIT_INVLPGA, "invlpga" }, - { SVM_EXIT_IOIO, "io" }, - { SVM_EXIT_MSR, "msr" }, - { SVM_EXIT_TASK_SWITCH, "task_switch" }, - { SVM_EXIT_SHUTDOWN, "shutdown" }, - { SVM_EXIT_VMRUN, "vmrun" }, - { SVM_EXIT_VMMCALL, "hypercall" }, - { SVM_EXIT_VMLOAD, "vmload" }, - { SVM_EXIT_VMSAVE, "vmsave" }, - { SVM_EXIT_STGI, "stgi" }, - { SVM_EXIT_CLGI, "clgi" }, - { SVM_EXIT_SKINIT, "skinit" }, - { SVM_EXIT_WBINVD, "wbinvd" }, - { SVM_EXIT_MONITOR, "monitor" }, - { SVM_EXIT_MWAIT, "mwait" }, - { SVM_EXIT_XSETBV, "xsetbv" }, - { SVM_EXIT_NPF, "npf" }, - { -1, NULL } -}; - static int svm_get_lpage_level(void) { return PT_PDPE_LEVEL; @@ -4225,7 +4171,6 @@ static struct kvm_x86_ops svm_x86_ops = { .get_mt_mask = svm_get_mt_mask, .get_exit_info = svm_get_exit_info, - .exit_reasons_str = svm_exit_reasons_str, .get_lpage_level = svm_get_lpage_level, diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 4e1716b..911d264 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -2,6 +2,8 @@ #define _TRACE_KVM_H #include +#include +#include #undef TRACE_SYSTEM #define TRACE_SYSTEM kvm @@ -181,6 +183,95 @@ TRACE_EVENT(kvm_apic, #define KVM_ISA_VMX 1 #define KVM_ISA_SVM 2 +#define VMX_EXIT_REASONS \ + { EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \ + { EXIT_REASON_EXTERNAL_INTERRUPT, "EXTERNAL_INTERRUPT" }, \ + { EXIT_REASON_TRIPLE_FAULT, "TRIPLE_FAULT" }, \ + { EXIT_REASON_PENDING_INTERRUPT, "PENDING_INTERRUPT" }, \ + { EXIT_REASON_NMI_WINDOW, "NMI_WINDOW" }, \ + { EXIT_REASON_TASK_SWITCH, "TASK_SWITCH" }, \ + { EXIT_REASON_CPUID, "CPUID" }, \ + { EXIT_REASON_HLT, "HLT" }, \ + { EXIT_REASON_INVLPG, "INVLPG" }, \ + { EXIT_REASON_RDPMC, "RDPMC" }, \ + { EXIT_REASON_RDTSC, "RDTSC" }, \ + { EXIT_REASON_VMCALL, "VMCALL" }, \ + { EXIT_REASON_VMCLEAR, "VMCLEAR" }, \ + { EXIT_REASON_VMLAUNCH, "VMLAUNCH" }, \ + { EXIT_REASON_VMPTRLD, "VMPTRLD" }, \ + { EXIT_REASON_VMPTRST, "VMPTRST" }, \ + { EXIT_REASON_VMREAD, "VMREAD" }, \ + { EXIT_REASON_VMRESUME, "VMRESUME" }, \ + { EXIT_REASON_VMWRITE, "VMWRITE" }, \ + { EXIT_REASON_VMOFF, "VMOFF" }, \ + { EXIT_REASON_VMON, "VMON" }, \ + { EXIT_REASON_CR_ACCESS, "CR_ACCESS" }, \ + { EXIT_REASON_DR_ACCESS, "DR_ACCESS" }, \ + { EXIT_REASON_IO_INSTRUCTION, "IO_INSTRUCTION" }, \ + { EXIT_REASON_MSR_READ, "MSR_READ" }, \ + { EXIT_REASON_MSR_WRITE, "MSR_WRITE" }, \ + { EXIT_REASON_MWAIT_INSTRUCTION, "MWAIT_INSTRUCTION" }, \ + { EXIT_REASON_MONITOR_INSTRUCTION, "MONITOR_INSTRUCTION" }, \ + { EXIT_REASON_PAUSE_INSTRUCTION, "PAUSE_INSTRUCTION" }, \ + { EXIT_REASON_MCE_DURING_VMENTRY, "MCE_DURING_VMENTRY" }, \ + { EXIT_REASON_TPR_BELOW_THRESHOLD, "TPR_BELOW_THRESHOLD" }, \ + { EXIT_REASON_APIC_ACCESS, "APIC_ACCESS" }, \ + { EXIT_REASON_EPT_VIOLATION, "EPT_VIOLATION" }, \ + { EXIT_REASON_EPT_MISCONFIG, "EPT_MISCONFIG" }, \ + { EXIT_REASON_WBINVD, "WBINVD" } + +#define SVM_EXIT_REASONS \ + { SVM_EXIT_READ_CR0, "read_cr0" }, \ + { SVM_EXIT_READ_CR3, "read_cr3" }, \ + { SVM_EXIT_READ_CR4, "read_cr4" }, \ + { SVM_EXIT_READ_CR8, "read_cr8" }, \ + { SVM_EXIT_WRITE_CR0, "write_cr0" }, \ + { SVM_EXIT_WRITE_CR3, "write_cr3" }, \ + { SVM_EXIT_WRITE_CR4, "write_cr4" }, \ + { SVM_EXIT_WRITE_CR8, "write_cr8" }, \ + { SVM_EXIT_READ_DR0, "read_dr0" }, \ + { SVM_EXIT_READ_DR1, "read_dr1" }, \ + { SVM_EXIT_READ_DR2, "read_dr2" }, \ + { SVM_EXIT_READ_DR3, "read_dr3" }, \ + { SVM_EXIT_WRITE_DR0, "write_dr0" }, \ + { SVM_EXIT_WRITE_DR1, "write_dr1" }, \ + { SVM_EXIT_WRITE_DR2, "write_dr2" }, \ + { SVM_EXIT_WRITE_DR3, "write_dr3" }, \ + { SVM_EXIT_WRITE_DR5, "write_dr5" }, \ + { SVM_EXIT_WRITE_DR7, "write_dr7" }, \ + { SVM_EXIT_EXCP_BASE + DB_VECTOR, "DB excp" }, \ + { SVM_EXIT_EXCP_BASE + BP_VECTOR, "BP excp" }, \ + { SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" }, \ + { SVM_EXIT_EXCP_BASE + PF_VECTOR, "PF excp" }, \ + { SVM_EXIT_EXCP_BASE + NM_VECTOR, "NM excp" }, \ + { SVM_EXIT_EXCP_BASE + MC_VECTOR, "MC excp" }, \ + { SVM_EXIT_INTR, "interrupt" }, \ + { SVM_EXIT_NMI, "nmi" }, \ + { SVM_EXIT_SMI, "smi" }, \ + { SVM_EXIT_INIT, "init" }, \ + { SVM_EXIT_VINTR, "vintr" }, \ + { SVM_EXIT_CPUID, "cpuid" }, \ + { SVM_EXIT_INVD, "invd" }, \ + { SVM_EXIT_HLT, "hlt" }, \ + { SVM_EXIT_INVLPG, "invlpg" }, \ + { SVM_EXIT_INVLPGA, "invlpga" }, \ + { SVM_EXIT_IOIO, "io" }, \ + { SVM_EXIT_MSR, "msr" }, \ + { SVM_EXIT_TASK_SWITCH, "task_switch" }, \ + { SVM_EXIT_SHUTDOWN, "shutdown" }, \ + { SVM_EXIT_VMRUN, "vmrun" }, \ + { SVM_EXIT_VMMCALL, "hypercall" }, \ + { SVM_EXIT_VMLOAD, "vmload" }, \ + { SVM_EXIT_VMSAVE, "vmsave" }, \ + { SVM_EXIT_STGI, "stgi" }, \ + { SVM_EXIT_CLGI, "clgi" }, \ + { SVM_EXIT_SKINIT, "skinit" }, \ + { SVM_EXIT_WBINVD, "wbinvd" }, \ + { SVM_EXIT_MONITOR, "monitor" }, \ + { SVM_EXIT_MWAIT, "mwait" }, \ + { SVM_EXIT_XSETBV, "xsetbv" }, \ + { SVM_EXIT_NPF, "npf" } + /* * Tracepoint for kvm guest exit: */ @@ -205,8 +296,9 @@ TRACE_EVENT(kvm_exit, ), TP_printk("reason %s rip 0x%lx info %llx %llx", - ftrace_print_symbols_seq(p, __entry->exit_reason, - kvm_x86_ops->exit_reasons_str), + (__entry->isa == KVM_ISA_VMX) ? + __print_symbolic(__entry->exit_reason, VMX_EXIT_REASONS) : + __print_symbolic(__entry->exit_reason, SVM_EXIT_REASONS), __entry->guest_rip, __entry->info1, __entry->info2) ); @@ -512,8 +604,9 @@ TRACE_EVENT(kvm_nested_vmexit, TP_printk("rip: 0x%016llx reason: %s ext_inf1: 0x%016llx " "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x", __entry->rip, - ftrace_print_symbols_seq(p, __entry->exit_code, - kvm_x86_ops->exit_reasons_str), + (__entry->isa == KVM_ISA_VMX) ? + __print_symbolic(__entry->exit_code, VMX_EXIT_REASONS) : + __print_symbolic(__entry->exit_code, SVM_EXIT_REASONS), __entry->exit_info1, __entry->exit_info2, __entry->exit_int_info, __entry->exit_int_info_err) ); @@ -548,8 +641,9 @@ TRACE_EVENT(kvm_nested_vmexit_inject, TP_printk("reason: %s ext_inf1: 0x%016llx " "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x", - ftrace_print_symbols_seq(p, __entry->exit_code, - kvm_x86_ops->exit_reasons_str), + (__entry->isa == KVM_ISA_VMX) ? + __print_symbolic(__entry->exit_code, VMX_EXIT_REASONS) : + __print_symbolic(__entry->exit_code, SVM_EXIT_REASONS), __entry->exit_info1, __entry->exit_info2, __entry->exit_int_info, __entry->exit_int_info_err) ); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index e65a158..e26629f 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6241,49 +6241,6 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) return ret; } -#define _ER(x) { EXIT_REASON_##x, #x } - -static const struct trace_print_flags vmx_exit_reasons_str[] = { - _ER(EXCEPTION_NMI), - _ER(EXTERNAL_INTERRUPT), - _ER(TRIPLE_FAULT), - _ER(PENDING_INTERRUPT), - _ER(NMI_WINDOW), - _ER(TASK_SWITCH), - _ER(CPUID), - _ER(HLT), - _ER(INVLPG), - _ER(RDPMC), - _ER(RDTSC), - _ER(VMCALL), - _ER(VMCLEAR), - _ER(VMLAUNCH), - _ER(VMPTRLD), - _ER(VMPTRST), - _ER(VMREAD), - _ER(VMRESUME), - _ER(VMWRITE), - _ER(VMOFF), - _ER(VMON), - _ER(CR_ACCESS), - _ER(DR_ACCESS), - _ER(IO_INSTRUCTION), - _ER(MSR_READ), - _ER(MSR_WRITE), - _ER(MWAIT_INSTRUCTION), - _ER(MONITOR_INSTRUCTION), - _ER(PAUSE_INSTRUCTION), - _ER(MCE_DURING_VMENTRY), - _ER(TPR_BELOW_THRESHOLD), - _ER(APIC_ACCESS), - _ER(EPT_VIOLATION), - _ER(EPT_MISCONFIG), - _ER(WBINVD), - { -1, NULL } -}; - -#undef _ER - static int vmx_get_lpage_level(void) { if (enable_ept && !cpu_has_vmx_ept_1g_page()) @@ -7039,7 +6996,6 @@ static struct kvm_x86_ops vmx_x86_ops = { .get_mt_mask = vmx_get_mt_mask, .get_exit_info = vmx_get_exit_info, - .exit_reasons_str = vmx_exit_reasons_str, .get_lpage_level = vmx_get_lpage_level, -- cgit v1.1 From 743eeb0b01d2fbf4154bf87bff1ebb6fb18aeb7a Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Wed, 27 Jul 2011 16:00:48 +0300 Subject: KVM: Intelligent device lookup on I/O bus Currently the method of dealing with an IO operation on a bus (PIO/MMIO) is to call the read or write callback for each device registered on the bus until we find a device which handles it. Since the number of devices on a bus can be significant due to ioeventfds and coalesced MMIO zones, this leads to a lot of overhead on each IO operation. Instead of registering devices, we now register ranges which points to a device. Lookup is done using an efficient bsearch instead of a linear search. Performance test was conducted by comparing exit count per second with 200 ioeventfds created on one byte and the guest is trying to access a different byte continuously (triggering usermode exits). Before the patch the guest has achieved 259k exits per second, after the patch the guest does 274k exits per second. Cc: Avi Kivity Cc: Marcelo Tosatti Signed-off-by: Sasha Levin Signed-off-by: Avi Kivity --- arch/x86/kvm/i8254.c | 6 ++- arch/x86/kvm/i8259.c | 108 ++++++++++++++++++++++++++++++++++++++++++--------- arch/x86/kvm/irq.h | 4 +- arch/x86/kvm/x86.c | 6 ++- 4 files changed, 101 insertions(+), 23 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index efad723..76e3f1c 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -713,14 +713,16 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier); kvm_iodevice_init(&pit->dev, &pit_dev_ops); - ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &pit->dev); + ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, KVM_PIT_BASE_ADDRESS, + KVM_PIT_MEM_LENGTH, &pit->dev); if (ret < 0) goto fail; if (flags & KVM_PIT_SPEAKER_DUMMY) { kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops); ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, - &pit->speaker_dev); + KVM_SPEAKER_BASE_ADDRESS, 4, + &pit->speaker_dev); if (ret < 0) goto fail_unregister; } diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 19fe855..6b869ce 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -459,15 +459,9 @@ static int picdev_in_range(gpa_t addr) } } -static inline struct kvm_pic *to_pic(struct kvm_io_device *dev) -{ - return container_of(dev, struct kvm_pic, dev); -} - -static int picdev_write(struct kvm_io_device *this, +static int picdev_write(struct kvm_pic *s, gpa_t addr, int len, const void *val) { - struct kvm_pic *s = to_pic(this); unsigned char data = *(unsigned char *)val; if (!picdev_in_range(addr)) return -EOPNOTSUPP; @@ -494,10 +488,9 @@ static int picdev_write(struct kvm_io_device *this, return 0; } -static int picdev_read(struct kvm_io_device *this, +static int picdev_read(struct kvm_pic *s, gpa_t addr, int len, void *val) { - struct kvm_pic *s = to_pic(this); unsigned char data = 0; if (!picdev_in_range(addr)) return -EOPNOTSUPP; @@ -525,6 +518,48 @@ static int picdev_read(struct kvm_io_device *this, return 0; } +static int picdev_master_write(struct kvm_io_device *dev, + gpa_t addr, int len, const void *val) +{ + return picdev_write(container_of(dev, struct kvm_pic, dev_master), + addr, len, val); +} + +static int picdev_master_read(struct kvm_io_device *dev, + gpa_t addr, int len, void *val) +{ + return picdev_read(container_of(dev, struct kvm_pic, dev_master), + addr, len, val); +} + +static int picdev_slave_write(struct kvm_io_device *dev, + gpa_t addr, int len, const void *val) +{ + return picdev_write(container_of(dev, struct kvm_pic, dev_slave), + addr, len, val); +} + +static int picdev_slave_read(struct kvm_io_device *dev, + gpa_t addr, int len, void *val) +{ + return picdev_read(container_of(dev, struct kvm_pic, dev_slave), + addr, len, val); +} + +static int picdev_eclr_write(struct kvm_io_device *dev, + gpa_t addr, int len, const void *val) +{ + return picdev_write(container_of(dev, struct kvm_pic, dev_eclr), + addr, len, val); +} + +static int picdev_eclr_read(struct kvm_io_device *dev, + gpa_t addr, int len, void *val) +{ + return picdev_read(container_of(dev, struct kvm_pic, dev_eclr), + addr, len, val); +} + /* * callback when PIC0 irq status changed */ @@ -537,9 +572,19 @@ static void pic_irq_request(struct kvm *kvm, int level) s->output = level; } -static const struct kvm_io_device_ops picdev_ops = { - .read = picdev_read, - .write = picdev_write, +static const struct kvm_io_device_ops picdev_master_ops = { + .read = picdev_master_read, + .write = picdev_master_write, +}; + +static const struct kvm_io_device_ops picdev_slave_ops = { + .read = picdev_slave_read, + .write = picdev_slave_write, +}; + +static const struct kvm_io_device_ops picdev_eclr_ops = { + .read = picdev_eclr_read, + .write = picdev_eclr_write, }; struct kvm_pic *kvm_create_pic(struct kvm *kvm) @@ -560,16 +605,39 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) /* * Initialize PIO device */ - kvm_iodevice_init(&s->dev, &picdev_ops); + kvm_iodevice_init(&s->dev_master, &picdev_master_ops); + kvm_iodevice_init(&s->dev_slave, &picdev_slave_ops); + kvm_iodevice_init(&s->dev_eclr, &picdev_eclr_ops); mutex_lock(&kvm->slots_lock); - ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &s->dev); + ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0x20, 2, + &s->dev_master); + if (ret < 0) + goto fail_unlock; + + ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0xa0, 2, &s->dev_slave); + if (ret < 0) + goto fail_unreg_2; + + ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0x4d0, 2, &s->dev_eclr); + if (ret < 0) + goto fail_unreg_1; + mutex_unlock(&kvm->slots_lock); - if (ret < 0) { - kfree(s); - return NULL; - } return s; + +fail_unreg_1: + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &s->dev_slave); + +fail_unreg_2: + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &s->dev_master); + +fail_unlock: + mutex_unlock(&kvm->slots_lock); + + kfree(s); + + return NULL; } void kvm_destroy_pic(struct kvm *kvm) @@ -577,7 +645,9 @@ void kvm_destroy_pic(struct kvm *kvm) struct kvm_pic *vpic = kvm->arch.vpic; if (vpic) { - kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev); + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev_master); + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev_slave); + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev_eclr); kvm->arch.vpic = NULL; kfree(vpic); } diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 53e2d08..2086f2b 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -66,7 +66,9 @@ struct kvm_pic { struct kvm *kvm; struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ int output; /* intr from master PIC */ - struct kvm_io_device dev; + struct kvm_io_device dev_master; + struct kvm_io_device dev_slave; + struct kvm_io_device dev_eclr; void (*ack_notifier)(void *opaque, int irq); unsigned long irq_states[16]; }; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6cb353c..d28dff7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3562,7 +3562,11 @@ long kvm_arch_vm_ioctl(struct file *filp, if (r) { mutex_lock(&kvm->slots_lock); kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, - &vpic->dev); + &vpic->dev_master); + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, + &vpic->dev_slave); + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, + &vpic->dev_eclr); mutex_unlock(&kvm->slots_lock); kfree(vpic); goto create_irqchip_unlock; -- cgit v1.1 From 807941b121cf77e70eec8db308b8c1f496cc79e9 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Sat, 30 Jul 2011 18:00:17 +0900 Subject: KVM: x86 emulator: Use ctxt->_eip directly in do_insn_fetch_byte() Instead of passing ctxt->_eip from insn_fetch() call sites, get it from ctxt in do_insn_fetch_byte(). This is done by replacing the argument _eip of insn_fetch() with _ctxt, which should be better than letting the macro use ctxt silently in its body. Though this changes the place where ctxt->_eip is incremented from insn_fetch() to do_insn_fetch_byte(), this does not have any real effect. Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 79 +++++++++++++++++++++++++++----------------------- 1 file changed, 43 insertions(+), 36 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 8b4cc5f..ce48dc4 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -651,18 +651,26 @@ static int segmented_read_std(struct x86_emulate_ctxt *ctxt, return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception); } -static int do_insn_fetch_byte(struct x86_emulate_ctxt *ctxt, - unsigned long eip, u8 *dest) +/* + * Fetch the next byte of the instruction being emulated which is pointed to + * by ctxt->_eip, then increment ctxt->_eip. + * + * Also prefetch the remaining bytes of the instruction without crossing page + * boundary if they are not in fetch_cache yet. + */ +static int do_insn_fetch_byte(struct x86_emulate_ctxt *ctxt, u8 *dest) { struct fetch_cache *fc = &ctxt->fetch; int rc; int size, cur_size; - if (eip == fc->end) { + if (ctxt->_eip == fc->end) { unsigned long linear; - struct segmented_address addr = { .seg=VCPU_SREG_CS, .ea=eip}; + struct segmented_address addr = { .seg = VCPU_SREG_CS, + .ea = ctxt->_eip }; cur_size = fc->end - fc->start; - size = min(15UL - cur_size, PAGE_SIZE - offset_in_page(eip)); + size = min(15UL - cur_size, + PAGE_SIZE - offset_in_page(ctxt->_eip)); rc = __linearize(ctxt, addr, size, false, true, &linear); if (rc != X86EMUL_CONTINUE) return rc; @@ -672,20 +680,21 @@ static int do_insn_fetch_byte(struct x86_emulate_ctxt *ctxt, return rc; fc->end += size; } - *dest = fc->data[eip - fc->start]; + *dest = fc->data[ctxt->_eip - fc->start]; + ctxt->_eip++; return X86EMUL_CONTINUE; } static int do_insn_fetch(struct x86_emulate_ctxt *ctxt, - unsigned long eip, void *dest, unsigned size) + void *dest, unsigned size) { int rc; /* x86 instructions are limited to 15 bytes. */ - if (eip + size - ctxt->eip > 15) + if (ctxt->_eip + size - ctxt->eip > 15) return X86EMUL_UNHANDLEABLE; while (size--) { - rc = do_insn_fetch_byte(ctxt, eip++, dest++); + rc = do_insn_fetch_byte(ctxt, dest++); if (rc != X86EMUL_CONTINUE) return rc; } @@ -693,20 +702,18 @@ static int do_insn_fetch(struct x86_emulate_ctxt *ctxt, } /* Fetch next part of the instruction being emulated. */ -#define insn_fetch(_type, _size, _eip) \ +#define insn_fetch(_type, _size, _ctxt) \ ({ unsigned long _x; \ - rc = do_insn_fetch(ctxt, (_eip), &_x, (_size)); \ + rc = do_insn_fetch(_ctxt, &_x, (_size)); \ if (rc != X86EMUL_CONTINUE) \ goto done; \ - (_eip) += (_size); \ (_type)_x; \ }) -#define insn_fetch_arr(_arr, _size, _eip) \ -({ rc = do_insn_fetch(ctxt, (_eip), _arr, (_size)); \ +#define insn_fetch_arr(_arr, _size, _ctxt) \ +({ rc = do_insn_fetch(_ctxt, _arr, (_size)); \ if (rc != X86EMUL_CONTINUE) \ goto done; \ - (_eip) += (_size); \ }) /* @@ -894,7 +901,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, ctxt->modrm_rm = base_reg = (ctxt->rex_prefix & 1) << 3; /* REG.B */ } - ctxt->modrm = insn_fetch(u8, 1, ctxt->_eip); + ctxt->modrm = insn_fetch(u8, 1, ctxt); ctxt->modrm_mod |= (ctxt->modrm & 0xc0) >> 6; ctxt->modrm_reg |= (ctxt->modrm & 0x38) >> 3; ctxt->modrm_rm |= (ctxt->modrm & 0x07); @@ -928,13 +935,13 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, switch (ctxt->modrm_mod) { case 0: if (ctxt->modrm_rm == 6) - modrm_ea += insn_fetch(u16, 2, ctxt->_eip); + modrm_ea += insn_fetch(u16, 2, ctxt); break; case 1: - modrm_ea += insn_fetch(s8, 1, ctxt->_eip); + modrm_ea += insn_fetch(s8, 1, ctxt); break; case 2: - modrm_ea += insn_fetch(u16, 2, ctxt->_eip); + modrm_ea += insn_fetch(u16, 2, ctxt); break; } switch (ctxt->modrm_rm) { @@ -971,13 +978,13 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, } else { /* 32/64-bit ModR/M decode. */ if ((ctxt->modrm_rm & 7) == 4) { - sib = insn_fetch(u8, 1, ctxt->_eip); + sib = insn_fetch(u8, 1, ctxt); index_reg |= (sib >> 3) & 7; base_reg |= sib & 7; scale = sib >> 6; if ((base_reg & 7) == 5 && ctxt->modrm_mod == 0) - modrm_ea += insn_fetch(s32, 4, ctxt->_eip); + modrm_ea += insn_fetch(s32, 4, ctxt); else modrm_ea += ctxt->regs[base_reg]; if (index_reg != 4) @@ -990,13 +997,13 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, switch (ctxt->modrm_mod) { case 0: if (ctxt->modrm_rm == 5) - modrm_ea += insn_fetch(s32, 4, ctxt->_eip); + modrm_ea += insn_fetch(s32, 4, ctxt); break; case 1: - modrm_ea += insn_fetch(s8, 1, ctxt->_eip); + modrm_ea += insn_fetch(s8, 1, ctxt); break; case 2: - modrm_ea += insn_fetch(s32, 4, ctxt->_eip); + modrm_ea += insn_fetch(s32, 4, ctxt); break; } } @@ -1013,13 +1020,13 @@ static int decode_abs(struct x86_emulate_ctxt *ctxt, op->type = OP_MEM; switch (ctxt->ad_bytes) { case 2: - op->addr.mem.ea = insn_fetch(u16, 2, ctxt->_eip); + op->addr.mem.ea = insn_fetch(u16, 2, ctxt); break; case 4: - op->addr.mem.ea = insn_fetch(u32, 4, ctxt->_eip); + op->addr.mem.ea = insn_fetch(u32, 4, ctxt); break; case 8: - op->addr.mem.ea = insn_fetch(u64, 8, ctxt->_eip); + op->addr.mem.ea = insn_fetch(u64, 8, ctxt); break; } done: @@ -3309,13 +3316,13 @@ static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op, /* NB. Immediates are sign-extended as necessary. */ switch (op->bytes) { case 1: - op->val = insn_fetch(s8, 1, ctxt->_eip); + op->val = insn_fetch(s8, 1, ctxt); break; case 2: - op->val = insn_fetch(s16, 2, ctxt->_eip); + op->val = insn_fetch(s16, 2, ctxt); break; case 4: - op->val = insn_fetch(s32, 4, ctxt->_eip); + op->val = insn_fetch(s32, 4, ctxt); break; } if (!sign_extension) { @@ -3374,7 +3381,7 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) /* Legacy prefixes. */ for (;;) { - switch (ctxt->b = insn_fetch(u8, 1, ctxt->_eip)) { + switch (ctxt->b = insn_fetch(u8, 1, ctxt)) { case 0x66: /* operand-size override */ op_prefix = true; /* switch between 2/4 bytes */ @@ -3430,7 +3437,7 @@ done_prefixes: /* Two-byte opcode? */ if (ctxt->b == 0x0f) { ctxt->twobyte = 1; - ctxt->b = insn_fetch(u8, 1, ctxt->_eip); + ctxt->b = insn_fetch(u8, 1, ctxt); opcode = twobyte_table[ctxt->b]; } ctxt->d = opcode.flags; @@ -3438,13 +3445,13 @@ done_prefixes: while (ctxt->d & GroupMask) { switch (ctxt->d & GroupMask) { case Group: - ctxt->modrm = insn_fetch(u8, 1, ctxt->_eip); + ctxt->modrm = insn_fetch(u8, 1, ctxt); --ctxt->_eip; goffset = (ctxt->modrm >> 3) & 7; opcode = opcode.u.group[goffset]; break; case GroupDual: - ctxt->modrm = insn_fetch(u8, 1, ctxt->_eip); + ctxt->modrm = insn_fetch(u8, 1, ctxt); --ctxt->_eip; goffset = (ctxt->modrm >> 3) & 7; if ((ctxt->modrm >> 6) == 3) @@ -3577,7 +3584,7 @@ done_prefixes: ctxt->src.type = OP_IMM; ctxt->src.addr.mem.ea = ctxt->_eip; ctxt->src.bytes = ctxt->op_bytes + 2; - insn_fetch_arr(ctxt->src.valptr, ctxt->src.bytes, ctxt->_eip); + insn_fetch_arr(ctxt->src.valptr, ctxt->src.bytes, ctxt); break; case SrcMemFAddr: memop.bytes = ctxt->op_bytes + 2; @@ -3630,7 +3637,7 @@ done_prefixes: ctxt->dst.type = OP_IMM; ctxt->dst.addr.mem.ea = ctxt->_eip; ctxt->dst.bytes = 1; - ctxt->dst.val = insn_fetch(u8, 1, ctxt->_eip); + ctxt->dst.val = insn_fetch(u8, 1, ctxt); break; case DstMem: case DstMem64: -- cgit v1.1 From e85a10852c26d7d509ad17bac1a0d5264224b2d2 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Sat, 30 Jul 2011 18:01:26 +0900 Subject: KVM: x86 emulator: Drop _size argument from insn_fetch() _type is enough to know the size. Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index ce48dc4..d4cc8af 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -702,9 +702,9 @@ static int do_insn_fetch(struct x86_emulate_ctxt *ctxt, } /* Fetch next part of the instruction being emulated. */ -#define insn_fetch(_type, _size, _ctxt) \ +#define insn_fetch(_type, _ctxt) \ ({ unsigned long _x; \ - rc = do_insn_fetch(_ctxt, &_x, (_size)); \ + rc = do_insn_fetch(_ctxt, &_x, sizeof(_type)); \ if (rc != X86EMUL_CONTINUE) \ goto done; \ (_type)_x; \ @@ -901,7 +901,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, ctxt->modrm_rm = base_reg = (ctxt->rex_prefix & 1) << 3; /* REG.B */ } - ctxt->modrm = insn_fetch(u8, 1, ctxt); + ctxt->modrm = insn_fetch(u8, ctxt); ctxt->modrm_mod |= (ctxt->modrm & 0xc0) >> 6; ctxt->modrm_reg |= (ctxt->modrm & 0x38) >> 3; ctxt->modrm_rm |= (ctxt->modrm & 0x07); @@ -935,13 +935,13 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, switch (ctxt->modrm_mod) { case 0: if (ctxt->modrm_rm == 6) - modrm_ea += insn_fetch(u16, 2, ctxt); + modrm_ea += insn_fetch(u16, ctxt); break; case 1: - modrm_ea += insn_fetch(s8, 1, ctxt); + modrm_ea += insn_fetch(s8, ctxt); break; case 2: - modrm_ea += insn_fetch(u16, 2, ctxt); + modrm_ea += insn_fetch(u16, ctxt); break; } switch (ctxt->modrm_rm) { @@ -978,13 +978,13 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, } else { /* 32/64-bit ModR/M decode. */ if ((ctxt->modrm_rm & 7) == 4) { - sib = insn_fetch(u8, 1, ctxt); + sib = insn_fetch(u8, ctxt); index_reg |= (sib >> 3) & 7; base_reg |= sib & 7; scale = sib >> 6; if ((base_reg & 7) == 5 && ctxt->modrm_mod == 0) - modrm_ea += insn_fetch(s32, 4, ctxt); + modrm_ea += insn_fetch(s32, ctxt); else modrm_ea += ctxt->regs[base_reg]; if (index_reg != 4) @@ -997,13 +997,13 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, switch (ctxt->modrm_mod) { case 0: if (ctxt->modrm_rm == 5) - modrm_ea += insn_fetch(s32, 4, ctxt); + modrm_ea += insn_fetch(s32, ctxt); break; case 1: - modrm_ea += insn_fetch(s8, 1, ctxt); + modrm_ea += insn_fetch(s8, ctxt); break; case 2: - modrm_ea += insn_fetch(s32, 4, ctxt); + modrm_ea += insn_fetch(s32, ctxt); break; } } @@ -1020,13 +1020,13 @@ static int decode_abs(struct x86_emulate_ctxt *ctxt, op->type = OP_MEM; switch (ctxt->ad_bytes) { case 2: - op->addr.mem.ea = insn_fetch(u16, 2, ctxt); + op->addr.mem.ea = insn_fetch(u16, ctxt); break; case 4: - op->addr.mem.ea = insn_fetch(u32, 4, ctxt); + op->addr.mem.ea = insn_fetch(u32, ctxt); break; case 8: - op->addr.mem.ea = insn_fetch(u64, 8, ctxt); + op->addr.mem.ea = insn_fetch(u64, ctxt); break; } done: @@ -3316,13 +3316,13 @@ static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op, /* NB. Immediates are sign-extended as necessary. */ switch (op->bytes) { case 1: - op->val = insn_fetch(s8, 1, ctxt); + op->val = insn_fetch(s8, ctxt); break; case 2: - op->val = insn_fetch(s16, 2, ctxt); + op->val = insn_fetch(s16, ctxt); break; case 4: - op->val = insn_fetch(s32, 4, ctxt); + op->val = insn_fetch(s32, ctxt); break; } if (!sign_extension) { @@ -3381,7 +3381,7 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) /* Legacy prefixes. */ for (;;) { - switch (ctxt->b = insn_fetch(u8, 1, ctxt)) { + switch (ctxt->b = insn_fetch(u8, ctxt)) { case 0x66: /* operand-size override */ op_prefix = true; /* switch between 2/4 bytes */ @@ -3437,7 +3437,7 @@ done_prefixes: /* Two-byte opcode? */ if (ctxt->b == 0x0f) { ctxt->twobyte = 1; - ctxt->b = insn_fetch(u8, 1, ctxt); + ctxt->b = insn_fetch(u8, ctxt); opcode = twobyte_table[ctxt->b]; } ctxt->d = opcode.flags; @@ -3445,13 +3445,13 @@ done_prefixes: while (ctxt->d & GroupMask) { switch (ctxt->d & GroupMask) { case Group: - ctxt->modrm = insn_fetch(u8, 1, ctxt); + ctxt->modrm = insn_fetch(u8, ctxt); --ctxt->_eip; goffset = (ctxt->modrm >> 3) & 7; opcode = opcode.u.group[goffset]; break; case GroupDual: - ctxt->modrm = insn_fetch(u8, 1, ctxt); + ctxt->modrm = insn_fetch(u8, ctxt); --ctxt->_eip; goffset = (ctxt->modrm >> 3) & 7; if ((ctxt->modrm >> 6) == 3) @@ -3637,7 +3637,7 @@ done_prefixes: ctxt->dst.type = OP_IMM; ctxt->dst.addr.mem.ea = ctxt->_eip; ctxt->dst.bytes = 1; - ctxt->dst.val = insn_fetch(u8, 1, ctxt); + ctxt->dst.val = insn_fetch(u8, ctxt); break; case DstMem: case DstMem64: -- cgit v1.1 From 7d88bb4803d62f6056b079ade6333a026fd11684 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Sat, 30 Jul 2011 18:02:29 +0900 Subject: KVM: x86 emulator: Let compiler know insn_fetch() rarely fails Fetching the instruction which was to be executed by the guest cannot fail normally. So compiler should always predict that it will succeed. Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index d4cc8af..191bc9b 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -672,11 +672,11 @@ static int do_insn_fetch_byte(struct x86_emulate_ctxt *ctxt, u8 *dest) size = min(15UL - cur_size, PAGE_SIZE - offset_in_page(ctxt->_eip)); rc = __linearize(ctxt, addr, size, false, true, &linear); - if (rc != X86EMUL_CONTINUE) + if (unlikely(rc != X86EMUL_CONTINUE)) return rc; rc = ctxt->ops->fetch(ctxt, linear, fc->data + cur_size, size, &ctxt->exception); - if (rc != X86EMUL_CONTINUE) + if (unlikely(rc != X86EMUL_CONTINUE)) return rc; fc->end += size; } @@ -691,7 +691,7 @@ static int do_insn_fetch(struct x86_emulate_ctxt *ctxt, int rc; /* x86 instructions are limited to 15 bytes. */ - if (ctxt->_eip + size - ctxt->eip > 15) + if (unlikely(ctxt->_eip + size - ctxt->eip > 15)) return X86EMUL_UNHANDLEABLE; while (size--) { rc = do_insn_fetch_byte(ctxt, dest++); -- cgit v1.1 From 1d2887e2d849969f58ce79203f9785ebe065d494 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Sat, 30 Jul 2011 18:03:34 +0900 Subject: KVM: x86 emulator: Make x86_decode_insn() return proper macros Return EMULATION_OK/FAILED consistently. Also treat instruction fetch errors, not restricted to X86EMUL_UNHANDLEABLE, as EMULATION_FAILED; although this cannot happen in practice, the current logic will continue the emulation even if the decoder fails to fetch the instruction. Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 12 ++++++------ arch/x86/kvm/x86.c | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 191bc9b..fe5eb6d 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3373,7 +3373,7 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) break; #endif default: - return -1; + return EMULATION_FAILED; } ctxt->op_bytes = def_op_bytes; @@ -3465,7 +3465,7 @@ done_prefixes: break; case Prefix: if (ctxt->rep_prefix && op_prefix) - return X86EMUL_UNHANDLEABLE; + return EMULATION_FAILED; simd_prefix = op_prefix ? 0x66 : ctxt->rep_prefix; switch (simd_prefix) { case 0x00: opcode = opcode.u.gprefix->pfx_no; break; @@ -3475,7 +3475,7 @@ done_prefixes: } break; default: - return X86EMUL_UNHANDLEABLE; + return EMULATION_FAILED; } ctxt->d &= ~GroupMask; @@ -3488,10 +3488,10 @@ done_prefixes: /* Unrecognised? */ if (ctxt->d == 0 || (ctxt->d & Undefined)) - return -1; + return EMULATION_FAILED; if (!(ctxt->d & VendorSpecific) && ctxt->only_vendor_specific_insn) - return -1; + return EMULATION_FAILED; if (mode == X86EMUL_MODE_PROT64 && (ctxt->d & Stack)) ctxt->op_bytes = 8; @@ -3683,7 +3683,7 @@ done: if (memopp && memopp->type == OP_MEM && ctxt->rip_relative) memopp->addr.mem.ea += ctxt->_eip; - return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; + return (rc != X86EMUL_CONTINUE) ? EMULATION_FAILED : EMULATION_OK; } static bool string_insn_completed(struct x86_emulate_ctxt *ctxt) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d28dff7..1fe9637 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4837,7 +4837,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, trace_kvm_emulate_insn_start(vcpu); ++vcpu->stat.insn_emulation; - if (r) { + if (r != EMULATION_OK) { if (emulation_type & EMULTYPE_TRAP_UD) return EMULATE_FAIL; if (reexecute_instruction(vcpu, cr2)) -- cgit v1.1 From 742bc67042e34a9fe1fed0b46e4cb1431a72c4bf Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Fri, 29 Jul 2011 19:44:21 -0300 Subject: KVM: x86: report valid microcode update ID Windows Server 2008 SP2 checked build with smp > 1 BSOD's during boot due to lack of microcode update: *** Assertion failed: The system BIOS on this machine does not properly support the processor. The system BIOS did not load any microcode update. A BIOS containing the latest microcode update is needed for system reliability. (CurrentUpdateRevision != 0) *** Source File: d:\longhorn\base\hals\update\intelupd\update.c, line 440 Report a non-zero microcode update signature to make it happy. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1fe9637..ea8f9f0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1842,7 +1842,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) switch (msr) { case MSR_IA32_PLATFORM_ID: - case MSR_IA32_UCODE_REV: case MSR_IA32_EBL_CR_POWERON: case MSR_IA32_DEBUGCTLMSR: case MSR_IA32_LASTBRANCHFROMIP: @@ -1863,6 +1862,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_FAM10H_MMIO_CONF_BASE: data = 0; break; + case MSR_IA32_UCODE_REV: + data = 0x100000000ULL; + break; case MSR_MTRRcap: data = 0x500 | KVM_NR_VAR_MTRR; break; -- cgit v1.1 From cf3ace79c065d65e9636f719a9df1382725410e3 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Tue, 2 Aug 2011 12:34:57 +0200 Subject: KVM: VMX: trivial: use BUG_ON Use BUG_ON(x) rather than if(x) BUG(); The semantic patch that fixes this problem is as follows: (http://coccinelle.lip6.fr/) // @@ identifier x; @@ -if (x) BUG(); +BUG_ON(x); @@ identifier x; @@ -if (!x) BUG(); +BUG_ON(!x); // Signed-off-by: Julia Lawall Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index e26629f..03df703 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4115,8 +4115,7 @@ static int handle_exception(struct kvm_vcpu *vcpu) error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); if (is_page_fault(intr_info)) { /* EPT won't cause page fault directly */ - if (enable_ept) - BUG(); + BUG_ON(enable_ept); cr2 = vmcs_readl(EXIT_QUALIFICATION); trace_kvm_page_fault(cr2, error_code); -- cgit v1.1 From e4e517b4be019787ada4cbbce2f04570c21b0cbd Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 28 Jul 2011 11:36:17 +0300 Subject: KVM: MMU: Do not unconditionally read PDPTE from guest memory Architecturally, PDPTEs are cached in the PDPTRs when CR3 is reloaded. On SVM, it is not possible to implement this, but on VMX this is possible and was indeed implemented until nested SVM changed this to unconditionally read PDPTEs dynamically. This has noticable impact when running PAE guests. Fix by changing the MMU to read PDPTRs from the cache, falling back to reading from memory for the nested MMU. Signed-off-by: Avi Kivity Tested-by: Joerg Roedel Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/kvm_cache_regs.h | 7 ------- arch/x86/kvm/mmu.c | 5 ++++- arch/x86/kvm/paging_tmpl.h | 2 +- arch/x86/kvm/svm.c | 15 +++++++++++++++ 5 files changed, 21 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 307e3cf..b31a341 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -265,6 +265,7 @@ struct kvm_mmu { void (*new_cr3)(struct kvm_vcpu *vcpu); void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root); unsigned long (*get_cr3)(struct kvm_vcpu *vcpu); + u64 (*get_pdptr)(struct kvm_vcpu *vcpu, int index); int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err, bool prefault); void (*inject_page_fault)(struct kvm_vcpu *vcpu, diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 3377d53..544076c 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -45,13 +45,6 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index) return vcpu->arch.walk_mmu->pdptrs[index]; } -static inline u64 kvm_pdptr_read_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, int index) -{ - load_pdptrs(vcpu, mmu, mmu->get_cr3(vcpu)); - - return mmu->pdptrs[index]; -} - static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask) { ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 8e8da79..f1b36cf 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2770,7 +2770,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) ASSERT(!VALID_PAGE(root)); if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { - pdptr = kvm_pdptr_read_mmu(vcpu, &vcpu->arch.mmu, i); + pdptr = vcpu->arch.mmu.get_pdptr(vcpu, i); if (!is_present_gpte(pdptr)) { vcpu->arch.mmu.pae_root[i] = 0; continue; @@ -3318,6 +3318,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->direct_map = true; context->set_cr3 = kvm_x86_ops->set_tdp_cr3; context->get_cr3 = get_cr3; + context->get_pdptr = kvm_pdptr_read; context->inject_page_fault = kvm_inject_page_fault; context->nx = is_nx(vcpu); @@ -3376,6 +3377,7 @@ static int init_kvm_softmmu(struct kvm_vcpu *vcpu) vcpu->arch.walk_mmu->set_cr3 = kvm_x86_ops->set_cr3; vcpu->arch.walk_mmu->get_cr3 = get_cr3; + vcpu->arch.walk_mmu->get_pdptr = kvm_pdptr_read; vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; return r; @@ -3386,6 +3388,7 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu) struct kvm_mmu *g_context = &vcpu->arch.nested_mmu; g_context->get_cr3 = get_cr3; + g_context->get_pdptr = kvm_pdptr_read; g_context->inject_page_fault = kvm_inject_page_fault; /* diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 507e2b8..f6dd9fe 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -163,7 +163,7 @@ retry_walk: #if PTTYPE == 64 if (walker->level == PT32E_ROOT_LEVEL) { - pte = kvm_pdptr_read_mmu(vcpu, mmu, (addr >> 30) & 3); + pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3); trace_kvm_mmu_paging_element(pte, walker->level); if (!is_present_gpte(pte)) goto error; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 2b24a88..f043168 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1844,6 +1844,20 @@ static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu) return svm->nested.nested_cr3; } +static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index) +{ + struct vcpu_svm *svm = to_svm(vcpu); + u64 cr3 = svm->nested.nested_cr3; + u64 pdpte; + int ret; + + ret = kvm_read_guest_page(vcpu->kvm, gpa_to_gfn(cr3), &pdpte, + offset_in_page(cr3) + index * 8, 8); + if (ret) + return 0; + return pdpte; +} + static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root) { @@ -1875,6 +1889,7 @@ static int nested_svm_init_mmu_context(struct kvm_vcpu *vcpu) vcpu->arch.mmu.set_cr3 = nested_svm_set_tdp_cr3; vcpu->arch.mmu.get_cr3 = nested_svm_get_tdp_cr3; + vcpu->arch.mmu.get_pdptr = nested_svm_get_tdp_pdptr; vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit; vcpu->arch.mmu.shadow_root_level = get_npt_level(); vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; -- cgit v1.1 From cd46868c7f088baaaba8f81251c5929f6002d050 Mon Sep 17 00:00:00 2001 From: "Yang, Wei Y" Date: Tue, 9 Aug 2011 18:14:01 +0800 Subject: KVM: MMU: Fix SMEP failure during fetch This patch fix kvm-unit-tests hanging and incorrect PT_ACCESSED_MASK bit set in the case of SMEP fault. The code updated 'eperm' after the variable was checked. Signed-off-by: Yang, Wei Signed-off-by: Avi Kivity --- arch/x86/kvm/paging_tmpl.h | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index f6dd9fe..9299410 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -147,7 +147,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, gfn_t table_gfn; unsigned index, pt_access, uninitialized_var(pte_access); gpa_t pte_gpa; - bool eperm; + bool eperm, last_gpte; int offset; const int write_fault = access & PFERR_WRITE_MASK; const int user_fault = access & PFERR_USER_MASK; @@ -221,6 +221,17 @@ retry_walk: eperm = true; #endif + last_gpte = FNAME(is_last_gpte)(walker, vcpu, mmu, pte); + if (last_gpte) { + pte_access = pt_access & + FNAME(gpte_access)(vcpu, pte, true); + /* check if the kernel is fetching from user page */ + if (unlikely(pte_access & PT_USER_MASK) && + kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)) + if (fetch_fault && !user_fault) + eperm = true; + } + if (!eperm && unlikely(!(pte & PT_ACCESSED_MASK))) { int ret; trace_kvm_mmu_set_accessed_bit(table_gfn, index, @@ -238,18 +249,12 @@ retry_walk: walker->ptes[walker->level - 1] = pte; - if (FNAME(is_last_gpte)(walker, vcpu, mmu, pte)) { + if (last_gpte) { int lvl = walker->level; gpa_t real_gpa; gfn_t gfn; u32 ac; - /* check if the kernel is fetching from user page */ - if (unlikely(pte_access & PT_USER_MASK) && - kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)) - if (fetch_fault && !user_fault) - eperm = true; - gfn = gpte_to_gfn_lvl(pte, lvl); gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT; @@ -295,7 +300,6 @@ retry_walk: walker->ptes[walker->level - 1] = pte; } - pte_access = pt_access & FNAME(gpte_access)(vcpu, pte, true); walker->pt_access = pt_access; walker->pte_access = pte_access; pgprintk("%s: pte %llx pte_access %x pt_access %x\n", -- cgit v1.1 From d5c1785d2f3aabe284d91bc7fc8f0abc58525dc9 Mon Sep 17 00:00:00 2001 From: Nadav Har'El Date: Tue, 2 Aug 2011 15:54:20 +0300 Subject: KVM: L1 TSC handling KVM assumed in several places that reading the TSC MSR returns the value for L1. This is incorrect, because when L2 is running, the correct TSC read exit emulation is to return L2's value. We therefore add a new x86_ops function, read_l1_tsc, to use in places that specifically need to read the L1 TSC, NOT the TSC of the current level of guest. Note that one change, of one line in kvm_arch_vcpu_load, is made redundant by a different patch sent by Zachary Amsden (and not yet applied): kvm_arch_vcpu_load() should not read the guest TSC, and if it didn't, of course we didn't have to change the call of kvm_get_msr() to read_l1_tsc(). [avi: moved callback to kvm_x86_ops tsc block] Signed-off-by: Nadav Har'El Acked-by: Zachary Amsdem Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/svm.c | 8 ++++++++ arch/x86/kvm/vmx.c | 16 ++++++++++++++++ arch/x86/kvm/x86.c | 8 ++++---- 4 files changed, 29 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b31a341..6ab4241 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -630,6 +630,7 @@ struct kvm_x86_ops { void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); u64 (*compute_tsc_offset)(struct kvm_vcpu *vcpu, u64 target_tsc); + u64 (*read_l1_tsc)(struct kvm_vcpu *vcpu); void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index f043168..590d1d2 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2910,6 +2910,13 @@ static int cr8_write_interception(struct vcpu_svm *svm) return 0; } +u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu) +{ + struct vmcb *vmcb = get_host_vmcb(to_svm(vcpu)); + return vmcb->control.tsc_offset + + svm_scale_tsc(vcpu, native_read_tsc()); +} + static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) { struct vcpu_svm *svm = to_svm(vcpu); @@ -4201,6 +4208,7 @@ static struct kvm_x86_ops svm_x86_ops = { .write_tsc_offset = svm_write_tsc_offset, .adjust_tsc_offset = svm_adjust_tsc_offset, .compute_tsc_offset = svm_compute_tsc_offset, + .read_l1_tsc = svm_read_l1_tsc, .set_tdp_cr3 = set_tdp_cr3, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 03df703..97b6454 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1748,6 +1748,21 @@ static u64 guest_read_tsc(void) } /* + * Like guest_read_tsc, but always returns L1's notion of the timestamp + * counter, even if a nested guest (L2) is currently running. + */ +u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu) +{ + u64 host_tsc, tsc_offset; + + rdtscll(host_tsc); + tsc_offset = is_guest_mode(vcpu) ? + to_vmx(vcpu)->nested.vmcs01_tsc_offset : + vmcs_read64(TSC_OFFSET); + return host_tsc + tsc_offset; +} + +/* * Empty call-back. Needs to be implemented when VMX enables the SET_TSC_KHZ * ioctl. In this case the call-back should update internal vmx state to make * the changes effective. @@ -7010,6 +7025,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .write_tsc_offset = vmx_write_tsc_offset, .adjust_tsc_offset = vmx_adjust_tsc_offset, .compute_tsc_offset = vmx_compute_tsc_offset, + .read_l1_tsc = vmx_read_l1_tsc, .set_tdp_cr3 = vmx_set_cr3, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ea8f9f0..6b37f18 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1098,7 +1098,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) /* Keep irq disabled to prevent changes to the clock */ local_irq_save(flags); - kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp); + tsc_timestamp = kvm_x86_ops->read_l1_tsc(v); kernel_ns = get_kernel_ns(); this_tsc_khz = vcpu_tsc_khz(v); if (unlikely(this_tsc_khz == 0)) { @@ -2218,7 +2218,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) s64 tsc_delta; u64 tsc; - kvm_get_msr(vcpu, MSR_IA32_TSC, &tsc); + tsc = kvm_x86_ops->read_l1_tsc(vcpu); tsc_delta = !vcpu->arch.last_guest_tsc ? 0 : tsc - vcpu->arch.last_guest_tsc; @@ -2242,7 +2242,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { kvm_x86_ops->vcpu_put(vcpu); kvm_put_guest_fpu(vcpu); - kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc); + vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu); } static int is_efer_nx(void) @@ -5729,7 +5729,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (hw_breakpoint_active()) hw_breakpoint_restore(); - kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc); + vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu); vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); -- cgit v1.1 From 27fc51b21cea3386a6672699631975d1097f9d39 Mon Sep 17 00:00:00 2001 From: Nadav Har'El Date: Tue, 2 Aug 2011 15:54:52 +0300 Subject: KVM: nVMX: Fix nested VMX TSC emulation This patch fixes two corner cases in nested (L2) handling of TSC-related issues: 1. Somewhat suprisingly, according to the Intel spec, if L1 allows WRMSR to the TSC MSR without an exit, then this should set L1's TSC value itself - not offset by vmcs12.TSC_OFFSET (like was wrongly done in the previous code). 2. Allow L1 to disable the TSC_OFFSETING control, and then correctly ignore the vmcs12.TSC_OFFSET. Signed-off-by: Nadav Har'El Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 97b6454..5e8d411 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1777,15 +1777,23 @@ static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) */ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) { - vmcs_write64(TSC_OFFSET, offset); - if (is_guest_mode(vcpu)) + if (is_guest_mode(vcpu)) { /* - * We're here if L1 chose not to trap the TSC MSR. Since - * prepare_vmcs12() does not copy tsc_offset, we need to also - * set the vmcs12 field here. + * We're here if L1 chose not to trap WRMSR to TSC. According + * to the spec, this should set L1's TSC; The offset that L1 + * set for L2 remains unchanged, and still needs to be added + * to the newly set TSC to get L2's TSC. */ - get_vmcs12(vcpu)->tsc_offset = offset - - to_vmx(vcpu)->nested.vmcs01_tsc_offset; + struct vmcs12 *vmcs12; + to_vmx(vcpu)->nested.vmcs01_tsc_offset = offset; + /* recalculate vmcs02.TSC_OFFSET: */ + vmcs12 = get_vmcs12(vcpu); + vmcs_write64(TSC_OFFSET, offset + + (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING) ? + vmcs12->tsc_offset : 0)); + } else { + vmcs_write64(TSC_OFFSET, offset); + } } static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) @@ -6485,8 +6493,11 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) set_cr4_guest_host_mask(vmx); - vmcs_write64(TSC_OFFSET, - vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset); + if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) + vmcs_write64(TSC_OFFSET, + vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset); + else + vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset); if (enable_vpid) { /* @@ -6893,7 +6904,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu) load_vmcs12_host_state(vcpu, vmcs12); - /* Update TSC_OFFSET if vmx_adjust_tsc_offset() was used while L2 ran */ + /* Update TSC_OFFSET if TSC was changed while L2 ran */ vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset); /* This is needed for same reason as it was needed in prepare_vmcs02 */ -- cgit v1.1 From 45133ecaaec7aea447afc98cc2c24aac638bbe5c Mon Sep 17 00:00:00 2001 From: Nadav Har'El Date: Tue, 2 Aug 2011 15:55:23 +0300 Subject: KVM: SVM: Fix TSC MSR read in nested SVM When the TSC MSR is read by an L2 guest (when L1 allowed this MSR to be read without exit), we need to return L2's notion of the TSC, not L1's. The current code incorrectly returned L1 TSC, because svm_get_msr() was also used in x86.c where this was assumed, but now that these places call the new svm_read_l1_tsc(), the MSR read can be fixed. Signed-off-by: Nadav Har'El Tested-by: Joerg Roedel Acked-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 590d1d2..8277f32 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2923,9 +2923,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) switch (ecx) { case MSR_IA32_TSC: { - struct vmcb *vmcb = get_host_vmcb(svm); - - *data = vmcb->control.tsc_offset + + *data = svm->vmcb->control.tsc_offset + svm_scale_tsc(vcpu, native_read_tsc()); break; -- cgit v1.1 From 58fbbf26eb01cf6d92cf18da8d14b3a4af9c4b47 Mon Sep 17 00:00:00 2001 From: Kevin Tian Date: Tue, 30 Aug 2011 13:56:17 +0300 Subject: KVM: APIC: avoid instruction emulation for EOI writes Instruction emulation for EOI writes can be skipped, since sane guest simply uses MOV instead of string operations. This is a nice improvement when guest doesn't support x2apic or hyper-V EOI support. a single VM bandwidth is observed with ~8% bandwidth improvement (7.4Gbps->8Gbps), by saving ~5% cycles from EOI emulation. Signed-off-by: Kevin Tian : Signed-off-by: Eddie Dong Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/include/asm/vmx.h | 12 ++++++++++++ arch/x86/kvm/lapic.c | 9 +++++++++ arch/x86/kvm/lapic.h | 1 + arch/x86/kvm/vmx.c | 21 +++++++++++++++++++++ 4 files changed, 43 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 2caf290..31f180c 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -350,6 +350,18 @@ enum vmcs_field { #define DEBUG_REG_ACCESS_REG(eq) (((eq) >> 8) & 0xf) /* 11:8, general purpose reg. */ +/* + * Exit Qualifications for APIC-Access + */ +#define APIC_ACCESS_OFFSET 0xfff /* 11:0, offset within the APIC page */ +#define APIC_ACCESS_TYPE 0xf000 /* 15:12, access type */ +#define TYPE_LINEAR_APIC_INST_READ (0 << 12) +#define TYPE_LINEAR_APIC_INST_WRITE (1 << 12) +#define TYPE_LINEAR_APIC_INST_FETCH (2 << 12) +#define TYPE_LINEAR_APIC_EVENT (3 << 12) +#define TYPE_PHYSICAL_APIC_EVENT (10 << 12) +#define TYPE_PHYSICAL_APIC_INST (15 << 12) + /* segment AR */ #define SEGMENT_AR_L_MASK (1 << 13) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 57dcbd4..52645f2 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -864,6 +864,15 @@ static int apic_mmio_write(struct kvm_io_device *this, return 0; } +void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + + if (apic) + apic_reg_write(vcpu->arch.apic, APIC_EOI, 0); +} +EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi); + void kvm_free_lapic(struct kvm_vcpu *vcpu) { if (!vcpu->arch.apic) diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 52c9e6b..8287243 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -26,6 +26,7 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu); void kvm_lapic_reset(struct kvm_vcpu *vcpu); u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); +void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu); void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu); void kvm_apic_set_version(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 5e8d411..47419d6 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -71,6 +71,9 @@ module_param(vmm_exclusive, bool, S_IRUGO); static int __read_mostly yield_on_hlt = 1; module_param(yield_on_hlt, bool, S_IRUGO); +static int __read_mostly fasteoi = 1; +module_param(fasteoi, bool, S_IRUGO); + /* * If nested=1, nested virtualization is supported, i.e., guests may use * VMX and be a hypervisor for its own guests. If nested=0, guests may not @@ -4540,6 +4543,24 @@ static int handle_xsetbv(struct kvm_vcpu *vcpu) static int handle_apic_access(struct kvm_vcpu *vcpu) { + if (likely(fasteoi)) { + unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + int access_type, offset; + + access_type = exit_qualification & APIC_ACCESS_TYPE; + offset = exit_qualification & APIC_ACCESS_OFFSET; + /* + * Sane guest uses MOV to write EOI, with written value + * not cared. So make a short-circuit here by avoiding + * heavy instruction emulation. + */ + if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) && + (offset == APIC_EOI)) { + kvm_lapic_set_eoi(vcpu); + skip_emulated_instruction(vcpu); + return 1; + } + } return emulate_instruction(vcpu, 0) == EMULATE_DONE; } -- cgit v1.1 From a31b9ceadb61487042dec92f662736ccddadc75f Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 7 Sep 2011 16:41:35 +0300 Subject: KVM: x86 emulator: simplify emulate_2op_SrcV() emulate_2op_SrcV(), and its siblings, emulate_2op_SrcV_nobyte() and emulate_2op_SrcB(), all use the same calling conventions and all get passed exactly the same parameters. Simplify them by passing just the emulation context. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 90 ++++++++++++++++++++++++-------------------------- 1 file changed, 44 insertions(+), 46 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index fe5eb6d..458914d 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -205,64 +205,62 @@ struct gprefix { #define ON64(x) #endif -#define ____emulate_2op(_op, _src, _dst, _eflags, _x, _y, _suffix, _dsttype) \ +#define ____emulate_2op(ctxt, _op, _x, _y, _suffix, _dsttype) \ do { \ __asm__ __volatile__ ( \ _PRE_EFLAGS("0", "4", "2") \ _op _suffix " %"_x"3,%1; " \ _POST_EFLAGS("0", "4", "2") \ - : "=m" (_eflags), "+q" (*(_dsttype*)&(_dst).val),\ + : "=m" ((ctxt)->eflags), \ + "+q" (*(_dsttype*)&(ctxt)->dst.val), \ "=&r" (_tmp) \ - : _y ((_src).val), "i" (EFLAGS_MASK)); \ + : _y ((ctxt)->src.val), "i" (EFLAGS_MASK)); \ } while (0) /* Raw emulation: instruction has two explicit operands. */ -#define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \ +#define __emulate_2op_nobyte(ctxt,_op,_wx,_wy,_lx,_ly,_qx,_qy) \ do { \ unsigned long _tmp; \ \ - switch ((_dst).bytes) { \ + switch ((ctxt)->dst.bytes) { \ case 2: \ - ____emulate_2op(_op,_src,_dst,_eflags,_wx,_wy,"w",u16);\ + ____emulate_2op(ctxt,_op,_wx,_wy,"w",u16); \ break; \ case 4: \ - ____emulate_2op(_op,_src,_dst,_eflags,_lx,_ly,"l",u32);\ + ____emulate_2op(ctxt,_op,_lx,_ly,"l",u32); \ break; \ case 8: \ - ON64(____emulate_2op(_op,_src,_dst,_eflags,_qx,_qy,"q",u64)); \ + ON64(____emulate_2op(ctxt,_op,_qx,_qy,"q",u64)); \ break; \ } \ } while (0) -#define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \ +#define __emulate_2op(ctxt,_op,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \ do { \ unsigned long _tmp; \ - switch ((_dst).bytes) { \ + switch ((ctxt)->dst.bytes) { \ case 1: \ - ____emulate_2op(_op,_src,_dst,_eflags,_bx,_by,"b",u8); \ + ____emulate_2op(ctxt,_op,_bx,_by,"b",u8); \ break; \ default: \ - __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ + __emulate_2op_nobyte(ctxt, _op, \ _wx, _wy, _lx, _ly, _qx, _qy); \ break; \ } \ } while (0) /* Source operand is byte-sized and may be restricted to just %cl. */ -#define emulate_2op_SrcB(_op, _src, _dst, _eflags) \ - __emulate_2op(_op, _src, _dst, _eflags, \ - "b", "c", "b", "c", "b", "c", "b", "c") +#define emulate_2op_SrcB(ctxt, _op) \ + __emulate_2op(ctxt, _op, "b", "c", "b", "c", "b", "c", "b", "c") /* Source operand is byte, word, long or quad sized. */ -#define emulate_2op_SrcV(_op, _src, _dst, _eflags) \ - __emulate_2op(_op, _src, _dst, _eflags, \ - "b", "q", "w", "r", _LO32, "r", "", "r") +#define emulate_2op_SrcV(ctxt, _op) \ + __emulate_2op(ctxt, _op, "b", "q", "w", "r", _LO32, "r", "", "r") /* Source operand is word, long or quad sized. */ -#define emulate_2op_SrcV_nobyte(_op, _src, _dst, _eflags) \ - __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ - "w", "r", _LO32, "r", "", "r") +#define emulate_2op_SrcV_nobyte(ctxt, _op) \ + __emulate_2op_nobyte(ctxt, _op, "w", "r", _LO32, "r", "", "r") /* Instruction has three operands and one operand is stored in ECX register */ #define __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, _suffix, _type) \ @@ -1681,26 +1679,26 @@ static int em_grp2(struct x86_emulate_ctxt *ctxt) { switch (ctxt->modrm_reg) { case 0: /* rol */ - emulate_2op_SrcB("rol", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcB(ctxt, "rol"); break; case 1: /* ror */ - emulate_2op_SrcB("ror", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcB(ctxt, "ror"); break; case 2: /* rcl */ - emulate_2op_SrcB("rcl", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcB(ctxt, "rcl"); break; case 3: /* rcr */ - emulate_2op_SrcB("rcr", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcB(ctxt, "rcr"); break; case 4: /* sal/shl */ case 6: /* sal/shl */ - emulate_2op_SrcB("sal", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcB(ctxt, "sal"); break; case 5: /* shr */ - emulate_2op_SrcB("shr", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcB(ctxt, "shr"); break; case 7: /* sar */ - emulate_2op_SrcB("sar", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcB(ctxt, "sar"); break; } return X86EMUL_CONTINUE; @@ -1714,7 +1712,7 @@ static int em_grp3(struct x86_emulate_ctxt *ctxt) switch (ctxt->modrm_reg) { case 0 ... 1: /* test */ - emulate_2op_SrcV("test", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcV(ctxt, "test"); break; case 2: /* not */ ctxt->dst.val = ~ctxt->dst.val; @@ -2459,7 +2457,7 @@ static int em_das(struct x86_emulate_ctxt *ctxt) ctxt->src.type = OP_IMM; ctxt->src.val = 0; ctxt->src.bytes = 1; - emulate_2op_SrcV("or", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcV(ctxt, "or"); ctxt->eflags &= ~(X86_EFLAGS_AF | X86_EFLAGS_CF); if (cf) ctxt->eflags |= X86_EFLAGS_CF; @@ -2509,49 +2507,49 @@ static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt) static int em_add(struct x86_emulate_ctxt *ctxt) { - emulate_2op_SrcV("add", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcV(ctxt, "add"); return X86EMUL_CONTINUE; } static int em_or(struct x86_emulate_ctxt *ctxt) { - emulate_2op_SrcV("or", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcV(ctxt, "or"); return X86EMUL_CONTINUE; } static int em_adc(struct x86_emulate_ctxt *ctxt) { - emulate_2op_SrcV("adc", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcV(ctxt, "adc"); return X86EMUL_CONTINUE; } static int em_sbb(struct x86_emulate_ctxt *ctxt) { - emulate_2op_SrcV("sbb", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcV(ctxt, "sbb"); return X86EMUL_CONTINUE; } static int em_and(struct x86_emulate_ctxt *ctxt) { - emulate_2op_SrcV("and", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcV(ctxt, "and"); return X86EMUL_CONTINUE; } static int em_sub(struct x86_emulate_ctxt *ctxt) { - emulate_2op_SrcV("sub", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcV(ctxt, "sub"); return X86EMUL_CONTINUE; } static int em_xor(struct x86_emulate_ctxt *ctxt) { - emulate_2op_SrcV("xor", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcV(ctxt, "xor"); return X86EMUL_CONTINUE; } static int em_cmp(struct x86_emulate_ctxt *ctxt) { - emulate_2op_SrcV("cmp", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcV(ctxt, "cmp"); /* Disable writeback. */ ctxt->dst.type = OP_NONE; return X86EMUL_CONTINUE; @@ -2559,7 +2557,7 @@ static int em_cmp(struct x86_emulate_ctxt *ctxt) static int em_test(struct x86_emulate_ctxt *ctxt) { - emulate_2op_SrcV("test", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcV(ctxt, "test"); return X86EMUL_CONTINUE; } @@ -2577,7 +2575,7 @@ static int em_xchg(struct x86_emulate_ctxt *ctxt) static int em_imul(struct x86_emulate_ctxt *ctxt) { - emulate_2op_SrcV_nobyte("imul", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcV_nobyte(ctxt, "imul"); return X86EMUL_CONTINUE; } @@ -4121,7 +4119,7 @@ twobyte_insn: ctxt->dst.type = OP_NONE; /* only subword offset */ ctxt->src.val &= (ctxt->dst.bytes << 3) - 1; - emulate_2op_SrcV_nobyte("bt", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcV_nobyte(ctxt, "bt"); break; case 0xa4: /* shld imm8, r, r/m */ case 0xa5: /* shld cl, r, r/m */ @@ -4135,7 +4133,7 @@ twobyte_insn: break; case 0xab: bts: /* bts */ - emulate_2op_SrcV_nobyte("bts", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcV_nobyte(ctxt, "bts"); break; case 0xac: /* shrd imm8, r, r/m */ case 0xad: /* shrd cl, r, r/m */ @@ -4150,7 +4148,7 @@ twobyte_insn: */ ctxt->src.orig_val = ctxt->src.val; ctxt->src.val = ctxt->regs[VCPU_REGS_RAX]; - emulate_2op_SrcV("cmp", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcV(ctxt, "cmp"); if (ctxt->eflags & EFLG_ZF) { /* Success: write back to memory. */ ctxt->dst.val = ctxt->src.orig_val; @@ -4165,7 +4163,7 @@ twobyte_insn: break; case 0xb3: btr: /* btr */ - emulate_2op_SrcV_nobyte("btr", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcV_nobyte(ctxt, "btr"); break; case 0xb4: /* lfs */ rc = emulate_load_segment(ctxt, VCPU_SREG_FS); @@ -4192,7 +4190,7 @@ twobyte_insn: break; case 0xbb: btc: /* btc */ - emulate_2op_SrcV_nobyte("btc", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcV_nobyte(ctxt, "btc"); break; case 0xbc: { /* bsf */ u8 zf; @@ -4224,7 +4222,7 @@ twobyte_insn: (s16) ctxt->src.val; break; case 0xc0 ... 0xc1: /* xadd */ - emulate_2op_SrcV("add", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcV(ctxt, "add"); /* Write back the register source. */ ctxt->src.val = ctxt->dst.orig_val; write_register_operand(&ctxt->src); -- cgit v1.1 From 761441b9f42159409d56f74dcc7ce5538d9efd69 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 7 Sep 2011 16:41:36 +0300 Subject: KVM: x86 emulator: simplify emulate_2op_cl() emulate_2op_cl() is always called with the same parameters. Simplify by passing just the emulation context. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 458914d..0b33884 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -263,40 +263,37 @@ struct gprefix { __emulate_2op_nobyte(ctxt, _op, "w", "r", _LO32, "r", "", "r") /* Instruction has three operands and one operand is stored in ECX register */ -#define __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, _suffix, _type) \ +#define __emulate_2op_cl(_op, ctxt, _suffix, _type) \ do { \ unsigned long _tmp; \ - _type _clv = (_cl).val; \ - _type _srcv = (_src).val; \ - _type _dstv = (_dst).val; \ + _type _clv = (ctxt)->src2.val; \ + _type _srcv = (ctxt)->src.val; \ + _type _dstv = (ctxt)->dst.val; \ \ __asm__ __volatile__ ( \ _PRE_EFLAGS("0", "5", "2") \ _op _suffix " %4,%1 \n" \ _POST_EFLAGS("0", "5", "2") \ - : "=m" (_eflags), "+r" (_dstv), "=&r" (_tmp) \ + : "=m" ((ctxt)->eflags), "+r" (_dstv), "=&r" (_tmp) \ : "c" (_clv) , "r" (_srcv), "i" (EFLAGS_MASK) \ ); \ \ - (_cl).val = (unsigned long) _clv; \ - (_src).val = (unsigned long) _srcv; \ - (_dst).val = (unsigned long) _dstv; \ + (ctxt)->src2.val = (unsigned long) _clv; \ + (ctxt)->src2.val = (unsigned long) _srcv; \ + (ctxt)->dst.val = (unsigned long) _dstv; \ } while (0) -#define emulate_2op_cl(_op, _cl, _src, _dst, _eflags) \ +#define emulate_2op_cl(ctxt, _op) \ do { \ - switch ((_dst).bytes) { \ + switch ((ctxt)->dst.bytes) { \ case 2: \ - __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ - "w", unsigned short); \ + __emulate_2op_cl(_op, ctxt, "w", u16); \ break; \ case 4: \ - __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ - "l", unsigned int); \ + __emulate_2op_cl(_op, ctxt, "l", u32); \ break; \ case 8: \ - ON64(__emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ - "q", unsigned long)); \ + ON64(__emulate_2op_cl(_op, ctxt, "q", ulong)); \ break; \ } \ } while (0) @@ -4123,7 +4120,7 @@ twobyte_insn: break; case 0xa4: /* shld imm8, r, r/m */ case 0xa5: /* shld cl, r, r/m */ - emulate_2op_cl("shld", ctxt->src2, ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_cl(ctxt, "shld"); break; case 0xa8: /* push gs */ rc = emulate_push_sreg(ctxt, VCPU_SREG_GS); @@ -4137,7 +4134,7 @@ twobyte_insn: break; case 0xac: /* shrd imm8, r, r/m */ case 0xad: /* shrd cl, r, r/m */ - emulate_2op_cl("shrd", ctxt->src2, ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_cl(ctxt, "shrd"); break; case 0xae: /* clflush */ break; -- cgit v1.1 From 29053a60d791a492b4609d87397b70a7a3254eb2 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 7 Sep 2011 16:41:37 +0300 Subject: KVM: x86 emulator: simplify emulate_2op_cl() emulate_2op_cl() is always called with the same parameters. Simplify by passing just the emulation context. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 0b33884..14b2791 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -263,7 +263,7 @@ struct gprefix { __emulate_2op_nobyte(ctxt, _op, "w", "r", _LO32, "r", "", "r") /* Instruction has three operands and one operand is stored in ECX register */ -#define __emulate_2op_cl(_op, ctxt, _suffix, _type) \ +#define __emulate_2op_cl(ctxt, _op, _suffix, _type) \ do { \ unsigned long _tmp; \ _type _clv = (ctxt)->src2.val; \ @@ -287,13 +287,13 @@ struct gprefix { do { \ switch ((ctxt)->dst.bytes) { \ case 2: \ - __emulate_2op_cl(_op, ctxt, "w", u16); \ + __emulate_2op_cl(ctxt, _op, "w", u16); \ break; \ case 4: \ - __emulate_2op_cl(_op, ctxt, "l", u32); \ + __emulate_2op_cl(ctxt, _op, "l", u32); \ break; \ case 8: \ - ON64(__emulate_2op_cl(_op, ctxt, "q", ulong)); \ + ON64(__emulate_2op_cl(ctxt, _op, "q", ulong)); \ break; \ } \ } while (0) -- cgit v1.1 From d1eef45d5906a0738684833cf1e0395b4d098340 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 7 Sep 2011 16:41:38 +0300 Subject: KVM: x86 emulator: simplify emulate_1op() emulate_1op() is always called with the same parameters. Simplify by passing just the emulation context. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 14b2791..5b4e1d4 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -298,7 +298,7 @@ struct gprefix { } \ } while (0) -#define __emulate_1op(_op, _dst, _eflags, _suffix) \ +#define __emulate_1op(ctxt, _op, _suffix) \ do { \ unsigned long _tmp; \ \ @@ -306,19 +306,19 @@ struct gprefix { _PRE_EFLAGS("0", "3", "2") \ _op _suffix " %1; " \ _POST_EFLAGS("0", "3", "2") \ - : "=m" (_eflags), "+m" ((_dst).val), \ + : "=m" ((ctxt)->eflags), "+m" ((ctxt)->dst.val), \ "=&r" (_tmp) \ : "i" (EFLAGS_MASK)); \ } while (0) /* Instruction has only one explicit operand (no source operand). */ -#define emulate_1op(_op, _dst, _eflags) \ +#define emulate_1op(ctxt, _op) \ do { \ - switch ((_dst).bytes) { \ - case 1: __emulate_1op(_op, _dst, _eflags, "b"); break; \ - case 2: __emulate_1op(_op, _dst, _eflags, "w"); break; \ - case 4: __emulate_1op(_op, _dst, _eflags, "l"); break; \ - case 8: ON64(__emulate_1op(_op, _dst, _eflags, "q")); break; \ + switch ((ctxt)->dst.bytes) { \ + case 1: __emulate_1op(ctxt, _op, "b"); break; \ + case 2: __emulate_1op(ctxt, _op, "w"); break; \ + case 4: __emulate_1op(ctxt, _op, "l"); break; \ + case 8: ON64(__emulate_1op(ctxt, _op, "q")); break; \ } \ } while (0) @@ -1715,7 +1715,7 @@ static int em_grp3(struct x86_emulate_ctxt *ctxt) ctxt->dst.val = ~ctxt->dst.val; break; case 3: /* neg */ - emulate_1op("neg", ctxt->dst, ctxt->eflags); + emulate_1op(ctxt, "neg"); break; case 4: /* mul */ emulate_1op_rax_rdx("mul", ctxt->src, *rax, *rdx, ctxt->eflags); @@ -1745,10 +1745,10 @@ static int em_grp45(struct x86_emulate_ctxt *ctxt) switch (ctxt->modrm_reg) { case 0: /* inc */ - emulate_1op("inc", ctxt->dst, ctxt->eflags); + emulate_1op(ctxt, "inc"); break; case 1: /* dec */ - emulate_1op("dec", ctxt->dst, ctxt->eflags); + emulate_1op(ctxt, "dec"); break; case 2: /* call near abs */ { long int old_eip; @@ -3849,10 +3849,10 @@ special_insn: rc = emulate_pop_sreg(ctxt, VCPU_SREG_DS); break; case 0x40 ... 0x47: /* inc r16/r32 */ - emulate_1op("inc", ctxt->dst, ctxt->eflags); + emulate_1op(ctxt, "inc"); break; case 0x48 ... 0x4f: /* dec r16/r32 */ - emulate_1op("dec", ctxt->dst, ctxt->eflags); + emulate_1op(ctxt, "dec"); break; case 0x63: /* movsxd */ if (ctxt->mode != X86EMUL_MODE_PROT64) -- cgit v1.1 From 9fef72ce10dc8263fec9350a8e2a1c505ebedaae Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 7 Sep 2011 16:41:39 +0300 Subject: KVM: x86 emulator: merge the two emulate_1op_rax_rdx implementations We have two emulate-with-extended-accumulator implementations: once which expect traps (_ex) and one which doesn't (plain). Drop the plain implementation and always use the one which expects traps; it will simply return 0 in the _ex argument and we can happily ignore it. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 64 ++++++++++++-------------------------------------- 1 file changed, 15 insertions(+), 49 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 5b4e1d4..6f2e419 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -322,21 +322,7 @@ struct gprefix { } \ } while (0) -#define __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, _suffix) \ - do { \ - unsigned long _tmp; \ - \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "4", "1") \ - _op _suffix " %5; " \ - _POST_EFLAGS("0", "4", "1") \ - : "=m" (_eflags), "=&r" (_tmp), \ - "+a" (_rax), "+d" (_rdx) \ - : "i" (EFLAGS_MASK), "m" ((_src).val), \ - "a" (_rax), "d" (_rdx)); \ - } while (0) - -#define __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, _eflags, _suffix, _ex) \ +#define __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, _suffix, _ex) \ do { \ unsigned long _tmp; \ \ @@ -358,46 +344,24 @@ struct gprefix { } while (0) /* instruction has only one source operand, destination is implicit (e.g. mul, div, imul, idiv) */ -#define emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags) \ +#define emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, _ex) \ do { \ switch((_src).bytes) { \ case 1: \ __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \ - _eflags, "b"); \ + _eflags, "b", _ex); \ break; \ case 2: \ __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \ - _eflags, "w"); \ + _eflags, "w", _ex); \ break; \ case 4: \ __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \ - _eflags, "l"); \ - break; \ - case 8: \ - ON64(__emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \ - _eflags, "q")); \ - break; \ - } \ - } while (0) - -#define emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, _eflags, _ex) \ - do { \ - switch((_src).bytes) { \ - case 1: \ - __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \ - _eflags, "b", _ex); \ - break; \ - case 2: \ - __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \ - _eflags, "w", _ex); \ - break; \ - case 4: \ - __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \ - _eflags, "l", _ex); \ + _eflags, "l", _ex); \ break; \ case 8: ON64( \ - __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \ - _eflags, "q", _ex)); \ + __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \ + _eflags, "q", _ex)); \ break; \ } \ } while (0) @@ -1718,18 +1682,20 @@ static int em_grp3(struct x86_emulate_ctxt *ctxt) emulate_1op(ctxt, "neg"); break; case 4: /* mul */ - emulate_1op_rax_rdx("mul", ctxt->src, *rax, *rdx, ctxt->eflags); + emulate_1op_rax_rdx("mul", ctxt->src, *rax, *rdx, + ctxt->eflags, de); break; case 5: /* imul */ - emulate_1op_rax_rdx("imul", ctxt->src, *rax, *rdx, ctxt->eflags); + emulate_1op_rax_rdx("imul", ctxt->src, *rax, *rdx, + ctxt->eflags, de); break; case 6: /* div */ - emulate_1op_rax_rdx_ex("div", ctxt->src, *rax, *rdx, - ctxt->eflags, de); + emulate_1op_rax_rdx("div", ctxt->src, *rax, *rdx, + ctxt->eflags, de); break; case 7: /* idiv */ - emulate_1op_rax_rdx_ex("idiv", ctxt->src, *rax, *rdx, - ctxt->eflags, de); + emulate_1op_rax_rdx("idiv", ctxt->src, *rax, *rdx, + ctxt->eflags, de); break; default: return X86EMUL_UNHANDLEABLE; -- cgit v1.1 From e8f2b1d621e7b73c16aaae2ccf4a64d09a0f56d8 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 7 Sep 2011 16:41:40 +0300 Subject: KVM: x86 emulator: simplify emulate_1op_rax_rdx() emulate_1op_rax_rdx() is always called with the same parameters. Simplify by passing just the emulation context. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 42 +++++++++++++++++------------------------- 1 file changed, 17 insertions(+), 25 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 6f2e419..e10fd37 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -322,9 +322,11 @@ struct gprefix { } \ } while (0) -#define __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, _suffix, _ex) \ +#define __emulate_1op_rax_rdx(ctxt, _op, _suffix, _ex) \ do { \ unsigned long _tmp; \ + ulong *rax = &(ctxt)->regs[VCPU_REGS_RAX]; \ + ulong *rdx = &(ctxt)->regs[VCPU_REGS_RDX]; \ \ __asm__ __volatile__ ( \ _PRE_EFLAGS("0", "5", "1") \ @@ -337,31 +339,27 @@ struct gprefix { "jmp 2b \n\t" \ ".popsection \n\t" \ _ASM_EXTABLE(1b, 3b) \ - : "=m" (_eflags), "=&r" (_tmp), \ - "+a" (_rax), "+d" (_rdx), "+qm"(_ex) \ - : "i" (EFLAGS_MASK), "m" ((_src).val), \ - "a" (_rax), "d" (_rdx)); \ + : "=m" ((ctxt)->eflags), "=&r" (_tmp), \ + "+a" (*rax), "+d" (*rdx), "+qm"(_ex) \ + : "i" (EFLAGS_MASK), "m" ((ctxt)->src.val), \ + "a" (*rax), "d" (*rdx)); \ } while (0) /* instruction has only one source operand, destination is implicit (e.g. mul, div, imul, idiv) */ -#define emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, _ex) \ +#define emulate_1op_rax_rdx(ctxt, _op, _ex) \ do { \ - switch((_src).bytes) { \ + switch((ctxt)->src.bytes) { \ case 1: \ - __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \ - _eflags, "b", _ex); \ + __emulate_1op_rax_rdx(ctxt, _op, "b", _ex); \ break; \ case 2: \ - __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \ - _eflags, "w", _ex); \ + __emulate_1op_rax_rdx(ctxt, _op, "w", _ex); \ break; \ case 4: \ - __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \ - _eflags, "l", _ex); \ + __emulate_1op_rax_rdx(ctxt, _op, "l", _ex); \ break; \ case 8: ON64( \ - __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \ - _eflags, "q", _ex)); \ + __emulate_1op_rax_rdx(ctxt, _op, "q", _ex)); \ break; \ } \ } while (0) @@ -1667,8 +1665,6 @@ static int em_grp2(struct x86_emulate_ctxt *ctxt) static int em_grp3(struct x86_emulate_ctxt *ctxt) { - unsigned long *rax = &ctxt->regs[VCPU_REGS_RAX]; - unsigned long *rdx = &ctxt->regs[VCPU_REGS_RDX]; u8 de = 0; switch (ctxt->modrm_reg) { @@ -1682,20 +1678,16 @@ static int em_grp3(struct x86_emulate_ctxt *ctxt) emulate_1op(ctxt, "neg"); break; case 4: /* mul */ - emulate_1op_rax_rdx("mul", ctxt->src, *rax, *rdx, - ctxt->eflags, de); + emulate_1op_rax_rdx(ctxt, "mul", de); break; case 5: /* imul */ - emulate_1op_rax_rdx("imul", ctxt->src, *rax, *rdx, - ctxt->eflags, de); + emulate_1op_rax_rdx(ctxt, "imul", de); break; case 6: /* div */ - emulate_1op_rax_rdx("div", ctxt->src, *rax, *rdx, - ctxt->eflags, de); + emulate_1op_rax_rdx(ctxt, "div", de); break; case 7: /* idiv */ - emulate_1op_rax_rdx("idiv", ctxt->src, *rax, *rdx, - ctxt->eflags, de); + emulate_1op_rax_rdx(ctxt, "idiv", de); break; default: return X86EMUL_UNHANDLEABLE; -- cgit v1.1 From caa8a168e35650961b9b0d43b9b6fc2279351949 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 11 Sep 2011 11:23:02 +0300 Subject: KVM: x86 emulator: disable writeback for TEST The TEST instruction doesn't write its destination operand. This could cause problems if an MMIO register was accessed using the TEST instruction. Recently Windows XP was observed to use TEST against the APIC ICR; this can cause spurious IPIs. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index e10fd37..af06539 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1670,6 +1670,8 @@ static int em_grp3(struct x86_emulate_ctxt *ctxt) switch (ctxt->modrm_reg) { case 0 ... 1: /* test */ emulate_2op_SrcV(ctxt, "test"); + /* Disable writeback. */ + ctxt->dst.type = OP_NONE; break; case 2: /* not */ ctxt->dst.val = ~ctxt->dst.val; @@ -2513,6 +2515,8 @@ static int em_cmp(struct x86_emulate_ctxt *ctxt) static int em_test(struct x86_emulate_ctxt *ctxt) { emulate_2op_SrcV(ctxt, "test"); + /* Disable writeback. */ + ctxt->dst.type = OP_NONE; return X86EMUL_CONTINUE; } -- cgit v1.1 From 1e2b1dd797f9356f779268ecc1ba21110d4e341c Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 12 Sep 2011 10:52:24 +0200 Subject: KVM: x86: Move kvm_trace_exit into atomic vmexit section This avoids that events causing the vmexit are recorded before the actual exit reason. Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/svm.c | 4 ++-- arch/x86/kvm/vmx.c | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 8277f32..e7ed4b1 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3335,8 +3335,6 @@ static int handle_exit(struct kvm_vcpu *vcpu) struct kvm_run *kvm_run = vcpu->run; u32 exit_code = svm->vmcb->control.exit_code; - trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM); - if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE)) vcpu->arch.cr0 = svm->vmcb->save.cr0; if (npt_enabled) @@ -3790,6 +3788,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; + trace_kvm_exit(svm->vmcb->control.exit_code, vcpu, KVM_ISA_SVM); + if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) kvm_before_handle_nmi(&svm->vcpu); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 47419d6..21217b6 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5739,8 +5739,6 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) u32 exit_reason = vmx->exit_reason; u32 vectoring_info = vmx->idt_vectoring_info; - trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX); - /* If guest state is invalid, start emulating */ if (vmx->emulation_required && emulate_invalid_guest_state) return handle_invalid_guest_state(vcpu); @@ -6144,6 +6142,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx->loaded_vmcs->launched = 1; vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); + trace_kvm_exit(vmx->exit_reason, vcpu, KVM_ISA_VMX); vmx_complete_atomic_exit(vmx); vmx_recover_nmi_blocking(vmx); -- cgit v1.1 From 7712de872c8ec00a657b867ab0296913f69addac Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 12 Sep 2011 11:25:51 +0200 Subject: KVM: x86: Avoid guest-triggerable printks in APIC model Convert remaining printks that the guest can trigger to apic_printk. Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/lapic.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 52645f2..4b53b81 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -316,8 +316,8 @@ int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda) result = 1; break; default: - printk(KERN_WARNING "Bad DFR vcpu %d: %08x\n", - apic->vcpu->vcpu_id, apic_get_reg(apic, APIC_DFR)); + apic_debug("Bad DFR vcpu %d: %08x\n", + apic->vcpu->vcpu_id, apic_get_reg(apic, APIC_DFR)); break; } @@ -354,8 +354,8 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, result = (target != source); break; default: - printk(KERN_WARNING "Bad dest shorthand value %x\n", - short_hand); + apic_debug("kvm: apic: Bad dest shorthand value %x\n", + short_hand); break; } @@ -401,11 +401,11 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, break; case APIC_DM_REMRD: - printk(KERN_DEBUG "Ignoring delivery mode 3\n"); + apic_debug("Ignoring delivery mode 3\n"); break; case APIC_DM_SMI: - printk(KERN_DEBUG "Ignoring guest SMI\n"); + apic_debug("Ignoring guest SMI\n"); break; case APIC_DM_NMI: @@ -565,8 +565,7 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset) val = kvm_apic_id(apic) << 24; break; case APIC_ARBPRI: - printk(KERN_WARNING "Access APIC ARBPRI register " - "which is for P6\n"); + apic_debug("Access APIC ARBPRI register which is for P6\n"); break; case APIC_TMCCT: /* Timer CCR */ @@ -804,14 +803,14 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) case APIC_TDCR: if (val & 4) - printk(KERN_ERR "KVM_WRITE:TDCR %x\n", val); + apic_debug("KVM_WRITE:TDCR %x\n", val); apic_set_reg(apic, APIC_TDCR, val); update_divide_count(apic); break; case APIC_ESR: if (apic_x2apic_mode(apic) && val != 0) { - printk(KERN_ERR "KVM_WRITE:ESR not zero %x\n", val); + apic_debug("KVM_WRITE:ESR not zero %x\n", val); ret = 1; } break; -- cgit v1.1 From bd80158aff71a80292f96d9baea1a65bc0ce87b3 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 12 Sep 2011 11:26:22 +0200 Subject: KVM: Clean up and extend rate-limited output The use of printk_ratelimit is discouraged, replace it with pr*_ratelimited or __ratelimit. While at it, convert remaining guest-triggerable printks to rate-limited variants. Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/i8259.c | 15 ++++++++------- arch/x86/kvm/mmu_audit.c | 6 +++--- arch/x86/kvm/vmx.c | 13 ++++++------- 3 files changed, 17 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 6b869ce..cac4746 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -34,6 +34,9 @@ #include #include "trace.h" +#define pr_pic_unimpl(fmt, ...) \ + pr_err_ratelimited("kvm: pic: " fmt, ## __VA_ARGS__) + static void pic_irq_request(struct kvm *kvm, int level); static void pic_lock(struct kvm_pic *s) @@ -306,10 +309,10 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val) } s->init_state = 1; if (val & 0x02) - printk(KERN_ERR "single mode not supported"); + pr_pic_unimpl("single mode not supported"); if (val & 0x08) - printk(KERN_ERR - "level sensitive irq not supported"); + pr_pic_unimpl( + "level sensitive irq not supported"); } else if (val & 0x08) { if (val & 0x04) s->poll = 1; @@ -467,8 +470,7 @@ static int picdev_write(struct kvm_pic *s, return -EOPNOTSUPP; if (len != 1) { - if (printk_ratelimit()) - printk(KERN_ERR "PIC: non byte write\n"); + pr_pic_unimpl("non byte write\n"); return 0; } pic_lock(s); @@ -496,8 +498,7 @@ static int picdev_read(struct kvm_pic *s, return -EOPNOTSUPP; if (len != 1) { - if (printk_ratelimit()) - printk(KERN_ERR "PIC: non byte read\n"); + pr_pic_unimpl("non byte read\n"); return 0; } pic_lock(s); diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c index 2460a26..746ec25 100644 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu_audit.c @@ -121,16 +121,16 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level) static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) { + static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10); unsigned long *rmapp; struct kvm_mmu_page *rev_sp; gfn_t gfn; - rev_sp = page_header(__pa(sptep)); gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt); if (!gfn_to_memslot(kvm, gfn)) { - if (!printk_ratelimit()) + if (!__ratelimit(&ratelimit_state)) return; audit_printk(kvm, "no memslot for gfn %llx\n", gfn); audit_printk(kvm, "index %ld of sp (gfn=%llx)\n", @@ -141,7 +141,7 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level); if (!*rmapp) { - if (!printk_ratelimit()) + if (!__ratelimit(&ratelimit_state)) return; audit_printk(kvm, "no rmap for writable spte %llx\n", *sptep); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 21217b6..a0d6bd9 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2762,8 +2762,8 @@ static void enter_lmode(struct kvm_vcpu *vcpu) guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES); if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) { - printk(KERN_DEBUG "%s: tss fixup for long mode. \n", - __func__); + pr_debug_ratelimited("%s: tss fixup for long mode. \n", + __func__); vmcs_write32(GUEST_TR_AR_BYTES, (guest_tr_ar & ~AR_TYPE_MASK) | AR_TYPE_BUSY_64_TSS); @@ -5634,8 +5634,8 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) return 0; if (unlikely(vmx->fail)) { - printk(KERN_INFO "%s failed vm entry %x\n", - __func__, vmcs_read32(VM_INSTRUCTION_ERROR)); + pr_info_ratelimited("%s failed vm entry %x\n", __func__, + vmcs_read32(VM_INSTRUCTION_ERROR)); return 1; } @@ -6612,9 +6612,8 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) if (vmcs12->vm_entry_msr_load_count > 0 || vmcs12->vm_exit_msr_load_count > 0 || vmcs12->vm_exit_msr_store_count > 0) { - if (printk_ratelimit()) - printk(KERN_WARNING - "%s: VMCS MSR_{LOAD,STORE} unsupported\n", __func__); + pr_warn_ratelimited("%s: VMCS MSR_{LOAD,STORE} unsupported\n", + __func__); nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); return 1; } -- cgit v1.1 From 9bc5791d4aa78b72abd76f506c0a391a54abc4d1 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 12 Sep 2011 14:10:22 +0200 Subject: KVM: x86: Add module parameter for lapic periodic timer limit Certain guests, specifically RTOSes, request faster periodic timers than what we allow by default. Add a module parameter to adjust the limit for non-standard setups. Also add a rate-limited warning in case the guest requested more. Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/lapic.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 4b53b81..2fb20ca 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -68,6 +68,9 @@ #define VEC_POS(v) ((v) & (32 - 1)) #define REG_POS(v) (((v) >> 5) << 4) +static unsigned int min_timer_period_us = 500; +module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR); + static inline u32 apic_get_reg(struct kvm_lapic *apic, int reg_off) { return *((u32 *) (apic->regs + reg_off)); @@ -677,8 +680,16 @@ static void start_apic_timer(struct kvm_lapic *apic) * scheduler. */ if (apic_lvtt_period(apic)) { - if (apic->lapic_timer.period < NSEC_PER_MSEC/2) - apic->lapic_timer.period = NSEC_PER_MSEC/2; + s64 min_period = min_timer_period_us * 1000LL; + + if (apic->lapic_timer.period < min_period) { + pr_info_ratelimited( + "kvm: vcpu %i: requested %lld ns " + "lapic timer period limited to %lld ns\n", + apic->vcpu->vcpu_id, apic->lapic_timer.period, + min_period); + apic->lapic_timer.period = min_period; + } } hrtimer_start(&apic->lapic_timer.timer, -- cgit v1.1 From 3329ece161ad65ea31d825720e270f3a79ebba92 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 13 Sep 2011 10:45:39 +0300 Subject: KVM: x86 emulator: convert group 3 instructions to direct decode Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 82 +++++++++++++++++++++++++++++--------------------- 1 file changed, 48 insertions(+), 34 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index af06539..ed819bd 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1663,37 +1663,49 @@ static int em_grp2(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } -static int em_grp3(struct x86_emulate_ctxt *ctxt) +static int em_not(struct x86_emulate_ctxt *ctxt) +{ + ctxt->dst.val = ~ctxt->dst.val; + return X86EMUL_CONTINUE; +} + +static int em_neg(struct x86_emulate_ctxt *ctxt) +{ + emulate_1op(ctxt, "neg"); + return X86EMUL_CONTINUE; +} + +static int em_mul_ex(struct x86_emulate_ctxt *ctxt) +{ + u8 ex = 0; + + emulate_1op_rax_rdx(ctxt, "mul", ex); + return X86EMUL_CONTINUE; +} + +static int em_imul_ex(struct x86_emulate_ctxt *ctxt) +{ + u8 ex = 0; + + emulate_1op_rax_rdx(ctxt, "imul", ex); + return X86EMUL_CONTINUE; +} + +static int em_div_ex(struct x86_emulate_ctxt *ctxt) { u8 de = 0; - switch (ctxt->modrm_reg) { - case 0 ... 1: /* test */ - emulate_2op_SrcV(ctxt, "test"); - /* Disable writeback. */ - ctxt->dst.type = OP_NONE; - break; - case 2: /* not */ - ctxt->dst.val = ~ctxt->dst.val; - break; - case 3: /* neg */ - emulate_1op(ctxt, "neg"); - break; - case 4: /* mul */ - emulate_1op_rax_rdx(ctxt, "mul", de); - break; - case 5: /* imul */ - emulate_1op_rax_rdx(ctxt, "imul", de); - break; - case 6: /* div */ - emulate_1op_rax_rdx(ctxt, "div", de); - break; - case 7: /* idiv */ - emulate_1op_rax_rdx(ctxt, "idiv", de); - break; - default: - return X86EMUL_UNHANDLEABLE; - } + emulate_1op_rax_rdx(ctxt, "div", de); + if (de) + return emulate_de(ctxt); + return X86EMUL_CONTINUE; +} + +static int em_idiv_ex(struct x86_emulate_ctxt *ctxt) +{ + u8 de = 0; + + emulate_1op_rax_rdx(ctxt, "idiv", de); if (de) return emulate_de(ctxt); return X86EMUL_CONTINUE; @@ -2989,9 +3001,14 @@ static struct opcode group1A[] = { }; static struct opcode group3[] = { - D(DstMem | SrcImm | ModRM), D(DstMem | SrcImm | ModRM), - D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock), - X4(D(SrcMem | ModRM)), + I(DstMem | SrcImm | ModRM, em_test), + I(DstMem | SrcImm | ModRM, em_test), + I(DstMem | SrcNone | ModRM | Lock, em_not), + I(DstMem | SrcNone | ModRM | Lock, em_neg), + I(SrcMem | ModRM, em_mul_ex), + I(SrcMem | ModRM, em_imul_ex), + I(SrcMem | ModRM, em_div_ex), + I(SrcMem | ModRM, em_idiv_ex), }; static struct opcode group4[] = { @@ -3917,9 +3934,6 @@ special_insn: /* complement carry flag from eflags reg */ ctxt->eflags ^= EFLG_CF; break; - case 0xf6 ... 0xf7: /* Grp3 */ - rc = em_grp3(ctxt); - break; case 0xf8: /* clc */ ctxt->eflags &= ~EFLG_CF; break; -- cgit v1.1 From f09ed83e211d253809e575e05bd4de1e335c0cb2 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 13 Sep 2011 10:45:40 +0300 Subject: KVM: x86 emulator: move memop, memopp into emulation context Simplifies further generalization of decode. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_emulate.h | 2 ++ arch/x86/kvm/emulate.c | 34 +++++++++++++++++----------------- 2 files changed, 19 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 6040d11..56bac3e 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -275,6 +275,8 @@ struct x86_emulate_ctxt { unsigned long _eip; /* Fields above regs are cleared together. */ unsigned long regs[NR_VCPU_REGS]; + struct operand memop; + struct operand *memopp; struct fetch_cache fetch; struct read_cache io_read; struct read_cache mem_read; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index ed819bd..58172fb 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3323,8 +3323,9 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) int def_op_bytes, def_ad_bytes, goffset, simd_prefix; bool op_prefix = false; struct opcode opcode; - struct operand memop = { .type = OP_NONE }, *memopp = NULL; + ctxt->memop.type = OP_NONE; + ctxt->memopp = NULL; ctxt->_eip = ctxt->eip; ctxt->fetch.start = ctxt->_eip; ctxt->fetch.end = ctxt->fetch.start + insn_len; @@ -3482,21 +3483,21 @@ done_prefixes: /* ModRM and SIB bytes. */ if (ctxt->d & ModRM) { - rc = decode_modrm(ctxt, &memop); + rc = decode_modrm(ctxt, &ctxt->memop); if (!ctxt->has_seg_override) set_seg_override(ctxt, ctxt->modrm_seg); } else if (ctxt->d & MemAbs) - rc = decode_abs(ctxt, &memop); + rc = decode_abs(ctxt, &ctxt->memop); if (rc != X86EMUL_CONTINUE) goto done; if (!ctxt->has_seg_override) set_seg_override(ctxt, VCPU_SREG_DS); - memop.addr.mem.seg = seg_override(ctxt); + ctxt->memop.addr.mem.seg = seg_override(ctxt); - if (memop.type == OP_MEM && ctxt->ad_bytes != 8) - memop.addr.mem.ea = (u32)memop.addr.mem.ea; + if (ctxt->memop.type == OP_MEM && ctxt->ad_bytes != 8) + ctxt->memop.addr.mem.ea = (u32)ctxt->memop.addr.mem.ea; /* * Decode and fetch the source operand: register, memory @@ -3509,17 +3510,16 @@ done_prefixes: decode_register_operand(ctxt, &ctxt->src, 0); break; case SrcMem16: - memop.bytes = 2; + ctxt->memop.bytes = 2; goto srcmem_common; case SrcMem32: - memop.bytes = 4; + ctxt->memop.bytes = 4; goto srcmem_common; case SrcMem: - memop.bytes = (ctxt->d & ByteOp) ? 1 : - ctxt->op_bytes; + ctxt->memop.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; srcmem_common: - ctxt->src = memop; - memopp = &ctxt->src; + ctxt->src = ctxt->memop; + ctxt->memopp = &ctxt->src; break; case SrcImmU16: rc = decode_imm(ctxt, &ctxt->src, 2, false); @@ -3561,7 +3561,7 @@ done_prefixes: insn_fetch_arr(ctxt->src.valptr, ctxt->src.bytes, ctxt); break; case SrcMemFAddr: - memop.bytes = ctxt->op_bytes + 2; + ctxt->memop.bytes = ctxt->op_bytes + 2; goto srcmem_common; break; case SrcDX: @@ -3615,8 +3615,8 @@ done_prefixes: break; case DstMem: case DstMem64: - ctxt->dst = memop; - memopp = &ctxt->dst; + ctxt->dst = ctxt->memop; + ctxt->memopp = &ctxt->dst; if ((ctxt->d & DstMask) == DstMem64) ctxt->dst.bytes = 8; else @@ -3654,8 +3654,8 @@ done_prefixes: } done: - if (memopp && memopp->type == OP_MEM && ctxt->rip_relative) - memopp->addr.mem.ea += ctxt->_eip; + if (ctxt->memopp && ctxt->memopp->type == OP_MEM && ctxt->rip_relative) + ctxt->memopp->addr.mem.ea += ctxt->_eip; return (rc != X86EMUL_CONTINUE) ? EMULATION_FAILED : EMULATION_OK; } -- cgit v1.1 From a99455499a86bde28fccd84e2119be9cd7c23a3f Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 13 Sep 2011 10:45:41 +0300 Subject: KVM: x86 emulator: split dst decode to a generic decode_operand() Instead of decoding each operand using its own code, use a generic function. Start with the destination operand. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 146 +++++++++++++++++++++++++++++-------------------- 1 file changed, 87 insertions(+), 59 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 58172fb..6a6aed9 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -29,6 +29,22 @@ #include "tss.h" /* + * Operand types + */ +#define OpNone 0 +#define OpImplicit 1 /* No generic decode */ +#define OpReg 2 /* Register */ +#define OpMem 3 /* Memory */ +#define OpAcc 4 /* Accumulator: AL/AX/EAX/RAX */ +#define OpDI 5 /* ES:DI/EDI/RDI */ +#define OpMem64 6 /* Memory, 64-bit */ +#define OpImmUByte 7 /* Zero-extended 8-bit immediate */ +#define OpDX 8 /* DX register */ + +#define OpBits 4 /* Width of operand field */ +#define OpMask ((1 << OpBits) - 1) + +/* * Opcode effective-address decode tables. * Note that we only emulate instructions that have at least one memory * operand (excluding implicit stack references). We assume that stack @@ -40,15 +56,16 @@ /* Operand sizes: 8-bit operands or specified/overridden size. */ #define ByteOp (1<<0) /* 8-bit operands. */ /* Destination operand type. */ -#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */ -#define DstReg (2<<1) /* Register operand. */ -#define DstMem (3<<1) /* Memory operand. */ -#define DstAcc (4<<1) /* Destination Accumulator */ -#define DstDI (5<<1) /* Destination is in ES:(E)DI */ -#define DstMem64 (6<<1) /* 64bit memory operand */ -#define DstImmUByte (7<<1) /* 8-bit unsigned immediate operand */ -#define DstDX (8<<1) /* Destination is in DX register */ -#define DstMask (0xf<<1) +#define DstShift 1 +#define ImplicitOps (OpImplicit << DstShift) +#define DstReg (OpReg << DstShift) +#define DstMem (OpMem << DstShift) +#define DstAcc (OpAcc << DstShift) +#define DstDI (OpDI << DstShift) +#define DstMem64 (OpMem64 << DstShift) +#define DstImmUByte (OpImmUByte << DstShift) +#define DstDX (OpDX << DstShift) +#define DstMask (OpMask << DstShift) /* Source operand type. */ #define SrcNone (0<<5) /* No source operand. */ #define SrcReg (1<<5) /* Register operand. */ @@ -3316,6 +3333,66 @@ done: return rc; } +static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, + unsigned d) +{ + int rc = X86EMUL_CONTINUE; + + switch (d) { + case OpReg: + decode_register_operand(ctxt, op, + ctxt->twobyte && (ctxt->b == 0xb6 || ctxt->b == 0xb7)); + break; + case OpImmUByte: + op->type = OP_IMM; + op->addr.mem.ea = ctxt->_eip; + op->bytes = 1; + op->val = insn_fetch(u8, ctxt); + break; + case OpMem: + case OpMem64: + *op = ctxt->memop; + ctxt->memopp = op; + if (d == OpMem64) + op->bytes = 8; + else + op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; + if (ctxt->d & BitOp) + fetch_bit_operand(ctxt); + op->orig_val = op->val; + break; + case OpAcc: + op->type = OP_REG; + op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; + op->addr.reg = &ctxt->regs[VCPU_REGS_RAX]; + fetch_register_operand(op); + op->orig_val = op->val; + break; + case OpDI: + op->type = OP_MEM; + op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; + op->addr.mem.ea = + register_address(ctxt, ctxt->regs[VCPU_REGS_RDI]); + op->addr.mem.seg = VCPU_SREG_ES; + op->val = 0; + break; + case OpDX: + op->type = OP_REG; + op->bytes = 2; + op->addr.reg = &ctxt->regs[VCPU_REGS_RDX]; + fetch_register_operand(op); + break; + case OpImplicit: + /* Special instructions do their own operand decoding. */ + default: + op->type = OP_NONE; /* Disable writeback. */ + break; + } + +done: + return rc; +} + int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) { int rc = X86EMUL_CONTINUE; @@ -3602,56 +3679,7 @@ done_prefixes: goto done; /* Decode and fetch the destination operand: register or memory. */ - switch (ctxt->d & DstMask) { - case DstReg: - decode_register_operand(ctxt, &ctxt->dst, - ctxt->twobyte && (ctxt->b == 0xb6 || ctxt->b == 0xb7)); - break; - case DstImmUByte: - ctxt->dst.type = OP_IMM; - ctxt->dst.addr.mem.ea = ctxt->_eip; - ctxt->dst.bytes = 1; - ctxt->dst.val = insn_fetch(u8, ctxt); - break; - case DstMem: - case DstMem64: - ctxt->dst = ctxt->memop; - ctxt->memopp = &ctxt->dst; - if ((ctxt->d & DstMask) == DstMem64) - ctxt->dst.bytes = 8; - else - ctxt->dst.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; - if (ctxt->d & BitOp) - fetch_bit_operand(ctxt); - ctxt->dst.orig_val = ctxt->dst.val; - break; - case DstAcc: - ctxt->dst.type = OP_REG; - ctxt->dst.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; - ctxt->dst.addr.reg = &ctxt->regs[VCPU_REGS_RAX]; - fetch_register_operand(&ctxt->dst); - ctxt->dst.orig_val = ctxt->dst.val; - break; - case DstDI: - ctxt->dst.type = OP_MEM; - ctxt->dst.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; - ctxt->dst.addr.mem.ea = - register_address(ctxt, ctxt->regs[VCPU_REGS_RDI]); - ctxt->dst.addr.mem.seg = VCPU_SREG_ES; - ctxt->dst.val = 0; - break; - case DstDX: - ctxt->dst.type = OP_REG; - ctxt->dst.bytes = 2; - ctxt->dst.addr.reg = &ctxt->regs[VCPU_REGS_RDX]; - fetch_register_operand(&ctxt->dst); - break; - case ImplicitOps: - /* Special instructions do their own operand decoding. */ - default: - ctxt->dst.type = OP_NONE; /* Disable writeback. */ - break; - } + rc = decode_operand(ctxt, &ctxt->dst, (ctxt->d >> DstShift) & OpMask); done: if (ctxt->memopp && ctxt->memopp->type == OP_MEM && ctxt->rip_relative) -- cgit v1.1 From b1ea50b2b63a95aa5a7944b48ba4d0e9b32211d3 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 13 Sep 2011 10:45:42 +0300 Subject: KVM: x86 emulator: expand decode flags to 64 bits Unifiying the operands means not taking advantage of the fact that some operand types can only go into certain operands (for example, DI can only be used by the destination), so we need more bits to hold the operand type. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_emulate.h | 2 +- arch/x86/kvm/emulate.c | 38 +++++++++++++++++++------------------- 2 files changed, 20 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 56bac3e..a026507 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -262,7 +262,7 @@ struct x86_emulate_ctxt { struct operand dst; bool has_seg_override; u8 seg_override; - unsigned int d; + u64 d; int (*execute)(struct x86_emulate_ctxt *ctxt); int (*check_perm)(struct x86_emulate_ctxt *ctxt); /* modrm */ diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 6a6aed9..8c65ff2 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -31,18 +31,18 @@ /* * Operand types */ -#define OpNone 0 -#define OpImplicit 1 /* No generic decode */ -#define OpReg 2 /* Register */ -#define OpMem 3 /* Memory */ -#define OpAcc 4 /* Accumulator: AL/AX/EAX/RAX */ -#define OpDI 5 /* ES:DI/EDI/RDI */ -#define OpMem64 6 /* Memory, 64-bit */ -#define OpImmUByte 7 /* Zero-extended 8-bit immediate */ -#define OpDX 8 /* DX register */ +#define OpNone 0ull +#define OpImplicit 1ull /* No generic decode */ +#define OpReg 2ull /* Register */ +#define OpMem 3ull /* Memory */ +#define OpAcc 4ull /* Accumulator: AL/AX/EAX/RAX */ +#define OpDI 5ull /* ES:DI/EDI/RDI */ +#define OpMem64 6ull /* Memory, 64-bit */ +#define OpImmUByte 7ull /* Zero-extended 8-bit immediate */ +#define OpDX 8ull /* DX register */ #define OpBits 4 /* Width of operand field */ -#define OpMask ((1 << OpBits) - 1) +#define OpMask ((1ull << OpBits) - 1) /* * Opcode effective-address decode tables. @@ -108,12 +108,12 @@ #define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */ #define No64 (1<<28) /* Source 2 operand type */ -#define Src2None (0<<29) -#define Src2CL (1<<29) -#define Src2ImmByte (2<<29) -#define Src2One (3<<29) -#define Src2Imm (4<<29) -#define Src2Mask (7<<29) +#define Src2None (0u<<29) +#define Src2CL (1u<<29) +#define Src2ImmByte (2u<<29) +#define Src2One (3u<<29) +#define Src2Imm (4u<<29) +#define Src2Mask (7u<<29) #define X2(x...) x, x #define X3(x...) X2(x), x @@ -125,8 +125,8 @@ #define X16(x...) X8(x), X8(x) struct opcode { - u32 flags; - u8 intercept; + u64 flags : 56; + u64 intercept : 8; union { int (*execute)(struct x86_emulate_ctxt *ctxt); struct opcode *group; @@ -3530,7 +3530,7 @@ done_prefixes: return EMULATION_FAILED; } - ctxt->d &= ~GroupMask; + ctxt->d &= ~(u64)GroupMask; ctxt->d |= opcode.flags; } -- cgit v1.1 From 4dd6a57df7ca9088a4b14664764e7adb9c120bb1 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 13 Sep 2011 10:45:43 +0300 Subject: KVM: x86 emulator: switch src2 to generic decode_operand() Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 51 +++++++++++++++++++++++++------------------------- 1 file changed, 26 insertions(+), 25 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 8c65ff2..88d32fc 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -40,6 +40,10 @@ #define OpMem64 6ull /* Memory, 64-bit */ #define OpImmUByte 7ull /* Zero-extended 8-bit immediate */ #define OpDX 8ull /* DX register */ +#define OpCL 9ull /* CL register (for shifts) */ +#define OpImmByte 10ull /* 8-bit sign extended immediate */ +#define OpOne 11ull /* Implied 1 */ +#define OpImm 12ull /* Sign extended immediate */ #define OpBits 4 /* Width of operand field */ #define OpMask ((1ull << OpBits) - 1) @@ -108,12 +112,13 @@ #define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */ #define No64 (1<<28) /* Source 2 operand type */ -#define Src2None (0u<<29) -#define Src2CL (1u<<29) -#define Src2ImmByte (2u<<29) -#define Src2One (3u<<29) -#define Src2Imm (4u<<29) -#define Src2Mask (7u<<29) +#define Src2Shift (29) +#define Src2None (OpNone << Src2Shift) +#define Src2CL (OpCL << Src2Shift) +#define Src2ImmByte (OpImmByte << Src2Shift) +#define Src2One (OpOne << Src2Shift) +#define Src2Imm (OpImm << Src2Shift) +#define Src2Mask (OpMask << Src2Shift) #define X2(x...) x, x #define X3(x...) X2(x), x @@ -3382,6 +3387,20 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, op->addr.reg = &ctxt->regs[VCPU_REGS_RDX]; fetch_register_operand(op); break; + case OpCL: + op->bytes = 1; + op->val = ctxt->regs[VCPU_REGS_RCX] & 0xff; + break; + case OpImmByte: + rc = decode_imm(ctxt, op, 1, true); + break; + case OpOne: + op->bytes = 1; + op->val = 1; + break; + case OpImm: + rc = decode_imm(ctxt, op, imm_size(ctxt), true); + break; case OpImplicit: /* Special instructions do their own operand decoding. */ default: @@ -3656,25 +3675,7 @@ done_prefixes: * Decode and fetch the second source operand: register, memory * or immediate. */ - switch (ctxt->d & Src2Mask) { - case Src2None: - break; - case Src2CL: - ctxt->src2.bytes = 1; - ctxt->src2.val = ctxt->regs[VCPU_REGS_RCX] & 0xff; - break; - case Src2ImmByte: - rc = decode_imm(ctxt, &ctxt->src2, 1, true); - break; - case Src2One: - ctxt->src2.bytes = 1; - ctxt->src2.val = 1; - break; - case Src2Imm: - rc = decode_imm(ctxt, &ctxt->src2, imm_size(ctxt), true); - break; - } - + rc = decode_operand(ctxt, &ctxt->src2, (ctxt->d >> Src2Shift) & OpMask); if (rc != X86EMUL_CONTINUE) goto done; -- cgit v1.1 From 20c29ff205ea9c4e2bf4ef79b0e5b934d6c60aff Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 13 Sep 2011 10:45:44 +0300 Subject: KVM: x86 emulator: free up some flag bits near src, dst Op fields are going to grow by a bit, we need two free bits. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 88d32fc..00e0904 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -88,10 +88,6 @@ #define SrcImmU16 (0xe<<5) /* Immediate operand, unsigned, 16 bits */ #define SrcDX (0xf<<5) /* Source is in DX register */ #define SrcMask (0xf<<5) -/* Generic ModRM decode. */ -#define ModRM (1<<9) -/* Destination is only written; never read. */ -#define Mov (1<<10) #define BitOp (1<<11) #define MemAbs (1<<12) /* Memory operand is absolute displacement */ #define String (1<<13) /* String instruction (rep capable) */ @@ -102,6 +98,10 @@ #define Prefix (3<<15) /* Instruction varies with 66/f2/f3 prefix */ #define RMExt (4<<15) /* Opcode extension in ModRM r/m if mod == 3 */ #define Sse (1<<18) /* SSE Vector instruction */ +/* Generic ModRM decode. */ +#define ModRM (1<<19) +/* Destination is only written; never read. */ +#define Mov (1<<20) /* Misc flags */ #define Prot (1<<21) /* instruction generates #UD if not in prot-mode */ #define VendorSpecific (1<<22) /* Vendor specific instruction */ -- cgit v1.1 From 608aabe316eab9116ddd73418217277693df3bb8 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 13 Sep 2011 10:45:45 +0300 Subject: KVM: x86 emulator: switch OpImmUByte decode to decode_imm() Similar to SrcImmUByte. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 00e0904..a0d6ceb4 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3349,10 +3349,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, ctxt->twobyte && (ctxt->b == 0xb6 || ctxt->b == 0xb7)); break; case OpImmUByte: - op->type = OP_IMM; - op->addr.mem.ea = ctxt->_eip; - op->bytes = 1; - op->val = insn_fetch(u8, ctxt); + rc = decode_imm(ctxt, op, 1, false); break; case OpMem: case OpMem64: -- cgit v1.1 From 5217973ef899ffecdd77743aae3b633994a08cde Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 13 Sep 2011 10:45:46 +0300 Subject: KVM: x86 emulator: qualify OpReg inhibit_byte_regs hack OpReg decoding has a hack that inhibits byte registers for movsx and movzx instructions. It should be replaced by something better, but meanwhile, qualify that the hack is only active for the destination operand. Note these instructions only use OpReg for the destination, but better to be explicit about it. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index a0d6ceb4..17a8910 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3346,6 +3346,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, switch (d) { case OpReg: decode_register_operand(ctxt, op, + op == &ctxt->dst && ctxt->twobyte && (ctxt->b == 0xb6 || ctxt->b == 0xb7)); break; case OpImmUByte: -- cgit v1.1 From 0fe591288470aebba4104b17513c9ad5050f9d0d Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 13 Sep 2011 10:45:47 +0300 Subject: KVM: x86 emulator: switch src decode to decode_operand() Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 156 ++++++++++++++++++++----------------------------- 1 file changed, 63 insertions(+), 93 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 17a8910..e46809b 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -44,8 +44,15 @@ #define OpImmByte 10ull /* 8-bit sign extended immediate */ #define OpOne 11ull /* Implied 1 */ #define OpImm 12ull /* Sign extended immediate */ - -#define OpBits 4 /* Width of operand field */ +#define OpMem16 13ull /* Memory operand (16-bit). */ +#define OpMem32 14ull /* Memory operand (32-bit). */ +#define OpImmU 15ull /* Immediate operand, zero extended */ +#define OpSI 16ull /* SI/ESI/RSI */ +#define OpImmFAddr 17ull /* Immediate far address */ +#define OpMemFAddr 18ull /* Far address in memory */ +#define OpImmU16 19ull /* Immediate operand, 16 bits, zero extended */ + +#define OpBits 5 /* Width of operand field */ #define OpMask ((1ull << OpBits) - 1) /* @@ -71,23 +78,24 @@ #define DstDX (OpDX << DstShift) #define DstMask (OpMask << DstShift) /* Source operand type. */ -#define SrcNone (0<<5) /* No source operand. */ -#define SrcReg (1<<5) /* Register operand. */ -#define SrcMem (2<<5) /* Memory operand. */ -#define SrcMem16 (3<<5) /* Memory operand (16-bit). */ -#define SrcMem32 (4<<5) /* Memory operand (32-bit). */ -#define SrcImm (5<<5) /* Immediate operand. */ -#define SrcImmByte (6<<5) /* 8-bit sign-extended immediate operand. */ -#define SrcOne (7<<5) /* Implied '1' */ -#define SrcImmUByte (8<<5) /* 8-bit unsigned immediate operand. */ -#define SrcImmU (9<<5) /* Immediate operand, unsigned */ -#define SrcSI (0xa<<5) /* Source is in the DS:RSI */ -#define SrcImmFAddr (0xb<<5) /* Source is immediate far address */ -#define SrcMemFAddr (0xc<<5) /* Source is far address in memory */ -#define SrcAcc (0xd<<5) /* Source Accumulator */ -#define SrcImmU16 (0xe<<5) /* Immediate operand, unsigned, 16 bits */ -#define SrcDX (0xf<<5) /* Source is in DX register */ -#define SrcMask (0xf<<5) +#define SrcShift 6 +#define SrcNone (OpNone << SrcShift) +#define SrcReg (OpReg << SrcShift) +#define SrcMem (OpMem << SrcShift) +#define SrcMem16 (OpMem16 << SrcShift) +#define SrcMem32 (OpMem32 << SrcShift) +#define SrcImm (OpImm << SrcShift) +#define SrcImmByte (OpImmByte << SrcShift) +#define SrcOne (OpOne << SrcShift) +#define SrcImmUByte (OpImmUByte << SrcShift) +#define SrcImmU (OpImmU << SrcShift) +#define SrcSI (OpSI << SrcShift) +#define SrcImmFAddr (OpImmFAddr << SrcShift) +#define SrcMemFAddr (OpMemFAddr << SrcShift) +#define SrcAcc (OpAcc << SrcShift) +#define SrcImmU16 (OpImmU16 << SrcShift) +#define SrcDX (OpDX << SrcShift) +#define SrcMask (OpMask << SrcShift) #define BitOp (1<<11) #define MemAbs (1<<12) /* Memory operand is absolute displacement */ #define String (1<<13) /* String instruction (rep capable) */ @@ -3354,13 +3362,14 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, break; case OpMem: case OpMem64: - *op = ctxt->memop; - ctxt->memopp = op; if (d == OpMem64) - op->bytes = 8; + ctxt->memop.bytes = 8; else - op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; - if (ctxt->d & BitOp) + ctxt->memop.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; + mem_common: + *op = ctxt->memop; + ctxt->memopp = op; + if ((ctxt->d & BitOp) && op == &ctxt->dst) fetch_bit_operand(ctxt); op->orig_val = op->val; break; @@ -3399,6 +3408,35 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, case OpImm: rc = decode_imm(ctxt, op, imm_size(ctxt), true); break; + case OpMem16: + ctxt->memop.bytes = 2; + goto mem_common; + case OpMem32: + ctxt->memop.bytes = 4; + goto mem_common; + case OpImmU16: + rc = decode_imm(ctxt, op, 2, false); + break; + case OpImmU: + rc = decode_imm(ctxt, op, imm_size(ctxt), false); + break; + case OpSI: + op->type = OP_MEM; + op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; + op->addr.mem.ea = + register_address(ctxt, ctxt->regs[VCPU_REGS_RSI]); + op->addr.mem.seg = seg_override(ctxt); + op->val = 0; + break; + case OpImmFAddr: + op->type = OP_IMM; + op->addr.mem.ea = ctxt->_eip; + op->bytes = ctxt->op_bytes + 2; + insn_fetch_arr(op->valptr, op->bytes, ctxt); + break; + case OpMemFAddr: + ctxt->memop.bytes = ctxt->op_bytes + 2; + goto mem_common; case OpImplicit: /* Special instructions do their own operand decoding. */ default: @@ -3597,75 +3635,7 @@ done_prefixes: * Decode and fetch the source operand: register, memory * or immediate. */ - switch (ctxt->d & SrcMask) { - case SrcNone: - break; - case SrcReg: - decode_register_operand(ctxt, &ctxt->src, 0); - break; - case SrcMem16: - ctxt->memop.bytes = 2; - goto srcmem_common; - case SrcMem32: - ctxt->memop.bytes = 4; - goto srcmem_common; - case SrcMem: - ctxt->memop.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; - srcmem_common: - ctxt->src = ctxt->memop; - ctxt->memopp = &ctxt->src; - break; - case SrcImmU16: - rc = decode_imm(ctxt, &ctxt->src, 2, false); - break; - case SrcImm: - rc = decode_imm(ctxt, &ctxt->src, imm_size(ctxt), true); - break; - case SrcImmU: - rc = decode_imm(ctxt, &ctxt->src, imm_size(ctxt), false); - break; - case SrcImmByte: - rc = decode_imm(ctxt, &ctxt->src, 1, true); - break; - case SrcImmUByte: - rc = decode_imm(ctxt, &ctxt->src, 1, false); - break; - case SrcAcc: - ctxt->src.type = OP_REG; - ctxt->src.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; - ctxt->src.addr.reg = &ctxt->regs[VCPU_REGS_RAX]; - fetch_register_operand(&ctxt->src); - break; - case SrcOne: - ctxt->src.bytes = 1; - ctxt->src.val = 1; - break; - case SrcSI: - ctxt->src.type = OP_MEM; - ctxt->src.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; - ctxt->src.addr.mem.ea = - register_address(ctxt, ctxt->regs[VCPU_REGS_RSI]); - ctxt->src.addr.mem.seg = seg_override(ctxt); - ctxt->src.val = 0; - break; - case SrcImmFAddr: - ctxt->src.type = OP_IMM; - ctxt->src.addr.mem.ea = ctxt->_eip; - ctxt->src.bytes = ctxt->op_bytes + 2; - insn_fetch_arr(ctxt->src.valptr, ctxt->src.bytes, ctxt); - break; - case SrcMemFAddr: - ctxt->memop.bytes = ctxt->op_bytes + 2; - goto srcmem_common; - break; - case SrcDX: - ctxt->src.type = OP_REG; - ctxt->src.bytes = 2; - ctxt->src.addr.reg = &ctxt->regs[VCPU_REGS_RDX]; - fetch_register_operand(&ctxt->src); - break; - } - + rc = decode_operand(ctxt, &ctxt->src, (ctxt->d >> SrcShift) & OpMask); if (rc != X86EMUL_CONTINUE) goto done; -- cgit v1.1 From 41ddf9784cb91c9e4d3a218eef3551bebe9c7362 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 13 Sep 2011 10:45:48 +0300 Subject: KVM: x86 emulator: simplify OpMem64 decode Use the same technique as the other OpMem variants, and goto mem_common. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index e46809b..1c95935 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3361,11 +3361,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, rc = decode_imm(ctxt, op, 1, false); break; case OpMem: - case OpMem64: - if (d == OpMem64) - ctxt->memop.bytes = 8; - else - ctxt->memop.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; + ctxt->memop.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; mem_common: *op = ctxt->memop; ctxt->memopp = op; @@ -3373,6 +3369,9 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, fetch_bit_operand(ctxt); op->orig_val = op->val; break; + case OpMem64: + ctxt->memop.bytes = 8; + goto mem_common; case OpAcc: op->type = OP_REG; op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; -- cgit v1.1 From c191a7a0f4d3b17cc6cee1d3f721dfe23fc7d6c6 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 13 Sep 2011 10:45:49 +0300 Subject: KVM: x86 emulator: streamline decode of segment registers The opcodes push %seg pop %seg l%seg, %mem, %reg (e.g. lds/les/lss/lfs/lgs) all have an segment register encoded in the instruction. To allow reuse, decode the segment number into src2 during the decode stage instead of the execution stage. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 99 +++++++++++++++++++++++++++++--------------------- 1 file changed, 57 insertions(+), 42 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 1c95935..ab48611 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -51,6 +51,12 @@ #define OpImmFAddr 17ull /* Immediate far address */ #define OpMemFAddr 18ull /* Far address in memory */ #define OpImmU16 19ull /* Immediate operand, 16 bits, zero extended */ +#define OpES 20ull /* ES */ +#define OpCS 21ull /* CS */ +#define OpSS 22ull /* SS */ +#define OpDS 23ull /* DS */ +#define OpFS 24ull /* FS */ +#define OpGS 25ull /* GS */ #define OpBits 5 /* Width of operand field */ #define OpMask ((1ull << OpBits) - 1) @@ -126,6 +132,12 @@ #define Src2ImmByte (OpImmByte << Src2Shift) #define Src2One (OpOne << Src2Shift) #define Src2Imm (OpImm << Src2Shift) +#define Src2ES (OpES << Src2Shift) +#define Src2CS (OpCS << Src2Shift) +#define Src2SS (OpSS << Src2Shift) +#define Src2DS (OpDS << Src2Shift) +#define Src2FS (OpFS << Src2Shift) +#define Src2GS (OpGS << Src2Shift) #define Src2Mask (OpMask << Src2Shift) #define X2(x...) x, x @@ -3101,16 +3113,19 @@ static struct gprefix pfx_0f_6f_0f_7f = { static struct opcode opcode_table[256] = { /* 0x00 - 0x07 */ I6ALU(Lock, em_add), - D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), + D(ImplicitOps | Stack | No64 | Src2ES), + D(ImplicitOps | Stack | No64 | Src2ES), /* 0x08 - 0x0F */ I6ALU(Lock, em_or), - D(ImplicitOps | Stack | No64), N, + D(ImplicitOps | Stack | No64 | Src2CS), N, /* 0x10 - 0x17 */ I6ALU(Lock, em_adc), - D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), + D(ImplicitOps | Stack | No64 | Src2SS), + D(ImplicitOps | Stack | No64 | Src2SS), /* 0x18 - 0x1F */ I6ALU(Lock, em_sbb), - D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), + D(ImplicitOps | Stack | No64 | Src2DS), + D(ImplicitOps | Stack | No64 | Src2DS), /* 0x20 - 0x27 */ I6ALU(Lock, em_and), N, N, /* 0x28 - 0x2F */ @@ -3178,7 +3193,8 @@ static struct opcode opcode_table[256] = { D2bv(DstMem | SrcImmByte | ModRM), I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm), I(ImplicitOps | Stack, em_ret), - D(DstReg | SrcMemFAddr | ModRM | No64), D(DstReg | SrcMemFAddr | ModRM | No64), + D(DstReg | SrcMemFAddr | ModRM | No64 | Src2ES), + D(DstReg | SrcMemFAddr | ModRM | No64 | Src2DS), G(ByteOp, group11), G(0, group11), /* 0xC8 - 0xCF */ N, N, N, I(ImplicitOps | Stack, em_ret_far), @@ -3253,20 +3269,22 @@ static struct opcode twobyte_table[256] = { /* 0x90 - 0x9F */ X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)), /* 0xA0 - 0xA7 */ - D(ImplicitOps | Stack), D(ImplicitOps | Stack), + D(Stack | Src2FS), D(Stack | Src2FS), DI(ImplicitOps, cpuid), D(DstMem | SrcReg | ModRM | BitOp), D(DstMem | SrcReg | Src2ImmByte | ModRM), D(DstMem | SrcReg | Src2CL | ModRM), N, N, /* 0xA8 - 0xAF */ - D(ImplicitOps | Stack), D(ImplicitOps | Stack), + D(Stack | Src2GS), D(Stack | Src2GS), DI(ImplicitOps, rsm), D(DstMem | SrcReg | ModRM | BitOp | Lock), D(DstMem | SrcReg | Src2ImmByte | ModRM), D(DstMem | SrcReg | Src2CL | ModRM), D(ModRM), I(DstReg | SrcMem | ModRM, em_imul), /* 0xB0 - 0xB7 */ D2bv(DstMem | SrcReg | ModRM | Lock), - D(DstReg | SrcMemFAddr | ModRM), D(DstMem | SrcReg | ModRM | BitOp | Lock), - D(DstReg | SrcMemFAddr | ModRM), D(DstReg | SrcMemFAddr | ModRM), + D(DstReg | SrcMemFAddr | ModRM | Src2SS), + D(DstMem | SrcReg | ModRM | BitOp | Lock), + D(DstReg | SrcMemFAddr | ModRM | Src2FS), + D(DstReg | SrcMemFAddr | ModRM | Src2GS), D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), /* 0xB8 - 0xBF */ N, N, @@ -3436,6 +3454,24 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, case OpMemFAddr: ctxt->memop.bytes = ctxt->op_bytes + 2; goto mem_common; + case OpES: + op->val = VCPU_SREG_ES; + break; + case OpCS: + op->val = VCPU_SREG_CS; + break; + case OpSS: + op->val = VCPU_SREG_SS; + break; + case OpDS: + op->val = VCPU_SREG_DS; + break; + case OpFS: + op->val = VCPU_SREG_FS; + break; + case OpGS: + op->val = VCPU_SREG_GS; + break; case OpImplicit: /* Special instructions do their own operand decoding. */ default: @@ -3803,26 +3839,15 @@ special_insn: switch (ctxt->b) { case 0x06: /* push es */ - rc = emulate_push_sreg(ctxt, VCPU_SREG_ES); - break; - case 0x07: /* pop es */ - rc = emulate_pop_sreg(ctxt, VCPU_SREG_ES); - break; case 0x0e: /* push cs */ - rc = emulate_push_sreg(ctxt, VCPU_SREG_CS); - break; case 0x16: /* push ss */ - rc = emulate_push_sreg(ctxt, VCPU_SREG_SS); - break; - case 0x17: /* pop ss */ - rc = emulate_pop_sreg(ctxt, VCPU_SREG_SS); - break; case 0x1e: /* push ds */ - rc = emulate_push_sreg(ctxt, VCPU_SREG_DS); + rc = emulate_push_sreg(ctxt, ctxt->src2.val); break; + case 0x07: /* pop es */ + case 0x17: /* pop ss */ case 0x1f: /* pop ds */ - rc = emulate_pop_sreg(ctxt, VCPU_SREG_DS); - break; + rc = emulate_pop_sreg(ctxt, ctxt->src2.val); case 0x40 ... 0x47: /* inc r16/r32 */ emulate_1op(ctxt, "inc"); break; @@ -3869,10 +3894,8 @@ special_insn: rc = em_grp2(ctxt); break; case 0xc4: /* les */ - rc = emulate_load_segment(ctxt, VCPU_SREG_ES); - break; case 0xc5: /* lds */ - rc = emulate_load_segment(ctxt, VCPU_SREG_DS); + rc = emulate_load_segment(ctxt, ctxt->src2.val); break; case 0xcc: /* int3 */ rc = emulate_int(ctxt, 3); @@ -4078,10 +4101,12 @@ twobyte_insn: ctxt->dst.val = test_cc(ctxt->b, ctxt->eflags); break; case 0xa0: /* push fs */ - rc = emulate_push_sreg(ctxt, VCPU_SREG_FS); + case 0xa8: /* push gs */ + rc = emulate_push_sreg(ctxt, ctxt->src2.val); break; case 0xa1: /* pop fs */ - rc = emulate_pop_sreg(ctxt, VCPU_SREG_FS); + case 0xa9: /* pop gs */ + rc = emulate_pop_sreg(ctxt, ctxt->src2.val); break; case 0xa3: bt: /* bt */ @@ -4094,12 +4119,6 @@ twobyte_insn: case 0xa5: /* shld cl, r, r/m */ emulate_2op_cl(ctxt, "shld"); break; - case 0xa8: /* push gs */ - rc = emulate_push_sreg(ctxt, VCPU_SREG_GS); - break; - case 0xa9: /* pop gs */ - rc = emulate_pop_sreg(ctxt, VCPU_SREG_GS); - break; case 0xab: bts: /* bts */ emulate_2op_SrcV_nobyte(ctxt, "bts"); @@ -4128,18 +4147,14 @@ twobyte_insn: } break; case 0xb2: /* lss */ - rc = emulate_load_segment(ctxt, VCPU_SREG_SS); + case 0xb4: /* lfs */ + case 0xb5: /* lgs */ + rc = emulate_load_segment(ctxt, ctxt->src2.val); break; case 0xb3: btr: /* btr */ emulate_2op_SrcV_nobyte(ctxt, "btr"); break; - case 0xb4: /* lfs */ - rc = emulate_load_segment(ctxt, VCPU_SREG_FS); - break; - case 0xb5: /* lgs */ - rc = emulate_load_segment(ctxt, VCPU_SREG_GS); - break; case 0xb6 ... 0xb7: /* movzx */ ctxt->dst.bytes = ctxt->op_bytes; ctxt->dst.val = (ctxt->d & ByteOp) ? (u8) ctxt->src.val -- cgit v1.1 From d4b4325fdb66739a74148f90da360ff82ce707d4 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 13 Sep 2011 10:45:50 +0300 Subject: KVM: x86 emulator: switch lds/les/lss/lfs/lgs to direct decode Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index ab48611..bd3e488 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1828,8 +1828,9 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt) return rc; } -static int emulate_load_segment(struct x86_emulate_ctxt *ctxt, int seg) +static int em_lseg(struct x86_emulate_ctxt *ctxt) { + int seg = ctxt->src2.val; unsigned short sel; int rc; @@ -3193,8 +3194,8 @@ static struct opcode opcode_table[256] = { D2bv(DstMem | SrcImmByte | ModRM), I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm), I(ImplicitOps | Stack, em_ret), - D(DstReg | SrcMemFAddr | ModRM | No64 | Src2ES), - D(DstReg | SrcMemFAddr | ModRM | No64 | Src2DS), + I(DstReg | SrcMemFAddr | ModRM | No64 | Src2ES, em_lseg), + I(DstReg | SrcMemFAddr | ModRM | No64 | Src2DS, em_lseg), G(ByteOp, group11), G(0, group11), /* 0xC8 - 0xCF */ N, N, N, I(ImplicitOps | Stack, em_ret_far), @@ -3281,10 +3282,10 @@ static struct opcode twobyte_table[256] = { D(ModRM), I(DstReg | SrcMem | ModRM, em_imul), /* 0xB0 - 0xB7 */ D2bv(DstMem | SrcReg | ModRM | Lock), - D(DstReg | SrcMemFAddr | ModRM | Src2SS), + I(DstReg | SrcMemFAddr | ModRM | Src2SS, em_lseg), D(DstMem | SrcReg | ModRM | BitOp | Lock), - D(DstReg | SrcMemFAddr | ModRM | Src2FS), - D(DstReg | SrcMemFAddr | ModRM | Src2GS), + I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg), + I(DstReg | SrcMemFAddr | ModRM | Src2GS, em_lseg), D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), /* 0xB8 - 0xBF */ N, N, @@ -3893,10 +3894,6 @@ special_insn: case 0xc0 ... 0xc1: rc = em_grp2(ctxt); break; - case 0xc4: /* les */ - case 0xc5: /* lds */ - rc = emulate_load_segment(ctxt, ctxt->src2.val); - break; case 0xcc: /* int3 */ rc = emulate_int(ctxt, 3); break; @@ -4146,11 +4143,6 @@ twobyte_insn: ctxt->dst.addr.reg = (unsigned long *)&ctxt->regs[VCPU_REGS_RAX]; } break; - case 0xb2: /* lss */ - case 0xb4: /* lfs */ - case 0xb5: /* lgs */ - rc = emulate_load_segment(ctxt, ctxt->src2.val); - break; case 0xb3: btr: /* btr */ emulate_2op_SrcV_nobyte(ctxt, "btr"); -- cgit v1.1 From 1cd196ea42c526549ded4fd29809c3fdaa4a7f41 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 13 Sep 2011 10:45:51 +0300 Subject: KVM: x86 emulator: convert push %sreg/pop %sreg to direct decode Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 44 +++++++++++++++----------------------------- 1 file changed, 15 insertions(+), 29 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index bd3e488..f1e3be18 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1458,15 +1458,18 @@ static int em_popf(struct x86_emulate_ctxt *ctxt) return emulate_popf(ctxt, &ctxt->dst.val, ctxt->op_bytes); } -static int emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg) +static int em_push_sreg(struct x86_emulate_ctxt *ctxt) { + int seg = ctxt->src2.val; + ctxt->src.val = get_segment_selector(ctxt, seg); return em_push(ctxt); } -static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, int seg) +static int em_pop_sreg(struct x86_emulate_ctxt *ctxt) { + int seg = ctxt->src2.val; unsigned long selector; int rc; @@ -3114,19 +3117,20 @@ static struct gprefix pfx_0f_6f_0f_7f = { static struct opcode opcode_table[256] = { /* 0x00 - 0x07 */ I6ALU(Lock, em_add), - D(ImplicitOps | Stack | No64 | Src2ES), - D(ImplicitOps | Stack | No64 | Src2ES), + I(ImplicitOps | Stack | No64 | Src2ES, em_push_sreg), + I(ImplicitOps | Stack | No64 | Src2ES, em_pop_sreg), /* 0x08 - 0x0F */ I6ALU(Lock, em_or), - D(ImplicitOps | Stack | No64 | Src2CS), N, + I(ImplicitOps | Stack | No64 | Src2CS, em_push_sreg), + N, /* 0x10 - 0x17 */ I6ALU(Lock, em_adc), - D(ImplicitOps | Stack | No64 | Src2SS), - D(ImplicitOps | Stack | No64 | Src2SS), + I(ImplicitOps | Stack | No64 | Src2SS, em_push_sreg), + I(ImplicitOps | Stack | No64 | Src2SS, em_pop_sreg), /* 0x18 - 0x1F */ I6ALU(Lock, em_sbb), - D(ImplicitOps | Stack | No64 | Src2DS), - D(ImplicitOps | Stack | No64 | Src2DS), + I(ImplicitOps | Stack | No64 | Src2DS, em_push_sreg), + I(ImplicitOps | Stack | No64 | Src2DS, em_pop_sreg), /* 0x20 - 0x27 */ I6ALU(Lock, em_and), N, N, /* 0x28 - 0x2F */ @@ -3270,12 +3274,12 @@ static struct opcode twobyte_table[256] = { /* 0x90 - 0x9F */ X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)), /* 0xA0 - 0xA7 */ - D(Stack | Src2FS), D(Stack | Src2FS), + I(Stack | Src2FS, em_push_sreg), I(Stack | Src2FS, em_pop_sreg), DI(ImplicitOps, cpuid), D(DstMem | SrcReg | ModRM | BitOp), D(DstMem | SrcReg | Src2ImmByte | ModRM), D(DstMem | SrcReg | Src2CL | ModRM), N, N, /* 0xA8 - 0xAF */ - D(Stack | Src2GS), D(Stack | Src2GS), + I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg), DI(ImplicitOps, rsm), D(DstMem | SrcReg | ModRM | BitOp | Lock), D(DstMem | SrcReg | Src2ImmByte | ModRM), D(DstMem | SrcReg | Src2CL | ModRM), @@ -3839,16 +3843,6 @@ special_insn: goto twobyte_insn; switch (ctxt->b) { - case 0x06: /* push es */ - case 0x0e: /* push cs */ - case 0x16: /* push ss */ - case 0x1e: /* push ds */ - rc = emulate_push_sreg(ctxt, ctxt->src2.val); - break; - case 0x07: /* pop es */ - case 0x17: /* pop ss */ - case 0x1f: /* pop ds */ - rc = emulate_pop_sreg(ctxt, ctxt->src2.val); case 0x40 ... 0x47: /* inc r16/r32 */ emulate_1op(ctxt, "inc"); break; @@ -4097,14 +4091,6 @@ twobyte_insn: case 0x90 ... 0x9f: /* setcc r/m8 */ ctxt->dst.val = test_cc(ctxt->b, ctxt->eflags); break; - case 0xa0: /* push fs */ - case 0xa8: /* push gs */ - rc = emulate_push_sreg(ctxt, ctxt->src2.val); - break; - case 0xa1: /* pop fs */ - case 0xa9: /* pop gs */ - rc = emulate_pop_sreg(ctxt, ctxt->src2.val); - break; case 0xa3: bt: /* bt */ ctxt->dst.type = OP_NONE; -- cgit v1.1 From 7460fb4a340033107530df19e7e125bd0969bfb2 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 20 Sep 2011 13:43:14 +0300 Subject: KVM: Fix simultaneous NMIs If simultaneous NMIs happen, we're supposed to queue the second and next (collapsing them), but currently we sometimes collapse the second into the first. Fix by using a counter for pending NMIs instead of a bool; since the counter limit depends on whether the processor is currently in an NMI handler, which can only be checked in vcpu context (via the NMI mask), we add a new KVM_REQ_NMI to request recalculation of the counter. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_host.h | 5 +++-- arch/x86/kvm/x86.c | 48 ++++++++++++++++++++++++++--------------- 2 files changed, 34 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 6ab4241..ab62711 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -413,8 +413,9 @@ struct kvm_vcpu_arch { u32 tsc_catchup_mult; s8 tsc_catchup_shift; - bool nmi_pending; - bool nmi_injected; + atomic_t nmi_queued; /* unprocessed asynchronous NMIs */ + unsigned nmi_pending; /* NMI queued after currently running handler */ + bool nmi_injected; /* Trying to inject an NMI this entry */ struct mtrr_state_type mtrr_state; u32 pat; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6b37f18..d51e407 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -83,6 +83,7 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); static void update_cr8_intercept(struct kvm_vcpu *vcpu); static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries); +static void process_nmi(struct kvm_vcpu *vcpu); struct kvm_x86_ops *kvm_x86_ops; EXPORT_SYMBOL_GPL(kvm_x86_ops); @@ -359,8 +360,8 @@ void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) void kvm_inject_nmi(struct kvm_vcpu *vcpu) { - kvm_make_request(KVM_REQ_EVENT, vcpu); - vcpu->arch.nmi_pending = 1; + atomic_inc(&vcpu->arch.nmi_queued); + kvm_make_request(KVM_REQ_NMI, vcpu); } EXPORT_SYMBOL_GPL(kvm_inject_nmi); @@ -2827,6 +2828,7 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events) { + process_nmi(vcpu); events->exception.injected = vcpu->arch.exception.pending && !kvm_exception_is_soft(vcpu->arch.exception.nr); @@ -2844,7 +2846,7 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, KVM_X86_SHADOW_INT_MOV_SS | KVM_X86_SHADOW_INT_STI); events->nmi.injected = vcpu->arch.nmi_injected; - events->nmi.pending = vcpu->arch.nmi_pending; + events->nmi.pending = vcpu->arch.nmi_pending != 0; events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu); events->nmi.pad = 0; @@ -2864,6 +2866,7 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, | KVM_VCPUEVENT_VALID_SHADOW)) return -EINVAL; + process_nmi(vcpu); vcpu->arch.exception.pending = events->exception.injected; vcpu->arch.exception.nr = events->exception.nr; vcpu->arch.exception.has_error_code = events->exception.has_error_code; @@ -4763,7 +4766,7 @@ int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) kvm_set_rflags(vcpu, ctxt->eflags); if (irq == NMI_VECTOR) - vcpu->arch.nmi_pending = false; + vcpu->arch.nmi_pending = 0; else vcpu->arch.interrupt.pending = false; @@ -5572,7 +5575,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu) /* try to inject new event if pending */ if (vcpu->arch.nmi_pending) { if (kvm_x86_ops->nmi_allowed(vcpu)) { - vcpu->arch.nmi_pending = false; + --vcpu->arch.nmi_pending; vcpu->arch.nmi_injected = true; kvm_x86_ops->set_nmi(vcpu); } @@ -5604,10 +5607,26 @@ static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu) } } +static void process_nmi(struct kvm_vcpu *vcpu) +{ + unsigned limit = 2; + + /* + * x86 is limited to one NMI running, and one NMI pending after it. + * If an NMI is already in progress, limit further NMIs to just one. + * Otherwise, allow two (and we'll inject the first one immediately). + */ + if (kvm_x86_ops->get_nmi_mask(vcpu) || vcpu->arch.nmi_injected) + limit = 1; + + vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0); + vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit); + kvm_make_request(KVM_REQ_EVENT, vcpu); +} + static int vcpu_enter_guest(struct kvm_vcpu *vcpu) { int r; - bool nmi_pending; bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && vcpu->run->request_interrupt_window; @@ -5647,6 +5666,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu)) record_steal_time(vcpu); + if (kvm_check_request(KVM_REQ_NMI, vcpu)) + process_nmi(vcpu); } @@ -5654,19 +5675,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (unlikely(r)) goto out; - /* - * An NMI can be injected between local nmi_pending read and - * vcpu->arch.nmi_pending read inside inject_pending_event(). - * But in that case, KVM_REQ_EVENT will be set, which makes - * the race described above benign. - */ - nmi_pending = ACCESS_ONCE(vcpu->arch.nmi_pending); - if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { inject_pending_event(vcpu); /* enable NMI/IRQ window open exits if needed */ - if (nmi_pending) + if (vcpu->arch.nmi_pending) kvm_x86_ops->enable_nmi_window(vcpu); else if (kvm_cpu_has_interrupt(vcpu) || req_int_win) kvm_x86_ops->enable_irq_window(vcpu); @@ -6374,7 +6387,8 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) { - vcpu->arch.nmi_pending = false; + atomic_set(&vcpu->arch.nmi_queued, 0); + vcpu->arch.nmi_pending = 0; vcpu->arch.nmi_injected = false; vcpu->arch.switch_db_regs = 0; @@ -6649,7 +6663,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) !vcpu->arch.apf.halted) || !list_empty_careful(&vcpu->async_pf.done) || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED - || vcpu->arch.nmi_pending || + || atomic_read(&vcpu->arch.nmi_queued) || (kvm_arch_interrupt_allowed(vcpu) && kvm_cpu_has_interrupt(vcpu)); } -- cgit v1.1 From b90dfb0419a79a90395e04fee3fbda3c12ba8237 Mon Sep 17 00:00:00 2001 From: "Liu, Jinsong" Date: Thu, 22 Sep 2011 16:53:58 +0800 Subject: x86: TSC deadline definitions This pre-defination is preparing for KVM tsc deadline timer emulation, but theirself are not kvm specific. Signed-off-by: Liu, Jinsong Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/apicdef.h | 2 ++ arch/x86/include/asm/cpufeature.h | 1 + arch/x86/include/asm/msr-index.h | 2 ++ 3 files changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h index 34595d5..3925d80 100644 --- a/arch/x86/include/asm/apicdef.h +++ b/arch/x86/include/asm/apicdef.h @@ -100,7 +100,9 @@ #define APIC_TIMER_BASE_CLKIN 0x0 #define APIC_TIMER_BASE_TMBASE 0x1 #define APIC_TIMER_BASE_DIV 0x2 +#define APIC_LVT_TIMER_ONESHOT (0 << 17) #define APIC_LVT_TIMER_PERIODIC (1 << 17) +#define APIC_LVT_TIMER_TSCDEADLINE (2 << 17) #define APIC_LVT_MASKED (1 << 16) #define APIC_LVT_LEVEL_TRIGGER (1 << 15) #define APIC_LVT_REMOTE_IRR (1 << 14) diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 88b23a4..94dfb0a 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -120,6 +120,7 @@ #define X86_FEATURE_X2APIC (4*32+21) /* x2APIC */ #define X86_FEATURE_MOVBE (4*32+22) /* MOVBE instruction */ #define X86_FEATURE_POPCNT (4*32+23) /* POPCNT instruction */ +#define X86_FEATURE_TSC_DEADLINE_TIMER (4*32+24) /* Tsc deadline timer */ #define X86_FEATURE_AES (4*32+25) /* AES instructions */ #define X86_FEATURE_XSAVE (4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */ #define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE enabled in the OS */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index d52609a..a6962d9 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -229,6 +229,8 @@ #define MSR_IA32_APICBASE_ENABLE (1<<11) #define MSR_IA32_APICBASE_BASE (0xfffff<<12) +#define MSR_IA32_TSCDEADLINE 0x000006e0 + #define MSR_IA32_UCODE_WRITE 0x00000079 #define MSR_IA32_UCODE_REV 0x0000008b -- cgit v1.1 From de0428a7ad4856c7b5b8a2792488ac893e6f3faa Mon Sep 17 00:00:00 2001 From: Kevin Winchester Date: Tue, 30 Aug 2011 20:41:05 -0300 Subject: x86, perf: Clean up perf_event cpu code The CPU support for perf events on x86 was implemented via included C files with #ifdefs. Clean this up by creating a new header file and compiling the vendor-specific files as needed. Signed-off-by: Kevin Winchester Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1314747665-2090-1-git-send-email-kjwinchester@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/Makefile | 5 + arch/x86/kernel/cpu/perf_event.c | 369 ++------------------- arch/x86/kernel/cpu/perf_event.h | 493 +++++++++++++++++++++++++++++ arch/x86/kernel/cpu/perf_event_amd.c | 18 +- arch/x86/kernel/cpu/perf_event_intel.c | 53 ++-- arch/x86/kernel/cpu/perf_event_intel_ds.c | 79 ++--- arch/x86/kernel/cpu/perf_event_intel_lbr.c | 28 +- arch/x86/kernel/cpu/perf_event_p4.c | 10 +- arch/x86/kernel/cpu/perf_event_p6.c | 9 +- 9 files changed, 604 insertions(+), 460 deletions(-) create mode 100644 arch/x86/kernel/cpu/perf_event.h (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 6042981..1044fd7 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -28,6 +28,11 @@ obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o obj-$(CONFIG_PERF_EVENTS) += perf_event.o +ifdef CONFIG_PERF_EVENTS +obj-$(CONFIG_CPU_SUP_AMD) += perf_event_amd.o +obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_p4.o perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o +endif + obj-$(CONFIG_X86_MCE) += mcheck/ obj-$(CONFIG_MTRR) += mtrr/ diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 05df6e3..8ab8911 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -32,6 +32,8 @@ #include #include +#include "perf_event.h" + #if 0 #undef wrmsrl #define wrmsrl(msr, val) \ @@ -43,285 +45,17 @@ do { \ } while (0) #endif -/* - * | NHM/WSM | SNB | - * register ------------------------------- - * | HT | no HT | HT | no HT | - *----------------------------------------- - * offcore | core | core | cpu | core | - * lbr_sel | core | core | cpu | core | - * ld_lat | cpu | core | cpu | core | - *----------------------------------------- - * - * Given that there is a small number of shared regs, - * we can pre-allocate their slot in the per-cpu - * per-core reg tables. - */ -enum extra_reg_type { - EXTRA_REG_NONE = -1, /* not used */ - - EXTRA_REG_RSP_0 = 0, /* offcore_response_0 */ - EXTRA_REG_RSP_1 = 1, /* offcore_response_1 */ - - EXTRA_REG_MAX /* number of entries needed */ -}; - -struct event_constraint { - union { - unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; - u64 idxmsk64; - }; - u64 code; - u64 cmask; - int weight; -}; - -struct amd_nb { - int nb_id; /* NorthBridge id */ - int refcnt; /* reference count */ - struct perf_event *owners[X86_PMC_IDX_MAX]; - struct event_constraint event_constraints[X86_PMC_IDX_MAX]; -}; - -struct intel_percore; - -#define MAX_LBR_ENTRIES 16 - -struct cpu_hw_events { - /* - * Generic x86 PMC bits - */ - struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */ - unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; - unsigned long running[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; - int enabled; - - int n_events; - int n_added; - int n_txn; - int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */ - u64 tags[X86_PMC_IDX_MAX]; - struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */ - - unsigned int group_flag; - - /* - * Intel DebugStore bits - */ - struct debug_store *ds; - u64 pebs_enabled; - - /* - * Intel LBR bits - */ - int lbr_users; - void *lbr_context; - struct perf_branch_stack lbr_stack; - struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES]; - - /* - * manage shared (per-core, per-cpu) registers - * used on Intel NHM/WSM/SNB - */ - struct intel_shared_regs *shared_regs; - - /* - * AMD specific bits - */ - struct amd_nb *amd_nb; - - void *kfree_on_online; -}; - -#define __EVENT_CONSTRAINT(c, n, m, w) {\ - { .idxmsk64 = (n) }, \ - .code = (c), \ - .cmask = (m), \ - .weight = (w), \ -} - -#define EVENT_CONSTRAINT(c, n, m) \ - __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n)) - -/* - * Constraint on the Event code. - */ -#define INTEL_EVENT_CONSTRAINT(c, n) \ - EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT) - -/* - * Constraint on the Event code + UMask + fixed-mask - * - * filter mask to validate fixed counter events. - * the following filters disqualify for fixed counters: - * - inv - * - edge - * - cnt-mask - * The other filters are supported by fixed counters. - * The any-thread option is supported starting with v3. - */ -#define FIXED_EVENT_CONSTRAINT(c, n) \ - EVENT_CONSTRAINT(c, (1ULL << (32+n)), X86_RAW_EVENT_MASK) - -/* - * Constraint on the Event code + UMask - */ -#define INTEL_UEVENT_CONSTRAINT(c, n) \ - EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK) - -#define EVENT_CONSTRAINT_END \ - EVENT_CONSTRAINT(0, 0, 0) - -#define for_each_event_constraint(e, c) \ - for ((e) = (c); (e)->weight; (e)++) - -/* - * Per register state. - */ -struct er_account { - raw_spinlock_t lock; /* per-core: protect structure */ - u64 config; /* extra MSR config */ - u64 reg; /* extra MSR number */ - atomic_t ref; /* reference count */ -}; - -/* - * Extra registers for specific events. - * - * Some events need large masks and require external MSRs. - * Those extra MSRs end up being shared for all events on - * a PMU and sometimes between PMU of sibling HT threads. - * In either case, the kernel needs to handle conflicting - * accesses to those extra, shared, regs. The data structure - * to manage those registers is stored in cpu_hw_event. - */ -struct extra_reg { - unsigned int event; - unsigned int msr; - u64 config_mask; - u64 valid_mask; - int idx; /* per_xxx->regs[] reg index */ -}; - -#define EVENT_EXTRA_REG(e, ms, m, vm, i) { \ - .event = (e), \ - .msr = (ms), \ - .config_mask = (m), \ - .valid_mask = (vm), \ - .idx = EXTRA_REG_##i \ - } - -#define INTEL_EVENT_EXTRA_REG(event, msr, vm, idx) \ - EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm, idx) +struct x86_pmu x86_pmu __read_mostly; -#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0, RSP_0) - -union perf_capabilities { - struct { - u64 lbr_format : 6; - u64 pebs_trap : 1; - u64 pebs_arch_reg : 1; - u64 pebs_format : 4; - u64 smm_freeze : 1; - }; - u64 capabilities; -}; - -/* - * struct x86_pmu - generic x86 pmu - */ -struct x86_pmu { - /* - * Generic x86 PMC bits - */ - const char *name; - int version; - int (*handle_irq)(struct pt_regs *); - void (*disable_all)(void); - void (*enable_all)(int added); - void (*enable)(struct perf_event *); - void (*disable)(struct perf_event *); - int (*hw_config)(struct perf_event *event); - int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign); - unsigned eventsel; - unsigned perfctr; - u64 (*event_map)(int); - int max_events; - int num_counters; - int num_counters_fixed; - int cntval_bits; - u64 cntval_mask; - int apic; - u64 max_period; - struct event_constraint * - (*get_event_constraints)(struct cpu_hw_events *cpuc, - struct perf_event *event); - - void (*put_event_constraints)(struct cpu_hw_events *cpuc, - struct perf_event *event); - struct event_constraint *event_constraints; - void (*quirks)(void); - int perfctr_second_write; - - int (*cpu_prepare)(int cpu); - void (*cpu_starting)(int cpu); - void (*cpu_dying)(int cpu); - void (*cpu_dead)(int cpu); - - /* - * Intel Arch Perfmon v2+ - */ - u64 intel_ctrl; - union perf_capabilities intel_cap; - - /* - * Intel DebugStore bits - */ - int bts, pebs; - int bts_active, pebs_active; - int pebs_record_size; - void (*drain_pebs)(struct pt_regs *regs); - struct event_constraint *pebs_constraints; - - /* - * Intel LBR - */ - unsigned long lbr_tos, lbr_from, lbr_to; /* MSR base regs */ - int lbr_nr; /* hardware stack size */ - - /* - * Extra registers for events - */ - struct extra_reg *extra_regs; - unsigned int er_flags; -}; - -#define ERF_NO_HT_SHARING 1 -#define ERF_HAS_RSP_1 2 - -static struct x86_pmu x86_pmu __read_mostly; - -static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { +DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { .enabled = 1, }; -static int x86_perf_event_set_period(struct perf_event *event); - -/* - * Generalized hw caching related hw_event table, filled - * in on a per model basis. A value of 0 means - * 'not supported', -1 means 'hw_event makes no sense on - * this CPU', any other value means the raw hw_event - * ID. - */ - -#define C(x) PERF_COUNT_HW_CACHE_##x - -static u64 __read_mostly hw_cache_event_ids +u64 __read_mostly hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX]; -static u64 __read_mostly hw_cache_extra_regs +u64 __read_mostly hw_cache_extra_regs [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX]; @@ -331,8 +65,7 @@ static u64 __read_mostly hw_cache_extra_regs * Can only be executed on the CPU where the event is active. * Returns the delta events processed. */ -static u64 -x86_perf_event_update(struct perf_event *event) +u64 x86_perf_event_update(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; int shift = 64 - x86_pmu.cntval_bits; @@ -375,30 +108,6 @@ again: return new_raw_count; } -static inline int x86_pmu_addr_offset(int index) -{ - int offset; - - /* offset = X86_FEATURE_PERFCTR_CORE ? index << 1 : index */ - alternative_io(ASM_NOP2, - "shll $1, %%eax", - X86_FEATURE_PERFCTR_CORE, - "=a" (offset), - "a" (index)); - - return offset; -} - -static inline unsigned int x86_pmu_config_addr(int index) -{ - return x86_pmu.eventsel + x86_pmu_addr_offset(index); -} - -static inline unsigned int x86_pmu_event_addr(int index) -{ - return x86_pmu.perfctr + x86_pmu_addr_offset(index); -} - /* * Find and validate any extra registers to set up. */ @@ -534,9 +243,6 @@ msr_fail: return false; } -static void reserve_ds_buffers(void); -static void release_ds_buffers(void); - static void hw_perf_event_destroy(struct perf_event *event) { if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) { @@ -585,7 +291,7 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event) return x86_pmu_extra_regs(val, event); } -static int x86_setup_perfctr(struct perf_event *event) +int x86_setup_perfctr(struct perf_event *event) { struct perf_event_attr *attr = &event->attr; struct hw_perf_event *hwc = &event->hw; @@ -649,7 +355,7 @@ static int x86_setup_perfctr(struct perf_event *event) return 0; } -static int x86_pmu_hw_config(struct perf_event *event) +int x86_pmu_hw_config(struct perf_event *event) { if (event->attr.precise_ip) { int precise = 0; @@ -725,7 +431,7 @@ static int __x86_pmu_event_init(struct perf_event *event) return x86_pmu.hw_config(event); } -static void x86_pmu_disable_all(void) +void x86_pmu_disable_all(void) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); int idx; @@ -760,15 +466,7 @@ static void x86_pmu_disable(struct pmu *pmu) x86_pmu.disable_all(); } -static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc, - u64 enable_mask) -{ - if (hwc->extra_reg.reg) - wrmsrl(hwc->extra_reg.reg, hwc->extra_reg.config); - wrmsrl(hwc->config_base, hwc->config | enable_mask); -} - -static void x86_pmu_enable_all(int added) +void x86_pmu_enable_all(int added) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); int idx; @@ -790,7 +488,7 @@ static inline int is_x86_event(struct perf_event *event) return event->pmu == &pmu; } -static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) +int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) { struct event_constraint *c, *constraints[X86_PMC_IDX_MAX]; unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; @@ -961,7 +659,6 @@ static inline int match_prev_assignment(struct hw_perf_event *hwc, } static void x86_pmu_start(struct perf_event *event, int flags); -static void x86_pmu_stop(struct perf_event *event, int flags); static void x86_pmu_enable(struct pmu *pmu) { @@ -1033,21 +730,13 @@ static void x86_pmu_enable(struct pmu *pmu) x86_pmu.enable_all(added); } -static inline void x86_pmu_disable_event(struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - - wrmsrl(hwc->config_base, hwc->config); -} - static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); /* * Set the next IRQ period, based on the hwc->period_left value. * To be called with the event disabled in hw: */ -static int -x86_perf_event_set_period(struct perf_event *event) +int x86_perf_event_set_period(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; s64 left = local64_read(&hwc->period_left); @@ -1107,7 +796,7 @@ x86_perf_event_set_period(struct perf_event *event) return ret; } -static void x86_pmu_enable_event(struct perf_event *event) +void x86_pmu_enable_event(struct perf_event *event) { if (__this_cpu_read(cpu_hw_events.enabled)) __x86_pmu_enable_event(&event->hw, @@ -1246,7 +935,7 @@ void perf_event_print_debug(void) local_irq_restore(flags); } -static void x86_pmu_stop(struct perf_event *event, int flags) +void x86_pmu_stop(struct perf_event *event, int flags) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct hw_perf_event *hwc = &event->hw; @@ -1299,7 +988,7 @@ static void x86_pmu_del(struct perf_event *event, int flags) perf_event_update_userpage(event); } -static int x86_pmu_handle_irq(struct pt_regs *regs) +int x86_pmu_handle_irq(struct pt_regs *regs) { struct perf_sample_data data; struct cpu_hw_events *cpuc; @@ -1439,30 +1128,8 @@ static __read_mostly struct notifier_block perf_event_nmi_notifier = { .priority = NMI_LOCAL_LOW_PRIOR, }; -static struct event_constraint unconstrained; -static struct event_constraint emptyconstraint; - -static struct event_constraint * -x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) -{ - struct event_constraint *c; - - if (x86_pmu.event_constraints) { - for_each_event_constraint(c, x86_pmu.event_constraints) { - if ((event->hw.config & c->cmask) == c->code) - return c; - } - } - - return &unconstrained; -} - -#include "perf_event_amd.c" -#include "perf_event_p6.c" -#include "perf_event_p4.c" -#include "perf_event_intel_lbr.c" -#include "perf_event_intel_ds.c" -#include "perf_event_intel.c" +struct event_constraint emptyconstraint; +struct event_constraint unconstrained; static int __cpuinit x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h new file mode 100644 index 0000000..fb330b0 --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event.h @@ -0,0 +1,493 @@ +/* + * Performance events x86 architecture header + * + * Copyright (C) 2008 Thomas Gleixner + * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2009 Jaswinder Singh Rajput + * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter + * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2009 Intel Corporation, + * Copyright (C) 2009 Google, Inc., Stephane Eranian + * + * For licencing details see kernel-base/COPYING + */ + +#include + +/* + * | NHM/WSM | SNB | + * register ------------------------------- + * | HT | no HT | HT | no HT | + *----------------------------------------- + * offcore | core | core | cpu | core | + * lbr_sel | core | core | cpu | core | + * ld_lat | cpu | core | cpu | core | + *----------------------------------------- + * + * Given that there is a small number of shared regs, + * we can pre-allocate their slot in the per-cpu + * per-core reg tables. + */ +enum extra_reg_type { + EXTRA_REG_NONE = -1, /* not used */ + + EXTRA_REG_RSP_0 = 0, /* offcore_response_0 */ + EXTRA_REG_RSP_1 = 1, /* offcore_response_1 */ + + EXTRA_REG_MAX /* number of entries needed */ +}; + +struct event_constraint { + union { + unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + u64 idxmsk64; + }; + u64 code; + u64 cmask; + int weight; +}; + +struct amd_nb { + int nb_id; /* NorthBridge id */ + int refcnt; /* reference count */ + struct perf_event *owners[X86_PMC_IDX_MAX]; + struct event_constraint event_constraints[X86_PMC_IDX_MAX]; +}; + +/* The maximal number of PEBS events: */ +#define MAX_PEBS_EVENTS 4 + +/* + * A debug store configuration. + * + * We only support architectures that use 64bit fields. + */ +struct debug_store { + u64 bts_buffer_base; + u64 bts_index; + u64 bts_absolute_maximum; + u64 bts_interrupt_threshold; + u64 pebs_buffer_base; + u64 pebs_index; + u64 pebs_absolute_maximum; + u64 pebs_interrupt_threshold; + u64 pebs_event_reset[MAX_PEBS_EVENTS]; +}; + +/* + * Per register state. + */ +struct er_account { + raw_spinlock_t lock; /* per-core: protect structure */ + u64 config; /* extra MSR config */ + u64 reg; /* extra MSR number */ + atomic_t ref; /* reference count */ +}; + +/* + * Per core/cpu state + * + * Used to coordinate shared registers between HT threads or + * among events on a single PMU. + */ +struct intel_shared_regs { + struct er_account regs[EXTRA_REG_MAX]; + int refcnt; /* per-core: #HT threads */ + unsigned core_id; /* per-core: core id */ +}; + +#define MAX_LBR_ENTRIES 16 + +struct cpu_hw_events { + /* + * Generic x86 PMC bits + */ + struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */ + unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + unsigned long running[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + int enabled; + + int n_events; + int n_added; + int n_txn; + int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */ + u64 tags[X86_PMC_IDX_MAX]; + struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */ + + unsigned int group_flag; + + /* + * Intel DebugStore bits + */ + struct debug_store *ds; + u64 pebs_enabled; + + /* + * Intel LBR bits + */ + int lbr_users; + void *lbr_context; + struct perf_branch_stack lbr_stack; + struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES]; + + /* + * manage shared (per-core, per-cpu) registers + * used on Intel NHM/WSM/SNB + */ + struct intel_shared_regs *shared_regs; + + /* + * AMD specific bits + */ + struct amd_nb *amd_nb; + + void *kfree_on_online; +}; + +#define __EVENT_CONSTRAINT(c, n, m, w) {\ + { .idxmsk64 = (n) }, \ + .code = (c), \ + .cmask = (m), \ + .weight = (w), \ +} + +#define EVENT_CONSTRAINT(c, n, m) \ + __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n)) + +/* + * Constraint on the Event code. + */ +#define INTEL_EVENT_CONSTRAINT(c, n) \ + EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT) + +/* + * Constraint on the Event code + UMask + fixed-mask + * + * filter mask to validate fixed counter events. + * the following filters disqualify for fixed counters: + * - inv + * - edge + * - cnt-mask + * The other filters are supported by fixed counters. + * The any-thread option is supported starting with v3. + */ +#define FIXED_EVENT_CONSTRAINT(c, n) \ + EVENT_CONSTRAINT(c, (1ULL << (32+n)), X86_RAW_EVENT_MASK) + +/* + * Constraint on the Event code + UMask + */ +#define INTEL_UEVENT_CONSTRAINT(c, n) \ + EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK) + +#define EVENT_CONSTRAINT_END \ + EVENT_CONSTRAINT(0, 0, 0) + +#define for_each_event_constraint(e, c) \ + for ((e) = (c); (e)->weight; (e)++) + +/* + * Extra registers for specific events. + * + * Some events need large masks and require external MSRs. + * Those extra MSRs end up being shared for all events on + * a PMU and sometimes between PMU of sibling HT threads. + * In either case, the kernel needs to handle conflicting + * accesses to those extra, shared, regs. The data structure + * to manage those registers is stored in cpu_hw_event. + */ +struct extra_reg { + unsigned int event; + unsigned int msr; + u64 config_mask; + u64 valid_mask; + int idx; /* per_xxx->regs[] reg index */ +}; + +#define EVENT_EXTRA_REG(e, ms, m, vm, i) { \ + .event = (e), \ + .msr = (ms), \ + .config_mask = (m), \ + .valid_mask = (vm), \ + .idx = EXTRA_REG_##i \ + } + +#define INTEL_EVENT_EXTRA_REG(event, msr, vm, idx) \ + EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm, idx) + +#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0, RSP_0) + +union perf_capabilities { + struct { + u64 lbr_format:6; + u64 pebs_trap:1; + u64 pebs_arch_reg:1; + u64 pebs_format:4; + u64 smm_freeze:1; + }; + u64 capabilities; +}; + +/* + * struct x86_pmu - generic x86 pmu + */ +struct x86_pmu { + /* + * Generic x86 PMC bits + */ + const char *name; + int version; + int (*handle_irq)(struct pt_regs *); + void (*disable_all)(void); + void (*enable_all)(int added); + void (*enable)(struct perf_event *); + void (*disable)(struct perf_event *); + int (*hw_config)(struct perf_event *event); + int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign); + unsigned eventsel; + unsigned perfctr; + u64 (*event_map)(int); + int max_events; + int num_counters; + int num_counters_fixed; + int cntval_bits; + u64 cntval_mask; + int apic; + u64 max_period; + struct event_constraint * + (*get_event_constraints)(struct cpu_hw_events *cpuc, + struct perf_event *event); + + void (*put_event_constraints)(struct cpu_hw_events *cpuc, + struct perf_event *event); + struct event_constraint *event_constraints; + void (*quirks)(void); + int perfctr_second_write; + + int (*cpu_prepare)(int cpu); + void (*cpu_starting)(int cpu); + void (*cpu_dying)(int cpu); + void (*cpu_dead)(int cpu); + + /* + * Intel Arch Perfmon v2+ + */ + u64 intel_ctrl; + union perf_capabilities intel_cap; + + /* + * Intel DebugStore bits + */ + int bts, pebs; + int bts_active, pebs_active; + int pebs_record_size; + void (*drain_pebs)(struct pt_regs *regs); + struct event_constraint *pebs_constraints; + + /* + * Intel LBR + */ + unsigned long lbr_tos, lbr_from, lbr_to; /* MSR base regs */ + int lbr_nr; /* hardware stack size */ + + /* + * Extra registers for events + */ + struct extra_reg *extra_regs; + unsigned int er_flags; +}; + +#define ERF_NO_HT_SHARING 1 +#define ERF_HAS_RSP_1 2 + +extern struct x86_pmu x86_pmu __read_mostly; + +DECLARE_PER_CPU(struct cpu_hw_events, cpu_hw_events); + +int x86_perf_event_set_period(struct perf_event *event); + +/* + * Generalized hw caching related hw_event table, filled + * in on a per model basis. A value of 0 means + * 'not supported', -1 means 'hw_event makes no sense on + * this CPU', any other value means the raw hw_event + * ID. + */ + +#define C(x) PERF_COUNT_HW_CACHE_##x + +extern u64 __read_mostly hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX]; +extern u64 __read_mostly hw_cache_extra_regs + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX]; + +u64 x86_perf_event_update(struct perf_event *event); + +static inline int x86_pmu_addr_offset(int index) +{ + int offset; + + /* offset = X86_FEATURE_PERFCTR_CORE ? index << 1 : index */ + alternative_io(ASM_NOP2, + "shll $1, %%eax", + X86_FEATURE_PERFCTR_CORE, + "=a" (offset), + "a" (index)); + + return offset; +} + +static inline unsigned int x86_pmu_config_addr(int index) +{ + return x86_pmu.eventsel + x86_pmu_addr_offset(index); +} + +static inline unsigned int x86_pmu_event_addr(int index) +{ + return x86_pmu.perfctr + x86_pmu_addr_offset(index); +} + +int x86_setup_perfctr(struct perf_event *event); + +int x86_pmu_hw_config(struct perf_event *event); + +void x86_pmu_disable_all(void); + +static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc, + u64 enable_mask) +{ + if (hwc->extra_reg.reg) + wrmsrl(hwc->extra_reg.reg, hwc->extra_reg.config); + wrmsrl(hwc->config_base, hwc->config | enable_mask); +} + +void x86_pmu_enable_all(int added); + +int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign); + +void x86_pmu_stop(struct perf_event *event, int flags); + +static inline void x86_pmu_disable_event(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + wrmsrl(hwc->config_base, hwc->config); +} + +void x86_pmu_enable_event(struct perf_event *event); + +int x86_pmu_handle_irq(struct pt_regs *regs); + +extern struct event_constraint emptyconstraint; + +extern struct event_constraint unconstrained; + +#ifdef CONFIG_CPU_SUP_AMD + +int amd_pmu_init(void); + +#else /* CONFIG_CPU_SUP_AMD */ + +static inline int amd_pmu_init(void) +{ + return 0; +} + +#endif /* CONFIG_CPU_SUP_AMD */ + +#ifdef CONFIG_CPU_SUP_INTEL + +int intel_pmu_save_and_restart(struct perf_event *event); + +struct event_constraint * +x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event); + +struct intel_shared_regs *allocate_shared_regs(int cpu); + +int intel_pmu_init(void); + +void init_debug_store_on_cpu(int cpu); + +void fini_debug_store_on_cpu(int cpu); + +void release_ds_buffers(void); + +void reserve_ds_buffers(void); + +extern struct event_constraint bts_constraint; + +void intel_pmu_enable_bts(u64 config); + +void intel_pmu_disable_bts(void); + +int intel_pmu_drain_bts_buffer(void); + +extern struct event_constraint intel_core2_pebs_event_constraints[]; + +extern struct event_constraint intel_atom_pebs_event_constraints[]; + +extern struct event_constraint intel_nehalem_pebs_event_constraints[]; + +extern struct event_constraint intel_westmere_pebs_event_constraints[]; + +extern struct event_constraint intel_snb_pebs_event_constraints[]; + +struct event_constraint *intel_pebs_constraints(struct perf_event *event); + +void intel_pmu_pebs_enable(struct perf_event *event); + +void intel_pmu_pebs_disable(struct perf_event *event); + +void intel_pmu_pebs_enable_all(void); + +void intel_pmu_pebs_disable_all(void); + +void intel_ds_init(void); + +void intel_pmu_lbr_reset(void); + +void intel_pmu_lbr_enable(struct perf_event *event); + +void intel_pmu_lbr_disable(struct perf_event *event); + +void intel_pmu_lbr_enable_all(void); + +void intel_pmu_lbr_disable_all(void); + +void intel_pmu_lbr_read(void); + +void intel_pmu_lbr_init_core(void); + +void intel_pmu_lbr_init_nhm(void); + +void intel_pmu_lbr_init_atom(void); + +int p4_pmu_init(void); + +int p6_pmu_init(void); + +#else /* CONFIG_CPU_SUP_INTEL */ + +static inline void reserve_ds_buffers(void) +{ +} + +static inline void release_ds_buffers(void) +{ +} + +static inline int intel_pmu_init(void) +{ + return 0; +} + +static inline struct intel_shared_regs *allocate_shared_regs(int cpu) +{ + return NULL; +} + +#endif /* CONFIG_CPU_SUP_INTEL */ diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index ee9436c..ed334c8 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -1,4 +1,9 @@ -#ifdef CONFIG_CPU_SUP_AMD +#include +#include +#include +#include + +#include "perf_event.h" static __initconst const u64 amd_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] @@ -573,7 +578,7 @@ static __initconst const struct x86_pmu amd_pmu_f15h = { #endif }; -static __init int amd_pmu_init(void) +__init int amd_pmu_init(void) { /* Performance-monitoring supported from K7 and later: */ if (boot_cpu_data.x86 < 6) @@ -602,12 +607,3 @@ static __init int amd_pmu_init(void) return 0; } - -#else /* CONFIG_CPU_SUP_AMD */ - -static int amd_pmu_init(void) -{ - return 0; -} - -#endif diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 3751494..61fa357 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1,16 +1,19 @@ -#ifdef CONFIG_CPU_SUP_INTEL - /* * Per core/cpu state * * Used to coordinate shared registers between HT threads or * among events on a single PMU. */ -struct intel_shared_regs { - struct er_account regs[EXTRA_REG_MAX]; - int refcnt; /* per-core: #HT threads */ - unsigned core_id; /* per-core: core id */ -}; + +#include +#include +#include +#include + +#include +#include + +#include "perf_event.h" /* * Intel PerfMon, used on Core and later. @@ -945,7 +948,7 @@ static void intel_pmu_enable_event(struct perf_event *event) * Save and restart an expired event. Called by NMI contexts, * so it has to be careful about preempting normal event ops: */ -static int intel_pmu_save_and_restart(struct perf_event *event) +int intel_pmu_save_and_restart(struct perf_event *event) { x86_perf_event_update(event); return x86_perf_event_set_period(event); @@ -1197,6 +1200,21 @@ intel_shared_regs_constraints(struct cpu_hw_events *cpuc, return c; } +struct event_constraint * +x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) +{ + struct event_constraint *c; + + if (x86_pmu.event_constraints) { + for_each_event_constraint(c, x86_pmu.event_constraints) { + if ((event->hw.config & c->cmask) == c->code) + return c; + } + } + + return &unconstrained; +} + static struct event_constraint * intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) { @@ -1309,7 +1327,7 @@ static __initconst const struct x86_pmu core_pmu = { .event_constraints = intel_core_event_constraints, }; -static struct intel_shared_regs *allocate_shared_regs(int cpu) +struct intel_shared_regs *allocate_shared_regs(int cpu) { struct intel_shared_regs *regs; int i; @@ -1441,7 +1459,7 @@ static void intel_clovertown_quirks(void) x86_pmu.pebs_constraints = NULL; } -static __init int intel_pmu_init(void) +__init int intel_pmu_init(void) { union cpuid10_edx edx; union cpuid10_eax eax; @@ -1597,7 +1615,7 @@ static __init int intel_pmu_init(void) intel_pmu_lbr_init_nhm(); x86_pmu.event_constraints = intel_snb_event_constraints; - x86_pmu.pebs_constraints = intel_snb_pebs_events; + x86_pmu.pebs_constraints = intel_snb_pebs_event_constraints; x86_pmu.extra_regs = intel_snb_extra_regs; /* all extra regs are per-cpu when HT is on */ x86_pmu.er_flags |= ERF_HAS_RSP_1; @@ -1628,16 +1646,3 @@ static __init int intel_pmu_init(void) } return 0; } - -#else /* CONFIG_CPU_SUP_INTEL */ - -static int intel_pmu_init(void) -{ - return 0; -} - -static struct intel_shared_regs *allocate_shared_regs(int cpu) -{ - return NULL; -} -#endif /* CONFIG_CPU_SUP_INTEL */ diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 1b1ef3a..c0d238f 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -1,7 +1,10 @@ -#ifdef CONFIG_CPU_SUP_INTEL +#include +#include +#include -/* The maximal number of PEBS events: */ -#define MAX_PEBS_EVENTS 4 +#include + +#include "perf_event.h" /* The size of a BTS record in bytes: */ #define BTS_RECORD_SIZE 24 @@ -37,24 +40,7 @@ struct pebs_record_nhm { u64 status, dla, dse, lat; }; -/* - * A debug store configuration. - * - * We only support architectures that use 64bit fields. - */ -struct debug_store { - u64 bts_buffer_base; - u64 bts_index; - u64 bts_absolute_maximum; - u64 bts_interrupt_threshold; - u64 pebs_buffer_base; - u64 pebs_index; - u64 pebs_absolute_maximum; - u64 pebs_interrupt_threshold; - u64 pebs_event_reset[MAX_PEBS_EVENTS]; -}; - -static void init_debug_store_on_cpu(int cpu) +void init_debug_store_on_cpu(int cpu) { struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; @@ -66,7 +52,7 @@ static void init_debug_store_on_cpu(int cpu) (u32)((u64)(unsigned long)ds >> 32)); } -static void fini_debug_store_on_cpu(int cpu) +void fini_debug_store_on_cpu(int cpu) { if (!per_cpu(cpu_hw_events, cpu).ds) return; @@ -175,7 +161,7 @@ static void release_ds_buffer(int cpu) kfree(ds); } -static void release_ds_buffers(void) +void release_ds_buffers(void) { int cpu; @@ -194,7 +180,7 @@ static void release_ds_buffers(void) put_online_cpus(); } -static void reserve_ds_buffers(void) +void reserve_ds_buffers(void) { int bts_err = 0, pebs_err = 0; int cpu; @@ -260,10 +246,10 @@ static void reserve_ds_buffers(void) * BTS */ -static struct event_constraint bts_constraint = +struct event_constraint bts_constraint = EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0); -static void intel_pmu_enable_bts(u64 config) +void intel_pmu_enable_bts(u64 config) { unsigned long debugctlmsr; @@ -282,7 +268,7 @@ static void intel_pmu_enable_bts(u64 config) update_debugctlmsr(debugctlmsr); } -static void intel_pmu_disable_bts(void) +void intel_pmu_disable_bts(void) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); unsigned long debugctlmsr; @@ -299,7 +285,7 @@ static void intel_pmu_disable_bts(void) update_debugctlmsr(debugctlmsr); } -static int intel_pmu_drain_bts_buffer(void) +int intel_pmu_drain_bts_buffer(void) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct debug_store *ds = cpuc->ds; @@ -361,7 +347,7 @@ static int intel_pmu_drain_bts_buffer(void) /* * PEBS */ -static struct event_constraint intel_core2_pebs_event_constraints[] = { +struct event_constraint intel_core2_pebs_event_constraints[] = { INTEL_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */ INTEL_UEVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */ INTEL_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */ @@ -370,14 +356,14 @@ static struct event_constraint intel_core2_pebs_event_constraints[] = { EVENT_CONSTRAINT_END }; -static struct event_constraint intel_atom_pebs_event_constraints[] = { +struct event_constraint intel_atom_pebs_event_constraints[] = { INTEL_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */ INTEL_UEVENT_CONSTRAINT(0x00c5, 0x1), /* MISPREDICTED_BRANCH_RETIRED */ INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */ EVENT_CONSTRAINT_END }; -static struct event_constraint intel_nehalem_pebs_event_constraints[] = { +struct event_constraint intel_nehalem_pebs_event_constraints[] = { INTEL_EVENT_CONSTRAINT(0x0b, 0xf), /* MEM_INST_RETIRED.* */ INTEL_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */ INTEL_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */ @@ -392,7 +378,7 @@ static struct event_constraint intel_nehalem_pebs_event_constraints[] = { EVENT_CONSTRAINT_END }; -static struct event_constraint intel_westmere_pebs_event_constraints[] = { +struct event_constraint intel_westmere_pebs_event_constraints[] = { INTEL_EVENT_CONSTRAINT(0x0b, 0xf), /* MEM_INST_RETIRED.* */ INTEL_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */ INTEL_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */ @@ -407,7 +393,7 @@ static struct event_constraint intel_westmere_pebs_event_constraints[] = { EVENT_CONSTRAINT_END }; -static struct event_constraint intel_snb_pebs_events[] = { +struct event_constraint intel_snb_pebs_event_constraints[] = { INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */ INTEL_UEVENT_CONSTRAINT(0x01c2, 0xf), /* UOPS_RETIRED.ALL */ INTEL_UEVENT_CONSTRAINT(0x02c2, 0xf), /* UOPS_RETIRED.RETIRE_SLOTS */ @@ -428,8 +414,7 @@ static struct event_constraint intel_snb_pebs_events[] = { EVENT_CONSTRAINT_END }; -static struct event_constraint * -intel_pebs_constraints(struct perf_event *event) +struct event_constraint *intel_pebs_constraints(struct perf_event *event) { struct event_constraint *c; @@ -446,7 +431,7 @@ intel_pebs_constraints(struct perf_event *event) return &emptyconstraint; } -static void intel_pmu_pebs_enable(struct perf_event *event) +void intel_pmu_pebs_enable(struct perf_event *event) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct hw_perf_event *hwc = &event->hw; @@ -460,7 +445,7 @@ static void intel_pmu_pebs_enable(struct perf_event *event) intel_pmu_lbr_enable(event); } -static void intel_pmu_pebs_disable(struct perf_event *event) +void intel_pmu_pebs_disable(struct perf_event *event) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct hw_perf_event *hwc = &event->hw; @@ -475,7 +460,7 @@ static void intel_pmu_pebs_disable(struct perf_event *event) intel_pmu_lbr_disable(event); } -static void intel_pmu_pebs_enable_all(void) +void intel_pmu_pebs_enable_all(void) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); @@ -483,7 +468,7 @@ static void intel_pmu_pebs_enable_all(void) wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled); } -static void intel_pmu_pebs_disable_all(void) +void intel_pmu_pebs_disable_all(void) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); @@ -576,8 +561,6 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) return 0; } -static int intel_pmu_save_and_restart(struct perf_event *event); - static void __intel_pmu_pebs_event(struct perf_event *event, struct pt_regs *iregs, void *__pebs) { @@ -716,7 +699,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) * BTS, PEBS probe and setup */ -static void intel_ds_init(void) +void intel_ds_init(void) { /* * No support for 32bit formats @@ -749,15 +732,3 @@ static void intel_ds_init(void) } } } - -#else /* CONFIG_CPU_SUP_INTEL */ - -static void reserve_ds_buffers(void) -{ -} - -static void release_ds_buffers(void) -{ -} - -#endif /* CONFIG_CPU_SUP_INTEL */ diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index d202c1b..3fab3de 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -1,4 +1,10 @@ -#ifdef CONFIG_CPU_SUP_INTEL +#include +#include + +#include +#include + +#include "perf_event.h" enum { LBR_FORMAT_32 = 0x00, @@ -48,7 +54,7 @@ static void intel_pmu_lbr_reset_64(void) } } -static void intel_pmu_lbr_reset(void) +void intel_pmu_lbr_reset(void) { if (!x86_pmu.lbr_nr) return; @@ -59,7 +65,7 @@ static void intel_pmu_lbr_reset(void) intel_pmu_lbr_reset_64(); } -static void intel_pmu_lbr_enable(struct perf_event *event) +void intel_pmu_lbr_enable(struct perf_event *event) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); @@ -81,7 +87,7 @@ static void intel_pmu_lbr_enable(struct perf_event *event) cpuc->lbr_users++; } -static void intel_pmu_lbr_disable(struct perf_event *event) +void intel_pmu_lbr_disable(struct perf_event *event) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); @@ -95,7 +101,7 @@ static void intel_pmu_lbr_disable(struct perf_event *event) __intel_pmu_lbr_disable(); } -static void intel_pmu_lbr_enable_all(void) +void intel_pmu_lbr_enable_all(void) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); @@ -103,7 +109,7 @@ static void intel_pmu_lbr_enable_all(void) __intel_pmu_lbr_enable(); } -static void intel_pmu_lbr_disable_all(void) +void intel_pmu_lbr_disable_all(void) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); @@ -178,7 +184,7 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) cpuc->lbr_stack.nr = i; } -static void intel_pmu_lbr_read(void) +void intel_pmu_lbr_read(void) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); @@ -191,7 +197,7 @@ static void intel_pmu_lbr_read(void) intel_pmu_lbr_read_64(cpuc); } -static void intel_pmu_lbr_init_core(void) +void intel_pmu_lbr_init_core(void) { x86_pmu.lbr_nr = 4; x86_pmu.lbr_tos = 0x01c9; @@ -199,7 +205,7 @@ static void intel_pmu_lbr_init_core(void) x86_pmu.lbr_to = 0x60; } -static void intel_pmu_lbr_init_nhm(void) +void intel_pmu_lbr_init_nhm(void) { x86_pmu.lbr_nr = 16; x86_pmu.lbr_tos = 0x01c9; @@ -207,12 +213,10 @@ static void intel_pmu_lbr_init_nhm(void) x86_pmu.lbr_to = 0x6c0; } -static void intel_pmu_lbr_init_atom(void) +void intel_pmu_lbr_init_atom(void) { x86_pmu.lbr_nr = 8; x86_pmu.lbr_tos = 0x01c9; x86_pmu.lbr_from = 0x40; x86_pmu.lbr_to = 0x60; } - -#endif /* CONFIG_CPU_SUP_INTEL */ diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index 7809d2b..492bf13 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -7,9 +7,13 @@ * For licencing details see kernel-base/COPYING */ -#ifdef CONFIG_CPU_SUP_INTEL +#include #include +#include +#include + +#include "perf_event.h" #define P4_CNTR_LIMIT 3 /* @@ -1303,7 +1307,7 @@ static __initconst const struct x86_pmu p4_pmu = { .perfctr_second_write = 1, }; -static __init int p4_pmu_init(void) +__init int p4_pmu_init(void) { unsigned int low, high; @@ -1326,5 +1330,3 @@ static __init int p4_pmu_init(void) return 0; } - -#endif /* CONFIG_CPU_SUP_INTEL */ diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c index 20c097e..c7181be 100644 --- a/arch/x86/kernel/cpu/perf_event_p6.c +++ b/arch/x86/kernel/cpu/perf_event_p6.c @@ -1,4 +1,7 @@ -#ifdef CONFIG_CPU_SUP_INTEL +#include +#include + +#include "perf_event.h" /* * Not sure about some of these @@ -114,7 +117,7 @@ static __initconst const struct x86_pmu p6_pmu = { .event_constraints = p6_event_constraints, }; -static __init int p6_pmu_init(void) +__init int p6_pmu_init(void) { switch (boot_cpu_data.x86_model) { case 1: @@ -138,5 +141,3 @@ static __init int p6_pmu_init(void) return 0; } - -#endif /* CONFIG_CPU_SUP_INTEL */ -- cgit v1.1 From 3e0996798a6a113efae9e0187c5581491bdb07a7 Mon Sep 17 00:00:00 2001 From: Yu Ke Date: Wed, 24 Mar 2010 11:01:13 -0700 Subject: xen/acpi: Domain0 acpi parser related platform hypercall This patches implements the xen_platform_op hypercall, to pass the parsed ACPI info to hypervisor. Signed-off-by: Yu Ke Signed-off-by: Tian Kevin Signed-off-by: Jeremy Fitzhardinge [v1: Added DEFINE_GUEST.. in appropiate headers] [v2: Ripped out typedefs] Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/include/asm/xen/interface.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h index 5d4922a..a1f2db5 100644 --- a/arch/x86/include/asm/xen/interface.h +++ b/arch/x86/include/asm/xen/interface.h @@ -55,6 +55,7 @@ DEFINE_GUEST_HANDLE(char); DEFINE_GUEST_HANDLE(int); DEFINE_GUEST_HANDLE(long); DEFINE_GUEST_HANDLE(void); +DEFINE_GUEST_HANDLE(uint64_t); #endif #ifndef HYPERVISOR_VIRT_START -- cgit v1.1 From eec07a9ecf428a680d72dc7f2428ea8bed80b84d Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 23 Sep 2011 11:44:17 -0700 Subject: xen: add dom0_op hypercall Signed-off-by: Jeremy Fitzhardinge --- arch/x86/include/asm/xen/hypercall.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index d240ea9..0c9894e 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -45,6 +45,7 @@ #include #include #include +#include /* * The hypercall asms have to meet several constraints: @@ -299,6 +300,13 @@ HYPERVISOR_set_timer_op(u64 timeout) } static inline int +HYPERVISOR_dom0_op(struct xen_platform_op *platform_op) +{ + platform_op->interface_version = XENPF_INTERFACE_VERSION; + return _hypercall1(int, dom0_op, platform_op); +} + +static inline int HYPERVISOR_set_debugreg(int reg, unsigned long value) { return _hypercall2(int, set_debugreg, reg, value); -- cgit v1.1 From fdb9eb9f155bfc0f8dc2fc88f90448b30c78ad97 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 26 Mar 2010 11:21:22 -0700 Subject: xen/dom0: set wallclock time in Xen Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/time.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 5158c50..8c9cdfa 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -200,8 +200,22 @@ static unsigned long xen_get_wallclock(void) static int xen_set_wallclock(unsigned long now) { + struct xen_platform_op op; + int rc; + /* do nothing for domU */ - return -1; + if (!xen_initial_domain()) + return -1; + + op.cmd = XENPF_settime; + op.u.settime.secs = now; + op.u.settime.nsecs = 0; + op.u.settime.system_time = xen_clocksource_read(); + + rc = HYPERVISOR_dom0_op(&op); + WARN(rc != 0, "XENPF_settime failed: now=%ld\n", now); + + return rc; } static struct clocksource xen_clocksource __read_mostly = { -- cgit v1.1 From 395cf9691d72173d8cdaa613c5f0255f993af94b Mon Sep 17 00:00:00 2001 From: Paul Bolle Date: Mon, 15 Aug 2011 02:02:26 +0200 Subject: doc: fix broken references There are numerous broken references to Documentation files (in other Documentation files, in comments, etc.). These broken references are caused by typo's in the references, and by renames or removals of the Documentation files. Some broken references are simply odd. Fix these broken references, sometimes by dropping the irrelevant text they were part of. Signed-off-by: Paul Bolle Signed-off-by: Jiri Kosina --- arch/x86/Kconfig | 2 +- arch/x86/Kconfig.debug | 2 +- arch/x86/boot/header.S | 2 +- arch/x86/include/asm/dma-mapping.h | 2 +- arch/x86/kernel/amd_gart_64.c | 2 +- arch/x86/kernel/apm_32.c | 2 -- arch/x86/kernel/pci-dma.c | 4 ++-- 7 files changed, 7 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 6a47bb2..9a4a267 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -279,7 +279,7 @@ config SMP Y to "Enhanced Real Time Clock Support", below. The "Advanced Power Management" code will be disabled if you say Y here. - See also , + See also , and the SMP-HOWTO available at . diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index c0f8a5c..bf56e17 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -139,7 +139,7 @@ config IOMMU_DEBUG code. When you use it make sure you have a big enough IOMMU/AGP aperture. Most of the options enabled by this can be set more finegrained using the iommu= command line - options. See Documentation/x86_64/boot-options.txt for more + options. See Documentation/x86/x86_64/boot-options.txt for more details. config IOMMU_STRESS diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 93e689f4..bdb4d45 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -129,7 +129,7 @@ start_sys_seg: .word SYSSEG # obsolete and meaningless, but just type_of_loader: .byte 0 # 0 means ancient bootloader, newer # bootloaders know to change this. - # See Documentation/i386/boot.txt for + # See Documentation/x86/boot.txt for # assigned ids # flags, unused bits must be zero (RFU) bit within loadflags diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index d4c419f..ed3065f 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -2,7 +2,7 @@ #define _ASM_X86_DMA_MAPPING_H /* - * IOMMU interface. See Documentation/PCI/PCI-DMA-mapping.txt and + * IOMMU interface. See Documentation/DMA-API-HOWTO.txt and * Documentation/DMA-API.txt for documentation. */ diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index 8a439d3..b1e7c7f 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -5,7 +5,7 @@ * This allows to use PCI devices that only support 32bit addresses on systems * with more than 4GB. * - * See Documentation/PCI/PCI-DMA-mapping.txt for the interface specification. + * See Documentation/DMA-API-HOWTO.txt for the interface specification. * * Copyright 2002 Andi Kleen, SuSE Labs. * Subject to the GNU General Public License v2 only. diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 0371c48..a46bd38 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -249,8 +249,6 @@ extern int (*console_blank_hook)(int); #define APM_MINOR_DEV 134 /* - * See Documentation/Config.help for the configuration options. - * * Various options can be changed at boot time as follows: * (We allow underscores for compatibility with the modules code) * apm=on/off enable/disable APM diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index b49d00d..6228720 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -117,8 +117,8 @@ again: } /* - * See for the iommu kernel parameter - * documentation. + * See for the iommu kernel + * parameter documentation. */ static __init int iommu_setup(char *p) { -- cgit v1.1 From d6eed550a9d08fbd1fce32e517e8c49afa6edbf7 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 27 Sep 2011 10:00:40 -0700 Subject: x86: Perf_event_amd.c needs Fix (rare) build error by adding header file: arch/x86/kernel/cpu/perf_event_amd.c:350:2: error: 'BAD_APICID' undeclared (first use in this function) Signed-off-by: Randy Dunlap Cc: Robert Richter Cc: Andre Przywara Cc: Stephen Rothwell Link: http://lkml.kernel.org/r/4E820138.90301@xenotime.net Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_amd.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index ed334c8..384450d 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -2,6 +2,7 @@ #include #include #include +#include #include "perf_event.h" -- cgit v1.1 From 4a7f340c6a75ec5fca23d9c80a59f3f28cc4a61e Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 8 Sep 2011 17:50:12 -0700 Subject: x86, ticketlock: remove obsolete comment The note about partial registers is not really relevent now that we rely on gcc to generate all the assembler. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/include/asm/spinlock.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index f5695ee..972c260 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -49,10 +49,6 @@ * issues and should be optimal for the uncontended case. Note the tail must be * in the high part, because a wide xadd increment of the low part would carry * up and contaminate the high part. - * - * With fewer than 2^8 possible CPUs, we can use x86's partial registers to - * save some instructions and make the code more elegant. There really isn't - * much between them in performance though, especially as locks are out of line. */ static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock) { -- cgit v1.1 From 910b2c5122ab787179a790ca1dec616fc80f0173 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Wed, 28 Sep 2011 17:42:14 +1000 Subject: x86, amd: Include linux/elf.h since we use stuff from asm/elf.h After merging the moduleh tree, today's linux-next build (x86_64 allmodconfig) failed like this: arch/x86/kernel/sys_x86_64.c:28:10: warning: 'enum align_flags' declared inside parameter list arch/x86/kernel/sys_x86_64.c:28:10: warning: its scope is only this definition or declaration, which is probably not what you want arch/x86/kernel/sys_x86_64.c:28:22: error: parameter 3 ('flags') has incomplete type [...] Presumably caused by the module.h split interacting with a new commit dfb09f9b7ab0 ("x86, amd: Avoid cache aliasing penalties on AMD family 15h") from the x8 tree. Signed-off-by: Stephen Rothwell Acked-by: Borislav Petkov Cc: Peter Zijlstra Cc: Paul Gortmaker Link: http://lkml.kernel.org/r/20110928174214.17a58be15d84d67c185930e1@canb.auug.org.au Signed-off-by: Ingo Molnar --- arch/x86/kernel/sys_x86_64.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index fe7d2da..0514890 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include -- cgit v1.1 From 838312be46f3abfbdc175f81c3e54a857994476d Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Wed, 28 Sep 2011 16:44:54 +0100 Subject: apic, i386/bigsmp: Fix false warnings regarding logical APIC ID mismatches These warnings (generally one per CPU) are a result of initializing x86_cpu_to_logical_apicid while apic_default is still in use, but the check in setup_local_APIC() being done when apic_bigsmp was already used as an override in default_setup_apic_routing(): Overriding APIC driver with bigsmp Enabling APIC mode: Physflat. Using 5 I/O APICs ------------[ cut here ]------------ WARNING: at .../arch/x86/kernel/apic/apic.c:1239 ... CPU 1 irqstacks, hard=f1c9a000 soft=f1c9c000 Booting Node 0, Processors #1 smpboot cpu 1: start_ip = 9e000 Initializing CPU#1 ------------[ cut here ]------------ WARNING: at .../arch/x86/kernel/apic/apic.c:1239 setup_local_APIC+0x137/0x46b() Hardware name: ... CPU1 logical APIC ID: 2 != 8 ... Fix this (for the time being, i.e. until x86_32_early_logical_apicid() will get removed again, as Tejun says ought to be possible) by overriding the previously stored values at the point where the APIC driver gets overridden. v2: Move this and the pre-existing override logic into arch/x86/kernel/apic/bigsmp_32.c. Signed-off-by: Jan Beulich Acked-by: Tejun Heo Cc: (2.6.39 and onwards) Link: http://lkml.kernel.org/r/4E835D16020000780005844C@nat28.tlf.novell.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apic.h | 2 +- arch/x86/kernel/apic/bigsmp_32.c | 20 ++++++++++++++++---- arch/x86/kernel/apic/probe_32.c | 10 ++-------- 3 files changed, 19 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 7b3ca83..9b7273c 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -495,7 +495,7 @@ static inline void default_wait_for_init_deassert(atomic_t *deassert) return; } -extern struct apic *generic_bigsmp_probe(void); +extern void generic_bigsmp_probe(void); #ifdef CONFIG_X86_LOCAL_APIC diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index efd737e..521bead 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -255,12 +255,24 @@ static struct apic apic_bigsmp = { .x86_32_early_logical_apicid = bigsmp_early_logical_apicid, }; -struct apic * __init generic_bigsmp_probe(void) +void __init generic_bigsmp_probe(void) { - if (probe_bigsmp()) - return &apic_bigsmp; + unsigned int cpu; - return NULL; + if (!probe_bigsmp()) + return; + + apic = &apic_bigsmp; + + for_each_possible_cpu(cpu) { + if (early_per_cpu(x86_cpu_to_logical_apicid, + cpu) == BAD_APICID) + continue; + early_per_cpu(x86_cpu_to_logical_apicid, cpu) = + bigsmp_early_logical_apicid(cpu); + } + + pr_info("Overriding APIC driver with %s\n", apic_bigsmp.name); } apic_driver(apic_bigsmp); diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index b5254ad..0787bb3 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -200,14 +200,8 @@ void __init default_setup_apic_routing(void) * - we find more than 8 CPUs in acpi LAPIC listing with xAPIC support */ - if (!cmdline_apic && apic == &apic_default) { - struct apic *bigsmp = generic_bigsmp_probe(); - if (bigsmp) { - apic = bigsmp; - printk(KERN_INFO "Overriding APIC driver with %s\n", - apic->name); - } - } + if (!cmdline_apic && apic == &apic_default) + generic_bigsmp_probe(); #endif if (apic->setup_apic_routing) -- cgit v1.1 From e05139f2569ecf699b229a6473a86cdffed62956 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Wed, 28 Sep 2011 16:56:51 +0100 Subject: x86-64: Don't apply destructive erratum workaround on unaffected CPUs Erratum 93 applies to AMD K8 CPUs only, and its workaround (forcing the upper 32 bits of %rip to all get set under certain conditions) is actually getting in the way of analyzing page faults occurring during EFI physical mode runtime calls (in particular the page table walk shown is completely unrelated to the actual fault). This is because typically EFI runtime code lives in the space between 2G and 4G, which - modulo the above manipulation - is likely to overlap with the kernel or modules area. While even for the other errata workarounds their taking effect could be limited to just the affected CPUs, none of them appears to be destructive, and they're generally getting called only outside of performance critical paths, so they're being left untouched. Signed-off-by: Jan Beulich Link: http://lkml.kernel.org/r/4E835FE30200007800058464@nat28.tlf.novell.com Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 0d17c8c..9c7378d 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -420,12 +420,14 @@ static noinline __kprobes int vmalloc_fault(unsigned long address) return 0; } +#ifdef CONFIG_CPU_SUP_AMD static const char errata93_warning[] = KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" "******* Working around it, but it may cause SEGVs or burn power.\n" "******* Please consider a BIOS update.\n" "******* Disabling USB legacy in the BIOS may also help.\n"; +#endif /* * No vm86 mode in 64-bit mode: @@ -505,7 +507,11 @@ bad: */ static int is_errata93(struct pt_regs *regs, unsigned long address) { -#ifdef CONFIG_X86_64 +#if defined(CONFIG_X86_64) && defined(CONFIG_CPU_SUP_AMD) + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD + || boot_cpu_data.x86 != 0xf) + return 0; + if (address != regs->ip) return 0; -- cgit v1.1 From eab9e6137f237681a04649e786cc4d942bedd6d1 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Wed, 28 Sep 2011 16:57:52 +0100 Subject: x86-64: Fix CFI data for interrupt frames The patch titled "x86: Don't use frame pointer to save old stack on irq entry" did not properly adjust CFI directives, so this patch is a follow-up to that one. With the old stack pointer no longer stored in a callee-saved register (plus some offset), we now have to use a CFA expression to describe the memory location where it is being found. This requires the use of .cfi_escape (allowing arbitrary byte streams to be emitted into .eh_frame), as there is no .cfi_def_cfa_expression (which also cannot reasonably be expected, as it would require a full expression parser). Signed-off-by: Jan Beulich Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/4E8360200200007800058467@nat28.tlf.novell.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/dwarf2.h | 2 ++ arch/x86/kernel/entry_64.S | 14 +++++++++----- 2 files changed, 11 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h index 3260991..f6f1598 100644 --- a/arch/x86/include/asm/dwarf2.h +++ b/arch/x86/include/asm/dwarf2.h @@ -27,6 +27,7 @@ #define CFI_REMEMBER_STATE .cfi_remember_state #define CFI_RESTORE_STATE .cfi_restore_state #define CFI_UNDEFINED .cfi_undefined +#define CFI_ESCAPE .cfi_escape #ifdef CONFIG_AS_CFI_SIGNAL_FRAME #define CFI_SIGNAL_FRAME .cfi_signal_frame @@ -68,6 +69,7 @@ #define CFI_REMEMBER_STATE cfi_ignore #define CFI_RESTORE_STATE cfi_ignore #define CFI_UNDEFINED cfi_ignore +#define CFI_ESCAPE cfi_ignore #define CFI_SIGNAL_FRAME cfi_ignore #endif diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 6419bb0..faf8d5e 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -331,10 +331,15 @@ ENDPROC(native_usergs_sysret64) 1: incl PER_CPU_VAR(irq_count) jne 2f mov PER_CPU_VAR(irq_stack_ptr),%rsp - EMPTY_FRAME 0 + CFI_DEF_CFA_REGISTER rsi 2: /* Store previous stack value */ pushq %rsi + CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \ + 0x77 /* DW_OP_breg7 */, 0, \ + 0x06 /* DW_OP_deref */, \ + 0x08 /* DW_OP_const1u */, SS+8-RBP, \ + 0x22 /* DW_OP_plus */ /* We entered an interrupt context - irqs are off: */ TRACE_IRQS_OFF .endm @@ -788,7 +793,6 @@ END(interrupt) subq $ORIG_RAX-RBP, %rsp CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP SAVE_ARGS_IRQ - PARTIAL_FRAME 0 call \func .endm @@ -813,10 +817,10 @@ ret_from_intr: /* Restore saved previous stack */ popq %rsi - leaq 16(%rsi), %rsp - + CFI_DEF_CFA_REGISTER rsi + leaq ARGOFFSET-RBP(%rsi), %rsp CFI_DEF_CFA_REGISTER rsp - CFI_ADJUST_CFA_OFFSET -16 + CFI_ADJUST_CFA_OFFSET RBP-ARGOFFSET exit_intr: GET_THREAD_INFO(%rcx) -- cgit v1.1 From 0930bba674e248b921ea659b036ff02564e5a5f4 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Thu, 29 Sep 2011 11:57:56 +0100 Subject: xen: modify kernel mappings corresponding to granted pages If we want to use granted pages for AIO, changing the mappings of a user vma and the corresponding p2m is not enough, we also need to update the kernel mappings accordingly. Currently this is only needed for pages that are created for user usages through /dev/xen/gntdev. As in, pages that have been in use by the kernel and use the P2M will not need this special mapping. However there are no guarantees that in the future the kernel won't start accessing pages through the 1:1 even for internal usage. In order to avoid the complexity of dealing with highmem, we allocated the pages lowmem. We issue a HYPERVISOR_grant_table_op right away in m2p_add_override and we remove the mappings using another HYPERVISOR_grant_table_op in m2p_remove_override. Considering that m2p_add_override and m2p_remove_override are called once per page we use multicalls and hypercall batching. Use the kmap_op pointer directly as argument to do the mapping as it is guaranteed to be present up until the unmapping is done. Before issuing any unmapping multicalls, we need to make sure that the mapping has already being done, because we need the kmap->handle to be set correctly. Signed-off-by: Stefano Stabellini [v1: Removed GRANT_FRAME_BIT usage] Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/include/asm/xen/page.h | 3 +- arch/x86/xen/p2m.c | 76 +++++++++++++++++++++++++++++++++++------ 2 files changed, 68 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index bc12c12..3138d33 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -12,6 +12,7 @@ #include #include +#include #include /* Xen machine address */ @@ -48,7 +49,7 @@ extern unsigned long set_phys_range_identity(unsigned long pfn_s, unsigned long pfn_e); extern int m2p_add_override(unsigned long mfn, struct page *page, - bool clear_pte); + struct gnttab_map_grant_ref *kmap_op); extern int m2p_remove_override(struct page *page, bool clear_pte); extern struct page *m2p_find_override(unsigned long mfn); extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn); diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 6e56b65..a8ee9a4 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -161,7 +161,9 @@ #include #include #include +#include +#include "multicalls.h" #include "xen-ops.h" static void __init m2p_override_init(void); @@ -676,7 +678,8 @@ static unsigned long mfn_hash(unsigned long mfn) } /* Add an MFN override for a particular page */ -int m2p_add_override(unsigned long mfn, struct page *page, bool clear_pte) +int m2p_add_override(unsigned long mfn, struct page *page, + struct gnttab_map_grant_ref *kmap_op) { unsigned long flags; unsigned long pfn; @@ -700,9 +703,20 @@ int m2p_add_override(unsigned long mfn, struct page *page, bool clear_pte) if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)))) return -ENOMEM; - if (clear_pte && !PageHighMem(page)) - /* Just zap old mapping for now */ - pte_clear(&init_mm, address, ptep); + if (kmap_op != NULL) { + if (!PageHighMem(page)) { + struct multicall_space mcs = + xen_mc_entry(sizeof(*kmap_op)); + + MULTI_grant_table_op(mcs.mc, + GNTTABOP_map_grant_ref, kmap_op, 1); + + xen_mc_issue(PARAVIRT_LAZY_MMU); + } + /* let's use dev_bus_addr to record the old mfn instead */ + kmap_op->dev_bus_addr = page->index; + page->index = (unsigned long) kmap_op; + } spin_lock_irqsave(&m2p_override_lock, flags); list_add(&page->lru, &m2p_overrides[mfn_hash(mfn)]); spin_unlock_irqrestore(&m2p_override_lock, flags); @@ -736,14 +750,56 @@ int m2p_remove_override(struct page *page, bool clear_pte) spin_lock_irqsave(&m2p_override_lock, flags); list_del(&page->lru); spin_unlock_irqrestore(&m2p_override_lock, flags); - set_phys_to_machine(pfn, page->index); WARN_ON(!PagePrivate(page)); ClearPagePrivate(page); - if (clear_pte && !PageHighMem(page)) - set_pte_at(&init_mm, address, ptep, - pfn_pte(pfn, PAGE_KERNEL)); - /* No tlb flush necessary because the caller already - * left the pte unmapped. */ + + if (clear_pte) { + struct gnttab_map_grant_ref *map_op = + (struct gnttab_map_grant_ref *) page->index; + set_phys_to_machine(pfn, map_op->dev_bus_addr); + if (!PageHighMem(page)) { + struct multicall_space mcs; + struct gnttab_unmap_grant_ref *unmap_op; + + /* + * It might be that we queued all the m2p grant table + * hypercalls in a multicall, then m2p_remove_override + * get called before the multicall has actually been + * issued. In this case handle is going to -1 because + * it hasn't been modified yet. + */ + if (map_op->handle == -1) + xen_mc_flush(); + /* + * Now if map_op->handle is negative it means that the + * hypercall actually returned an error. + */ + if (map_op->handle == GNTST_general_error) { + printk(KERN_WARNING "m2p_remove_override: " + "pfn %lx mfn %lx, failed to modify kernel mappings", + pfn, mfn); + return -1; + } + + mcs = xen_mc_entry( + sizeof(struct gnttab_unmap_grant_ref)); + unmap_op = mcs.args; + unmap_op->host_addr = map_op->host_addr; + unmap_op->handle = map_op->handle; + unmap_op->dev_bus_addr = 0; + + MULTI_grant_table_op(mcs.mc, + GNTTABOP_unmap_grant_ref, unmap_op, 1); + + xen_mc_issue(PARAVIRT_LAZY_MMU); + + set_pte_at(&init_mm, address, ptep, + pfn_pte(pfn, PAGE_KERNEL)); + __flush_tlb_single(address); + map_op->host_addr = 0; + } + } else + set_phys_to_machine(pfn, page->index); return 0; } -- cgit v1.1 From b17d0b5c0824b6a6f143a6587fa7d47abe006ab4 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Thu, 29 Sep 2011 12:05:57 +0100 Subject: xen: XEN_PVHVM depends on PCI Xen PV on HVM guests require PCI support because they need the xen-platform-pci driver in order to initialize xenbus. Signed-off-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/Kconfig | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 5cc821c..e061f55 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -25,8 +25,7 @@ config XEN_PRIVILEGED_GUEST config XEN_PVHVM def_bool y - depends on XEN - depends on X86_LOCAL_APIC + depends on XEN && PCI && X86_LOCAL_APIC config XEN_MAX_DOMAIN_MEMORY int -- cgit v1.1 From aa24411b6717fd1e6ecef281bec497f6f30bbd66 Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Wed, 28 Sep 2011 17:46:32 +0100 Subject: xen/balloon: account for pages released during memory setup In xen_memory_setup() pages that occur in gaps in the memory map are released back to Xen. This reduces the domain's current page count in the hypervisor. The Xen balloon driver does not correctly decrease its initial current_pages count to reflect this. If 'delta' pages are released and the target is adjusted the resulting reservation is always 'delta' less than the requested target. This affects dom0 if the initial allocation of pages overlaps the PCI memory region but won't affect most domU guests that have been setup with pseudo-physical memory maps that don't have gaps. Fix this by accouting for the released pages when starting the balloon driver. If the domain's targets are managed by xapi, the domain may eventually run out of memory and die because xapi currently gets its target calculations wrong and whenever it is restarted it always reduces the target by 'delta'. Signed-off-by: David Vrabel Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/setup.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 46d6d21..c983717 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -39,6 +39,9 @@ extern void xen_syscall32_target(void); /* Amount of extra memory space we add to the e820 ranges */ phys_addr_t xen_extra_mem_start, xen_extra_mem_size; +/* Number of pages released from the initial allocation. */ +unsigned long xen_released_pages; + /* * The maximum amount of extra memory compared to the base size. The * main scaling factor is the size of struct page. At extreme ratios @@ -313,7 +316,9 @@ char * __init xen_memory_setup(void) extra_pages = 0; } - extra_pages += xen_return_unused_memory(xen_start_info->nr_pages, &e820); + xen_released_pages = xen_return_unused_memory(xen_start_info->nr_pages, + &e820); + extra_pages += xen_released_pages; /* * Clamp the amount of extra memory to a EXTRA_MEM_RATIO -- cgit v1.1 From 8b5d44a5ac93cd7a1b044db3ff0ba4955b4ba5ec Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Wed, 28 Sep 2011 17:46:34 +0100 Subject: xen: allow balloon driver to use more than one memory region Allow the xen balloon driver to populate its list of extra pages from more than one region of memory. This will allow platforms to provide (for example) a region of low memory and a region of high memory. The maximum possible number of extra regions is 128 (== E820MAX) which is quite large so xen_extra_mem is placed in __initdata. This is safe as both xen_memory_setup() and balloon_init() are in __init. The balloon regions themselves are not altered (i.e., there is still only the one region). Signed-off-by: David Vrabel Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/setup.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index c983717..0c8e974 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -37,7 +37,7 @@ extern void xen_syscall_target(void); extern void xen_syscall32_target(void); /* Amount of extra memory space we add to the e820 ranges */ -phys_addr_t xen_extra_mem_start, xen_extra_mem_size; +struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata; /* Number of pages released from the initial allocation. */ unsigned long xen_released_pages; @@ -59,7 +59,7 @@ static void __init xen_add_extra_mem(unsigned long pages) unsigned long pfn; u64 size = (u64)pages * PAGE_SIZE; - u64 extra_start = xen_extra_mem_start + xen_extra_mem_size; + u64 extra_start = xen_extra_mem[0].start + xen_extra_mem[0].size; if (!pages) return; @@ -69,7 +69,7 @@ static void __init xen_add_extra_mem(unsigned long pages) memblock_x86_reserve_range(extra_start, extra_start + size, "XEN EXTRA"); - xen_extra_mem_size += size; + xen_extra_mem[0].size += size; xen_max_p2m_pfn = PFN_DOWN(extra_start + size); @@ -242,7 +242,7 @@ char * __init xen_memory_setup(void) memcpy(map_raw, map, sizeof(map)); e820.nr_map = 0; - xen_extra_mem_start = mem_end; + xen_extra_mem[0].start = mem_end; for (i = 0; i < memmap.nr_entries; i++) { unsigned long long end; @@ -270,8 +270,8 @@ char * __init xen_memory_setup(void) e820_add_region(end, delta, E820_UNUSABLE); } - if (map[i].size > 0 && end > xen_extra_mem_start) - xen_extra_mem_start = end; + if (map[i].size > 0 && end > xen_extra_mem[0].start) + xen_extra_mem[0].start = end; /* Add region if any remains */ if (map[i].size > 0) @@ -279,10 +279,10 @@ char * __init xen_memory_setup(void) } /* Align the balloon area so that max_low_pfn does not get set * to be at the _end_ of the PCI gap at the far end (fee01000). - * Note that xen_extra_mem_start gets set in the loop above to be - * past the last E820 region. */ - if (xen_initial_domain() && (xen_extra_mem_start < (1ULL<<32))) - xen_extra_mem_start = (1ULL<<32); + * Note that the start of balloon area gets set in the loop above + * to be past the last E820 region. */ + if (xen_initial_domain() && (xen_extra_mem[0].start < (1ULL<<32))) + xen_extra_mem[0].start = (1ULL<<32); /* * In domU, the ISA region is normal, usable memory, but we -- cgit v1.1 From dc91c728fddc29dfed1ae96f6807216b5f42d3a1 Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Thu, 29 Sep 2011 12:26:19 +0100 Subject: xen: allow extra memory to be in multiple regions Allow the extra memory (used by the balloon driver) to be in multiple regions (typically two regions, one for low memory and one for high memory). This allows the balloon driver to increase the number of available low pages (if the initial number if pages is small). As a side effect, the algorithm for building the e820 memory map is simpler and more obviously correct as the map supplied by the hypervisor is (almost) used as is (in particular, all reserved regions and gaps are preserved). Only RAM regions are altered and RAM regions above max_pfn + extra_pages are marked as unused (the region is split in two if necessary). Signed-off-by: David Vrabel Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/setup.c | 182 ++++++++++++++++++++++++--------------------------- 1 file changed, 86 insertions(+), 96 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 0c8e974..2ad2fd5 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -54,26 +54,32 @@ unsigned long xen_released_pages; */ #define EXTRA_MEM_RATIO (10) -static void __init xen_add_extra_mem(unsigned long pages) +static void __init xen_add_extra_mem(u64 start, u64 size) { unsigned long pfn; + int i; - u64 size = (u64)pages * PAGE_SIZE; - u64 extra_start = xen_extra_mem[0].start + xen_extra_mem[0].size; - - if (!pages) - return; - - e820_add_region(extra_start, size, E820_RAM); - sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); - - memblock_x86_reserve_range(extra_start, extra_start + size, "XEN EXTRA"); + for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { + /* Add new region. */ + if (xen_extra_mem[i].size == 0) { + xen_extra_mem[i].start = start; + xen_extra_mem[i].size = size; + break; + } + /* Append to existing region. */ + if (xen_extra_mem[i].start + xen_extra_mem[i].size == start) { + xen_extra_mem[i].size += size; + break; + } + } + if (i == XEN_EXTRA_MEM_MAX_REGIONS) + printk(KERN_WARNING "Warning: not enough extra memory regions\n"); - xen_extra_mem[0].size += size; + memblock_x86_reserve_range(start, start + size, "XEN EXTRA"); - xen_max_p2m_pfn = PFN_DOWN(extra_start + size); + xen_max_p2m_pfn = PFN_DOWN(start + size); - for (pfn = PFN_DOWN(extra_start); pfn <= xen_max_p2m_pfn; pfn++) + for (pfn = PFN_DOWN(start); pfn <= xen_max_p2m_pfn; pfn++) __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); } @@ -120,8 +126,8 @@ static unsigned long __init xen_release_chunk(phys_addr_t start_addr, return len; } -static unsigned long __init xen_return_unused_memory(unsigned long max_pfn, - const struct e820map *e820) +static unsigned long __init xen_return_unused_memory( + unsigned long max_pfn, const struct e820entry *map, int nr_map) { phys_addr_t max_addr = PFN_PHYS(max_pfn); phys_addr_t last_end = ISA_END_ADDRESS; @@ -129,13 +135,13 @@ static unsigned long __init xen_return_unused_memory(unsigned long max_pfn, int i; /* Free any unused memory above the low 1Mbyte. */ - for (i = 0; i < e820->nr_map && last_end < max_addr; i++) { - phys_addr_t end = e820->map[i].addr; + for (i = 0; i < nr_map && last_end < max_addr; i++) { + phys_addr_t end = map[i].addr; end = min(max_addr, end); if (last_end < end) released += xen_release_chunk(last_end, end); - last_end = max(last_end, e820->map[i].addr + e820->map[i].size); + last_end = max(last_end, map[i].addr + map[i].size); } if (last_end < max_addr) @@ -200,20 +206,32 @@ static unsigned long __init xen_get_max_pages(void) return min(max_pages, MAX_DOMAIN_PAGES); } +static void xen_align_and_add_e820_region(u64 start, u64 size, int type) +{ + u64 end = start + size; + + /* Align RAM regions to page boundaries. */ + if (type == E820_RAM) { + start = PAGE_ALIGN(start); + end &= ~((u64)PAGE_SIZE - 1); + } + + e820_add_region(start, end - start, type); +} + /** * machine_specific_memory_setup - Hook for machine specific memory setup. **/ char * __init xen_memory_setup(void) { static struct e820entry map[E820MAX] __initdata; - static struct e820entry map_raw[E820MAX] __initdata; unsigned long max_pfn = xen_start_info->nr_pages; unsigned long long mem_end; int rc; struct xen_memory_map memmap; + unsigned long max_pages; unsigned long extra_pages = 0; - unsigned long extra_limit; unsigned long identity_pages = 0; int i; int op; @@ -240,49 +258,55 @@ char * __init xen_memory_setup(void) } BUG_ON(rc); - memcpy(map_raw, map, sizeof(map)); - e820.nr_map = 0; - xen_extra_mem[0].start = mem_end; - for (i = 0; i < memmap.nr_entries; i++) { - unsigned long long end; - - /* Guard against non-page aligned E820 entries. */ - if (map[i].type == E820_RAM) - map[i].size -= (map[i].size + map[i].addr) % PAGE_SIZE; - - end = map[i].addr + map[i].size; - if (map[i].type == E820_RAM && end > mem_end) { - /* RAM off the end - may be partially included */ - u64 delta = min(map[i].size, end - mem_end); - - map[i].size -= delta; - end -= delta; - - extra_pages += PFN_DOWN(delta); - /* - * Set RAM below 4GB that is not for us to be unusable. - * This prevents "System RAM" address space from being - * used as potential resource for I/O address (happens - * when 'allocate_resource' is called). - */ - if (delta && - (xen_initial_domain() && end < 0x100000000ULL)) - e820_add_region(end, delta, E820_UNUSABLE); + /* Make sure the Xen-supplied memory map is well-ordered. */ + sanitize_e820_map(map, memmap.nr_entries, &memmap.nr_entries); + + max_pages = xen_get_max_pages(); + if (max_pages > max_pfn) + extra_pages += max_pages - max_pfn; + + xen_released_pages = xen_return_unused_memory(max_pfn, map, + memmap.nr_entries); + extra_pages += xen_released_pages; + + /* + * Clamp the amount of extra memory to a EXTRA_MEM_RATIO + * factor the base size. On non-highmem systems, the base + * size is the full initial memory allocation; on highmem it + * is limited to the max size of lowmem, so that it doesn't + * get completely filled. + * + * In principle there could be a problem in lowmem systems if + * the initial memory is also very large with respect to + * lowmem, but we won't try to deal with that here. + */ + extra_pages = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)), + extra_pages); + + i = 0; + while (i < memmap.nr_entries) { + u64 addr = map[i].addr; + u64 size = map[i].size; + u32 type = map[i].type; + + if (type == E820_RAM) { + if (addr < mem_end) { + size = min(size, mem_end - addr); + } else if (extra_pages) { + size = min(size, (u64)extra_pages * PAGE_SIZE); + extra_pages -= size / PAGE_SIZE; + xen_add_extra_mem(addr, size); + } else + type = E820_UNUSABLE; } - if (map[i].size > 0 && end > xen_extra_mem[0].start) - xen_extra_mem[0].start = end; + xen_align_and_add_e820_region(addr, size, type); - /* Add region if any remains */ - if (map[i].size > 0) - e820_add_region(map[i].addr, map[i].size, map[i].type); + map[i].addr += size; + map[i].size -= size; + if (map[i].size == 0) + i++; } - /* Align the balloon area so that max_low_pfn does not get set - * to be at the _end_ of the PCI gap at the far end (fee01000). - * Note that the start of balloon area gets set in the loop above - * to be past the last E820 region. */ - if (xen_initial_domain() && (xen_extra_mem[0].start < (1ULL<<32))) - xen_extra_mem[0].start = (1ULL<<32); /* * In domU, the ISA region is normal, usable memory, but we @@ -308,45 +332,11 @@ char * __init xen_memory_setup(void) sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); - extra_limit = xen_get_max_pages(); - if (max_pfn + extra_pages > extra_limit) { - if (extra_limit > max_pfn) - extra_pages = extra_limit - max_pfn; - else - extra_pages = 0; - } - - xen_released_pages = xen_return_unused_memory(xen_start_info->nr_pages, - &e820); - extra_pages += xen_released_pages; - - /* - * Clamp the amount of extra memory to a EXTRA_MEM_RATIO - * factor the base size. On non-highmem systems, the base - * size is the full initial memory allocation; on highmem it - * is limited to the max size of lowmem, so that it doesn't - * get completely filled. - * - * In principle there could be a problem in lowmem systems if - * the initial memory is also very large with respect to - * lowmem, but we won't try to deal with that here. - */ - extra_limit = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)), - max_pfn + extra_pages); - - if (extra_limit >= max_pfn) - extra_pages = extra_limit - max_pfn; - else - extra_pages = 0; - - xen_add_extra_mem(extra_pages); - /* * Set P2M for all non-RAM pages and E820 gaps to be identity - * type PFNs. We supply it with the non-sanitized version - * of the E820. + * type PFNs. */ - identity_pages = xen_set_identity(map_raw, memmap.nr_entries); + identity_pages = xen_set_identity(e820.map, e820.nr_map); printk(KERN_INFO "Set %ld page(s) to 1-1 mapping.\n", identity_pages); return "Xen"; } -- cgit v1.1 From f3f436e33b925ead21e3f9b47b1e2aed965511d9 Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Wed, 28 Sep 2011 17:46:36 +0100 Subject: xen: release all pages within 1-1 p2m mappings In xen_memory_setup() all reserved regions and gaps are set to an identity (1-1) p2m mapping. If an available page has a PFN within one of these 1-1 mappings it will become inaccessible (as it MFN is lost) so release them before setting up the mapping. This can make an additional 256 MiB or more of RAM available (depending on the size of the reserved regions in the memory map) if the initial pages overlap with reserved regions. The 1:1 p2m mappings are also extended to cover partial pages. This fixes an issue with (for example) systems with a BIOS that puts the DMI tables in a reserved region that begins on a non-page boundary. Signed-off-by: David Vrabel Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/setup.c | 117 ++++++++++++++++++--------------------------------- 1 file changed, 42 insertions(+), 75 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 2ad2fd5..38d0af4 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -83,25 +83,18 @@ static void __init xen_add_extra_mem(u64 start, u64 size) __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); } -static unsigned long __init xen_release_chunk(phys_addr_t start_addr, - phys_addr_t end_addr) +static unsigned long __init xen_release_chunk(unsigned long start, + unsigned long end) { struct xen_memory_reservation reservation = { .address_bits = 0, .extent_order = 0, .domid = DOMID_SELF }; - unsigned long start, end; unsigned long len = 0; unsigned long pfn; int ret; - start = PFN_UP(start_addr); - end = PFN_DOWN(end_addr); - - if (end <= start) - return 0; - for(pfn = start; pfn < end; pfn++) { unsigned long mfn = pfn_to_mfn(pfn); @@ -126,72 +119,52 @@ static unsigned long __init xen_release_chunk(phys_addr_t start_addr, return len; } -static unsigned long __init xen_return_unused_memory( - unsigned long max_pfn, const struct e820entry *map, int nr_map) +static unsigned long __init xen_set_identity_and_release( + const struct e820entry *list, size_t map_size, unsigned long nr_pages) { - phys_addr_t max_addr = PFN_PHYS(max_pfn); - phys_addr_t last_end = ISA_END_ADDRESS; + phys_addr_t start = 0; unsigned long released = 0; - int i; - - /* Free any unused memory above the low 1Mbyte. */ - for (i = 0; i < nr_map && last_end < max_addr; i++) { - phys_addr_t end = map[i].addr; - end = min(max_addr, end); - - if (last_end < end) - released += xen_release_chunk(last_end, end); - last_end = max(last_end, map[i].addr + map[i].size); - } - - if (last_end < max_addr) - released += xen_release_chunk(last_end, max_addr); - - printk(KERN_INFO "released %lu pages of unused memory\n", released); - return released; -} - -static unsigned long __init xen_set_identity(const struct e820entry *list, - ssize_t map_size) -{ - phys_addr_t last = xen_initial_domain() ? 0 : ISA_END_ADDRESS; - phys_addr_t start_pci = last; - const struct e820entry *entry; unsigned long identity = 0; + const struct e820entry *entry; int i; + /* + * Combine non-RAM regions and gaps until a RAM region (or the + * end of the map) is reached, then set the 1:1 map and + * release the pages (if available) in those non-RAM regions. + * + * The combined non-RAM regions are rounded to a whole number + * of pages so any partial pages are accessible via the 1:1 + * mapping. This is needed for some BIOSes that put (for + * example) the DMI tables in a reserved region that begins on + * a non-page boundary. + */ for (i = 0, entry = list; i < map_size; i++, entry++) { - phys_addr_t start = entry->addr; - phys_addr_t end = start + entry->size; + phys_addr_t end = entry->addr + entry->size; - if (start < last) - start = last; + if (entry->type == E820_RAM || i == map_size - 1) { + unsigned long start_pfn = PFN_DOWN(start); + unsigned long end_pfn = PFN_UP(end); - if (end <= start) - continue; + if (entry->type == E820_RAM) + end_pfn = PFN_UP(entry->addr); - /* Skip over the 1MB region. */ - if (last > end) - continue; + if (start_pfn < end_pfn) { + if (start_pfn < nr_pages) + released += xen_release_chunk( + start_pfn, min(end_pfn, nr_pages)); - if ((entry->type == E820_RAM) || (entry->type == E820_UNUSABLE)) { - if (start > start_pci) identity += set_phys_range_identity( - PFN_UP(start_pci), PFN_DOWN(start)); - - /* Without saving 'last' we would gooble RAM too - * at the end of the loop. */ - last = end; - start_pci = end; - continue; + start_pfn, end_pfn); + } + start = end; } - start_pci = min(start, start_pci); - last = end; } - if (last > start_pci) - identity += set_phys_range_identity( - PFN_UP(start_pci), PFN_DOWN(last)); - return identity; + + printk(KERN_INFO "Released %lu pages of unused memory\n", released); + printk(KERN_INFO "Set %ld page(s) to 1-1 mapping\n", identity); + + return released; } static unsigned long __init xen_get_max_pages(void) @@ -232,7 +205,6 @@ char * __init xen_memory_setup(void) struct xen_memory_map memmap; unsigned long max_pages; unsigned long extra_pages = 0; - unsigned long identity_pages = 0; int i; int op; @@ -265,8 +237,13 @@ char * __init xen_memory_setup(void) if (max_pages > max_pfn) extra_pages += max_pages - max_pfn; - xen_released_pages = xen_return_unused_memory(max_pfn, map, - memmap.nr_entries); + /* + * Set P2M for all non-RAM pages and E820 gaps to be identity + * type PFNs. Any RAM pages that would be made inaccesible by + * this are first released. + */ + xen_released_pages = xen_set_identity_and_release( + map, memmap.nr_entries, max_pfn); extra_pages += xen_released_pages; /* @@ -312,10 +289,6 @@ char * __init xen_memory_setup(void) * In domU, the ISA region is normal, usable memory, but we * reserve ISA memory anyway because too many things poke * about in there. - * - * In Dom0, the host E820 information can leave gaps in the - * ISA range, which would cause us to release those pages. To - * avoid this, we unconditionally reserve them here. */ e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS, E820_RESERVED); @@ -332,12 +305,6 @@ char * __init xen_memory_setup(void) sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); - /* - * Set P2M for all non-RAM pages and E820 gaps to be identity - * type PFNs. - */ - identity_pages = xen_set_identity(e820.map, e820.nr_map); - printk(KERN_INFO "Set %ld page(s) to 1-1 mapping.\n", identity_pages); return "Xen"; } -- cgit v1.1 From 4dcaebbf6586d299be8513512a1253f177b803d7 Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Thu, 29 Sep 2011 16:53:29 +0100 Subject: xen: use generic functions instead of xen_{alloc, free}_vm_area() Replace calls to the Xen-specific xen_alloc_vm_area() and xen_free_vm_area() functions with the generic equivalent (alloc_vm_area() and free_vm_area()). On x86, these were identical already. Signed-off-by: David Vrabel Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/include/asm/xen/grant_table.h | 7 ------- arch/x86/xen/grant-table.c | 2 +- 2 files changed, 1 insertion(+), 8 deletions(-) delete mode 100644 arch/x86/include/asm/xen/grant_table.h (limited to 'arch/x86') diff --git a/arch/x86/include/asm/xen/grant_table.h b/arch/x86/include/asm/xen/grant_table.h deleted file mode 100644 index fdbbb45..0000000 --- a/arch/x86/include/asm/xen/grant_table.h +++ /dev/null @@ -1,7 +0,0 @@ -#ifndef _ASM_X86_XEN_GRANT_TABLE_H -#define _ASM_X86_XEN_GRANT_TABLE_H - -#define xen_alloc_vm_area(size) alloc_vm_area(size) -#define xen_free_vm_area(area) free_vm_area(area) - -#endif /* _ASM_X86_XEN_GRANT_TABLE_H */ diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c index 49ba9b5..6bbfd7a 100644 --- a/arch/x86/xen/grant-table.c +++ b/arch/x86/xen/grant-table.c @@ -71,7 +71,7 @@ int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes, if (shared == NULL) { struct vm_struct *area = - xen_alloc_vm_area(PAGE_SIZE * max_nr_gframes); + alloc_vm_area(PAGE_SIZE * max_nr_gframes); BUG_ON(area == NULL); shared = area->addr; *__shared = shared; -- cgit v1.1 From a3e06bbe8445f57eb949e6474c5a9b30f24d2057 Mon Sep 17 00:00:00 2001 From: "Liu, Jinsong" Date: Thu, 22 Sep 2011 16:55:52 +0800 Subject: KVM: emulate lapic tsc deadline timer for guest This patch emulate lapic tsc deadline timer for guest: Enumerate tsc deadline timer capability by CPUID; Enable tsc deadline timer mode by lapic MMIO; Start tsc deadline timer by WRMSR; [jan: use do_div()] [avi: fix for !irqchip_in_kernel()] [marcelo: another fix for !irqchip_in_kernel()] Signed-off-by: Liu, Jinsong Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 2 + arch/x86/kvm/kvm_timer.h | 2 + arch/x86/kvm/lapic.c | 142 ++++++++++++++++++++++++++++++++-------- arch/x86/kvm/lapic.h | 3 + arch/x86/kvm/x86.c | 21 +++++- 5 files changed, 140 insertions(+), 30 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index ab62711..b4973f4 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -674,6 +674,8 @@ u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn); extern bool tdp_enabled; +u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu); + /* control of guest tsc rate supported? */ extern bool kvm_has_tsc_control; /* minimum supported tsc_khz for guests */ diff --git a/arch/x86/kvm/kvm_timer.h b/arch/x86/kvm/kvm_timer.h index 64bc6ea..497dbaa 100644 --- a/arch/x86/kvm/kvm_timer.h +++ b/arch/x86/kvm/kvm_timer.h @@ -2,6 +2,8 @@ struct kvm_timer { struct hrtimer timer; s64 period; /* unit: ns */ + u32 timer_mode_mask; + u64 tscdeadline; atomic_t pending; /* accumulated triggered timers */ bool reinject; struct kvm_timer_ops *t_ops; diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 2fb20ca..54abb40 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -138,9 +138,23 @@ static inline int apic_lvt_vector(struct kvm_lapic *apic, int lvt_type) return apic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK; } +static inline int apic_lvtt_oneshot(struct kvm_lapic *apic) +{ + return ((apic_get_reg(apic, APIC_LVTT) & + apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_ONESHOT); +} + static inline int apic_lvtt_period(struct kvm_lapic *apic) { - return apic_get_reg(apic, APIC_LVTT) & APIC_LVT_TIMER_PERIODIC; + return ((apic_get_reg(apic, APIC_LVTT) & + apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_PERIODIC); +} + +static inline int apic_lvtt_tscdeadline(struct kvm_lapic *apic) +{ + return ((apic_get_reg(apic, APIC_LVTT) & + apic->lapic_timer.timer_mode_mask) == + APIC_LVT_TIMER_TSCDEADLINE); } static inline int apic_lvt_nmi_mode(u32 lvt_val) @@ -169,7 +183,7 @@ static inline int apic_x2apic_mode(struct kvm_lapic *apic) } static unsigned int apic_lvt_mask[APIC_LVT_NUM] = { - LVT_MASK | APIC_LVT_TIMER_PERIODIC, /* LVTT */ + LVT_MASK , /* part LVTT mask, timer mode mask added at runtime */ LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */ LVT_MASK | APIC_MODE_MASK, /* LVTPC */ LINT_MASK, LINT_MASK, /* LVT0-1 */ @@ -572,6 +586,9 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset) break; case APIC_TMCCT: /* Timer CCR */ + if (apic_lvtt_tscdeadline(apic)) + return 0; + val = apic_get_tmcct(apic); break; @@ -666,37 +683,40 @@ static void update_divide_count(struct kvm_lapic *apic) static void start_apic_timer(struct kvm_lapic *apic) { - ktime_t now = apic->lapic_timer.timer.base->get_time(); - - apic->lapic_timer.period = (u64)apic_get_reg(apic, APIC_TMICT) * - APIC_BUS_CYCLE_NS * apic->divide_count; + ktime_t now; atomic_set(&apic->lapic_timer.pending, 0); - if (!apic->lapic_timer.period) - return; - /* - * Do not allow the guest to program periodic timers with small - * interval, since the hrtimers are not throttled by the host - * scheduler. - */ - if (apic_lvtt_period(apic)) { - s64 min_period = min_timer_period_us * 1000LL; - - if (apic->lapic_timer.period < min_period) { - pr_info_ratelimited( - "kvm: vcpu %i: requested %lld ns " - "lapic timer period limited to %lld ns\n", - apic->vcpu->vcpu_id, apic->lapic_timer.period, - min_period); - apic->lapic_timer.period = min_period; + if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) { + /* lapic timer in oneshot or peroidic mode */ + now = apic->lapic_timer.timer.base->get_time(); + apic->lapic_timer.period = (u64)apic_get_reg(apic, APIC_TMICT) + * APIC_BUS_CYCLE_NS * apic->divide_count; + + if (!apic->lapic_timer.period) + return; + /* + * Do not allow the guest to program periodic timers with small + * interval, since the hrtimers are not throttled by the host + * scheduler. + */ + if (apic_lvtt_period(apic)) { + s64 min_period = min_timer_period_us * 1000LL; + + if (apic->lapic_timer.period < min_period) { + pr_info_ratelimited( + "kvm: vcpu %i: requested %lld ns " + "lapic timer period limited to %lld ns\n", + apic->vcpu->vcpu_id, + apic->lapic_timer.period, min_period); + apic->lapic_timer.period = min_period; + } } - } - hrtimer_start(&apic->lapic_timer.timer, - ktime_add_ns(now, apic->lapic_timer.period), - HRTIMER_MODE_ABS); + hrtimer_start(&apic->lapic_timer.timer, + ktime_add_ns(now, apic->lapic_timer.period), + HRTIMER_MODE_ABS); - apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016" + apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016" PRIx64 ", " "timer initial count 0x%x, period %lldns, " "expire @ 0x%016" PRIx64 ".\n", __func__, @@ -705,6 +725,30 @@ static void start_apic_timer(struct kvm_lapic *apic) apic->lapic_timer.period, ktime_to_ns(ktime_add_ns(now, apic->lapic_timer.period))); + } else if (apic_lvtt_tscdeadline(apic)) { + /* lapic timer in tsc deadline mode */ + u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline; + u64 ns = 0; + struct kvm_vcpu *vcpu = apic->vcpu; + unsigned long this_tsc_khz = vcpu_tsc_khz(vcpu); + unsigned long flags; + + if (unlikely(!tscdeadline || !this_tsc_khz)) + return; + + local_irq_save(flags); + + now = apic->lapic_timer.timer.base->get_time(); + guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu); + if (likely(tscdeadline > guest_tsc)) { + ns = (tscdeadline - guest_tsc) * 1000000ULL; + do_div(ns, this_tsc_khz); + } + hrtimer_start(&apic->lapic_timer.timer, + ktime_add_ns(now, ns), HRTIMER_MODE_ABS); + + local_irq_restore(flags); + } } static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val) @@ -792,7 +836,6 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) case APIC_LVT0: apic_manage_nmi_watchdog(apic, val); - case APIC_LVTT: case APIC_LVTTHMR: case APIC_LVTPC: case APIC_LVT1: @@ -806,7 +849,22 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) break; + case APIC_LVTT: + if ((apic_get_reg(apic, APIC_LVTT) & + apic->lapic_timer.timer_mode_mask) != + (val & apic->lapic_timer.timer_mode_mask)) + hrtimer_cancel(&apic->lapic_timer.timer); + + if (!apic_sw_enabled(apic)) + val |= APIC_LVT_MASKED; + val &= (apic_lvt_mask[0] | apic->lapic_timer.timer_mode_mask); + apic_set_reg(apic, APIC_LVTT, val); + break; + case APIC_TMICT: + if (apic_lvtt_tscdeadline(apic)) + break; + hrtimer_cancel(&apic->lapic_timer.timer); apic_set_reg(apic, APIC_TMICT, val); start_apic_timer(apic); @@ -902,6 +960,32 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu) *---------------------------------------------------------------------- */ +u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + if (!apic) + return 0; + + if (apic_lvtt_oneshot(apic) || apic_lvtt_period(apic)) + return 0; + + return apic->lapic_timer.tscdeadline; +} + +void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + if (!apic) + return; + + if (apic_lvtt_oneshot(apic) || apic_lvtt_period(apic)) + return; + + hrtimer_cancel(&apic->lapic_timer.timer); + apic->lapic_timer.tscdeadline = data; + start_apic_timer(apic); +} + void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8) { struct kvm_lapic *apic = vcpu->arch.apic; diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 8287243..138e8cc 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -42,6 +42,9 @@ int kvm_lapic_enabled(struct kvm_vcpu *vcpu); bool kvm_apic_present(struct kvm_vcpu *vcpu); int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); +u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu); +void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data); + void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr); void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu); void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d51e407..cf26909 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -600,6 +600,8 @@ static bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu) static void update_cpuid(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; + struct kvm_lapic *apic = vcpu->arch.apic; + u32 timer_mode_mask; best = kvm_find_cpuid_entry(vcpu, 1, 0); if (!best) @@ -611,6 +613,16 @@ static void update_cpuid(struct kvm_vcpu *vcpu) if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) best->ecx |= bit(X86_FEATURE_OSXSAVE); } + + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && + best->function == 0x1) { + best->ecx |= bit(X86_FEATURE_TSC_DEADLINE_TIMER); + timer_mode_mask = 3 << 17; + } else + timer_mode_mask = 1 << 17; + + if (apic) + apic->lapic_timer.timer_mode_mask = timer_mode_mask; } int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) @@ -826,6 +838,7 @@ static u32 msrs_to_save[] = { static unsigned num_msrs_to_save; static u32 emulated_msrs[] = { + MSR_IA32_TSCDEADLINE, MSR_IA32_MISC_ENABLE, MSR_IA32_MCG_STATUS, MSR_IA32_MCG_CTL, @@ -1001,7 +1014,7 @@ static inline int kvm_tsc_changes_freq(void) return ret; } -static u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu) +u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu) { if (vcpu->arch.virtual_tsc_khz) return vcpu->arch.virtual_tsc_khz; @@ -1565,6 +1578,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) break; case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: return kvm_x2apic_msr_write(vcpu, msr, data); + case MSR_IA32_TSCDEADLINE: + kvm_set_lapic_tscdeadline_msr(vcpu, data); + break; case MSR_IA32_MISC_ENABLE: vcpu->arch.ia32_misc_enable_msr = data; break; @@ -1894,6 +1910,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: return kvm_x2apic_msr_read(vcpu, msr, pdata); break; + case MSR_IA32_TSCDEADLINE: + data = kvm_get_lapic_tscdeadline_msr(vcpu); + break; case MSR_IA32_MISC_ENABLE: data = vcpu->arch.ia32_misc_enable_msr; break; -- cgit v1.1 From 011af857847a7ebe1f45342b29ff9f390515a4b8 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 5 Oct 2011 14:01:17 +0200 Subject: perf, amd: Use GO/HO bits in perf-ctr The AMD perf-counters support counting in guest or host-mode only. Make use of that feature when user-space specified guest/host-mode only counting. Signed-off-by: Joerg Roedel Signed-off-by: Gleb Natapov Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1317816084-18026-3-git-send-email-gleb@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/perf_event.h | 3 +++ arch/x86/kernel/cpu/perf_event_amd.c | 13 +++++++++++++ 2 files changed, 16 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 094fb30..ce2bfb3 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -29,6 +29,9 @@ #define ARCH_PERFMON_EVENTSEL_INV (1ULL << 23) #define ARCH_PERFMON_EVENTSEL_CMASK 0xFF000000ULL +#define AMD_PERFMON_EVENTSEL_GUESTONLY (1ULL << 40) +#define AMD_PERFMON_EVENTSEL_HOSTONLY (1ULL << 41) + #define AMD64_EVENTSEL_EVENT \ (ARCH_PERFMON_EVENTSEL_EVENT | (0x0FULL << 32)) #define INTEL_ARCH_EVENT_MASK \ diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index 384450d..db8e603 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -138,6 +138,19 @@ static int amd_pmu_hw_config(struct perf_event *event) if (ret) return ret; + if (event->attr.exclude_host && event->attr.exclude_guest) + /* + * When HO == GO == 1 the hardware treats that as GO == HO == 0 + * and will count in both modes. We don't want to count in that + * case so we emulate no-counting by setting US = OS = 0. + */ + event->hw.config &= ~(ARCH_PERFMON_EVENTSEL_USR | + ARCH_PERFMON_EVENTSEL_OS); + else if (event->attr.exclude_host) + event->hw.config |= AMD_PERFMON_EVENTSEL_GUESTONLY; + else if (event->attr.exclude_guest) + event->hw.config |= AMD_PERFMON_EVENTSEL_HOSTONLY; + if (event->attr.type != PERF_TYPE_RAW) return 0; -- cgit v1.1 From 29cf7a30f8a0ce4af2406d93d5a332099be26923 Mon Sep 17 00:00:00 2001 From: Paul Menzel Date: Wed, 31 Aug 2011 17:07:10 +0200 Subject: x86/PCI: use host bridge _CRS info on ASUS M2V-MX SE In summary, this DMI quirk uses the _CRS info by default for the ASUS M2V-MX SE by turning on `pci=use_crs` and is similar to the quirk added by commit 2491762cfb47 ("x86/PCI: use host bridge _CRS info on ASRock ALiveSATA2-GLAN") whose commit message should be read for further information. Since commit 3e3da00c01d0 ("x86/pci: AMD one chain system to use pci read out res") Linux gives the following oops: parport0: PC-style at 0x378, irq 7 [PCSPP,TRISTATE] HDA Intel 0000:20:01.0: PCI INT A -> GSI 17 (level, low) -> IRQ 17 HDA Intel 0000:20:01.0: setting latency timer to 64 BUG: unable to handle kernel paging request at ffffc90011c08000 IP: [] azx_probe+0x3ad/0x86b [snd_hda_intel] PGD 13781a067 PUD 13781b067 PMD 1300ba067 PTE 800000fd00000173 Oops: 0009 [#1] SMP last sysfs file: /sys/module/snd_pcm/initstate CPU 0 Modules linked in: snd_hda_intel(+) snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_midi snd_rawmidi snd_seq_midi_event tpm_tis tpm snd_seq tpm_bios psmouse parport_pc snd_timer snd_seq_device parport processor evdev snd i2c_viapro thermal_sys amd64_edac_mod k8temp i2c_core soundcore shpchp pcspkr serio_raw asus_atk0110 pci_hotplug edac_core button snd_page_alloc edac_mce_amd ext3 jbd mbcache sha256_generic cryptd aes_x86_64 aes_generic cbc dm_crypt dm_mod raid1 md_mod usbhid hid sg sd_mod crc_t10dif sr_mod cdrom ata_generic uhci_hcd sata_via pata_via libata ehci_hcd usbcore scsi_mod via_rhine mii nls_base [last unloaded: scsi_wait_scan] Pid: 1153, comm: work_for_cpu Not tainted 2.6.37-1-amd64 #1 M2V-MX SE/System Product Name RIP: 0010:[] [] azx_probe+0x3ad/0x86b [snd_hda_intel] RSP: 0018:ffff88013153fe50 EFLAGS: 00010286 RAX: ffffc90011c08000 RBX: ffff88013029ec00 RCX: 0000000000000006 RDX: 0000000000000000 RSI: 0000000000000246 RDI: 0000000000000246 RBP: ffff88013341d000 R08: 0000000000000000 R09: 0000000000000040 R10: 0000000000000286 R11: 0000000000003731 R12: ffff88013029c400 R13: 0000000000000000 R14: 0000000000000000 R15: ffff88013341d090 FS: 0000000000000000(0000) GS:ffff8800bfc00000(0000) knlGS:00000000f7610ab0 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: ffffc90011c08000 CR3: 0000000132f57000 CR4: 00000000000006f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process work_for_cpu (pid: 1153, threadinfo ffff88013153e000, task ffff8801303c86c0) Stack: 0000000000000005 ffffffff8123ad65 00000000000136c0 ffff88013029c400 ffff8801303c8998 ffff88013341d000 ffff88013341d090 ffff8801322d9dc8 ffff88013341d208 0000000000000000 0000000000000000 ffffffff811ad232 Call Trace: [] ? __pm_runtime_set_status+0x162/0x186 [] ? local_pci_probe+0x49/0x92 [] ? do_work_for_cpu+0x0/0x1b [] ? do_work_for_cpu+0x0/0x1b [] ? do_work_for_cpu+0xb/0x1b [] ? kthread+0x7a/0x82 [] ? kernel_thread_helper+0x4/0x10 [] ? kthread+0x0/0x82 [] ? kernel_thread_helper+0x0/0x10 Code: f4 01 00 00 ef 31 f6 48 89 df e8 29 dd ff ff 85 c0 0f 88 2b 03 00 00 48 89 ef e8 b4 39 c3 e0 8b 7b 40 e8 fc 9d b1 e0 48 8b 43 38 <66> 8b 10 66 89 14 24 8b 43 14 83 e8 03 83 f8 01 77 32 31 d2 be RIP [] azx_probe+0x3ad/0x86b [snd_hda_intel] RSP CR2: ffffc90011c08000 ---[ end trace 8d1f3ebc136437fd ]--- Trusting the ACPI _CRS information (`pci=use_crs`) fixes this problem. $ dmesg | grep -i crs # with the quirk PCI: Using host bridge windows from ACPI; if necessary, use "pci=nocrs" and report a bug The match has to be against the DMI board entries though since the vendor entries are not populated. DMI: System manufacturer System Product Name/M2V-MX SE, BIOS 0304 10/30/2007 This quirk should be removed when `pci=use_crs` is enabled for machines from 2006 or earlier or some other solution is implemented. Using coreboot [1] with this board the problem does not exist but this quirk also does not affect it either. To be safe though the check is tightened to only take effect when the BIOS from American Megatrends is used. 15:13 < ruik> but coreboot does not need that 15:13 < ruik> because i have there only one root bus 15:13 < ruik> the audio is behind a bridge $ sudo dmidecode BIOS Information Vendor: American Megatrends Inc. Version: 0304 Release Date: 10/30/2007 [1] http://www.coreboot.org/ Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=30552 Cc: stable@kernel.org (2.6.34) Cc: Thomas Gleixner Cc: Ingo Molnar Cc: H. Peter Anvin Cc: x86@kernel.org Signed-off-by: Paul Menzel Signed-off-by: Bjorn Helgaas Acked-by: Jesse Barnes Signed-off-by: Linus Torvalds --- arch/x86/pci/acpi.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 039d913..404f21a 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -43,6 +43,17 @@ static const struct dmi_system_id pci_use_crs_table[] __initconst = { DMI_MATCH(DMI_PRODUCT_NAME, "ALiveSATA2-GLAN"), }, }, + /* https://bugzilla.kernel.org/show_bug.cgi?id=30552 */ + /* 2006 AMD HT/VIA system with two host bridges */ + { + .callback = set_use_crs, + .ident = "ASUS M2V-MX SE", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), + DMI_MATCH(DMI_BOARD_NAME, "M2V-MX SE"), + DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."), + }, + }, {} }; -- cgit v1.1 From 144d31e6f1902a39bc95754d820d356722697850 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 5 Oct 2011 14:01:21 +0200 Subject: perf, intel: Use GO/HO bits in perf-ctr Intel does not have guest/host-only bit in perf counters like AMD does. To support GO/HO bits KVM needs to switch EVENTSELn values (or PERF_GLOBAL_CTRL if available) at a guest entry. If a counter is configured to count only in a guest mode it stays disabled in a host, but VMX is configured to switch it to enabled value during guest entry. This patch adds GO/HO tracking to Intel perf code and provides interface for KVM to get a list of MSRs that need to be switched on a guest entry. Only cpus with architectural PMU (v1 or later) are supported with this patch. To my knowledge there is not p6 models with VMX but without architectural PMU and p4 with VMX are rare and the interface is general enough to support them if need arise. Signed-off-by: Gleb Natapov Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1317816084-18026-7-git-send-email-gleb@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/perf_event.h | 12 +++++ arch/x86/kernel/cpu/perf_event.h | 12 +++++ arch/x86/kernel/cpu/perf_event_intel.c | 91 ++++++++++++++++++++++++++++++++-- 3 files changed, 112 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index ce2bfb3..e47cb61 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -162,7 +162,19 @@ extern unsigned long perf_misc_flags(struct pt_regs *regs); ); \ } +struct perf_guest_switch_msr { + unsigned msr; + u64 host, guest; +}; + +extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr); #else +static inline perf_guest_switch_msr *perf_guest_get_msrs(int *nr) +{ + *nr = 0; + return NULL; +} + static inline void perf_events_lapic_init(void) { } #endif diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index fb330b0..b9698d4 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -131,6 +131,13 @@ struct cpu_hw_events { struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES]; /* + * Intel host/guest exclude bits + */ + u64 intel_ctrl_guest_mask; + u64 intel_ctrl_host_mask; + struct perf_guest_switch_msr guest_switch_msrs[X86_PMC_IDX_MAX]; + + /* * manage shared (per-core, per-cpu) registers * used on Intel NHM/WSM/SNB */ @@ -295,6 +302,11 @@ struct x86_pmu { */ struct extra_reg *extra_regs; unsigned int er_flags; + + /* + * Intel host/guest support (KVM) + */ + struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr); }; #define ERF_NO_HT_SHARING 1 diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 61fa357..e09ca20 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -749,7 +749,8 @@ static void intel_pmu_enable_all(int added) intel_pmu_pebs_enable_all(); intel_pmu_lbr_enable_all(); - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, + x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask); if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { struct perf_event *event = @@ -872,6 +873,7 @@ static void intel_pmu_disable_fixed(struct hw_perf_event *hwc) static void intel_pmu_disable_event(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) { intel_pmu_disable_bts(); @@ -879,6 +881,9 @@ static void intel_pmu_disable_event(struct perf_event *event) return; } + cpuc->intel_ctrl_guest_mask &= ~(1ull << hwc->idx); + cpuc->intel_ctrl_host_mask &= ~(1ull << hwc->idx); + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { intel_pmu_disable_fixed(hwc); return; @@ -924,6 +929,7 @@ static void intel_pmu_enable_fixed(struct hw_perf_event *hwc) static void intel_pmu_enable_event(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) { if (!__this_cpu_read(cpu_hw_events.enabled)) @@ -933,6 +939,11 @@ static void intel_pmu_enable_event(struct perf_event *event) return; } + if (event->attr.exclude_host) + cpuc->intel_ctrl_guest_mask |= (1ull << hwc->idx); + if (event->attr.exclude_guest) + cpuc->intel_ctrl_host_mask |= (1ull << hwc->idx); + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { intel_pmu_enable_fixed(hwc); return; @@ -1302,12 +1313,84 @@ static int intel_pmu_hw_config(struct perf_event *event) return 0; } +struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr) +{ + if (x86_pmu.guest_get_msrs) + return x86_pmu.guest_get_msrs(nr); + *nr = 0; + return NULL; +} +EXPORT_SYMBOL_GPL(perf_guest_get_msrs); + +static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs; + + arr[0].msr = MSR_CORE_PERF_GLOBAL_CTRL; + arr[0].host = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask; + arr[0].guest = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_host_mask; + + *nr = 1; + return arr; +} + +static struct perf_guest_switch_msr *core_guest_get_msrs(int *nr) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs; + int idx; + + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + struct perf_event *event = cpuc->events[idx]; + + arr[idx].msr = x86_pmu_config_addr(idx); + arr[idx].host = arr[idx].guest = 0; + + if (!test_bit(idx, cpuc->active_mask)) + continue; + + arr[idx].host = arr[idx].guest = + event->hw.config | ARCH_PERFMON_EVENTSEL_ENABLE; + + if (event->attr.exclude_host) + arr[idx].host &= ~ARCH_PERFMON_EVENTSEL_ENABLE; + else if (event->attr.exclude_guest) + arr[idx].guest &= ~ARCH_PERFMON_EVENTSEL_ENABLE; + } + + *nr = x86_pmu.num_counters; + return arr; +} + +static void core_pmu_enable_event(struct perf_event *event) +{ + if (!event->attr.exclude_host) + x86_pmu_enable_event(event); +} + +static void core_pmu_enable_all(int added) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + int idx; + + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + struct hw_perf_event *hwc = &cpuc->events[idx]->hw; + + if (!test_bit(idx, cpuc->active_mask) || + cpuc->events[idx]->attr.exclude_host) + continue; + + __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); + } +} + static __initconst const struct x86_pmu core_pmu = { .name = "core", .handle_irq = x86_pmu_handle_irq, .disable_all = x86_pmu_disable_all, - .enable_all = x86_pmu_enable_all, - .enable = x86_pmu_enable_event, + .enable_all = core_pmu_enable_all, + .enable = core_pmu_enable_event, .disable = x86_pmu_disable_event, .hw_config = x86_pmu_hw_config, .schedule_events = x86_schedule_events, @@ -1325,6 +1408,7 @@ static __initconst const struct x86_pmu core_pmu = { .get_event_constraints = intel_get_event_constraints, .put_event_constraints = intel_put_event_constraints, .event_constraints = intel_core_event_constraints, + .guest_get_msrs = core_guest_get_msrs, }; struct intel_shared_regs *allocate_shared_regs(int cpu) @@ -1431,6 +1515,7 @@ static __initconst const struct x86_pmu intel_pmu = { .cpu_prepare = intel_pmu_cpu_prepare, .cpu_starting = intel_pmu_cpu_starting, .cpu_dying = intel_pmu_cpu_dying, + .guest_get_msrs = intel_guest_get_msrs, }; static void intel_clovertown_quirks(void) -- cgit v1.1 From 1d48922c14b6363f6d5febb12464d804bb5cc53f Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Fri, 30 Sep 2011 15:06:19 -0400 Subject: x86, nmi: Split out nmi from traps.c The nmi stuff is changing a lot and adding more functionality. Split it out from the traps.c file so it doesn't continue to pollute that file. This makes it easier to find and expand all the future nmi related work. No real functional changes here. Signed-off-by: Don Zickus Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1317409584-23662-2-git-send-email-dzickus@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/nmi.c | 178 +++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/traps.c | 155 ----------------------------------------- 3 files changed, 179 insertions(+), 156 deletions(-) create mode 100644 arch/x86/kernel/nmi.c (limited to 'arch/x86') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 82f2912..8baca3c 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -19,7 +19,7 @@ endif obj-y := process_$(BITS).o signal.o entry_$(BITS).o obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o -obj-y += time.o ioport.o ldt.o dumpstack.o +obj-y += time.o ioport.o ldt.o dumpstack.o nmi.o obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-y += probe_roms.o diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c new file mode 100644 index 0000000..68d758a --- /dev/null +++ b/arch/x86/kernel/nmi.c @@ -0,0 +1,178 @@ +/* + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs + * + * Pentium III FXSR, SSE support + * Gareth Hughes , May 2000 + */ + +/* + * Handle hardware traps and faults. + */ +#include +#include +#include +#include + +#if defined(CONFIG_EDAC) +#include +#endif + +#include +#include +#include + +static int ignore_nmis; + +int unknown_nmi_panic; +/* + * Prevent NMI reason port (0x61) being accessed simultaneously, can + * only be used in NMI handler. + */ +static DEFINE_RAW_SPINLOCK(nmi_reason_lock); + +static int __init setup_unknown_nmi_panic(char *str) +{ + unknown_nmi_panic = 1; + return 1; +} +__setup("unknown_nmi_panic", setup_unknown_nmi_panic); + +static notrace __kprobes void +pci_serr_error(unsigned char reason, struct pt_regs *regs) +{ + pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n", + reason, smp_processor_id()); + + /* + * On some machines, PCI SERR line is used to report memory + * errors. EDAC makes use of it. + */ +#if defined(CONFIG_EDAC) + if (edac_handler_set()) { + edac_atomic_assert_error(); + return; + } +#endif + + if (panic_on_unrecovered_nmi) + panic("NMI: Not continuing"); + + pr_emerg("Dazed and confused, but trying to continue\n"); + + /* Clear and disable the PCI SERR error line. */ + reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_SERR; + outb(reason, NMI_REASON_PORT); +} + +static notrace __kprobes void +io_check_error(unsigned char reason, struct pt_regs *regs) +{ + unsigned long i; + + pr_emerg( + "NMI: IOCK error (debug interrupt?) for reason %02x on CPU %d.\n", + reason, smp_processor_id()); + show_registers(regs); + + if (panic_on_io_nmi) + panic("NMI IOCK error: Not continuing"); + + /* Re-enable the IOCK line, wait for a few seconds */ + reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_IOCHK; + outb(reason, NMI_REASON_PORT); + + i = 20000; + while (--i) { + touch_nmi_watchdog(); + udelay(100); + } + + reason &= ~NMI_REASON_CLEAR_IOCHK; + outb(reason, NMI_REASON_PORT); +} + +static notrace __kprobes void +unknown_nmi_error(unsigned char reason, struct pt_regs *regs) +{ + if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == + NOTIFY_STOP) + return; +#ifdef CONFIG_MCA + /* + * Might actually be able to figure out what the guilty party + * is: + */ + if (MCA_bus) { + mca_handle_nmi(); + return; + } +#endif + pr_emerg("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", + reason, smp_processor_id()); + + pr_emerg("Do you have a strange power saving mode enabled?\n"); + if (unknown_nmi_panic || panic_on_unrecovered_nmi) + panic("NMI: Not continuing"); + + pr_emerg("Dazed and confused, but trying to continue\n"); +} + +static notrace __kprobes void default_do_nmi(struct pt_regs *regs) +{ + unsigned char reason = 0; + + /* + * CPU-specific NMI must be processed before non-CPU-specific + * NMI, otherwise we may lose it, because the CPU-specific + * NMI can not be detected/processed on other CPUs. + */ + if (notify_die(DIE_NMI, "nmi", regs, 0, 2, SIGINT) == NOTIFY_STOP) + return; + + /* Non-CPU-specific NMI: NMI sources can be processed on any CPU */ + raw_spin_lock(&nmi_reason_lock); + reason = get_nmi_reason(); + + if (reason & NMI_REASON_MASK) { + if (reason & NMI_REASON_SERR) + pci_serr_error(reason, regs); + else if (reason & NMI_REASON_IOCHK) + io_check_error(reason, regs); +#ifdef CONFIG_X86_32 + /* + * Reassert NMI in case it became active + * meanwhile as it's edge-triggered: + */ + reassert_nmi(); +#endif + raw_spin_unlock(&nmi_reason_lock); + return; + } + raw_spin_unlock(&nmi_reason_lock); + + unknown_nmi_error(reason, regs); +} + +dotraplinkage notrace __kprobes void +do_nmi(struct pt_regs *regs, long error_code) +{ + nmi_enter(); + + inc_irq_stat(__nmi_count); + + if (!ignore_nmis) + default_do_nmi(regs); + + nmi_exit(); +} + +void stop_nmi(void) +{ + ignore_nmis++; +} + +void restart_nmi(void) +{ + ignore_nmis--; +} diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 6913369..a8e3eb8 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -81,15 +81,6 @@ gate_desc idt_table[NR_VECTORS] __page_aligned_data = { { { { 0, 0 } } }, }; DECLARE_BITMAP(used_vectors, NR_VECTORS); EXPORT_SYMBOL_GPL(used_vectors); -static int ignore_nmis; - -int unknown_nmi_panic; -/* - * Prevent NMI reason port (0x61) being accessed simultaneously, can - * only be used in NMI handler. - */ -static DEFINE_RAW_SPINLOCK(nmi_reason_lock); - static inline void conditional_sti(struct pt_regs *regs) { if (regs->flags & X86_EFLAGS_IF) @@ -307,152 +298,6 @@ gp_in_kernel: die("general protection fault", regs, error_code); } -static int __init setup_unknown_nmi_panic(char *str) -{ - unknown_nmi_panic = 1; - return 1; -} -__setup("unknown_nmi_panic", setup_unknown_nmi_panic); - -static notrace __kprobes void -pci_serr_error(unsigned char reason, struct pt_regs *regs) -{ - pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n", - reason, smp_processor_id()); - - /* - * On some machines, PCI SERR line is used to report memory - * errors. EDAC makes use of it. - */ -#if defined(CONFIG_EDAC) - if (edac_handler_set()) { - edac_atomic_assert_error(); - return; - } -#endif - - if (panic_on_unrecovered_nmi) - panic("NMI: Not continuing"); - - pr_emerg("Dazed and confused, but trying to continue\n"); - - /* Clear and disable the PCI SERR error line. */ - reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_SERR; - outb(reason, NMI_REASON_PORT); -} - -static notrace __kprobes void -io_check_error(unsigned char reason, struct pt_regs *regs) -{ - unsigned long i; - - pr_emerg( - "NMI: IOCK error (debug interrupt?) for reason %02x on CPU %d.\n", - reason, smp_processor_id()); - show_registers(regs); - - if (panic_on_io_nmi) - panic("NMI IOCK error: Not continuing"); - - /* Re-enable the IOCK line, wait for a few seconds */ - reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_IOCHK; - outb(reason, NMI_REASON_PORT); - - i = 20000; - while (--i) { - touch_nmi_watchdog(); - udelay(100); - } - - reason &= ~NMI_REASON_CLEAR_IOCHK; - outb(reason, NMI_REASON_PORT); -} - -static notrace __kprobes void -unknown_nmi_error(unsigned char reason, struct pt_regs *regs) -{ - if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == - NOTIFY_STOP) - return; -#ifdef CONFIG_MCA - /* - * Might actually be able to figure out what the guilty party - * is: - */ - if (MCA_bus) { - mca_handle_nmi(); - return; - } -#endif - pr_emerg("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", - reason, smp_processor_id()); - - pr_emerg("Do you have a strange power saving mode enabled?\n"); - if (unknown_nmi_panic || panic_on_unrecovered_nmi) - panic("NMI: Not continuing"); - - pr_emerg("Dazed and confused, but trying to continue\n"); -} - -static notrace __kprobes void default_do_nmi(struct pt_regs *regs) -{ - unsigned char reason = 0; - - /* - * CPU-specific NMI must be processed before non-CPU-specific - * NMI, otherwise we may lose it, because the CPU-specific - * NMI can not be detected/processed on other CPUs. - */ - if (notify_die(DIE_NMI, "nmi", regs, 0, 2, SIGINT) == NOTIFY_STOP) - return; - - /* Non-CPU-specific NMI: NMI sources can be processed on any CPU */ - raw_spin_lock(&nmi_reason_lock); - reason = get_nmi_reason(); - - if (reason & NMI_REASON_MASK) { - if (reason & NMI_REASON_SERR) - pci_serr_error(reason, regs); - else if (reason & NMI_REASON_IOCHK) - io_check_error(reason, regs); -#ifdef CONFIG_X86_32 - /* - * Reassert NMI in case it became active - * meanwhile as it's edge-triggered: - */ - reassert_nmi(); -#endif - raw_spin_unlock(&nmi_reason_lock); - return; - } - raw_spin_unlock(&nmi_reason_lock); - - unknown_nmi_error(reason, regs); -} - -dotraplinkage notrace __kprobes void -do_nmi(struct pt_regs *regs, long error_code) -{ - nmi_enter(); - - inc_irq_stat(__nmi_count); - - if (!ignore_nmis) - default_do_nmi(regs); - - nmi_exit(); -} - -void stop_nmi(void) -{ - ignore_nmis++; -} - -void restart_nmi(void) -{ - ignore_nmis--; -} - /* May run on IST stack. */ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) { -- cgit v1.1 From c9126b2ee8adb9235941cedbf558d39a9e65642d Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Fri, 30 Sep 2011 15:06:20 -0400 Subject: x86, nmi: Create new NMI handler routines The NMI handlers used to rely on the notifier infrastructure. This worked great until we wanted to support handling multiple events better. One of the key ideas to the nmi handling is to process _all_ the handlers for each NMI. The reason behind this switch is because NMIs are edge triggered. If enough NMIs are triggered, then they could be lost because the cpu can only latch at most one NMI (besides the one currently being processed). In order to deal with this we have decided to process all the NMI handlers for each NMI. This allows the handlers to determine if they recieved an event or not (the ones that can not determine this will be left to fend for themselves on the unknown NMI list). As a result of this change it is now possible to have an extra NMI that was destined to be received for an already processed event. Because the event was processed in the previous NMI, this NMI gets dropped and becomes an 'unknown' NMI. This of course will cause printks that scare people. However, we prefer to have extra NMIs as opposed to losing NMIs and as such are have developed a basic mechanism to catch most of them. That will be a later patch. To accomplish this idea, I unhooked the nmi handlers from the notifier routines and created a new mechanism loosely based on doIRQ. The reason for this is the notifier routines have a couple of shortcomings. One we could't guarantee all future NMI handlers used NOTIFY_OK instead of NOTIFY_STOP. Second, we couldn't keep track of the number of events being handled in each routine (most only handle one, perf can handle more than one). Third, I wanted to eventually display which nmi handlers are registered in the system in /proc/interrupts to help see who is generating NMIs. The patch below just implements the new infrastructure but doesn't wire it up yet (that is the next patch). Its design is based on doIRQ structs and the atomic notifier routines. So the rcu stuff in the patch isn't entirely untested (as the notifier routines have soaked it) but it should be double checked in case I copied the code wrong. Signed-off-by: Don Zickus Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1317409584-23662-3-git-send-email-dzickus@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/nmi.h | 18 ++++++ arch/x86/kernel/nmi.c | 153 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 171 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index 4886a68..480b69b 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -42,6 +42,24 @@ void arch_trigger_all_cpu_backtrace(void); #define NMI_LOCAL_NORMAL_PRIOR (NMI_LOCAL_BIT | NMI_NORMAL_PRIOR) #define NMI_LOCAL_LOW_PRIOR (NMI_LOCAL_BIT | NMI_LOW_PRIOR) +#define NMI_FLAG_FIRST 1 + +enum { + NMI_LOCAL=0, + NMI_UNKNOWN, + NMI_MAX +}; + +#define NMI_DONE 0 +#define NMI_HANDLED 1 + +typedef int (*nmi_handler_t)(unsigned int, struct pt_regs *); + +int register_nmi_handler(unsigned int, nmi_handler_t, unsigned long, + const char *); + +void unregister_nmi_handler(unsigned int, const char *); + void stop_nmi(void); void restart_nmi(void); diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 68d758a..327748d 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -13,6 +13,9 @@ #include #include #include +#include +#include +#include #if defined(CONFIG_EDAC) #include @@ -21,6 +24,33 @@ #include #include #include +#include + +#define NMI_MAX_NAMELEN 16 +struct nmiaction { + struct list_head list; + nmi_handler_t handler; + unsigned int flags; + char *name; +}; + +struct nmi_desc { + spinlock_t lock; + struct list_head head; +}; + +static struct nmi_desc nmi_desc[NMI_MAX] = +{ + { + .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[0].lock), + .head = LIST_HEAD_INIT(nmi_desc[0].head), + }, + { + .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[1].lock), + .head = LIST_HEAD_INIT(nmi_desc[1].head), + }, + +}; static int ignore_nmis; @@ -38,6 +68,129 @@ static int __init setup_unknown_nmi_panic(char *str) } __setup("unknown_nmi_panic", setup_unknown_nmi_panic); +#define nmi_to_desc(type) (&nmi_desc[type]) + +static int notrace __kprobes nmi_handle(unsigned int type, struct pt_regs *regs) +{ + struct nmi_desc *desc = nmi_to_desc(type); + struct nmiaction *a; + int handled=0; + + rcu_read_lock(); + + /* + * NMIs are edge-triggered, which means if you have enough + * of them concurrently, you can lose some because only one + * can be latched at any given time. Walk the whole list + * to handle those situations. + */ + list_for_each_entry_rcu(a, &desc->head, list) { + + handled += a->handler(type, regs); + + } + + rcu_read_unlock(); + + /* return total number of NMI events handled */ + return handled; +} + +static int __setup_nmi(unsigned int type, struct nmiaction *action) +{ + struct nmi_desc *desc = nmi_to_desc(type); + unsigned long flags; + + spin_lock_irqsave(&desc->lock, flags); + + /* + * some handlers need to be executed first otherwise a fake + * event confuses some handlers (kdump uses this flag) + */ + if (action->flags & NMI_FLAG_FIRST) + list_add_rcu(&action->list, &desc->head); + else + list_add_tail_rcu(&action->list, &desc->head); + + spin_unlock_irqrestore(&desc->lock, flags); + return 0; +} + +static struct nmiaction *__free_nmi(unsigned int type, const char *name) +{ + struct nmi_desc *desc = nmi_to_desc(type); + struct nmiaction *n; + unsigned long flags; + + spin_lock_irqsave(&desc->lock, flags); + + list_for_each_entry_rcu(n, &desc->head, list) { + /* + * the name passed in to describe the nmi handler + * is used as the lookup key + */ + if (!strcmp(n->name, name)) { + WARN(in_nmi(), + "Trying to free NMI (%s) from NMI context!\n", n->name); + list_del_rcu(&n->list); + break; + } + } + + spin_unlock_irqrestore(&desc->lock, flags); + synchronize_rcu(); + return (n); +} + +int register_nmi_handler(unsigned int type, nmi_handler_t handler, + unsigned long nmiflags, const char *devname) +{ + struct nmiaction *action; + int retval = -ENOMEM; + + if (!handler) + return -EINVAL; + + action = kzalloc(sizeof(struct nmiaction), GFP_KERNEL); + if (!action) + goto fail_action; + + action->handler = handler; + action->flags = nmiflags; + action->name = kstrndup(devname, NMI_MAX_NAMELEN, GFP_KERNEL); + if (!action->name) + goto fail_action_name; + + retval = __setup_nmi(type, action); + + if (retval) + goto fail_setup_nmi; + + return retval; + +fail_setup_nmi: + kfree(action->name); +fail_action_name: + kfree(action); +fail_action: + + return retval; +} +EXPORT_SYMBOL_GPL(register_nmi_handler); + +void unregister_nmi_handler(unsigned int type, const char *name) +{ + struct nmiaction *a; + + a = __free_nmi(type, name); + if (a) { + kfree(a->name); + kfree(a); + } +} + +EXPORT_SYMBOL_GPL(unregister_nmi_handler); + static notrace __kprobes void pci_serr_error(unsigned char reason, struct pt_regs *regs) { -- cgit v1.1 From 9c48f1c629ecfa114850c03f875c6691003214de Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Fri, 30 Sep 2011 15:06:21 -0400 Subject: x86, nmi: Wire up NMI handlers to new routines Just convert all the files that have an nmi handler to the new routines. Most of it is straight forward conversion. A couple of places needed some tweaking like kgdb which separates the debug notifier from the nmi handler and mce removes a call to notify_die. [Thanks to Ying for finding out the history behind that mce call https://lkml.org/lkml/2010/5/27/114 And Boris responding that he would like to remove that call because of it https://lkml.org/lkml/2011/9/21/163] The things that get converted are the registeration/unregistration routines and the nmi handler itself has its args changed along with code removal to check which list it is on (most are on one NMI list except for kgdb which has both an NMI routine and an NMI Unknown routine). Signed-off-by: Don Zickus Signed-off-by: Peter Zijlstra Acked-by: Corey Minyard Cc: Jason Wessel Cc: Andi Kleen Cc: Robert Richter Cc: Huang Ying Cc: Corey Minyard Cc: Jack Steiner Link: http://lkml.kernel.org/r/1317409584-23662-4-git-send-email-dzickus@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/nmi.h | 20 ---------- arch/x86/include/asm/reboot.h | 2 +- arch/x86/kernel/apic/hw_nmi.c | 27 +++---------- arch/x86/kernel/apic/x2apic_uv_x.c | 20 ++-------- arch/x86/kernel/cpu/mcheck/mce-inject.c | 20 ++++------ arch/x86/kernel/cpu/mcheck/mce.c | 3 -- arch/x86/kernel/cpu/perf_event.c | 69 ++------------------------------- arch/x86/kernel/crash.c | 5 +-- arch/x86/kernel/kgdb.c | 60 +++++++++++++++++++++------- arch/x86/kernel/nmi.c | 11 ++++-- arch/x86/kernel/reboot.c | 23 ++++------- arch/x86/oprofile/nmi_int.c | 38 ++++++------------ arch/x86/oprofile/nmi_timer_int.c | 28 +++---------- 13 files changed, 99 insertions(+), 227 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index 480b69b..5361095 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -22,26 +22,6 @@ void arch_trigger_all_cpu_backtrace(void); #define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace #endif -/* - * Define some priorities for the nmi notifier call chain. - * - * Create a local nmi bit that has a higher priority than - * external nmis, because the local ones are more frequent. - * - * Also setup some default high/normal/low settings for - * subsystems to registers with. Using 4 bits to separate - * the priorities. This can go a lot higher if needed be. - */ - -#define NMI_LOCAL_SHIFT 16 /* randomly picked */ -#define NMI_LOCAL_BIT (1ULL << NMI_LOCAL_SHIFT) -#define NMI_HIGH_PRIOR (1ULL << 8) -#define NMI_NORMAL_PRIOR (1ULL << 4) -#define NMI_LOW_PRIOR (1ULL << 0) -#define NMI_LOCAL_HIGH_PRIOR (NMI_LOCAL_BIT | NMI_HIGH_PRIOR) -#define NMI_LOCAL_NORMAL_PRIOR (NMI_LOCAL_BIT | NMI_NORMAL_PRIOR) -#define NMI_LOCAL_LOW_PRIOR (NMI_LOCAL_BIT | NMI_LOW_PRIOR) - #define NMI_FLAG_FIRST 1 enum { diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h index 3250e3d..92f29706 100644 --- a/arch/x86/include/asm/reboot.h +++ b/arch/x86/include/asm/reboot.h @@ -23,7 +23,7 @@ void machine_real_restart(unsigned int type); #define MRR_BIOS 0 #define MRR_APM 1 -typedef void (*nmi_shootdown_cb)(int, struct die_args*); +typedef void (*nmi_shootdown_cb)(int, struct pt_regs*); void nmi_shootdown_cpus(nmi_shootdown_cb callback); #endif /* _ASM_X86_REBOOT_H */ diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index d5e57db0..31cb9ae 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -60,22 +60,10 @@ void arch_trigger_all_cpu_backtrace(void) } static int __kprobes -arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self, - unsigned long cmd, void *__args) +arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs) { - struct die_args *args = __args; - struct pt_regs *regs; int cpu; - switch (cmd) { - case DIE_NMI: - break; - - default: - return NOTIFY_DONE; - } - - regs = args->regs; cpu = smp_processor_id(); if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { @@ -86,21 +74,16 @@ arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self, show_regs(regs); arch_spin_unlock(&lock); cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); - return NOTIFY_STOP; + return NMI_HANDLED; } - return NOTIFY_DONE; + return NMI_DONE; } -static __read_mostly struct notifier_block backtrace_notifier = { - .notifier_call = arch_trigger_all_cpu_backtrace_handler, - .next = NULL, - .priority = NMI_LOCAL_LOW_PRIOR, -}; - static int __init register_trigger_all_cpu_backtrace(void) { - register_die_notifier(&backtrace_notifier); + register_nmi_handler(NMI_LOCAL, arch_trigger_all_cpu_backtrace_handler, + 0, "arch_bt"); return 0; } early_initcall(register_trigger_all_cpu_backtrace); diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 34b1859..75be00e 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -672,18 +672,11 @@ void __cpuinit uv_cpu_init(void) /* * When NMI is received, print a stack trace. */ -int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data) +int uv_handle_nmi(unsigned int reason, struct pt_regs *regs) { unsigned long real_uv_nmi; int bid; - if (reason != DIE_NMIUNKNOWN) - return NOTIFY_OK; - - if (in_crash_kexec) - /* do nothing if entering the crash kernel */ - return NOTIFY_OK; - /* * Each blade has an MMR that indicates when an NMI has been sent * to cpus on the blade. If an NMI is detected, atomically @@ -704,7 +697,7 @@ int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data) } if (likely(__get_cpu_var(cpu_last_nmi_count) == uv_blade_info[bid].nmi_count)) - return NOTIFY_DONE; + return NMI_DONE; __get_cpu_var(cpu_last_nmi_count) = uv_blade_info[bid].nmi_count; @@ -717,17 +710,12 @@ int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data) dump_stack(); spin_unlock(&uv_nmi_lock); - return NOTIFY_STOP; + return NMI_HANDLED; } -static struct notifier_block uv_dump_stack_nmi_nb = { - .notifier_call = uv_handle_nmi, - .priority = NMI_LOCAL_LOW_PRIOR - 1, -}; - void uv_register_nmi_notifier(void) { - if (register_die_notifier(&uv_dump_stack_nmi_nb)) + if (register_nmi_handler(NMI_UNKNOWN, uv_handle_nmi, 0, "uv")) printk(KERN_WARNING "UV NMI handler failed to register\n"); } diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index 0ed633c..6199232 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -78,27 +78,20 @@ static void raise_exception(struct mce *m, struct pt_regs *pregs) static cpumask_var_t mce_inject_cpumask; -static int mce_raise_notify(struct notifier_block *self, - unsigned long val, void *data) +static int mce_raise_notify(unsigned int cmd, struct pt_regs *regs) { - struct die_args *args = (struct die_args *)data; int cpu = smp_processor_id(); struct mce *m = &__get_cpu_var(injectm); - if (val != DIE_NMI || !cpumask_test_cpu(cpu, mce_inject_cpumask)) - return NOTIFY_DONE; + if (!cpumask_test_cpu(cpu, mce_inject_cpumask)) + return NMI_DONE; cpumask_clear_cpu(cpu, mce_inject_cpumask); if (m->inject_flags & MCJ_EXCEPTION) - raise_exception(m, args->regs); + raise_exception(m, regs); else if (m->status) raise_poll(m); - return NOTIFY_STOP; + return NMI_HANDLED; } -static struct notifier_block mce_raise_nb = { - .notifier_call = mce_raise_notify, - .priority = NMI_LOCAL_NORMAL_PRIOR, -}; - /* Inject mce on current CPU */ static int raise_local(void) { @@ -216,7 +209,8 @@ static int inject_init(void) return -ENOMEM; printk(KERN_INFO "Machine check injector initialized\n"); mce_chrdev_ops.write = mce_write; - register_die_notifier(&mce_raise_nb); + register_nmi_handler(NMI_LOCAL, mce_raise_notify, 0, + "mce_notify"); return 0; } diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 5b5ccee..fce51ad1 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -908,9 +908,6 @@ void do_machine_check(struct pt_regs *regs, long error_code) percpu_inc(mce_exception_count); - if (notify_die(DIE_NMI, "machine check", regs, error_code, - 18, SIGKILL) == NOTIFY_STOP) - goto out; if (!banks) goto out; diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 8ab8911..6408910 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1058,76 +1058,15 @@ void perf_events_lapic_init(void) apic_write(APIC_LVTPC, APIC_DM_NMI); } -struct pmu_nmi_state { - unsigned int marked; - int handled; -}; - -static DEFINE_PER_CPU(struct pmu_nmi_state, pmu_nmi); - static int __kprobes -perf_event_nmi_handler(struct notifier_block *self, - unsigned long cmd, void *__args) +perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs) { - struct die_args *args = __args; - unsigned int this_nmi; - int handled; - if (!atomic_read(&active_events)) - return NOTIFY_DONE; - - switch (cmd) { - case DIE_NMI: - break; - case DIE_NMIUNKNOWN: - this_nmi = percpu_read(irq_stat.__nmi_count); - if (this_nmi != __this_cpu_read(pmu_nmi.marked)) - /* let the kernel handle the unknown nmi */ - return NOTIFY_DONE; - /* - * This one is a PMU back-to-back nmi. Two events - * trigger 'simultaneously' raising two back-to-back - * NMIs. If the first NMI handles both, the latter - * will be empty and daze the CPU. So, we drop it to - * avoid false-positive 'unknown nmi' messages. - */ - return NOTIFY_STOP; - default: - return NOTIFY_DONE; - } - - handled = x86_pmu.handle_irq(args->regs); - if (!handled) - return NOTIFY_DONE; + return NMI_DONE; - this_nmi = percpu_read(irq_stat.__nmi_count); - if ((handled > 1) || - /* the next nmi could be a back-to-back nmi */ - ((__this_cpu_read(pmu_nmi.marked) == this_nmi) && - (__this_cpu_read(pmu_nmi.handled) > 1))) { - /* - * We could have two subsequent back-to-back nmis: The - * first handles more than one counter, the 2nd - * handles only one counter and the 3rd handles no - * counter. - * - * This is the 2nd nmi because the previous was - * handling more than one counter. We will mark the - * next (3rd) and then drop it if unhandled. - */ - __this_cpu_write(pmu_nmi.marked, this_nmi + 1); - __this_cpu_write(pmu_nmi.handled, handled); - } - - return NOTIFY_STOP; + return x86_pmu.handle_irq(regs); } -static __read_mostly struct notifier_block perf_event_nmi_notifier = { - .notifier_call = perf_event_nmi_handler, - .next = NULL, - .priority = NMI_LOCAL_LOW_PRIOR, -}; - struct event_constraint emptyconstraint; struct event_constraint unconstrained; @@ -1232,7 +1171,7 @@ static int __init init_hw_perf_events(void) ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED; perf_events_lapic_init(); - register_die_notifier(&perf_event_nmi_notifier); + register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI"); unconstrained = (struct event_constraint) __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1, diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 764c7c2..13ad899 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -32,15 +32,12 @@ int in_crash_kexec; #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) -static void kdump_nmi_callback(int cpu, struct die_args *args) +static void kdump_nmi_callback(int cpu, struct pt_regs *regs) { - struct pt_regs *regs; #ifdef CONFIG_X86_32 struct pt_regs fixed_regs; #endif - regs = args->regs; - #ifdef CONFIG_X86_32 if (!user_mode_vm(regs)) { crash_fixup_ss_esp(&fixed_regs, regs); diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 00354d4..faba577 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -511,28 +511,37 @@ single_step_cont(struct pt_regs *regs, struct die_args *args) static int was_in_debug_nmi[NR_CPUS]; -static int __kgdb_notify(struct die_args *args, unsigned long cmd) +static int kgdb_nmi_handler(unsigned int cmd, struct pt_regs *regs) { - struct pt_regs *regs = args->regs; - switch (cmd) { - case DIE_NMI: + case NMI_LOCAL: if (atomic_read(&kgdb_active) != -1) { /* KGDB CPU roundup */ kgdb_nmicallback(raw_smp_processor_id(), regs); was_in_debug_nmi[raw_smp_processor_id()] = 1; touch_nmi_watchdog(); - return NOTIFY_STOP; + return NMI_HANDLED; } - return NOTIFY_DONE; + break; - case DIE_NMIUNKNOWN: + case NMI_UNKNOWN: if (was_in_debug_nmi[raw_smp_processor_id()]) { was_in_debug_nmi[raw_smp_processor_id()] = 0; - return NOTIFY_STOP; + return NMI_HANDLED; } - return NOTIFY_DONE; + break; + default: + /* do nothing */ + break; + } + return NMI_DONE; +} + +static int __kgdb_notify(struct die_args *args, unsigned long cmd) +{ + struct pt_regs *regs = args->regs; + switch (cmd) { case DIE_DEBUG: if (atomic_read(&kgdb_cpu_doing_single_step) != -1) { if (user_mode(regs)) @@ -590,11 +599,6 @@ kgdb_notify(struct notifier_block *self, unsigned long cmd, void *ptr) static struct notifier_block kgdb_notifier = { .notifier_call = kgdb_notify, - - /* - * Lowest-prio notifier priority, we want to be notified last: - */ - .priority = NMI_LOCAL_LOW_PRIOR, }; /** @@ -605,7 +609,31 @@ static struct notifier_block kgdb_notifier = { */ int kgdb_arch_init(void) { - return register_die_notifier(&kgdb_notifier); + int retval; + + retval = register_die_notifier(&kgdb_notifier); + if (retval) + goto out; + + retval = register_nmi_handler(NMI_LOCAL, kgdb_nmi_handler, + 0, "kgdb"); + if (retval) + goto out1; + + retval = register_nmi_handler(NMI_UNKNOWN, kgdb_nmi_handler, + 0, "kgdb"); + + if (retval) + goto out2; + + return retval; + +out2: + unregister_nmi_handler(NMI_LOCAL, "kgdb"); +out1: + unregister_die_notifier(&kgdb_notifier); +out: + return retval; } static void kgdb_hw_overflow_handler(struct perf_event *event, @@ -673,6 +701,8 @@ void kgdb_arch_exit(void) breakinfo[i].pev = NULL; } } + unregister_nmi_handler(NMI_UNKNOWN, "kgdb"); + unregister_nmi_handler(NMI_LOCAL, "kgdb"); unregister_die_notifier(&kgdb_notifier); } diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 327748d..e20f5e7 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -1,6 +1,7 @@ /* * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs + * Copyright (C) 2011 Don Zickus Red Hat, Inc. * * Pentium III FXSR, SSE support * Gareth Hughes , May 2000 @@ -248,8 +249,10 @@ io_check_error(unsigned char reason, struct pt_regs *regs) static notrace __kprobes void unknown_nmi_error(unsigned char reason, struct pt_regs *regs) { - if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == - NOTIFY_STOP) + int handled; + + handled = nmi_handle(NMI_UNKNOWN, regs); + if (handled) return; #ifdef CONFIG_MCA /* @@ -274,13 +277,15 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs) static notrace __kprobes void default_do_nmi(struct pt_regs *regs) { unsigned char reason = 0; + int handled; /* * CPU-specific NMI must be processed before non-CPU-specific * NMI, otherwise we may lose it, because the CPU-specific * NMI can not be detected/processed on other CPUs. */ - if (notify_die(DIE_NMI, "nmi", regs, 0, 2, SIGINT) == NOTIFY_STOP) + handled = nmi_handle(NMI_LOCAL, regs); + if (handled) return; /* Non-CPU-specific NMI: NMI sources can be processed on any CPU */ diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 9242436..e334be1 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -464,7 +464,7 @@ static inline void kb_wait(void) } } -static void vmxoff_nmi(int cpu, struct die_args *args) +static void vmxoff_nmi(int cpu, struct pt_regs *regs) { cpu_emergency_vmxoff(); } @@ -736,14 +736,10 @@ static nmi_shootdown_cb shootdown_callback; static atomic_t waiting_for_crash_ipi; -static int crash_nmi_callback(struct notifier_block *self, - unsigned long val, void *data) +static int crash_nmi_callback(unsigned int val, struct pt_regs *regs) { int cpu; - if (val != DIE_NMI) - return NOTIFY_OK; - cpu = raw_smp_processor_id(); /* Don't do anything if this handler is invoked on crashing cpu. @@ -751,10 +747,10 @@ static int crash_nmi_callback(struct notifier_block *self, * an NMI if system was initially booted with nmi_watchdog parameter. */ if (cpu == crashing_cpu) - return NOTIFY_STOP; + return NMI_HANDLED; local_irq_disable(); - shootdown_callback(cpu, (struct die_args *)data); + shootdown_callback(cpu, regs); atomic_dec(&waiting_for_crash_ipi); /* Assume hlt works */ @@ -762,7 +758,7 @@ static int crash_nmi_callback(struct notifier_block *self, for (;;) cpu_relax(); - return 1; + return NMI_HANDLED; } static void smp_send_nmi_allbutself(void) @@ -770,12 +766,6 @@ static void smp_send_nmi_allbutself(void) apic->send_IPI_allbutself(NMI_VECTOR); } -static struct notifier_block crash_nmi_nb = { - .notifier_call = crash_nmi_callback, - /* we want to be the first one called */ - .priority = NMI_LOCAL_HIGH_PRIOR+1, -}; - /* Halt all other CPUs, calling the specified function on each of them * * This function can be used to halt all other CPUs on crash @@ -794,7 +784,8 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback) atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1); /* Would it be better to replace the trap vector here? */ - if (register_die_notifier(&crash_nmi_nb)) + if (register_nmi_handler(NMI_LOCAL, crash_nmi_callback, + NMI_FLAG_FIRST, "crash")) return; /* return what? */ /* Ensure the new callback function is set before sending * out the NMI diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 68894fd..adf8fb3 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -61,26 +61,15 @@ u64 op_x86_get_ctrl(struct op_x86_model_spec const *model, } -static int profile_exceptions_notify(struct notifier_block *self, - unsigned long val, void *data) +static int profile_exceptions_notify(unsigned int val, struct pt_regs *regs) { - struct die_args *args = (struct die_args *)data; - int ret = NOTIFY_DONE; - - switch (val) { - case DIE_NMI: - if (ctr_running) - model->check_ctrs(args->regs, &__get_cpu_var(cpu_msrs)); - else if (!nmi_enabled) - break; - else - model->stop(&__get_cpu_var(cpu_msrs)); - ret = NOTIFY_STOP; - break; - default: - break; - } - return ret; + if (ctr_running) + model->check_ctrs(regs, &__get_cpu_var(cpu_msrs)); + else if (!nmi_enabled) + return NMI_DONE; + else + model->stop(&__get_cpu_var(cpu_msrs)); + return NMI_HANDLED; } static void nmi_cpu_save_registers(struct op_msrs *msrs) @@ -363,12 +352,6 @@ static void nmi_cpu_setup(void *dummy) apic_write(APIC_LVTPC, APIC_DM_NMI); } -static struct notifier_block profile_exceptions_nb = { - .notifier_call = profile_exceptions_notify, - .next = NULL, - .priority = NMI_LOCAL_LOW_PRIOR, -}; - static void nmi_cpu_restore_registers(struct op_msrs *msrs) { struct op_msr *counters = msrs->counters; @@ -508,7 +491,8 @@ static int nmi_setup(void) ctr_running = 0; /* make variables visible to the nmi handler: */ smp_mb(); - err = register_die_notifier(&profile_exceptions_nb); + err = register_nmi_handler(NMI_LOCAL, profile_exceptions_notify, + 0, "oprofile"); if (err) goto fail; @@ -538,7 +522,7 @@ static void nmi_shutdown(void) put_online_cpus(); /* make variables visible to the nmi handler: */ smp_mb(); - unregister_die_notifier(&profile_exceptions_nb); + unregister_nmi_handler(NMI_LOCAL, "oprofile"); msrs = &get_cpu_var(cpu_msrs); model->shutdown(msrs); free_msrs(); diff --git a/arch/x86/oprofile/nmi_timer_int.c b/arch/x86/oprofile/nmi_timer_int.c index 720bf5a..7f8052c 100644 --- a/arch/x86/oprofile/nmi_timer_int.c +++ b/arch/x86/oprofile/nmi_timer_int.c @@ -18,32 +18,16 @@ #include #include -static int profile_timer_exceptions_notify(struct notifier_block *self, - unsigned long val, void *data) +static int profile_timer_exceptions_notify(unsigned int val, struct pt_regs *regs) { - struct die_args *args = (struct die_args *)data; - int ret = NOTIFY_DONE; - - switch (val) { - case DIE_NMI: - oprofile_add_sample(args->regs, 0); - ret = NOTIFY_STOP; - break; - default: - break; - } - return ret; + oprofile_add_sample(regs, 0); + return NMI_HANDLED; } -static struct notifier_block profile_timer_exceptions_nb = { - .notifier_call = profile_timer_exceptions_notify, - .next = NULL, - .priority = NMI_LOW_PRIOR, -}; - static int timer_start(void) { - if (register_die_notifier(&profile_timer_exceptions_nb)) + if (register_nmi_handler(NMI_LOCAL, profile_timer_exceptions_notify, + 0, "oprofile-timer")) return 1; return 0; } @@ -51,7 +35,7 @@ static int timer_start(void) static void timer_stop(void) { - unregister_die_notifier(&profile_timer_exceptions_nb); + unregister_nmi_handler(NMI_LOCAL, "oprofile-timer"); synchronize_sched(); /* Allow already-started NMIs to complete. */ } -- cgit v1.1 From b227e23399dc59977aa42c49bd668bdab7a61812 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Fri, 30 Sep 2011 15:06:22 -0400 Subject: x86, nmi: Add in logic to handle multiple events and unknown NMIs Previous patches allow the NMI subsystem to process multipe NMI events in one NMI. As previously discussed this can cause issues when an event triggered another NMI but is processed in the current NMI. This causes the next NMI to go unprocessed and become an 'unknown' NMI. To handle this, we first have to flag whether or not the NMI handler handled more than one event or not. If it did, then there exists a chance that the next NMI might be already processed. Once the NMI is flagged as a candidate to be swallowed, we next look for a back-to-back NMI condition. This is determined by looking at the %rip from pt_regs. If it is the same as the previous NMI, it is assumed the cpu did not have a chance to jump back into a non-NMI context and execute code and instead handled another NMI. If both of those conditions are true then we will swallow any unknown NMI. There still exists a chance that we accidentally swallow a real unknown NMI, but for now things seem better. An optimization has also been added to the nmi notifier rountine. Because x86 can latch up to one NMI while currently processing an NMI, we don't have to worry about executing _all_ the handlers in a standalone NMI. The idea is if multiple NMIs come in, the second NMI will represent them. For those back-to-back NMI cases, we have the potentail to drop NMIs. Therefore only execute all the handlers in the second half of a detected back-to-back NMI. Signed-off-by: Don Zickus Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1317409584-23662-5-git-send-email-dzickus@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/nmi.h | 1 + arch/x86/kernel/nmi.c | 97 ++++++++++++++++++++++++++++++++++++++++---- arch/x86/kernel/process_32.c | 2 + arch/x86/kernel/process_64.c | 2 + 4 files changed, 93 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index 5361095..fd3f9f1 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -42,5 +42,6 @@ void unregister_nmi_handler(unsigned int, const char *); void stop_nmi(void); void restart_nmi(void); +void local_touch_nmi(void); #endif /* _ASM_X86_NMI_H */ diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index e20f5e7..35b3959 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -71,7 +71,7 @@ __setup("unknown_nmi_panic", setup_unknown_nmi_panic); #define nmi_to_desc(type) (&nmi_desc[type]) -static int notrace __kprobes nmi_handle(unsigned int type, struct pt_regs *regs) +static int notrace __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b) { struct nmi_desc *desc = nmi_to_desc(type); struct nmiaction *a; @@ -85,12 +85,9 @@ static int notrace __kprobes nmi_handle(unsigned int type, struct pt_regs *regs) * can be latched at any given time. Walk the whole list * to handle those situations. */ - list_for_each_entry_rcu(a, &desc->head, list) { - + list_for_each_entry_rcu(a, &desc->head, list) handled += a->handler(type, regs); - } - rcu_read_unlock(); /* return total number of NMI events handled */ @@ -105,6 +102,13 @@ static int __setup_nmi(unsigned int type, struct nmiaction *action) spin_lock_irqsave(&desc->lock, flags); /* + * most handlers of type NMI_UNKNOWN never return because + * they just assume the NMI is theirs. Just a sanity check + * to manage expectations + */ + WARN_ON_ONCE(type == NMI_UNKNOWN && !list_empty(&desc->head)); + + /* * some handlers need to be executed first otherwise a fake * event confuses some handlers (kdump uses this flag) */ @@ -251,7 +255,13 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs) { int handled; - handled = nmi_handle(NMI_UNKNOWN, regs); + /* + * Use 'false' as back-to-back NMIs are dealt with one level up. + * Of course this makes having multiple 'unknown' handlers useless + * as only the first one is ever run (unless it can actually determine + * if it caused the NMI) + */ + handled = nmi_handle(NMI_UNKNOWN, regs, false); if (handled) return; #ifdef CONFIG_MCA @@ -274,19 +284,49 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs) pr_emerg("Dazed and confused, but trying to continue\n"); } +static DEFINE_PER_CPU(bool, swallow_nmi); +static DEFINE_PER_CPU(unsigned long, last_nmi_rip); + static notrace __kprobes void default_do_nmi(struct pt_regs *regs) { unsigned char reason = 0; int handled; + bool b2b = false; /* * CPU-specific NMI must be processed before non-CPU-specific * NMI, otherwise we may lose it, because the CPU-specific * NMI can not be detected/processed on other CPUs. */ - handled = nmi_handle(NMI_LOCAL, regs); - if (handled) + + /* + * Back-to-back NMIs are interesting because they can either + * be two NMI or more than two NMIs (any thing over two is dropped + * due to NMI being edge-triggered). If this is the second half + * of the back-to-back NMI, assume we dropped things and process + * more handlers. Otherwise reset the 'swallow' NMI behaviour + */ + if (regs->ip == __this_cpu_read(last_nmi_rip)) + b2b = true; + else + __this_cpu_write(swallow_nmi, false); + + __this_cpu_write(last_nmi_rip, regs->ip); + + handled = nmi_handle(NMI_LOCAL, regs, b2b); + if (handled) { + /* + * There are cases when a NMI handler handles multiple + * events in the current NMI. One of these events may + * be queued for in the next NMI. Because the event is + * already handled, the next NMI will result in an unknown + * NMI. Instead lets flag this for a potential NMI to + * swallow. + */ + if (handled > 1) + __this_cpu_write(swallow_nmi, true); return; + } /* Non-CPU-specific NMI: NMI sources can be processed on any CPU */ raw_spin_lock(&nmi_reason_lock); @@ -309,7 +349,40 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) } raw_spin_unlock(&nmi_reason_lock); - unknown_nmi_error(reason, regs); + /* + * Only one NMI can be latched at a time. To handle + * this we may process multiple nmi handlers at once to + * cover the case where an NMI is dropped. The downside + * to this approach is we may process an NMI prematurely, + * while its real NMI is sitting latched. This will cause + * an unknown NMI on the next run of the NMI processing. + * + * We tried to flag that condition above, by setting the + * swallow_nmi flag when we process more than one event. + * This condition is also only present on the second half + * of a back-to-back NMI, so we flag that condition too. + * + * If both are true, we assume we already processed this + * NMI previously and we swallow it. Otherwise we reset + * the logic. + * + * There are scenarios where we may accidentally swallow + * a 'real' unknown NMI. For example, while processing + * a perf NMI another perf NMI comes in along with a + * 'real' unknown NMI. These two NMIs get combined into + * one (as descibed above). When the next NMI gets + * processed, it will be flagged by perf as handled, but + * noone will know that there was a 'real' unknown NMI sent + * also. As a result it gets swallowed. Or if the first + * perf NMI returns two events handled then the second + * NMI will get eaten by the logic below, again losing a + * 'real' unknown NMI. But this is the best we can do + * for now. + */ + if (b2b && __this_cpu_read(swallow_nmi)) + ; + else + unknown_nmi_error(reason, regs); } dotraplinkage notrace __kprobes void @@ -334,3 +407,9 @@ void restart_nmi(void) { ignore_nmis--; } + +/* reset the back-to-back NMI logic */ +void local_touch_nmi(void) +{ + __this_cpu_write(last_nmi_rip, 0); +} diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 7a3b651..46ff054 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -57,6 +57,7 @@ #include #include #include +#include asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); @@ -107,6 +108,7 @@ void cpu_idle(void) if (cpu_is_offline(cpu)) play_dead(); + local_touch_nmi(); local_irq_disable(); /* Don't trace irqs off for idle */ stop_critical_timings(); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index f693e44..3bd7e6e 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -51,6 +51,7 @@ #include #include #include +#include asmlinkage extern void ret_from_fork(void); @@ -133,6 +134,7 @@ void cpu_idle(void) * from here on, until they go to idle. * Otherwise, idle callbacks can misfire. */ + local_touch_nmi(); local_irq_disable(); enter_idle(); /* Don't trace irqs off for idle */ -- cgit v1.1 From efc3aac5f3d7dbd47fd0a4983979dd4342a78fba Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Fri, 30 Sep 2011 15:06:23 -0400 Subject: x86, nmi: Track NMI usage stats Now that the NMI handler are broken into lists, increment the appropriate stats for each list. This allows us to see what is going on when they get printed out in the next patch. Signed-off-by: Don Zickus Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1317409584-23662-6-git-send-email-dzickus@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/nmi.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 35b3959..d0eaa31 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -53,6 +53,15 @@ static struct nmi_desc nmi_desc[NMI_MAX] = }; +struct nmi_stats { + unsigned int normal; + unsigned int unknown; + unsigned int external; + unsigned int swallow; +}; + +static DEFINE_PER_CPU(struct nmi_stats, nmi_stats); + static int ignore_nmis; int unknown_nmi_panic; @@ -262,8 +271,13 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs) * if it caused the NMI) */ handled = nmi_handle(NMI_UNKNOWN, regs, false); - if (handled) + if (handled) { + __this_cpu_add(nmi_stats.unknown, handled); return; + } + + __this_cpu_add(nmi_stats.unknown, 1); + #ifdef CONFIG_MCA /* * Might actually be able to figure out what the guilty party @@ -314,6 +328,7 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) __this_cpu_write(last_nmi_rip, regs->ip); handled = nmi_handle(NMI_LOCAL, regs, b2b); + __this_cpu_add(nmi_stats.normal, handled); if (handled) { /* * There are cases when a NMI handler handles multiple @@ -344,6 +359,7 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) */ reassert_nmi(); #endif + __this_cpu_add(nmi_stats.external, 1); raw_spin_unlock(&nmi_reason_lock); return; } @@ -380,7 +396,7 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) * for now. */ if (b2b && __this_cpu_read(swallow_nmi)) - ; + __this_cpu_add(nmi_stats.swallow, 1); else unknown_nmi_error(reason, regs); } -- cgit v1.1 From ee5789dbcc800ba7d641443e53f60d53977f9747 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 21 Sep 2011 11:30:17 +0200 Subject: perf, x86: Share IBS macros between perf and oprofile Moving IBS macros from oprofile to to make it available to perf. No additional changes. Signed-off-by: Robert Richter Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1316597423-25723-2-git-send-email-robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/perf_event.h | 38 +++++++++++++++++++++++++++++++++--- arch/x86/kernel/cpu/perf_event_amd.c | 4 ++-- arch/x86/oprofile/op_model_amd.c | 37 +++-------------------------------- 3 files changed, 40 insertions(+), 39 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index e47cb61..e9e277c 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -46,14 +46,17 @@ #define AMD64_RAW_EVENT_MASK \ (X86_RAW_EVENT_MASK | \ AMD64_EVENTSEL_EVENT) +#define AMD64_NUM_COUNTERS 4 +#define AMD64_NUM_COUNTERS_F15H 6 +#define AMD64_NUM_COUNTERS_MAX AMD64_NUM_COUNTERS_F15H -#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c +#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8) -#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 0 +#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 0 #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \ (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX)) -#define ARCH_PERFMON_BRANCH_MISSES_RETIRED 6 +#define ARCH_PERFMON_BRANCH_MISSES_RETIRED 6 /* * Intel "Architectural Performance Monitoring" CPUID @@ -113,6 +116,35 @@ union cpuid10_edx { */ #define X86_PMC_IDX_FIXED_BTS (X86_PMC_IDX_FIXED + 16) +/* + * IBS cpuid feature detection + */ + +#define IBS_CPUID_FEATURES 0x8000001b + +/* + * Same bit mask as for IBS cpuid feature flags (Fn8000_001B_EAX), but + * bit 0 is used to indicate the existence of IBS. + */ +#define IBS_CAPS_AVAIL (1U<<0) +#define IBS_CAPS_FETCHSAM (1U<<1) +#define IBS_CAPS_OPSAM (1U<<2) +#define IBS_CAPS_RDWROPCNT (1U<<3) +#define IBS_CAPS_OPCNT (1U<<4) +#define IBS_CAPS_BRNTRGT (1U<<5) +#define IBS_CAPS_OPCNTEXT (1U<<6) + +#define IBS_CAPS_DEFAULT (IBS_CAPS_AVAIL \ + | IBS_CAPS_FETCHSAM \ + | IBS_CAPS_OPSAM) + +/* + * IBS APIC setup + */ +#define IBSCTL 0x1cc +#define IBSCTL_LVT_OFFSET_VALID (1ULL<<8) +#define IBSCTL_LVT_OFFSET_MASK 0x0F + /* IbsFetchCtl bits/masks */ #define IBS_FETCH_RAND_EN (1ULL<<57) #define IBS_FETCH_VAL (1ULL<<49) diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index db8e603..aeefd45 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -411,7 +411,7 @@ static __initconst const struct x86_pmu amd_pmu = { .perfctr = MSR_K7_PERFCTR0, .event_map = amd_pmu_event_map, .max_events = ARRAY_SIZE(amd_perfmon_event_map), - .num_counters = 4, + .num_counters = AMD64_NUM_COUNTERS, .cntval_bits = 48, .cntval_mask = (1ULL << 48) - 1, .apic = 1, @@ -575,7 +575,7 @@ static __initconst const struct x86_pmu amd_pmu_f15h = { .perfctr = MSR_F15H_PERF_CTR, .event_map = amd_pmu_event_map, .max_events = ARRAY_SIZE(amd_perfmon_event_map), - .num_counters = 6, + .num_counters = AMD64_NUM_COUNTERS_F15H, .cntval_bits = 48, .cntval_mask = (1ULL << 48) - 1, .apic = 1, diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 9cbb710..e947e5c 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -29,8 +29,6 @@ #include "op_x86_model.h" #include "op_counter.h" -#define NUM_COUNTERS 4 -#define NUM_COUNTERS_F15H 6 #ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX #define NUM_VIRT_COUNTERS 32 #else @@ -70,35 +68,6 @@ static struct ibs_config ibs_config; static struct ibs_state ibs_state; /* - * IBS cpuid feature detection - */ - -#define IBS_CPUID_FEATURES 0x8000001b - -/* - * Same bit mask as for IBS cpuid feature flags (Fn8000_001B_EAX), but - * bit 0 is used to indicate the existence of IBS. - */ -#define IBS_CAPS_AVAIL (1U<<0) -#define IBS_CAPS_FETCHSAM (1U<<1) -#define IBS_CAPS_OPSAM (1U<<2) -#define IBS_CAPS_RDWROPCNT (1U<<3) -#define IBS_CAPS_OPCNT (1U<<4) -#define IBS_CAPS_BRNTRGT (1U<<5) -#define IBS_CAPS_OPCNTEXT (1U<<6) - -#define IBS_CAPS_DEFAULT (IBS_CAPS_AVAIL \ - | IBS_CAPS_FETCHSAM \ - | IBS_CAPS_OPSAM) - -/* - * IBS APIC setup - */ -#define IBSCTL 0x1cc -#define IBSCTL_LVT_OFFSET_VALID (1ULL<<8) -#define IBSCTL_LVT_OFFSET_MASK 0x0F - -/* * IBS randomization macros */ #define IBS_RANDOM_BITS 12 @@ -439,7 +408,7 @@ static int op_amd_fill_in_addresses(struct op_msrs * const msrs) goto fail; } /* both registers must be reserved */ - if (num_counters == NUM_COUNTERS_F15H) { + if (num_counters == AMD64_NUM_COUNTERS_F15H) { msrs->counters[i].addr = MSR_F15H_PERF_CTR + (i << 1); msrs->controls[i].addr = MSR_F15H_PERF_CTL + (i << 1); } else { @@ -741,9 +710,9 @@ static int op_amd_init(struct oprofile_operations *ops) ops->create_files = setup_ibs_files; if (boot_cpu_data.x86 == 0x15) { - num_counters = NUM_COUNTERS_F15H; + num_counters = AMD64_NUM_COUNTERS_F15H; } else { - num_counters = NUM_COUNTERS; + num_counters = AMD64_NUM_COUNTERS; } op_amd_spec.num_counters = num_counters; -- cgit v1.1 From b716916679e72054d436afadce2f94dcad71cfad Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 21 Sep 2011 11:30:18 +0200 Subject: perf, x86: Implement IBS initialization This patch implements IBS feature detection and initialzation. The code is shared between perf and oprofile. If IBS is available on the system for perf, a pmu is setup. Signed-off-by: Robert Richter Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1316597423-25723-3-git-send-email-robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/perf_event.h | 2 + arch/x86/kernel/cpu/Makefile | 2 +- arch/x86/kernel/cpu/perf_event_amd_ibs.c | 294 +++++++++++++++++++++++++++++++ arch/x86/oprofile/nmi_int.c | 2 - arch/x86/oprofile/op_model_amd.c | 197 --------------------- arch/x86/oprofile/op_x86_model.h | 1 - 6 files changed, 297 insertions(+), 201 deletions(-) create mode 100644 arch/x86/kernel/cpu/perf_event_amd_ibs.c (limited to 'arch/x86') diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index e9e277c..f61c62f 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -159,6 +159,8 @@ union cpuid10_edx { #define IBS_OP_MAX_CNT 0x0000FFFFULL #define IBS_OP_MAX_CNT_EXT 0x007FFFFFULL /* not a register bit mask */ +extern u32 get_ibs_caps(void); + #ifdef CONFIG_PERF_EVENTS extern void perf_events_lapic_init(void); diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 1044fd7..fe6eb19 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -36,7 +36,7 @@ endif obj-$(CONFIG_X86_MCE) += mcheck/ obj-$(CONFIG_MTRR) += mtrr/ -obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o +obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o perf_event_amd_ibs.o quiet_cmd_mkcapflags = MKCAP $@ cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@ diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c new file mode 100644 index 0000000..ab6343d --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -0,0 +1,294 @@ +/* + * Performance events - AMD IBS + * + * Copyright (C) 2011 Advanced Micro Devices, Inc., Robert Richter + * + * For licencing details see kernel-base/COPYING + */ + +#include +#include +#include + +#include + +static u32 ibs_caps; + +#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) + +static struct pmu perf_ibs; + +static int perf_ibs_init(struct perf_event *event) +{ + if (perf_ibs.type != event->attr.type) + return -ENOENT; + return 0; +} + +static int perf_ibs_add(struct perf_event *event, int flags) +{ + return 0; +} + +static void perf_ibs_del(struct perf_event *event, int flags) +{ +} + +static struct pmu perf_ibs = { + .event_init= perf_ibs_init, + .add= perf_ibs_add, + .del= perf_ibs_del, +}; + +static __init int perf_event_ibs_init(void) +{ + if (!ibs_caps) + return -ENODEV; /* ibs not supported by the cpu */ + + perf_pmu_register(&perf_ibs, "ibs", -1); + printk(KERN_INFO "perf: AMD IBS detected (0x%08x)\n", ibs_caps); + + return 0; +} + +#else /* defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) */ + +static __init int perf_event_ibs_init(void) { return 0; } + +#endif + +/* IBS - apic initialization, for perf and oprofile */ + +static __init u32 __get_ibs_caps(void) +{ + u32 caps; + unsigned int max_level; + + if (!boot_cpu_has(X86_FEATURE_IBS)) + return 0; + + /* check IBS cpuid feature flags */ + max_level = cpuid_eax(0x80000000); + if (max_level < IBS_CPUID_FEATURES) + return IBS_CAPS_DEFAULT; + + caps = cpuid_eax(IBS_CPUID_FEATURES); + if (!(caps & IBS_CAPS_AVAIL)) + /* cpuid flags not valid */ + return IBS_CAPS_DEFAULT; + + return caps; +} + +u32 get_ibs_caps(void) +{ + return ibs_caps; +} + +EXPORT_SYMBOL(get_ibs_caps); + +static inline int get_eilvt(int offset) +{ + return !setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 1); +} + +static inline int put_eilvt(int offset) +{ + return !setup_APIC_eilvt(offset, 0, 0, 1); +} + +/* + * Check and reserve APIC extended interrupt LVT offset for IBS if available. + */ +static inline int ibs_eilvt_valid(void) +{ + int offset; + u64 val; + int valid = 0; + + preempt_disable(); + + rdmsrl(MSR_AMD64_IBSCTL, val); + offset = val & IBSCTL_LVT_OFFSET_MASK; + + if (!(val & IBSCTL_LVT_OFFSET_VALID)) { + pr_err(FW_BUG "cpu %d, invalid IBS interrupt offset %d (MSR%08X=0x%016llx)\n", + smp_processor_id(), offset, MSR_AMD64_IBSCTL, val); + goto out; + } + + if (!get_eilvt(offset)) { + pr_err(FW_BUG "cpu %d, IBS interrupt offset %d not available (MSR%08X=0x%016llx)\n", + smp_processor_id(), offset, MSR_AMD64_IBSCTL, val); + goto out; + } + + valid = 1; +out: + preempt_enable(); + + return valid; +} + +static int setup_ibs_ctl(int ibs_eilvt_off) +{ + struct pci_dev *cpu_cfg; + int nodes; + u32 value = 0; + + nodes = 0; + cpu_cfg = NULL; + do { + cpu_cfg = pci_get_device(PCI_VENDOR_ID_AMD, + PCI_DEVICE_ID_AMD_10H_NB_MISC, + cpu_cfg); + if (!cpu_cfg) + break; + ++nodes; + pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off + | IBSCTL_LVT_OFFSET_VALID); + pci_read_config_dword(cpu_cfg, IBSCTL, &value); + if (value != (ibs_eilvt_off | IBSCTL_LVT_OFFSET_VALID)) { + pci_dev_put(cpu_cfg); + printk(KERN_DEBUG "Failed to setup IBS LVT offset, " + "IBSCTL = 0x%08x\n", value); + return -EINVAL; + } + } while (1); + + if (!nodes) { + printk(KERN_DEBUG "No CPU node configured for IBS\n"); + return -ENODEV; + } + + return 0; +} + +/* + * This runs only on the current cpu. We try to find an LVT offset and + * setup the local APIC. For this we must disable preemption. On + * success we initialize all nodes with this offset. This updates then + * the offset in the IBS_CTL per-node msr. The per-core APIC setup of + * the IBS interrupt vector is handled by perf_ibs_cpu_notifier that + * is using the new offset. + */ +static int force_ibs_eilvt_setup(void) +{ + int offset; + int ret; + + preempt_disable(); + /* find the next free available EILVT entry, skip offset 0 */ + for (offset = 1; offset < APIC_EILVT_NR_MAX; offset++) { + if (get_eilvt(offset)) + break; + } + preempt_enable(); + + if (offset == APIC_EILVT_NR_MAX) { + printk(KERN_DEBUG "No EILVT entry available\n"); + return -EBUSY; + } + + ret = setup_ibs_ctl(offset); + if (ret) + goto out; + + if (!ibs_eilvt_valid()) { + ret = -EFAULT; + goto out; + } + + pr_err(FW_BUG "using offset %d for IBS interrupts\n", offset); + pr_err(FW_BUG "workaround enabled for IBS LVT offset\n"); + + return 0; +out: + preempt_disable(); + put_eilvt(offset); + preempt_enable(); + return ret; +} + +static inline int get_ibs_lvt_offset(void) +{ + u64 val; + + rdmsrl(MSR_AMD64_IBSCTL, val); + if (!(val & IBSCTL_LVT_OFFSET_VALID)) + return -EINVAL; + + return val & IBSCTL_LVT_OFFSET_MASK; +} + +static void setup_APIC_ibs(void *dummy) +{ + int offset; + + offset = get_ibs_lvt_offset(); + if (offset < 0) + goto failed; + + if (!setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 0)) + return; +failed: + pr_warn("perf: IBS APIC setup failed on cpu #%d\n", + smp_processor_id()); +} + +static void clear_APIC_ibs(void *dummy) +{ + int offset; + + offset = get_ibs_lvt_offset(); + if (offset >= 0) + setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1); +} + +static int __cpuinit +perf_ibs_cpu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) +{ + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_STARTING: + setup_APIC_ibs(NULL); + break; + case CPU_DYING: + clear_APIC_ibs(NULL); + break; + default: + break; + } + + return NOTIFY_OK; +} + +static __init int amd_ibs_init(void) +{ + u32 caps; + int ret; + + caps = __get_ibs_caps(); + if (!caps) + return -ENODEV; /* ibs not supported by the cpu */ + + if (!ibs_eilvt_valid()) { + ret = force_ibs_eilvt_setup(); + if (ret) { + pr_err("Failed to setup IBS, %d\n", ret); + return ret; + } + } + + get_online_cpus(); + ibs_caps = caps; + /* make ibs_caps visible to other cpus: */ + smp_mb(); + perf_cpu_notifier(perf_ibs_cpu_notifier); + smp_call_function(setup_APIC_ibs, NULL, 1); + put_online_cpus(); + + return perf_event_ibs_init(); +} + +/* Since we need the pci subsystem to init ibs we can't do this earlier: */ +device_initcall(amd_ibs_init); diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index adf8fb3..c04dc14 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -385,8 +385,6 @@ static void nmi_cpu_shutdown(void *dummy) apic_write(APIC_LVTPC, per_cpu(saved_lvtpc, cpu)); apic_write(APIC_LVTERR, v); nmi_cpu_restore_registers(msrs); - if (model->cpu_down) - model->cpu_down(); } static void nmi_cpu_up(void *dummy) diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index e947e5c..303f086 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -74,27 +74,6 @@ static struct ibs_state ibs_state; #define IBS_RANDOM_MASK ((1ULL << IBS_RANDOM_BITS) - 1) #define IBS_RANDOM_MAXCNT_OFFSET (1ULL << (IBS_RANDOM_BITS - 5)) -static u32 get_ibs_caps(void) -{ - u32 ibs_caps; - unsigned int max_level; - - if (!boot_cpu_has(X86_FEATURE_IBS)) - return 0; - - /* check IBS cpuid feature flags */ - max_level = cpuid_eax(0x80000000); - if (max_level < IBS_CPUID_FEATURES) - return IBS_CAPS_DEFAULT; - - ibs_caps = cpuid_eax(IBS_CPUID_FEATURES); - if (!(ibs_caps & IBS_CAPS_AVAIL)) - /* cpuid flags not valid */ - return IBS_CAPS_DEFAULT; - - return ibs_caps; -} - /* * 16-bit Linear Feedback Shift Register (LFSR) * @@ -285,81 +264,6 @@ static void op_amd_stop_ibs(void) wrmsrl(MSR_AMD64_IBSOPCTL, 0); } -static inline int get_eilvt(int offset) -{ - return !setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 1); -} - -static inline int put_eilvt(int offset) -{ - return !setup_APIC_eilvt(offset, 0, 0, 1); -} - -static inline int ibs_eilvt_valid(void) -{ - int offset; - u64 val; - int valid = 0; - - preempt_disable(); - - rdmsrl(MSR_AMD64_IBSCTL, val); - offset = val & IBSCTL_LVT_OFFSET_MASK; - - if (!(val & IBSCTL_LVT_OFFSET_VALID)) { - pr_err(FW_BUG "cpu %d, invalid IBS interrupt offset %d (MSR%08X=0x%016llx)\n", - smp_processor_id(), offset, MSR_AMD64_IBSCTL, val); - goto out; - } - - if (!get_eilvt(offset)) { - pr_err(FW_BUG "cpu %d, IBS interrupt offset %d not available (MSR%08X=0x%016llx)\n", - smp_processor_id(), offset, MSR_AMD64_IBSCTL, val); - goto out; - } - - valid = 1; -out: - preempt_enable(); - - return valid; -} - -static inline int get_ibs_offset(void) -{ - u64 val; - - rdmsrl(MSR_AMD64_IBSCTL, val); - if (!(val & IBSCTL_LVT_OFFSET_VALID)) - return -EINVAL; - - return val & IBSCTL_LVT_OFFSET_MASK; -} - -static void setup_APIC_ibs(void) -{ - int offset; - - offset = get_ibs_offset(); - if (offset < 0) - goto failed; - - if (!setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 0)) - return; -failed: - pr_warn("oprofile: IBS APIC setup failed on cpu #%d\n", - smp_processor_id()); -} - -static void clear_APIC_ibs(void) -{ - int offset; - - offset = get_ibs_offset(); - if (offset >= 0) - setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1); -} - #ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX static void op_mux_switch_ctrl(struct op_x86_model_spec const *model, @@ -473,15 +377,6 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, val |= op_x86_get_ctrl(model, &counter_config[virt]); wrmsrl(msrs->controls[i].addr, val); } - - if (ibs_caps) - setup_APIC_ibs(); -} - -static void op_amd_cpu_shutdown(void) -{ - if (ibs_caps) - clear_APIC_ibs(); } static int op_amd_check_ctrs(struct pt_regs * const regs, @@ -544,86 +439,6 @@ static void op_amd_stop(struct op_msrs const * const msrs) op_amd_stop_ibs(); } -static int setup_ibs_ctl(int ibs_eilvt_off) -{ - struct pci_dev *cpu_cfg; - int nodes; - u32 value = 0; - - nodes = 0; - cpu_cfg = NULL; - do { - cpu_cfg = pci_get_device(PCI_VENDOR_ID_AMD, - PCI_DEVICE_ID_AMD_10H_NB_MISC, - cpu_cfg); - if (!cpu_cfg) - break; - ++nodes; - pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off - | IBSCTL_LVT_OFFSET_VALID); - pci_read_config_dword(cpu_cfg, IBSCTL, &value); - if (value != (ibs_eilvt_off | IBSCTL_LVT_OFFSET_VALID)) { - pci_dev_put(cpu_cfg); - printk(KERN_DEBUG "Failed to setup IBS LVT offset, " - "IBSCTL = 0x%08x\n", value); - return -EINVAL; - } - } while (1); - - if (!nodes) { - printk(KERN_DEBUG "No CPU node configured for IBS\n"); - return -ENODEV; - } - - return 0; -} - -/* - * This runs only on the current cpu. We try to find an LVT offset and - * setup the local APIC. For this we must disable preemption. On - * success we initialize all nodes with this offset. This updates then - * the offset in the IBS_CTL per-node msr. The per-core APIC setup of - * the IBS interrupt vector is called from op_amd_setup_ctrs()/op_- - * amd_cpu_shutdown() using the new offset. - */ -static int force_ibs_eilvt_setup(void) -{ - int offset; - int ret; - - preempt_disable(); - /* find the next free available EILVT entry, skip offset 0 */ - for (offset = 1; offset < APIC_EILVT_NR_MAX; offset++) { - if (get_eilvt(offset)) - break; - } - preempt_enable(); - - if (offset == APIC_EILVT_NR_MAX) { - printk(KERN_DEBUG "No EILVT entry available\n"); - return -EBUSY; - } - - ret = setup_ibs_ctl(offset); - if (ret) - goto out; - - if (!ibs_eilvt_valid()) { - ret = -EFAULT; - goto out; - } - - pr_err(FW_BUG "using offset %d for IBS interrupts\n", offset); - pr_err(FW_BUG "workaround enabled for IBS LVT offset\n"); - - return 0; -out: - preempt_disable(); - put_eilvt(offset); - preempt_enable(); - return ret; -} - /* * check and reserve APIC extended interrupt LVT offset for IBS if * available @@ -636,17 +451,6 @@ static void init_ibs(void) if (!ibs_caps) return; - if (ibs_eilvt_valid()) - goto out; - - if (!force_ibs_eilvt_setup()) - goto out; - - /* Failed to setup ibs */ - ibs_caps = 0; - return; - -out: printk(KERN_INFO "oprofile: AMD IBS detected (0x%08x)\n", ibs_caps); } @@ -729,7 +533,6 @@ struct op_x86_model_spec op_amd_spec = { .init = op_amd_init, .fill_in_addresses = &op_amd_fill_in_addresses, .setup_ctrs = &op_amd_setup_ctrs, - .cpu_down = &op_amd_cpu_shutdown, .check_ctrs = &op_amd_check_ctrs, .start = &op_amd_start, .stop = &op_amd_stop, diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index 89017fa..71e8a67 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -43,7 +43,6 @@ struct op_x86_model_spec { int (*fill_in_addresses)(struct op_msrs * const msrs); void (*setup_ctrs)(struct op_x86_model_spec const *model, struct op_msrs const * const msrs); - void (*cpu_down)(void); int (*check_ctrs)(struct pt_regs * const regs, struct op_msrs const * const msrs); void (*start)(struct op_msrs const * const msrs); -- cgit v1.1 From d48b0e173715f678698d3678fefd40f2893ce798 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 6 Oct 2011 14:20:27 +0200 Subject: x86, nmi, drivers: Fix nmi splitup build bug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit nmi.c needs an #include : arch/x86/kernel/nmi.c: In function ‘unknown_nmi_error’: arch/x86/kernel/nmi.c:286:6: error: ‘MCA_bus’ undeclared (first use in this function) arch/x86/kernel/nmi.c:286:6: note: each undeclared identifier is reported only once for each function it appears in Another one is the hpwdt driver: drivers/watchdog/hpwdt.c:507:9: error: ‘NMI_DONE’ undeclared (first use in this function) Signed-off-by: Ingo Molnar --- arch/x86/kernel/nmi.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index d0eaa31..7ec5bd1 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -18,6 +18,8 @@ #include #include +#include + #if defined(CONFIG_EDAC) #include #endif -- cgit v1.1 From 53a019a951fae849471e4a620948c5f6886bd1a4 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 7 Oct 2011 22:31:55 +0900 Subject: x86: Fix insn decoder for longer instruction Fix x86 insn decoder for hardening against invalid length instructions. This adds length checkings for each byte-read site and if it exceeds MAX_INSN_SIZE, returns immediately. This can happen when decoding user-space binary. Caller can check whether it happened by checking insn.*.got member is set or not. Signed-off-by: Masami Hiramatsu Cc: Stephane Eranian Cc: Andi Kleen Cc: acme@redhat.com Cc: ming.m.lin@intel.com Cc: robert.richter@amd.com Cc: ravitillo@lbl.gov Cc: yrl.pp-manager.tt@hitachi.com Cc: Peter Zijlstra Cc: Srikar Dronamraju Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20111007133155.10933.58577.stgit@localhost.localdomain Signed-off-by: Ingo Molnar --- arch/x86/lib/insn.c | 48 +++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 43 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c index 9f33b98..374562e 100644 --- a/arch/x86/lib/insn.c +++ b/arch/x86/lib/insn.c @@ -22,14 +22,23 @@ #include #include -#define get_next(t, insn) \ - ({t r; r = *(t*)insn->next_byte; insn->next_byte += sizeof(t); r; }) +/* Verify next sizeof(t) bytes can be on the same instruction */ +#define validate_next(t, insn, n) \ + ((insn)->next_byte + sizeof(t) + n - (insn)->kaddr <= MAX_INSN_SIZE) + +#define __get_next(t, insn) \ + ({ t r = *(t*)insn->next_byte; insn->next_byte += sizeof(t); r; }) + +#define __peek_nbyte_next(t, insn, n) \ + ({ t r = *(t*)((insn)->next_byte + n); r; }) -#define peek_next(t, insn) \ - ({t r; r = *(t*)insn->next_byte; r; }) +#define get_next(t, insn) \ + ({ if (unlikely(!validate_next(t, insn, 0))) goto err_out; __get_next(t, insn); }) #define peek_nbyte_next(t, insn, n) \ - ({t r; r = *(t*)((insn)->next_byte + n); r; }) + ({ if (unlikely(!validate_next(t, insn, n))) goto err_out; __peek_nbyte_next(t, insn, n); }) + +#define peek_next(t, insn) peek_nbyte_next(t, insn, 0) /** * insn_init() - initialize struct insn @@ -158,6 +167,8 @@ vex_end: insn->vex_prefix.got = 1; prefixes->got = 1; + +err_out: return; } @@ -208,6 +219,9 @@ void insn_get_opcode(struct insn *insn) insn->attr = 0; /* This instruction is bad */ end: opcode->got = 1; + +err_out: + return; } /** @@ -241,6 +255,9 @@ void insn_get_modrm(struct insn *insn) if (insn->x86_64 && inat_is_force64(insn->attr)) insn->opnd_bytes = 8; modrm->got = 1; + +err_out: + return; } @@ -290,6 +307,9 @@ void insn_get_sib(struct insn *insn) } } insn->sib.got = 1; + +err_out: + return; } @@ -351,6 +371,9 @@ void insn_get_displacement(struct insn *insn) } out: insn->displacement.got = 1; + +err_out: + return; } /* Decode moffset16/32/64 */ @@ -373,6 +396,9 @@ static void __get_moffset(struct insn *insn) break; } insn->moffset1.got = insn->moffset2.got = 1; + +err_out: + return; } /* Decode imm v32(Iz) */ @@ -389,6 +415,9 @@ static void __get_immv32(struct insn *insn) insn->immediate.nbytes = 4; break; } + +err_out: + return; } /* Decode imm v64(Iv/Ov) */ @@ -411,6 +440,9 @@ static void __get_immv(struct insn *insn) break; } insn->immediate1.got = insn->immediate2.got = 1; + +err_out: + return; } /* Decode ptr16:16/32(Ap) */ @@ -432,6 +464,9 @@ static void __get_immptr(struct insn *insn) insn->immediate2.value = get_next(unsigned short, insn); insn->immediate2.nbytes = 2; insn->immediate1.got = insn->immediate2.got = 1; + +err_out: + return; } /** @@ -496,6 +531,9 @@ void insn_get_immediate(struct insn *insn) } done: insn->immediate.got = 1; + +err_out: + return; } /** -- cgit v1.1 From 2b666859ec323403ac9a3a441d16eab30945404b Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Thu, 6 Oct 2011 00:40:47 +0300 Subject: x86: Default to vsyscall=native for now This UML breakage: linux-2.6.30.1[3800] vsyscall fault (exploit attempt?) ip:ffffffffff600000 cs:33 sp:7fbfb9c498 ax:ffffffffff600000 si:0 di:606790 linux-2.6.30.1[3856] vsyscall fault (exploit attempt?) ip:ffffffffff600000 cs:33 sp:7fbfb13168 ax:ffffffffff600000 si:0 di:606790 Is caused by commit 3ae36655 ("x86-64: Rework vsyscall emulation and add vsyscall= parameter") - the vsyscall emulation code is not fully cooked yet as UML relies on some rather fragile SIGSEGV semantics. Linus suggested in https://lkml.org/lkml/2011/8/9/376 to default to vsyscall=native for now, this patch implements that. Signed-off-by: Adrian Bunk Acked-by: Andrew Lutomirski Cc: H. Peter Anvin Link: http://lkml.kernel.org/r/20111005214047.GE14406@localhost.pp.htv.fi Signed-off-by: Ingo Molnar --- arch/x86/kernel/vsyscall_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 18ae83d..b56c65de 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -56,7 +56,7 @@ DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) = .lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock), }; -static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE; +static enum { EMULATE, NATIVE, NONE } vsyscall_mode = NATIVE; static int __init vsyscall_setup(char *str) { -- cgit v1.1 From e4aff81182c01f81d0bedb0685abccf3cba4a2ac Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 12 Oct 2011 00:33:05 -0700 Subject: x86, ioapic: Pass struct irq_attr * to setup_ioapic_irq() Do not expand that struct, and just pass pointer to reduce the number of parameters in related functions. Signed-off-by: Yinghai Lu Cc: Naga Chumbalkar Cc: Suresh Siddha Link: http://lkml.kernel.org/r/4E9542B1.7050800@oracle.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 8eb863e..a7052fa 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1324,8 +1324,8 @@ static int setup_ioapic_entry(int apic_id, int irq, return 0; } -static void setup_ioapic_irq(int apic_id, int pin, unsigned int irq, - struct irq_cfg *cfg, int trigger, int polarity) +static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg, + struct io_apic_irq_attr *attr) { struct IO_APIC_route_entry entry; unsigned int dest; @@ -1348,23 +1348,24 @@ static void setup_ioapic_irq(int apic_id, int pin, unsigned int irq, apic_printk(APIC_VERBOSE,KERN_DEBUG "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> " "IRQ %d Mode:%i Active:%i Dest:%d)\n", - apic_id, mpc_ioapic_id(apic_id), pin, cfg->vector, - irq, trigger, polarity, dest); + attr->ioapic, mpc_ioapic_id(attr->ioapic), attr->ioapic_pin, + cfg->vector, irq, attr->trigger, attr->polarity, dest); - if (setup_ioapic_entry(mpc_ioapic_id(apic_id), irq, &entry, - dest, trigger, polarity, cfg->vector, pin)) { + if (setup_ioapic_entry(mpc_ioapic_id(attr->ioapic), irq, &entry, + dest, attr->trigger, attr->polarity, cfg->vector, + attr->ioapic_pin)) { printk("Failed to setup ioapic entry for ioapic %d, pin %d\n", - mpc_ioapic_id(apic_id), pin); + mpc_ioapic_id(attr->ioapic), attr->ioapic_pin); __clear_irq_vector(irq, cfg); return; } - ioapic_register_intr(irq, cfg, trigger); + ioapic_register_intr(irq, cfg, attr->trigger); if (irq < legacy_pic->nr_legacy_irqs) legacy_pic->mask(irq); - ioapic_write_entry(apic_id, pin, entry); + ioapic_write_entry(attr->ioapic, attr->ioapic_pin, entry); } static bool __init io_apic_pin_not_connected(int idx, int apic_id, int pin) @@ -3566,8 +3567,7 @@ io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr) return -EINVAL; ret = __add_pin_to_irq_node(cfg, node, attr->ioapic, attr->ioapic_pin); if (!ret) - setup_ioapic_irq(attr->ioapic, attr->ioapic_pin, irq, cfg, - attr->trigger, attr->polarity); + setup_ioapic_irq(irq, cfg, attr); return ret; } -- cgit v1.1 From c5b4712c3fabef4d1f48d24563fc562053baa002 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 12 Oct 2011 00:33:15 -0700 Subject: x86, ioapic: Split up setup_ioapic_entry() Ingo pointed out that setup_ioapic_entry() is way too big now. Split the intr-remap code out into setup_ir_ioapic_entry(). Also pass struct io_apic_irq_attr * instead of 5 parameters in those two functions. At last in setup_ir_ioapic_entry() we don't need to panic. Signed-off-by: Yinghai Lu Cc: Naga Chumbalkar Cc: Suresh Siddha Link: http://lkml.kernel.org/r/4E9542BB.4070807@oracle.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 139 ++++++++++++++++++++++++----------------- 1 file changed, 80 insertions(+), 59 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index a7052fa..2fbb9d6 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1255,72 +1255,95 @@ static void ioapic_register_intr(unsigned int irq, struct irq_cfg *cfg, fasteoi ? "fasteoi" : "edge"); } -static int setup_ioapic_entry(int apic_id, int irq, - struct IO_APIC_route_entry *entry, - unsigned int destination, int trigger, - int polarity, int vector, int pin) + +static int setup_ir_ioapic_entry(int irq, + struct IR_IO_APIC_route_entry *entry, + unsigned int destination, int vector, + struct io_apic_irq_attr *attr) { - /* - * add it to the IO-APIC irq-routing table: - */ - memset(entry,0,sizeof(*entry)); + int index; + struct irte irte; + int apic_id = mpc_ioapic_id(attr->ioapic); + struct intel_iommu *iommu = map_ioapic_to_ir(apic_id); - if (intr_remapping_enabled) { - struct intel_iommu *iommu = map_ioapic_to_ir(apic_id); - struct irte irte; - struct IR_IO_APIC_route_entry *ir_entry = - (struct IR_IO_APIC_route_entry *) entry; - int index; + if (!iommu) { + pr_warn("No mapping iommu for ioapic %d\n", apic_id); + return -ENODEV; + } - if (!iommu) - panic("No mapping iommu for ioapic %d\n", apic_id); + index = alloc_irte(iommu, irq, 1); + if (index < 0) { + pr_warn("Failed to allocate IRTE for ioapic %d\n", apic_id); + return -ENOMEM; + } - index = alloc_irte(iommu, irq, 1); - if (index < 0) - panic("Failed to allocate IRTE for ioapic %d\n", apic_id); + prepare_irte(&irte, vector, destination); - prepare_irte(&irte, vector, destination); + /* Set source-id of interrupt request */ + set_ioapic_sid(&irte, apic_id); - /* Set source-id of interrupt request */ - set_ioapic_sid(&irte, apic_id); + modify_irte(irq, &irte); - modify_irte(irq, &irte); + apic_printk(APIC_VERBOSE, KERN_DEBUG "IOAPIC[%d]: " + "Set IRTE entry (P:%d FPD:%d Dst_Mode:%d " + "Redir_hint:%d Trig_Mode:%d Dlvry_Mode:%X " + "Avail:%X Vector:%02X Dest:%08X " + "SID:%04X SQ:%X SVT:%X)\n", + apic_id, irte.present, irte.fpd, irte.dst_mode, + irte.redir_hint, irte.trigger_mode, irte.dlvry_mode, + irte.avail, irte.vector, irte.dest_id, + irte.sid, irte.sq, irte.svt); + + memset(entry, 0, sizeof(*entry)); + + entry->index2 = (index >> 15) & 0x1; + entry->zero = 0; + entry->format = 1; + entry->index = (index & 0x7fff); + /* + * IO-APIC RTE will be configured with virtual vector. + * irq handler will do the explicit EOI to the io-apic. + */ + entry->vector = attr->ioapic_pin; + entry->mask = 0; /* enable IRQ */ + entry->trigger = attr->trigger; + entry->polarity = attr->polarity; - ir_entry->index2 = (index >> 15) & 0x1; - ir_entry->zero = 0; - ir_entry->format = 1; - ir_entry->index = (index & 0x7fff); - /* - * IO-APIC RTE will be configured with virtual vector. - * irq handler will do the explicit EOI to the io-apic. - */ - ir_entry->vector = pin; - - apic_printk(APIC_VERBOSE, KERN_DEBUG "IOAPIC[%d]: " - "Set IRTE entry (P:%d FPD:%d Dst_Mode:%d " - "Redir_hint:%d Trig_Mode:%d Dlvry_Mode:%X " - "Avail:%X Vector:%02X Dest:%08X " - "SID:%04X SQ:%X SVT:%X)\n", - apic_id, irte.present, irte.fpd, irte.dst_mode, - irte.redir_hint, irte.trigger_mode, irte.dlvry_mode, - irte.avail, irte.vector, irte.dest_id, - irte.sid, irte.sq, irte.svt); - } else { - entry->delivery_mode = apic->irq_delivery_mode; - entry->dest_mode = apic->irq_dest_mode; - entry->dest = destination; - entry->vector = vector; - } + /* Mask level triggered irqs. + * Use IRQ_DELAYED_DISABLE for edge triggered irqs. + */ + if (attr->trigger) + entry->mask = 1; - entry->mask = 0; /* enable IRQ */ - entry->trigger = trigger; - entry->polarity = polarity; + return 0; +} - /* Mask level triggered irqs. +static int setup_ioapic_entry(int irq, struct IO_APIC_route_entry *entry, + unsigned int destination, int vector, + struct io_apic_irq_attr *attr) +{ + if (intr_remapping_enabled) + return setup_ir_ioapic_entry(irq, + (struct IR_IO_APIC_route_entry *)entry, + destination, vector, attr); + + memset(entry, 0, sizeof(*entry)); + + entry->delivery_mode = apic->irq_delivery_mode; + entry->dest_mode = apic->irq_dest_mode; + entry->dest = destination; + entry->vector = vector; + entry->mask = 0; /* enable IRQ */ + entry->trigger = attr->trigger; + entry->polarity = attr->polarity; + + /* + * Mask level triggered irqs. * Use IRQ_DELAYED_DISABLE for edge triggered irqs. */ - if (trigger) + if (attr->trigger) entry->mask = 1; + return 0; } @@ -1351,13 +1374,11 @@ static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg, attr->ioapic, mpc_ioapic_id(attr->ioapic), attr->ioapic_pin, cfg->vector, irq, attr->trigger, attr->polarity, dest); - - if (setup_ioapic_entry(mpc_ioapic_id(attr->ioapic), irq, &entry, - dest, attr->trigger, attr->polarity, cfg->vector, - attr->ioapic_pin)) { - printk("Failed to setup ioapic entry for ioapic %d, pin %d\n", - mpc_ioapic_id(attr->ioapic), attr->ioapic_pin); + if (setup_ioapic_entry(irq, &entry, dest, cfg->vector, attr)) { + pr_warn("Failed to setup ioapic entry for ioapic %d, pin %d\n", + mpc_ioapic_id(attr->ioapic), attr->ioapic_pin); __clear_irq_vector(irq, cfg); + return; } -- cgit v1.1 From 3a61d7feca055152c2c6870d1f8dfbc3443bd1b3 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 12 Oct 2011 00:33:28 -0700 Subject: x86, ioapic: Print out irte with right ioapic index While checking irte dump in dmesg, the print out is confusing ioapic index with real io apic id: IOAPIC[0]: Set routing entry (1-1 -> 0x31 -> IRQ 1 Mode:0 Active:0 Dest:1) IOAPIC[1]: Set IRTE entry (P:1 FPD:0 Dst_Mode:1 Redir_hint:1 Trig_Mode:0 Dlvry_Mode:1 Avail:0 Vector:31 Dest:00000001 SID:00FF SQ:0 SVT:1) IOAPIC[0]: Set routing entry (1-2 -> 0x30 -> IRQ 0 Mode:0 Active:0 Dest:1) IOAPIC[1]: Set IRTE entry (P:1 FPD:0 Dst_Mode:1 Redir_hint:1 Trig_Mode:0 Dlvry_Mode:1 Avail:0 Vector:30 Dest:00000001 SID:00FF SQ:0 SVT:1) The system's first ioapic id is 1. This commit: | commit 3040db92ee1b6c5b6b6d73f8cdcad54c0da11563 | Author: Naga Chumbalkar | Date: Tue Jul 12 21:17:41 2011 +0000 | | x86, ioapic: Print IRTE when IR is enabled Confused apic_id with the ioapic ID - fix it. Signed-off-by: Yinghai Lu Cc: Naga Chumbalkar Cc: Suresh Siddha Link: http://lkml.kernel.org/r/4E9542C8.8040209@oracle.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 2fbb9d6..a781506 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1289,7 +1289,7 @@ static int setup_ir_ioapic_entry(int irq, "Redir_hint:%d Trig_Mode:%d Dlvry_Mode:%X " "Avail:%X Vector:%02X Dest:%08X " "SID:%04X SQ:%X SVT:%X)\n", - apic_id, irte.present, irte.fpd, irte.dst_mode, + attr->ioapic, irte.present, irte.fpd, irte.dst_mode, irte.redir_hint, irte.trigger_mode, irte.dlvry_mode, irte.avail, irte.vector, irte.dest_id, irte.sid, irte.sq, irte.svt); -- cgit v1.1 From cda417dd87ed46f58cbc7d3f075ebe42b4462140 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 12 Oct 2011 00:33:39 -0700 Subject: x86, ioapic: Factor out print_IO_APIC() to only print one io apic It is getting too big after the interrupt remaping entries debug print out was added. Original print_IO_APIC() becomes print_IO_APICs(). New print_IO_APIC() will only print one ioapic's registers As a side-effect this clean-up also made checkpatch.pl happier. Signed-off-by: Yinghai Lu Cc: Naga Chumbalkar Cc: Suresh Siddha Link: http://lkml.kernel.org/r/4E9542D3.5000008@oracle.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 46 ++++++++++++++++++++++-------------------- 1 file changed, 24 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index a781506..94a4bcf 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1512,30 +1512,14 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic_id, unsigned int pin, ioapic_write_entry(apic_id, pin, entry); } - -__apicdebuginit(void) print_IO_APIC(void) +__apicdebuginit(void) print_IO_APIC(int apic) { - int apic, i; + int i; union IO_APIC_reg_00 reg_00; union IO_APIC_reg_01 reg_01; union IO_APIC_reg_02 reg_02; union IO_APIC_reg_03 reg_03; unsigned long flags; - struct irq_cfg *cfg; - unsigned int irq; - - printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); - for (i = 0; i < nr_ioapics; i++) - printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", - mpc_ioapic_id(i), ioapics[i].nr_registers); - - /* - * We are a bit conservative about what we expect. We have to - * know about every hardware change ASAP. - */ - printk(KERN_INFO "testing the IO APIC.......................\n"); - - for (apic = 0; apic < nr_ioapics; apic++) { raw_spin_lock_irqsave(&ioapic_lock, flags); reg_00.raw = io_apic_read(apic, 0); @@ -1636,7 +1620,27 @@ __apicdebuginit(void) print_IO_APIC(void) ); } } - } +} + +__apicdebuginit(void) print_IO_APICs(void) +{ + int apic, i; + struct irq_cfg *cfg; + unsigned int irq; + + printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); + for (i = 0; i < nr_ioapics; i++) + printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", + mpc_ioapic_id(i), ioapics[i].nr_registers); + + /* + * We are a bit conservative about what we expect. We have to + * know about every hardware change ASAP. + */ + printk(KERN_INFO "testing the IO APIC.......................\n"); + + for (apic = 0; apic < nr_ioapics; apic++) + print_IO_APIC(apic); printk(KERN_DEBUG "IRQ to pin mappings:\n"); for_each_active_irq(irq) { @@ -1655,8 +1659,6 @@ __apicdebuginit(void) print_IO_APIC(void) } printk(KERN_INFO ".................................... done.\n"); - - return; } __apicdebuginit(void) print_APIC_field(int base) @@ -1850,7 +1852,7 @@ __apicdebuginit(int) print_ICs(void) return 0; print_local_APICs(show_lapic); - print_IO_APIC(); + print_IO_APICs(); return 0; } -- cgit v1.1 From 6f50d45fae8189365805d26e1fdf26a75301bcae Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 12 Oct 2011 00:33:48 -0700 Subject: x86, ioapic: Clean up ioapic/apic_id usage While looking at the code, apic_id sometime is referred to index of ioapic, but sometime is used for phys apic id. and some even use apic for real apic id. It is very confusing. So try to limit apic_id or ioapic_id to be real apic id for ioapic, and use ioapic_idx for ioapic index in the array. -v2: Suggested by Ingo, use ioapic_idx consistently, instead of ioapic Signed-off-by: Yinghai Lu Cc: Naga Chumbalkar Cc: Suresh Siddha Link: http://lkml.kernel.org/r/4E9542DC.3090509@oracle.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 194 ++++++++++++++++++++--------------------- 1 file changed, 97 insertions(+), 97 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 94a4bcf..08f0d16 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -92,21 +92,21 @@ static struct ioapic { DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1); } ioapics[MAX_IO_APICS]; -#define mpc_ioapic_ver(id) ioapics[id].mp_config.apicver +#define mpc_ioapic_ver(ioapic_idx) ioapics[ioapic_idx].mp_config.apicver -int mpc_ioapic_id(int id) +int mpc_ioapic_id(int ioapic_idx) { - return ioapics[id].mp_config.apicid; + return ioapics[ioapic_idx].mp_config.apicid; } -unsigned int mpc_ioapic_addr(int id) +unsigned int mpc_ioapic_addr(int ioapic_idx) { - return ioapics[id].mp_config.apicaddr; + return ioapics[ioapic_idx].mp_config.apicaddr; } -struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int id) +struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int ioapic_idx) { - return &ioapics[id].gsi_config; + return &ioapics[ioapic_idx].gsi_config; } int nr_ioapics; @@ -712,13 +712,13 @@ int restore_ioapic_entries(void) /* * Find the IRQ entry number of a certain pin. */ -static int find_irq_entry(int apic, int pin, int type) +static int find_irq_entry(int ioapic_idx, int pin, int type) { int i; for (i = 0; i < mp_irq_entries; i++) if (mp_irqs[i].irqtype == type && - (mp_irqs[i].dstapic == mpc_ioapic_id(apic) || + (mp_irqs[i].dstapic == mpc_ioapic_id(ioapic_idx) || mp_irqs[i].dstapic == MP_APIC_ALL) && mp_irqs[i].dstirq == pin) return i; @@ -757,12 +757,13 @@ static int __init find_isa_irq_apic(int irq, int type) (mp_irqs[i].srcbusirq == irq)) break; } + if (i < mp_irq_entries) { - int apic; - for(apic = 0; apic < nr_ioapics; apic++) { - if (mpc_ioapic_id(apic) == mp_irqs[i].dstapic) - return apic; - } + int ioapic_idx; + + for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++) + if (mpc_ioapic_id(ioapic_idx) == mp_irqs[i].dstapic) + return ioapic_idx; } return -1; @@ -977,7 +978,7 @@ static int pin_2_irq(int idx, int apic, int pin) int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin, struct io_apic_irq_attr *irq_attr) { - int apic, i, best_guess = -1; + int ioapic_idx, i, best_guess = -1; apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n", @@ -990,8 +991,8 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin, for (i = 0; i < mp_irq_entries; i++) { int lbus = mp_irqs[i].srcbus; - for (apic = 0; apic < nr_ioapics; apic++) - if (mpc_ioapic_id(apic) == mp_irqs[i].dstapic || + for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++) + if (mpc_ioapic_id(ioapic_idx) == mp_irqs[i].dstapic || mp_irqs[i].dstapic == MP_APIC_ALL) break; @@ -999,13 +1000,13 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin, !mp_irqs[i].irqtype && (bus == lbus) && (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) { - int irq = pin_2_irq(i, apic, mp_irqs[i].dstirq); + int irq = pin_2_irq(i, ioapic_idx, mp_irqs[i].dstirq); - if (!(apic || IO_APIC_IRQ(irq))) + if (!(ioapic_idx || IO_APIC_IRQ(irq))) continue; if (pin == (mp_irqs[i].srcbusirq & 3)) { - set_io_apic_irq_attr(irq_attr, apic, + set_io_apic_irq_attr(irq_attr, ioapic_idx, mp_irqs[i].dstirq, irq_trigger(i), irq_polarity(i)); @@ -1016,7 +1017,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin, * best-guess fuzzy result for broken mptables. */ if (best_guess < 0) { - set_io_apic_irq_attr(irq_attr, apic, + set_io_apic_irq_attr(irq_attr, ioapic_idx, mp_irqs[i].dstirq, irq_trigger(i), irq_polarity(i)); @@ -1263,24 +1264,24 @@ static int setup_ir_ioapic_entry(int irq, { int index; struct irte irte; - int apic_id = mpc_ioapic_id(attr->ioapic); - struct intel_iommu *iommu = map_ioapic_to_ir(apic_id); + int ioapic_id = mpc_ioapic_id(attr->ioapic); + struct intel_iommu *iommu = map_ioapic_to_ir(ioapic_id); if (!iommu) { - pr_warn("No mapping iommu for ioapic %d\n", apic_id); + pr_warn("No mapping iommu for ioapic %d\n", ioapic_id); return -ENODEV; } index = alloc_irte(iommu, irq, 1); if (index < 0) { - pr_warn("Failed to allocate IRTE for ioapic %d\n", apic_id); + pr_warn("Failed to allocate IRTE for ioapic %d\n", ioapic_id); return -ENOMEM; } prepare_irte(&irte, vector, destination); /* Set source-id of interrupt request */ - set_ioapic_sid(&irte, apic_id); + set_ioapic_sid(&irte, ioapic_id); modify_irte(irq, &irte); @@ -1389,30 +1390,30 @@ static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg, ioapic_write_entry(attr->ioapic, attr->ioapic_pin, entry); } -static bool __init io_apic_pin_not_connected(int idx, int apic_id, int pin) +static bool __init io_apic_pin_not_connected(int idx, int ioapic_idx, int pin) { if (idx != -1) return false; apic_printk(APIC_VERBOSE, KERN_DEBUG " apic %d pin %d not connected\n", - mpc_ioapic_id(apic_id), pin); + mpc_ioapic_id(ioapic_idx), pin); return true; } -static void __init __io_apic_setup_irqs(unsigned int apic_id) +static void __init __io_apic_setup_irqs(unsigned int ioapic_idx) { int idx, node = cpu_to_node(0); struct io_apic_irq_attr attr; unsigned int pin, irq; - for (pin = 0; pin < ioapics[apic_id].nr_registers; pin++) { - idx = find_irq_entry(apic_id, pin, mp_INT); - if (io_apic_pin_not_connected(idx, apic_id, pin)) + for (pin = 0; pin < ioapics[ioapic_idx].nr_registers; pin++) { + idx = find_irq_entry(ioapic_idx, pin, mp_INT); + if (io_apic_pin_not_connected(idx, ioapic_idx, pin)) continue; - irq = pin_2_irq(idx, apic_id, pin); + irq = pin_2_irq(idx, ioapic_idx, pin); - if ((apic_id > 0) && (irq > 16)) + if ((ioapic_idx > 0) && (irq > 16)) continue; /* @@ -1420,10 +1421,10 @@ static void __init __io_apic_setup_irqs(unsigned int apic_id) * installed and if it returns 1: */ if (apic->multi_timer_check && - apic->multi_timer_check(apic_id, irq)) + apic->multi_timer_check(ioapic_idx, irq)) continue; - set_io_apic_irq_attr(&attr, apic_id, pin, irq_trigger(idx), + set_io_apic_irq_attr(&attr, ioapic_idx, pin, irq_trigger(idx), irq_polarity(idx)); io_apic_setup_irq_pin(irq, node, &attr); @@ -1432,12 +1433,12 @@ static void __init __io_apic_setup_irqs(unsigned int apic_id) static void __init setup_IO_APIC_irqs(void) { - unsigned int apic_id; + unsigned int ioapic_idx; apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); - for (apic_id = 0; apic_id < nr_ioapics; apic_id++) - __io_apic_setup_irqs(apic_id); + for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++) + __io_apic_setup_irqs(ioapic_idx); } /* @@ -1447,28 +1448,28 @@ static void __init setup_IO_APIC_irqs(void) */ void setup_IO_APIC_irq_extra(u32 gsi) { - int apic_id = 0, pin, idx, irq, node = cpu_to_node(0); + int ioapic_idx = 0, pin, idx, irq, node = cpu_to_node(0); struct io_apic_irq_attr attr; /* * Convert 'gsi' to 'ioapic.pin'. */ - apic_id = mp_find_ioapic(gsi); - if (apic_id < 0) + ioapic_idx = mp_find_ioapic(gsi); + if (ioapic_idx < 0) return; - pin = mp_find_ioapic_pin(apic_id, gsi); - idx = find_irq_entry(apic_id, pin, mp_INT); + pin = mp_find_ioapic_pin(ioapic_idx, gsi); + idx = find_irq_entry(ioapic_idx, pin, mp_INT); if (idx == -1) return; - irq = pin_2_irq(idx, apic_id, pin); + irq = pin_2_irq(idx, ioapic_idx, pin); /* Only handle the non legacy irqs on secondary ioapics */ - if (apic_id == 0 || irq < NR_IRQS_LEGACY) + if (ioapic_idx == 0 || irq < NR_IRQS_LEGACY) return; - set_io_apic_irq_attr(&attr, apic_id, pin, irq_trigger(idx), + set_io_apic_irq_attr(&attr, ioapic_idx, pin, irq_trigger(idx), irq_polarity(idx)); io_apic_setup_irq_pin_once(irq, node, &attr); @@ -1477,8 +1478,8 @@ void setup_IO_APIC_irq_extra(u32 gsi) /* * Set up the timer pin, possibly with the 8259A-master behind. */ -static void __init setup_timer_IRQ0_pin(unsigned int apic_id, unsigned int pin, - int vector) +static void __init setup_timer_IRQ0_pin(unsigned int ioapic_idx, + unsigned int pin, int vector) { struct IO_APIC_route_entry entry; @@ -1509,10 +1510,10 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic_id, unsigned int pin, /* * Add it to the IO-APIC irq-routing table: */ - ioapic_write_entry(apic_id, pin, entry); + ioapic_write_entry(ioapic_idx, pin, entry); } -__apicdebuginit(void) print_IO_APIC(int apic) +__apicdebuginit(void) print_IO_APIC(int ioapic_idx) { int i; union IO_APIC_reg_00 reg_00; @@ -1522,16 +1523,16 @@ __apicdebuginit(void) print_IO_APIC(int apic) unsigned long flags; raw_spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(apic, 0); - reg_01.raw = io_apic_read(apic, 1); + reg_00.raw = io_apic_read(ioapic_idx, 0); + reg_01.raw = io_apic_read(ioapic_idx, 1); if (reg_01.bits.version >= 0x10) - reg_02.raw = io_apic_read(apic, 2); + reg_02.raw = io_apic_read(ioapic_idx, 2); if (reg_01.bits.version >= 0x20) - reg_03.raw = io_apic_read(apic, 3); + reg_03.raw = io_apic_read(ioapic_idx, 3); raw_spin_unlock_irqrestore(&ioapic_lock, flags); printk("\n"); - printk(KERN_DEBUG "IO APIC #%d......\n", mpc_ioapic_id(apic)); + printk(KERN_DEBUG "IO APIC #%d......\n", mpc_ioapic_id(ioapic_idx)); printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); @@ -1581,7 +1582,7 @@ __apicdebuginit(void) print_IO_APIC(int apic) struct IO_APIC_route_entry entry; struct IR_IO_APIC_route_entry *ir_entry; - entry = ioapic_read_entry(apic, i); + entry = ioapic_read_entry(ioapic_idx, i); ir_entry = (struct IR_IO_APIC_route_entry *) &entry; printk(KERN_DEBUG " %02x %04X ", i, @@ -1602,7 +1603,7 @@ __apicdebuginit(void) print_IO_APIC(int apic) } else { struct IO_APIC_route_entry entry; - entry = ioapic_read_entry(apic, i); + entry = ioapic_read_entry(ioapic_idx, i); printk(KERN_DEBUG " %02x %02X ", i, entry.dest @@ -1624,14 +1625,15 @@ __apicdebuginit(void) print_IO_APIC(int apic) __apicdebuginit(void) print_IO_APICs(void) { - int apic, i; + int ioapic_idx; struct irq_cfg *cfg; unsigned int irq; printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); - for (i = 0; i < nr_ioapics; i++) + for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++) printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", - mpc_ioapic_id(i), ioapics[i].nr_registers); + mpc_ioapic_id(ioapic_idx), + ioapics[ioapic_idx].nr_registers); /* * We are a bit conservative about what we expect. We have to @@ -1639,8 +1641,8 @@ __apicdebuginit(void) print_IO_APICs(void) */ printk(KERN_INFO "testing the IO APIC.......................\n"); - for (apic = 0; apic < nr_ioapics; apic++) - print_IO_APIC(apic); + for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++) + print_IO_APIC(ioapic_idx); printk(KERN_DEBUG "IRQ to pin mappings:\n"); for_each_active_irq(irq) { @@ -1977,7 +1979,7 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void) { union IO_APIC_reg_00 reg_00; physid_mask_t phys_id_present_map; - int apic_id; + int ioapic_idx; int i; unsigned char old_id; unsigned long flags; @@ -1991,21 +1993,20 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void) /* * Set the IOAPIC ID to the value stored in the MPC table. */ - for (apic_id = 0; apic_id < nr_ioapics; apic_id++) { - + for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++) { /* Read the register 0 value */ raw_spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(apic_id, 0); + reg_00.raw = io_apic_read(ioapic_idx, 0); raw_spin_unlock_irqrestore(&ioapic_lock, flags); - old_id = mpc_ioapic_id(apic_id); + old_id = mpc_ioapic_id(ioapic_idx); - if (mpc_ioapic_id(apic_id) >= get_physical_broadcast()) { + if (mpc_ioapic_id(ioapic_idx) >= get_physical_broadcast()) { printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n", - apic_id, mpc_ioapic_id(apic_id)); + ioapic_idx, mpc_ioapic_id(ioapic_idx)); printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", reg_00.bits.ID); - ioapics[apic_id].mp_config.apicid = reg_00.bits.ID; + ioapics[ioapic_idx].mp_config.apicid = reg_00.bits.ID; } /* @@ -2014,9 +2015,9 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void) * 'stuck on smp_invalidate_needed IPI wait' messages. */ if (apic->check_apicid_used(&phys_id_present_map, - mpc_ioapic_id(apic_id))) { + mpc_ioapic_id(ioapic_idx))) { printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", - apic_id, mpc_ioapic_id(apic_id)); + ioapic_idx, mpc_ioapic_id(ioapic_idx)); for (i = 0; i < get_physical_broadcast(); i++) if (!physid_isset(i, phys_id_present_map)) break; @@ -2025,14 +2026,14 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void) printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", i); physid_set(i, phys_id_present_map); - ioapics[apic_id].mp_config.apicid = i; + ioapics[ioapic_idx].mp_config.apicid = i; } else { physid_mask_t tmp; - apic->apicid_to_cpu_present(mpc_ioapic_id(apic_id), + apic->apicid_to_cpu_present(mpc_ioapic_id(ioapic_idx), &tmp); apic_printk(APIC_VERBOSE, "Setting %d in the " "phys_id_present_map\n", - mpc_ioapic_id(apic_id)); + mpc_ioapic_id(ioapic_idx)); physids_or(phys_id_present_map, phys_id_present_map, tmp); } @@ -2040,35 +2041,35 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void) * We need to adjust the IRQ routing table * if the ID changed. */ - if (old_id != mpc_ioapic_id(apic_id)) + if (old_id != mpc_ioapic_id(ioapic_idx)) for (i = 0; i < mp_irq_entries; i++) if (mp_irqs[i].dstapic == old_id) mp_irqs[i].dstapic - = mpc_ioapic_id(apic_id); + = mpc_ioapic_id(ioapic_idx); /* * Update the ID register according to the right value * from the MPC table if they are different. */ - if (mpc_ioapic_id(apic_id) == reg_00.bits.ID) + if (mpc_ioapic_id(ioapic_idx) == reg_00.bits.ID) continue; apic_printk(APIC_VERBOSE, KERN_INFO "...changing IO-APIC physical APIC ID to %d ...", - mpc_ioapic_id(apic_id)); + mpc_ioapic_id(ioapic_idx)); - reg_00.bits.ID = mpc_ioapic_id(apic_id); + reg_00.bits.ID = mpc_ioapic_id(ioapic_idx); raw_spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic_id, 0, reg_00.raw); + io_apic_write(ioapic_idx, 0, reg_00.raw); raw_spin_unlock_irqrestore(&ioapic_lock, flags); /* * Sanity check */ raw_spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(apic_id, 0); + reg_00.raw = io_apic_read(ioapic_idx, 0); raw_spin_unlock_irqrestore(&ioapic_lock, flags); - if (reg_00.bits.ID != mpc_ioapic_id(apic_id)) + if (reg_00.bits.ID != mpc_ioapic_id(ioapic_idx)) printk("could not set ID!\n"); else apic_printk(APIC_VERBOSE, " ok.\n"); @@ -2968,27 +2969,26 @@ static int __init io_apic_bug_finalize(void) late_initcall(io_apic_bug_finalize); -static void resume_ioapic_id(int ioapic_id) +static void resume_ioapic_id(int ioapic_idx) { unsigned long flags; union IO_APIC_reg_00 reg_00; - raw_spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(ioapic_id, 0); - if (reg_00.bits.ID != mpc_ioapic_id(ioapic_id)) { - reg_00.bits.ID = mpc_ioapic_id(ioapic_id); - io_apic_write(ioapic_id, 0, reg_00.raw); + reg_00.raw = io_apic_read(ioapic_idx, 0); + if (reg_00.bits.ID != mpc_ioapic_id(ioapic_idx)) { + reg_00.bits.ID = mpc_ioapic_id(ioapic_idx); + io_apic_write(ioapic_idx, 0, reg_00.raw); } raw_spin_unlock_irqrestore(&ioapic_lock, flags); } static void ioapic_resume(void) { - int ioapic_id; + int ioapic_idx; - for (ioapic_id = nr_ioapics - 1; ioapic_id >= 0; ioapic_id--) - resume_ioapic_id(ioapic_id); + for (ioapic_idx = nr_ioapics - 1; ioapic_idx >= 0; ioapic_idx--) + resume_ioapic_id(ioapic_idx); restore_ioapic_entries(); } @@ -3597,18 +3597,18 @@ io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr) int io_apic_setup_irq_pin_once(unsigned int irq, int node, struct io_apic_irq_attr *attr) { - unsigned int id = attr->ioapic, pin = attr->ioapic_pin; + unsigned int ioapic_idx = attr->ioapic, pin = attr->ioapic_pin; int ret; /* Avoid redundant programming */ - if (test_bit(pin, ioapics[id].pin_programmed)) { + if (test_bit(pin, ioapics[ioapic_idx].pin_programmed)) { pr_debug("Pin %d-%d already programmed\n", - mpc_ioapic_id(id), pin); + mpc_ioapic_id(ioapic_idx), pin); return 0; } ret = io_apic_setup_irq_pin(irq, node, attr); if (!ret) - set_bit(pin, ioapics[id].pin_programmed); + set_bit(pin, ioapics[ioapic_idx].pin_programmed); return ret; } -- cgit v1.1 From 141d55e6cc590293ea1378f55b9ebd38f5024bf0 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 12 Oct 2011 11:53:17 -0700 Subject: x86/irq: Standardize on CONFIG_SPARSE_IRQ=y Sparseirq got introduced in v2.6.28 and Thomas did a huge cleanup around v2.6.38 that eliminated basically all disadvantages of it. So we can remove non-sparseirq support now and simplify our IRQ degrees of freedom a bit. Suggested-and-acked-by: Thomas Gleixner Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/4E95E21D.6090200@oracle.com Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 1 + arch/x86/include/asm/irq_vectors.h | 12 ++---------- arch/x86/kernel/apic/io_apic.c | 23 ----------------------- 3 files changed, 3 insertions(+), 33 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 6a47bb2..fcd34c3 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -64,6 +64,7 @@ config X86 select HAVE_TEXT_POKE_SMP select HAVE_GENERIC_HARDIRQS select HAVE_SPARSE_IRQ + select SPARSE_IRQ select GENERIC_FIND_FIRST_BIT select GENERIC_IRQ_PROBE select GENERIC_PENDING_IRQ if SMP diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 7e50f06..4b44487 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -160,19 +160,11 @@ static inline int invalid_vm86_irq(int irq) #define IO_APIC_VECTOR_LIMIT ( 32 * MAX_IO_APICS ) #ifdef CONFIG_X86_IO_APIC -# ifdef CONFIG_SPARSE_IRQ -# define CPU_VECTOR_LIMIT (64 * NR_CPUS) -# define NR_IRQS \ +# define CPU_VECTOR_LIMIT (64 * NR_CPUS) +# define NR_IRQS \ (CPU_VECTOR_LIMIT > IO_APIC_VECTOR_LIMIT ? \ (NR_VECTORS + CPU_VECTOR_LIMIT) : \ (NR_VECTORS + IO_APIC_VECTOR_LIMIT)) -# else -# define CPU_VECTOR_LIMIT (32 * NR_CPUS) -# define NR_IRQS \ - (CPU_VECTOR_LIMIT < IO_APIC_VECTOR_LIMIT ? \ - (NR_VECTORS + CPU_VECTOR_LIMIT) : \ - (NR_VECTORS + IO_APIC_VECTOR_LIMIT)) -# endif #else /* !CONFIG_X86_IO_APIC: */ # define NR_IRQS NR_IRQS_LEGACY #endif diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 08f0d16..05a30c1 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -186,11 +186,7 @@ static struct irq_pin_list *alloc_irq_pin_list(int node) /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ -#ifdef CONFIG_SPARSE_IRQ static struct irq_cfg irq_cfgx[NR_IRQS_LEGACY]; -#else -static struct irq_cfg irq_cfgx[NR_IRQS]; -#endif int __init arch_early_irq_init(void) { @@ -234,7 +230,6 @@ int __init arch_early_irq_init(void) return 0; } -#ifdef CONFIG_SPARSE_IRQ static struct irq_cfg *irq_cfg(unsigned int irq) { return irq_get_chip_data(irq); @@ -269,22 +264,6 @@ static void free_irq_cfg(unsigned int at, struct irq_cfg *cfg) kfree(cfg); } -#else - -struct irq_cfg *irq_cfg(unsigned int irq) -{ - return irq < nr_irqs ? irq_cfgx + irq : NULL; -} - -static struct irq_cfg *alloc_irq_cfg(unsigned int irq, int node) -{ - return irq_cfgx + irq; -} - -static inline void free_irq_cfg(unsigned int at, struct irq_cfg *cfg) { } - -#endif - static struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node) { int res = irq_alloc_desc_at(at, node); @@ -3644,7 +3623,6 @@ int get_nr_irqs_gsi(void) return nr_irqs_gsi; } -#ifdef CONFIG_SPARSE_IRQ int __init arch_probe_nr_irqs(void) { int nr; @@ -3664,7 +3642,6 @@ int __init arch_probe_nr_irqs(void) return NR_IRQS_LEGACY; } -#endif int io_apic_set_pci_routing(struct device *dev, int irq, struct io_apic_irq_attr *irq_attr) -- cgit v1.1 From 70989449daccf545214b4840b112558e25c2b3fc Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Sun, 9 Oct 2011 15:42:00 -0400 Subject: x86, microcode: Don't request microcode from userspace unnecessarily Requesting the microcode from userspace *every time* when onlining CPUs (during a CPU hotplug operation) is unnecessary. Thus, ensure that once the kernel gets the microcode after booting, it is not freed nor invalidated when a CPU goes offline, so that it can be reused when that CPU comes back online, without requesting userspace for it again. As a result, the CPU hotplug operations become faster as well. Signed-off-by: Srivatsa S. Bhat Acked-by: Rafael J. Wysocki Acked-by: Andreas Herrmann Link: http://lkml.kernel.org/r/4E91F908.5010006@linux.vnet.ibm.com Signed-off-by: Borislav Petkov --- arch/x86/kernel/microcode_core.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index f924280..f2d2a66 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -483,7 +483,13 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); pr_debug("CPU%d removed\n", cpu); break; - case CPU_DEAD: + + /* + * When a CPU goes offline, don't free up or invalidate the copy of + * the microcode in kernel memory, so that we can reuse it when the + * CPU comes back online without unnecessarily requesting the userspace + * for it again. + */ case CPU_UP_CANCELED_FROZEN: /* The CPU refused to come up during a system resume */ microcode_fini_cpu(cpu); -- cgit v1.1 From 153b19a3b9fd8b9478495b9ee1f93f6a77c564f9 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Thu, 13 Oct 2011 12:04:20 +0300 Subject: x86, mrst: use a temporary variable for SFI irq SFI tables reside in RAM and should not be modified once they are written. Current code went to set pentry->irq to zero which causes subsequent reads to fail with invalid SFI table checksum. This will break kexec as the second kernel fails to validate SFI tables. To fix this we use temporary variable for irq number. Signed-off-by: Mika Westerberg Reviewed-by: Kirill A. Shutemov Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- arch/x86/platform/mrst/mrst.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c index 58425ad..fe73276 100644 --- a/arch/x86/platform/mrst/mrst.c +++ b/arch/x86/platform/mrst/mrst.c @@ -678,38 +678,40 @@ static int __init sfi_parse_devs(struct sfi_table_header *table) pentry = (struct sfi_device_table_entry *)sb->pentry; for (i = 0; i < num; i++, pentry++) { - if (pentry->irq != (u8)0xff) { /* native RTE case */ + int irq = pentry->irq; + + if (irq != (u8)0xff) { /* native RTE case */ /* these SPI2 devices are not exposed to system as PCI * devices, but they have separate RTE entry in IOAPIC * so we have to enable them one by one here */ - ioapic = mp_find_ioapic(pentry->irq); + ioapic = mp_find_ioapic(irq); irq_attr.ioapic = ioapic; - irq_attr.ioapic_pin = pentry->irq; + irq_attr.ioapic_pin = irq; irq_attr.trigger = 1; irq_attr.polarity = 1; - io_apic_set_pci_routing(NULL, pentry->irq, &irq_attr); + io_apic_set_pci_routing(NULL, irq, &irq_attr); } else - pentry->irq = 0; /* No irq */ + irq = 0; /* No irq */ switch (pentry->type) { case SFI_DEV_TYPE_IPC: /* ID as IRQ is a hack that will go away */ - pdev = platform_device_alloc(pentry->name, pentry->irq); + pdev = platform_device_alloc(pentry->name, irq); if (pdev == NULL) { pr_err("out of memory for SFI platform device '%s'.\n", pentry->name); continue; } - install_irq_resource(pdev, pentry->irq); + install_irq_resource(pdev, irq); pr_debug("info[%2d]: IPC bus, name = %16.16s, " - "irq = 0x%2x\n", i, pentry->name, pentry->irq); + "irq = 0x%2x\n", i, pentry->name, irq); sfi_handle_ipc_dev(pdev); break; case SFI_DEV_TYPE_SPI: memset(&spi_info, 0, sizeof(spi_info)); strncpy(spi_info.modalias, pentry->name, SFI_NAME_LEN); - spi_info.irq = pentry->irq; + spi_info.irq = irq; spi_info.bus_num = pentry->host_num; spi_info.chip_select = pentry->addr; spi_info.max_speed_hz = pentry->max_freq; @@ -726,7 +728,7 @@ static int __init sfi_parse_devs(struct sfi_table_header *table) memset(&i2c_info, 0, sizeof(i2c_info)); bus = pentry->host_num; strncpy(i2c_info.type, pentry->name, SFI_NAME_LEN); - i2c_info.irq = pentry->irq; + i2c_info.irq = irq; i2c_info.addr = pentry->addr; pr_debug("info[%2d]: I2C bus = %d, name = %16.16s, " "irq = 0x%2x, addr = 0x%x\n", i, bus, -- cgit v1.1 From 506ed6b53e00ba303ad778122f08e1fca7cf5efb Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 12 Oct 2011 17:46:33 -0700 Subject: x86, intel: Output microcode revision in /proc/cpuinfo I got a request to make it easier to determine the microcode update level on Intel CPUs. This patch adds a new "microcode" field to /proc/cpuinfo. The microcode level is also outputed on fatal machine checks together with the other CPUID model information. I removed the respective code from the microcode update driver, it just reads the field from cpu_data. Also when the microcode is updated it fills in the new values too. I had to add a memory barrier to native_cpuid to prevent it being optimized away when the result is not used. This turns out to clean up further code which already got this information manually. This is done in followon patches. Signed-off-by: Andi Kleen Acked-by: H. Peter Anvin Link: http://lkml.kernel.org/r/1318466795-7393-1-git-send-email-andi@firstfloor.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 4 +++- arch/x86/kernel/cpu/intel.c | 9 +++++++++ arch/x86/kernel/cpu/mcheck/mce.c | 9 +++++++-- arch/x86/kernel/cpu/proc.c | 2 ++ arch/x86/kernel/microcode_intel.c | 14 +++++--------- 5 files changed, 26 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 0d1171c..b650435 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -111,6 +111,7 @@ struct cpuinfo_x86 { /* Index into per_cpu list: */ u16 cpu_index; #endif + u32 microcode; } __attribute__((__aligned__(SMP_CACHE_BYTES))); #define X86_VENDOR_INTEL 0 @@ -179,7 +180,8 @@ static inline void native_cpuid(unsigned int *eax, unsigned int *ebx, "=b" (*ebx), "=c" (*ecx), "=d" (*edx) - : "0" (*eax), "2" (*ecx)); + : "0" (*eax), "2" (*ecx) + : "memory"); } static inline void load_cr3(pgd_t *pgdir) diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index ed6086e..26627a3 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -47,6 +47,15 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) (c->x86 == 0x6 && c->x86_model >= 0x0e)) set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64)) { + unsigned lower_word; + + wrmsr(MSR_IA32_UCODE_REV, 0, 0); + /* Required by the SDM */ + sync_core(); + rdmsr(MSR_IA32_UCODE_REV, lower_word, c->microcode); + } + /* * Atom erratum AAE44/AAF40/AAG38/AAH41: * diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 08363b0..8af6fa4 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -217,8 +217,13 @@ static void print_mce(struct mce *m) pr_cont("MISC %llx ", m->misc); pr_cont("\n"); - pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", - m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid); + /* + * Note this output is parsed by external tools and old fields + * should not be changed. + */ + pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %u\n", + m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid, + cpu_data(m->extcpu).microcode); /* * Print out human-readable details about the MCE error, diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 62ac8cb..6254fda 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -85,6 +85,8 @@ static int show_cpuinfo(struct seq_file *m, void *v) seq_printf(m, "stepping\t: %d\n", c->x86_mask); else seq_printf(m, "stepping\t: unknown\n"); + if (c->microcode) + seq_printf(m, "microcode\t: %u\n", c->microcode); if (cpu_has(c, X86_FEATURE_TSC)) { unsigned int freq = cpufreq_quick_get(cpu); diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index 1a1b606..3ca42d0 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c @@ -161,12 +161,7 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) csig->pf = 1 << ((val[1] >> 18) & 7); } - wrmsr(MSR_IA32_UCODE_REV, 0, 0); - /* see notes above for revision 1.07. Apparent chip bug */ - sync_core(); - /* get the current revision from MSR 0x8B */ - rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev); - + csig->rev = c->microcode; pr_info("CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n", cpu_num, csig->sig, csig->pf, csig->rev); @@ -299,9 +294,9 @@ static int apply_microcode(int cpu) struct microcode_intel *mc_intel; struct ucode_cpu_info *uci; unsigned int val[2]; - int cpu_num; + int cpu_num = raw_smp_processor_id(); + struct cpuinfo_x86 *c = &cpu_data(cpu_num); - cpu_num = raw_smp_processor_id(); uci = ucode_cpu_info + cpu; mc_intel = uci->mc; @@ -317,7 +312,7 @@ static int apply_microcode(int cpu) (unsigned long) mc_intel->bits >> 16 >> 16); wrmsr(MSR_IA32_UCODE_REV, 0, 0); - /* see notes above for revision 1.07. Apparent chip bug */ + /* As documented in the SDM: Do a CPUID 1 here */ sync_core(); /* get the current revision from MSR 0x8B */ @@ -335,6 +330,7 @@ static int apply_microcode(int cpu) (mc_intel->hdr.date >> 16) & 0xff); uci->cpu_sig.rev = val[1]; + c->microcode = val[1]; return 0; } -- cgit v1.1 From 30963c0ac721f70aea0352ed1cd0fc3cfbaef730 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 12 Oct 2011 17:46:34 -0700 Subject: x86, intel: Use c->microcode for Atom errata check Now that the cpu update level is available the Atom PSE errata check can use it directly without reading the MSR again. Signed-off-by: Andi Kleen Acked-by: H. Peter Anvin Link: http://lkml.kernel.org/r/1318466795-7393-2-git-send-email-andi@firstfloor.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 26627a3..5231312 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -64,17 +64,10 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) * need the microcode to have already been loaded... so if it is * not, recommend a BIOS update and disable large pages. */ - if (c->x86 == 6 && c->x86_model == 0x1c && c->x86_mask <= 2) { - u32 ucode, junk; - - wrmsr(MSR_IA32_UCODE_REV, 0, 0); - sync_core(); - rdmsr(MSR_IA32_UCODE_REV, junk, ucode); - - if (ucode < 0x20e) { - printk(KERN_WARNING "Atom PSE erratum detected, BIOS microcode update recommended\n"); - clear_cpu_cap(c, X86_FEATURE_PSE); - } + if (c->x86 == 6 && c->x86_model == 0x1c && c->x86_mask <= 2 && + c->microcode < 0x20e) { + printk(KERN_WARNING "Atom PSE erratum detected, BIOS microcode update recommended\n"); + clear_cpu_cap(c, X86_FEATURE_PSE); } #ifdef CONFIG_X86_64 -- cgit v1.1 From 72da0b07b1b497927758a2102b856ce41e4ba81e Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 15 Sep 2011 08:58:51 +0100 Subject: x86: constify PCI raw ops structures As with any other such change, the goal is to prevent inadvertent writes to these structures (assuming DEBUG_RODATA is enabled), and to separate data (possibly frequently) written to from such never getting modified. Reviewed-by: Ingo Molnar Signed-off-by: Jan Beulich Signed-off-by: Jesse Barnes --- arch/x86/include/asm/pci_x86.h | 6 +++--- arch/x86/pci/ce4100.c | 2 +- arch/x86/pci/common.c | 4 ++-- arch/x86/pci/direct.c | 6 +++--- arch/x86/pci/mmconfig_32.c | 2 +- arch/x86/pci/mmconfig_64.c | 2 +- arch/x86/pci/numaq_32.c | 2 +- arch/x86/pci/olpc.c | 2 +- arch/x86/pci/pcbios.c | 4 ++-- 9 files changed, 15 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index 7045267..e381978 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -99,10 +99,10 @@ struct pci_raw_ops { int reg, int len, u32 val); }; -extern struct pci_raw_ops *raw_pci_ops; -extern struct pci_raw_ops *raw_pci_ext_ops; +extern const struct pci_raw_ops *raw_pci_ops; +extern const struct pci_raw_ops *raw_pci_ext_ops; -extern struct pci_raw_ops pci_direct_conf1; +extern const struct pci_raw_ops pci_direct_conf1; extern bool port_cf9_safe; /* arch_initcall level */ diff --git a/arch/x86/pci/ce4100.c b/arch/x86/pci/ce4100.c index 9917609..41bd2a2 100644 --- a/arch/x86/pci/ce4100.c +++ b/arch/x86/pci/ce4100.c @@ -304,7 +304,7 @@ static int ce4100_conf_write(unsigned int seg, unsigned int bus, return pci_direct_conf1.write(seg, bus, devfn, reg, len, value); } -struct pci_raw_ops ce4100_pci_conf = { +static const struct pci_raw_ops ce4100_pci_conf = { .read = ce4100_conf_read, .write = ce4100_conf_write, }; diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 92df322..7962ccb 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -33,8 +33,8 @@ int noioapicreroute = 1; int pcibios_last_bus = -1; unsigned long pirq_table_addr; struct pci_bus *pci_root_bus; -struct pci_raw_ops *raw_pci_ops; -struct pci_raw_ops *raw_pci_ext_ops; +const struct pci_raw_ops *__read_mostly raw_pci_ops; +const struct pci_raw_ops *__read_mostly raw_pci_ext_ops; int raw_pci_read(unsigned int domain, unsigned int bus, unsigned int devfn, int reg, int len, u32 *val) diff --git a/arch/x86/pci/direct.c b/arch/x86/pci/direct.c index 4f2c704..1546059 100644 --- a/arch/x86/pci/direct.c +++ b/arch/x86/pci/direct.c @@ -79,7 +79,7 @@ static int pci_conf1_write(unsigned int seg, unsigned int bus, #undef PCI_CONF1_ADDRESS -struct pci_raw_ops pci_direct_conf1 = { +const struct pci_raw_ops pci_direct_conf1 = { .read = pci_conf1_read, .write = pci_conf1_write, }; @@ -175,7 +175,7 @@ static int pci_conf2_write(unsigned int seg, unsigned int bus, #undef PCI_CONF2_ADDRESS -struct pci_raw_ops pci_direct_conf2 = { +static const struct pci_raw_ops pci_direct_conf2 = { .read = pci_conf2_read, .write = pci_conf2_write, }; @@ -191,7 +191,7 @@ struct pci_raw_ops pci_direct_conf2 = { * This should be close to trivial, but it isn't, because there are buggy * chipsets (yes, you guessed it, by Intel and Compaq) that have no class ID. */ -static int __init pci_sanity_check(struct pci_raw_ops *o) +static int __init pci_sanity_check(const struct pci_raw_ops *o) { u32 x = 0; int year, devfn; diff --git a/arch/x86/pci/mmconfig_32.c b/arch/x86/pci/mmconfig_32.c index a3d9c54..5372e86 100644 --- a/arch/x86/pci/mmconfig_32.c +++ b/arch/x86/pci/mmconfig_32.c @@ -117,7 +117,7 @@ static int pci_mmcfg_write(unsigned int seg, unsigned int bus, return 0; } -static struct pci_raw_ops pci_mmcfg = { +static const struct pci_raw_ops pci_mmcfg = { .read = pci_mmcfg_read, .write = pci_mmcfg_write, }; diff --git a/arch/x86/pci/mmconfig_64.c b/arch/x86/pci/mmconfig_64.c index e783841..915a493 100644 --- a/arch/x86/pci/mmconfig_64.c +++ b/arch/x86/pci/mmconfig_64.c @@ -81,7 +81,7 @@ static int pci_mmcfg_write(unsigned int seg, unsigned int bus, return 0; } -static struct pci_raw_ops pci_mmcfg = { +static const struct pci_raw_ops pci_mmcfg = { .read = pci_mmcfg_read, .write = pci_mmcfg_write, }; diff --git a/arch/x86/pci/numaq_32.c b/arch/x86/pci/numaq_32.c index 512a88c..51abf02 100644 --- a/arch/x86/pci/numaq_32.c +++ b/arch/x86/pci/numaq_32.c @@ -110,7 +110,7 @@ static int pci_conf1_mq_write(unsigned int seg, unsigned int bus, #undef PCI_CONF1_MQ_ADDRESS -static struct pci_raw_ops pci_direct_conf1_mq = { +static const struct pci_raw_ops pci_direct_conf1_mq = { .read = pci_conf1_mq_read, .write = pci_conf1_mq_write }; diff --git a/arch/x86/pci/olpc.c b/arch/x86/pci/olpc.c index 5262603..7043a4f 100644 --- a/arch/x86/pci/olpc.c +++ b/arch/x86/pci/olpc.c @@ -301,7 +301,7 @@ static int pci_olpc_write(unsigned int seg, unsigned int bus, return 0; } -static struct pci_raw_ops pci_olpc_conf = { +static const struct pci_raw_ops pci_olpc_conf = { .read = pci_olpc_read, .write = pci_olpc_write, }; diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c index f685535..db0e9a5 100644 --- a/arch/x86/pci/pcbios.c +++ b/arch/x86/pci/pcbios.c @@ -303,7 +303,7 @@ static int pci_bios_write(unsigned int seg, unsigned int bus, * Function table for BIOS32 access */ -static struct pci_raw_ops pci_bios_access = { +static const struct pci_raw_ops pci_bios_access = { .read = pci_bios_read, .write = pci_bios_write }; @@ -312,7 +312,7 @@ static struct pci_raw_ops pci_bios_access = { * Try to find PCI BIOS. */ -static struct pci_raw_ops * __devinit pci_find_bios(void) +static const struct pci_raw_ops * __devinit pci_find_bios(void) { union bios32 *check; unsigned char sum; -- cgit v1.1 From db45bd90be200692342a340e84354c26ca9e424c Mon Sep 17 00:00:00 2001 From: Josh Stone Date: Mon, 17 Oct 2011 18:00:45 -0700 Subject: x86, perf, kprobes: Make kprobes's twobyte_is_boostable volatile MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When compiling an i386_defconfig kernel with gcc-4.6.1-9.fc15.i686, I noticed a warning about the asm operand for test_bit in kprobes' can_boost. I discovered that this caused only the first long of twobyte_is_boostable[] to be output. Jakub filed and fixed gcc PR50571 to correct the warning and this output issue. But to solve it for less current gcc, we can make kprobes' twobyte_is_boostable[] volatile, and it won't be optimized out. Before: CC arch/x86/kernel/kprobes.o In file included from include/linux/bitops.h:22:0, from include/linux/kernel.h:17, from [...]/arch/x86/include/asm/percpu.h:44, from [...]/arch/x86/include/asm/current.h:5, from [...]/arch/x86/include/asm/processor.h:15, from [...]/arch/x86/include/asm/atomic.h:6, from include/linux/atomic.h:4, from include/linux/mutex.h:18, from include/linux/notifier.h:13, from include/linux/kprobes.h:34, from arch/x86/kernel/kprobes.c:43: [...]/arch/x86/include/asm/bitops.h: In function ‘can_boost.part.1’: [...]/arch/x86/include/asm/bitops.h:319:2: warning: use of memory input without lvalue in asm operand 1 is deprecated [enabled by default] $ objdump -rd arch/x86/kernel/kprobes.o | grep -A1 -w bt 551: 0f a3 05 00 00 00 00 bt %eax,0x0 554: R_386_32 .rodata.cst4 $ objdump -s -j .rodata.cst4 -j .data arch/x86/kernel/kprobes.o arch/x86/kernel/kprobes.o: file format elf32-i386 Contents of section .data: 0000 48000000 00000000 00000000 00000000 H............... Contents of section .rodata.cst4: 0000 4c030000 L... Only a single long of twobyte_is_boostable[] is in the object file. After, with volatile: $ objdump -rd arch/x86/kernel/kprobes.o | grep -A1 -w bt 551: 0f a3 05 20 00 00 00 bt %eax,0x20 554: R_386_32 .data $ objdump -s -j .rodata.cst4 -j .data arch/x86/kernel/kprobes.o arch/x86/kernel/kprobes.o: file format elf32-i386 Contents of section .data: 0000 48000000 00000000 00000000 00000000 H............... 0010 00000000 00000000 00000000 00000000 ................ 0020 4c030000 0f000200 ffff0000 ffcff0c0 L............... 0030 0000ffff 3bbbfff8 03ff2ebb 26bb2e77 ....;.......&..w Now all 32 bytes are output into .data instead. Signed-off-by: Josh Stone Acked-by: Masami Hiramatsu Cc: Srikar Dronamraju Cc: Jakub Jelinek Link: http://lkml.kernel.org/r/1318899645-4068-1-git-send-email-jistone@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/kprobes.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index f1a6244d..c0ed3d9 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -75,8 +75,10 @@ DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); /* * Undefined/reserved opcodes, conditional jump, Opcode Extension * Groups, and some special opcodes can not boost. + * This is volatile to keep gcc from statically optimizing it out, as + * variable_test_bit makes gcc think only *(unsigned long*) is used. */ -static const u32 twobyte_is_boostable[256 / 32] = { +static volatile const u32 twobyte_is_boostable[256 / 32] = { /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ /* ---------------------------------------------- */ W(0x00, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0) | /* 00 */ -- cgit v1.1 From 881e23e56764808e7ab1ed73b5d8a6700042ea38 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 17 Oct 2011 16:45:10 +0200 Subject: x86, microcode: Correct microcode revision format 506ed6b53e00 ("x86, intel: Output microcode revision in /proc/cpuinfo") added microcode revision format to /proc/cpuinfo and the MCE handler in decimal format but both AMD and Intel patch levels are handled as hex numbers. Fix it. Acked-by: Andi Kleen Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/mcheck/mce.c | 2 +- arch/x86/kernel/cpu/proc.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 8af6fa4..ad8d897 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -221,7 +221,7 @@ static void print_mce(struct mce *m) * Note this output is parsed by external tools and old fields * should not be changed. */ - pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %u\n", + pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n", m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid, cpu_data(m->extcpu).microcode); diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 6254fda..14b2314 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -86,7 +86,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) else seq_printf(m, "stepping\t: unknown\n"); if (c->microcode) - seq_printf(m, "microcode\t: %u\n", c->microcode); + seq_printf(m, "microcode\t: 0x%x\n", c->microcode); if (cpu_has(c, X86_FEATURE_TSC)) { unsigned int freq = cpufreq_quick_get(cpu); -- cgit v1.1 From bcb80e53877c2045d9e52f4a71372c3fe6501f6f Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 17 Oct 2011 16:34:36 +0200 Subject: x86, microcode, AMD: Add microcode revision to /proc/cpuinfo Enable microcode revision output for AMD after 506ed6b53e00 ("x86, intel: Output microcode revision in /proc/cpuinfo") did it for Intel. Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/amd.c | 4 ++++ arch/x86/kernel/microcode_amd.c | 5 +++-- 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index b13ed39..d898fab 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -412,6 +412,8 @@ static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c) static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) { + u32 dummy; + early_init_amd_mc(c); /* @@ -442,6 +444,8 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) } #endif + rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy); + /* We need to do the following only once */ if (c != &boot_cpu_data) return; diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 591be0e..d494799 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -74,14 +74,13 @@ static struct equiv_cpu_entry *equiv_cpu_table; static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) { struct cpuinfo_x86 *c = &cpu_data(cpu); - u32 dummy; if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) { pr_warning("CPU%d: family %d not supported\n", cpu, c->x86); return -1; } - rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy); + csig->rev = c->microcode; pr_info("CPU%d: patch_level=0x%08x\n", cpu, csig->rev); return 0; @@ -130,6 +129,7 @@ static int apply_microcode_amd(int cpu) int cpu_num = raw_smp_processor_id(); struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; struct microcode_amd *mc_amd = uci->mc; + struct cpuinfo_x86 *c = &cpu_data(cpu); /* We should bind the task to the CPU */ BUG_ON(cpu_num != cpu); @@ -150,6 +150,7 @@ static int apply_microcode_amd(int cpu) pr_info("CPU%d: new patch_level=0x%08x\n", cpu, rev); uci->cpu_sig.rev = rev; + c->microcode = rev; return 0; } -- cgit v1.1 From e6599225db36bbdc991d1cc8fbfcacb24f86cdb5 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 29 Sep 2011 13:26:45 -0400 Subject: xen/irq: If we fail during msi_capability_init return proper error code. There are three different modes: PV, HVM, and initial domain 0. In all the cases we would return -1 for failure instead of a proper error code. Fix this by propagating the error code from the generic IRQ code. Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/pci/xen.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index 1017c7b..11a9301 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -175,8 +175,10 @@ static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) "pcifront-msi-x" : "pcifront-msi", DOMID_SELF); - if (irq < 0) + if (irq < 0) { + ret = irq; goto free; + } i++; } kfree(v); @@ -221,8 +223,10 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) if (msg.data != XEN_PIRQ_MSI_DATA || xen_irq_from_pirq(pirq) < 0) { pirq = xen_allocate_pirq_msi(dev, msidesc); - if (pirq < 0) + if (pirq < 0) { + irq = -ENODEV; goto error; + } xen_msi_compose_msg(dev, pirq, &msg); __write_msi_msg(msidesc, &msg); dev_dbg(&dev->dev, "xen: msi bound to pirq=%d\n", pirq); @@ -244,7 +248,7 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) error: dev_err(&dev->dev, "Xen PCI frontend has not registered MSI/MSI-X support!\n"); - return -ENODEV; + return irq; } #ifdef CONFIG_XEN_DOM0 -- cgit v1.1 From 5e287830136a8edb76e9f9c432b264d99833172f Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 29 Sep 2011 13:06:42 -0400 Subject: xen/enlighten: Fix compile warnings and set cx to known value. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We get: linux/arch/x86/xen/enlighten.c: In function ‘xen_start_kernel’: linux/arch/x86/xen/enlighten.c:226: warning: ‘cx’ may be used uninitialized in this function linux/arch/x86/xen/enlighten.c:240: note: ‘cx’ was declared here and the cx is really not set but passed in the xen_cpuid instruction which masks the value with returned masked_ecx from cpuid. This can potentially lead to invalid data being stored in cx. Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/enlighten.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 2d69617..da8afd5 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -251,6 +251,7 @@ static void __init xen_init_cpuid_mask(void) ~((1 << X86_FEATURE_APIC) | /* disable local APIC */ (1 << X86_FEATURE_ACPI)); /* disable ACPI */ ax = 1; + cx = 0; xen_cpuid(&ax, &bx, &cx, &dx); xsave_mask = -- cgit v1.1 From 8404877ee1cfdbc872e153fd89022f9e47f6f5a3 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 29 Sep 2011 13:09:34 -0400 Subject: xen/p2m/debugfs: Fix potential pointer exception. We could be referencing the last + 1 element of level_name[] array which would cause a pointer exception, because of the initial setup of lvl=4. [v1: No need to do this for type_name, pointed out by Ian Campbell] Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/p2m.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 58efeb9..2e3bf7a 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -786,7 +786,7 @@ EXPORT_SYMBOL_GPL(m2p_find_override_pfn); int p2m_dump_show(struct seq_file *m, void *v) { static const char * const level_name[] = { "top", "middle", - "entry", "abnormal" }; + "entry", "abnormal", "error"}; static const char * const type_name[] = { "identity", "missing", "pfn", "abnormal"}; #define TYPE_IDENTITY 0 -- cgit v1.1 From a491dbef56f2aba42fb292067d4652d246627738 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Mon, 3 Oct 2011 12:35:26 -0400 Subject: xen/p2m/debugfs: Make type_name more obvious. Per Ian Campbell suggestion to defend against future breakage in case we expand the P2M values, incorporate the defines in the string array. Suggested-by: Ian Campbell Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/p2m.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 2e3bf7a..795e003 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -787,12 +787,15 @@ int p2m_dump_show(struct seq_file *m, void *v) { static const char * const level_name[] = { "top", "middle", "entry", "abnormal", "error"}; - static const char * const type_name[] = { "identity", "missing", - "pfn", "abnormal"}; #define TYPE_IDENTITY 0 #define TYPE_MISSING 1 #define TYPE_PFN 2 #define TYPE_UNKNOWN 3 + static const char * const type_name[] = { + [TYPE_IDENTITY] = "identity", + [TYPE_MISSING] = "missing", + [TYPE_PFN] = "pfn", + [TYPE_UNKNOWN] = "abnormal"}; unsigned long pfn, prev_pfn_type = 0, prev_pfn_level = 0; unsigned int uninitialized_var(prev_level); unsigned int uninitialized_var(prev_type); -- cgit v1.1 From e827bb09c815955d5d5f0ddf98483a7efd04f55b Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Fri, 23 Sep 2011 19:50:55 +0300 Subject: crypto: blowfish-x86_64 - improve x86_64 blowfish 4-way performance This patch adds improved F-macro for 4-way parallel functions. With new F-macro for 4-way parallel functions, blowfish sees ~15% improvement in speed tests on AMD Phenom II (~5% on Intel Xeon E7330). However when used in 1-way blowfish function new macro would be ~10% slower than original, so old F-macro is kept for 1-way functions. Patch cleans up old F-macro as it is no longer needed in 4-way part. Patch also does register macro renaming to reduce stack usage. Signed-off-by: Jussi Kivilinna Signed-off-by: Herbert Xu --- arch/x86/crypto/blowfish-x86_64-asm_64.S | 198 +++++++++++++++---------------- 1 file changed, 98 insertions(+), 100 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/blowfish-x86_64-asm_64.S b/arch/x86/crypto/blowfish-x86_64-asm_64.S index 44eb23a..391d245 100644 --- a/arch/x86/crypto/blowfish-x86_64-asm_64.S +++ b/arch/x86/crypto/blowfish-x86_64-asm_64.S @@ -56,38 +56,32 @@ #define RT0 %rbp #define RT1 %rsi +#define RT2 %r8 +#define RT3 %r9 #define RT0d %ebp #define RT1d %esi +#define RT2d %r8d +#define RT3d %r9d -#define RK0 %r8 -#define RK1 %r9 -#define RK2 %r10 -#define RK3 %r11 - -#define RK0d %r8d -#define RK1d %r9d -#define RK2d %r10d -#define RK3d %r11d - -#define RKEY %r12 +#define RKEY %r10 /*********************************************************************** * 1-way blowfish ***********************************************************************/ -#define F(x, k) \ - rorq $16, x; \ - movzbl x ## bh, RT0d; \ - movzbl x ## bl, RT1d; \ - rolq $16, x; \ - movl s0(CTX,RT0,4), k ## d; \ - addl s1(CTX,RT1,4), k ## d; \ - movzbl x ## bh, RT0d; \ - movzbl x ## bl, RT1d; \ - rolq $32, x; \ - xorl s2(CTX,RT0,4), k ## d; \ - addl s3(CTX,RT1,4), k ## d; \ - xorq k, x; +#define F() \ + rorq $16, RX0; \ + movzbl RX0bh, RT0d; \ + movzbl RX0bl, RT1d; \ + rolq $16, RX0; \ + movl s0(CTX,RT0,4), RT0d; \ + addl s1(CTX,RT1,4), RT0d; \ + movzbl RX0bh, RT1d; \ + movzbl RX0bl, RT2d; \ + rolq $32, RX0; \ + xorl s2(CTX,RT1,4), RT0d; \ + addl s3(CTX,RT2,4), RT0d; \ + xorq RT0, RX0; #define add_roundkey_enc(n) \ xorq p+4*(n)(CTX), RX0; @@ -95,11 +89,8 @@ #define round_enc(n) \ add_roundkey_enc(n); \ \ - F(RX0, RK0); \ - F(RX0, RK0); - -#define round_final_enc(n) \ - xorq p+4*(n)(CTX), RX0; + F(); \ + F(); #define add_roundkey_dec(n) \ movq p+4*(n-1)(CTX), RT0; \ @@ -109,8 +100,8 @@ #define round_dec(n) \ add_roundkey_dec(n); \ \ - F(RX0, RK0); \ - F(RX0, RK0); \ + F(); \ + F(); \ #define read_block() \ movq (RIO), RX0; \ @@ -130,16 +121,15 @@ .type __blowfish_enc_blk,@function; __blowfish_enc_blk: - // input: - // %rdi: ctx, CTX - // %rsi: dst - // %rdx: src - // %rcx: bool xor - pushq %rbp; - pushq %rbx; - - pushq %rsi; - pushq %rcx; + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + * %rcx: bool, if true: xor output + */ + movq %rbp, %r11; + + movq %rsi, %r10; movq %rdx, RIO; read_block(); @@ -154,38 +144,31 @@ __blowfish_enc_blk: round_enc(14); add_roundkey_enc(16); - popq %rbp; - popq RIO; + movq %r11, %rbp; - test %bpl, %bpl; + movq %r10, RIO; + test %cl, %cl; jnz __enc_xor; write_block(); - -__enc_ret: - popq %rbx; - popq %rbp; - ret; - __enc_xor: xor_block(); - - jmp __enc_ret; + ret; .align 8 .global blowfish_dec_blk .type blowfish_dec_blk,@function; blowfish_dec_blk: - // input: - // %rdi: ctx, CTX - // %rsi: dst - // %rdx: src - pushq %rbp; - pushq %rbx; - - pushq %rsi; + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + */ + movq %rbp, %r11; + + movq %rsi, %r10; movq %rdx, RIO; read_block(); @@ -200,17 +183,33 @@ blowfish_dec_blk: round_dec(3); add_roundkey_dec(1); - popq RIO; + movq %r10, RIO; write_block(); - popq %rbx; - popq %rbp; + movq %r11, %rbp; ret; /********************************************************************** 4-way blowfish, four blocks parallel **********************************************************************/ + +/* F() for 4-way. Slower when used alone/1-way, but faster when used + * parallel/4-way (tested on AMD Phenom II & Intel Xeon E7330). + */ +#define F4(x) \ + movzbl x ## bh, RT1d; \ + movzbl x ## bl, RT3d; \ + rorq $16, x; \ + movzbl x ## bh, RT0d; \ + movzbl x ## bl, RT2d; \ + rorq $16, x; \ + movl s0(CTX,RT0,4), RT0d; \ + addl s1(CTX,RT2,4), RT0d; \ + xorl s2(CTX,RT1,4), RT0d; \ + addl s3(CTX,RT3,4), RT0d; \ + xorq RT0, x; + #define add_preloaded_roundkey4() \ xorq RKEY, RX0; \ xorq RKEY, RX1; \ @@ -227,15 +226,15 @@ blowfish_dec_blk: #define round_enc4(n) \ add_roundkey_enc4(n); \ \ - F(RX0, RK0); \ - F(RX1, RK1); \ - F(RX2, RK2); \ - F(RX3, RK3); \ + F4(RX0); \ + F4(RX1); \ + F4(RX2); \ + F4(RX3); \ \ - F(RX0, RK0); \ - F(RX1, RK1); \ - F(RX2, RK2); \ - F(RX3, RK3); + F4(RX0); \ + F4(RX1); \ + F4(RX2); \ + F4(RX3); #define preload_roundkey_dec(n) \ movq p+4*((n)-1)(CTX), RKEY; \ @@ -248,15 +247,15 @@ blowfish_dec_blk: #define round_dec4(n) \ add_roundkey_dec4(n); \ \ - F(RX0, RK0); \ - F(RX1, RK1); \ - F(RX2, RK2); \ - F(RX3, RK3); \ + F4(RX0); \ + F4(RX1); \ + F4(RX2); \ + F4(RX3); \ \ - F(RX0, RK0); \ - F(RX1, RK1); \ - F(RX2, RK2); \ - F(RX3, RK3); + F4(RX0); \ + F4(RX1); \ + F4(RX2); \ + F4(RX3); #define read_block4() \ movq (RIO), RX0; \ @@ -306,18 +305,19 @@ blowfish_dec_blk: .type __blowfish_enc_blk_4way,@function; __blowfish_enc_blk_4way: - // input: - // %rdi: ctx, CTX - // %rsi: dst - // %rdx: src - // %rcx: bool xor + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + * %rcx: bool, if true: xor output + */ pushq %rbp; pushq %rbx; - pushq RKEY; + pushq %rcx; + preload_roundkey_enc(0); - pushq %rsi; - pushq %rcx; + movq %rsi, %r11; movq %rdx, RIO; read_block4(); @@ -333,40 +333,39 @@ __blowfish_enc_blk_4way: add_preloaded_roundkey4(); popq %rbp; - popq RIO; + movq %r11, RIO; test %bpl, %bpl; jnz __enc_xor4; write_block4(); -__enc_ret4: - popq RKEY; popq %rbx; popq %rbp; - ret; __enc_xor4: xor_block4(); - jmp __enc_ret4; + popq %rbx; + popq %rbp; + ret; .align 8 .global blowfish_dec_blk_4way .type blowfish_dec_blk_4way,@function; blowfish_dec_blk_4way: - // input: - // %rdi: ctx, CTX - // %rsi: dst - // %rdx: src + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + */ pushq %rbp; pushq %rbx; - pushq RKEY; preload_roundkey_dec(17); - pushq %rsi; + movq %rsi, %r11; movq %rdx, RIO; read_block4(); @@ -381,10 +380,9 @@ blowfish_dec_blk_4way: round_dec4(3); add_preloaded_roundkey4(); - popq RIO; + movq %r11, RIO; write_block4(); - popq RKEY; popq %rbx; popq %rbp; -- cgit v1.1 From a071d06e34ff361c7a8d1ddf3ce8a95d782fa25a Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Fri, 23 Sep 2011 19:51:00 +0300 Subject: crypto: blowfish-x86_64 - add credits Signed-off-by: Jussi Kivilinna Signed-off-by: Herbert Xu --- arch/x86/crypto/blowfish_glue.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/crypto/blowfish_glue.c b/arch/x86/crypto/blowfish_glue.c index 40911ab..2568a7b 100644 --- a/arch/x86/crypto/blowfish_glue.c +++ b/arch/x86/crypto/blowfish_glue.c @@ -3,6 +3,11 @@ * * Copyright (c) 2011 Jussi Kivilinna * + * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by: + * Copyright (c) 2006 Herbert Xu + * CTR part based on code (crypto/ctr.c) by: + * (C) Copyright IBM Corp. 2007 - Joy Latten + * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or -- cgit v1.1 From 91d41f159d75d602f6001218eec64c5e761475a6 Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Mon, 26 Sep 2011 16:47:20 +0300 Subject: crypto: twofish-x86-asm - make assembler functions use twofish_ctx instead of crypto_tfm This needed by 3-way twofish patch to be able to easily use one block assembler functions. As glue code is shared between i586/x86_64 apply change to i586 assembler too. Also export assembler functions for 3-way parallel twofish module. CC: Joachim Fritschi Signed-off-by: Jussi Kivilinna Signed-off-by: Herbert Xu --- arch/x86/crypto/twofish-i586-asm_32.S | 10 +++++----- arch/x86/crypto/twofish-x86_64-asm_64.S | 6 ++---- arch/x86/crypto/twofish_glue.c | 12 ++++++++---- 3 files changed, 15 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/twofish-i586-asm_32.S b/arch/x86/crypto/twofish-i586-asm_32.S index 575331c..658af4b 100644 --- a/arch/x86/crypto/twofish-i586-asm_32.S +++ b/arch/x86/crypto/twofish-i586-asm_32.S @@ -26,7 +26,7 @@ #define in_blk 12 /* input byte array address parameter*/ #define out_blk 8 /* output byte array address parameter*/ -#define tfm 4 /* Twofish context structure */ +#define ctx 4 /* Twofish context structure */ #define a_offset 0 #define b_offset 4 @@ -229,8 +229,8 @@ twofish_enc_blk: push %esi push %edi - mov tfm + 16(%esp), %ebp /* abuse the base pointer: set new base bointer to the crypto tfm */ - add $crypto_tfm_ctx_offset, %ebp /* ctx address */ + mov ctx + 16(%esp), %ebp /* abuse the base pointer: set new base + * pointer to the ctx address */ mov in_blk+16(%esp),%edi /* input address in edi */ mov (%edi), %eax @@ -285,8 +285,8 @@ twofish_dec_blk: push %edi - mov tfm + 16(%esp), %ebp /* abuse the base pointer: set new base bointer to the crypto tfm */ - add $crypto_tfm_ctx_offset, %ebp /* ctx address */ + mov ctx + 16(%esp), %ebp /* abuse the base pointer: set new base + * pointer to the ctx address */ mov in_blk+16(%esp),%edi /* input address in edi */ mov (%edi), %eax diff --git a/arch/x86/crypto/twofish-x86_64-asm_64.S b/arch/x86/crypto/twofish-x86_64-asm_64.S index 573aa10..7bcf3fc 100644 --- a/arch/x86/crypto/twofish-x86_64-asm_64.S +++ b/arch/x86/crypto/twofish-x86_64-asm_64.S @@ -221,10 +221,9 @@ twofish_enc_blk: pushq R1 - /* %rdi contains the crypto tfm address */ + /* %rdi contains the ctx address */ /* %rsi contains the output address */ /* %rdx contains the input address */ - add $crypto_tfm_ctx_offset, %rdi /* set ctx address */ /* ctx address is moved to free one non-rex register as target for the 8bit high operations */ mov %rdi, %r11 @@ -274,10 +273,9 @@ twofish_enc_blk: twofish_dec_blk: pushq R1 - /* %rdi contains the crypto tfm address */ + /* %rdi contains the ctx address */ /* %rsi contains the output address */ /* %rdx contains the input address */ - add $crypto_tfm_ctx_offset, %rdi /* set ctx address */ /* ctx address is moved to free one non-rex register as target for the 8bit high operations */ mov %rdi, %r11 diff --git a/arch/x86/crypto/twofish_glue.c b/arch/x86/crypto/twofish_glue.c index cefaf8b..dc6b3fb 100644 --- a/arch/x86/crypto/twofish_glue.c +++ b/arch/x86/crypto/twofish_glue.c @@ -44,17 +44,21 @@ #include #include -asmlinkage void twofish_enc_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src); -asmlinkage void twofish_dec_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src); +asmlinkage void twofish_enc_blk(struct twofish_ctx *ctx, u8 *dst, + const u8 *src); +EXPORT_SYMBOL_GPL(twofish_enc_blk); +asmlinkage void twofish_dec_blk(struct twofish_ctx *ctx, u8 *dst, + const u8 *src); +EXPORT_SYMBOL_GPL(twofish_dec_blk); static void twofish_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { - twofish_enc_blk(tfm, dst, src); + twofish_enc_blk(crypto_tfm_ctx(tfm), dst, src); } static void twofish_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { - twofish_dec_blk(tfm, dst, src); + twofish_dec_blk(crypto_tfm_ctx(tfm), dst, src); } static struct crypto_alg alg = { -- cgit v1.1 From 8280daad436edb7dd9e7e06fc13bcecb6b2a885c Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Mon, 26 Sep 2011 16:47:25 +0300 Subject: crypto: twofish - add 3-way parallel x86_64 assembler implemention Patch adds 3-way parallel x86_64 assembly implementation of twofish as new module. New assembler functions crypt data in three blocks chunks, improving cipher performance on out-of-order CPUs. Patch has been tested with tcrypt and automated filesystem tests. Summary of the tcrypt benchmarks: Twofish 3-way-asm vs twofish asm (128bit 8kb block ECB) encrypt: 1.3x speed decrypt: 1.3x speed Twofish 3-way-asm vs twofish asm (128bit 8kb block CBC) encrypt: 1.07x speed decrypt: 1.4x speed Twofish 3-way-asm vs twofish asm (128bit 8kb block CTR) encrypt: 1.4x speed Twofish 3-way-asm vs AES asm (128bit 8kb block ECB) encrypt: 1.0x speed decrypt: 1.0x speed Twofish 3-way-asm vs AES asm (128bit 8kb block CBC) encrypt: 0.84x speed decrypt: 1.09x speed Twofish 3-way-asm vs AES asm (128bit 8kb block CTR) encrypt: 1.15x speed Full output: http://koti.mbnet.fi/axh/kernel/crypto/tcrypt-speed-twofish-3way-asm-x86_64.txt http://koti.mbnet.fi/axh/kernel/crypto/tcrypt-speed-twofish-asm-x86_64.txt http://koti.mbnet.fi/axh/kernel/crypto/tcrypt-speed-aes-asm-x86_64.txt Tests were run on: vendor_id : AuthenticAMD cpu family : 16 model : 10 model name : AMD Phenom(tm) II X6 1055T Processor Also userspace test were run on: vendor_id : GenuineIntel cpu family : 6 model : 15 model name : Intel(R) Xeon(R) CPU E7330 @ 2.40GHz stepping : 11 Userspace test results: Encryption/decryption of twofish 3-way vs x86_64-asm on AMD Phenom II: encrypt: 1.27x decrypt: 1.25x Encryption/decryption of twofish 3-way vs x86_64-asm on Intel Xeon E7330: encrypt: 1.36x decrypt: 1.36x Signed-off-by: Jussi Kivilinna Signed-off-by: Herbert Xu --- arch/x86/crypto/Makefile | 2 + arch/x86/crypto/twofish-x86_64-asm_64-3way.S | 316 ++++++++++++++++++ arch/x86/crypto/twofish_glue_3way.c | 472 +++++++++++++++++++++++++++ 3 files changed, 790 insertions(+) create mode 100644 arch/x86/crypto/twofish-x86_64-asm_64-3way.S create mode 100644 arch/x86/crypto/twofish_glue_3way.c (limited to 'arch/x86') diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 725addf..3537d4b 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -9,6 +9,7 @@ obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o +obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o @@ -23,6 +24,7 @@ salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o +twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o diff --git a/arch/x86/crypto/twofish-x86_64-asm_64-3way.S b/arch/x86/crypto/twofish-x86_64-asm_64-3way.S new file mode 100644 index 0000000..5b012a2 --- /dev/null +++ b/arch/x86/crypto/twofish-x86_64-asm_64-3way.S @@ -0,0 +1,316 @@ +/* + * Twofish Cipher 3-way parallel algorithm (x86_64) + * + * Copyright (C) 2011 Jussi Kivilinna + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + * USA + * + */ + +.file "twofish-x86_64-asm-3way.S" +.text + +/* structure of crypto context */ +#define s0 0 +#define s1 1024 +#define s2 2048 +#define s3 3072 +#define w 4096 +#define k 4128 + +/********************************************************************** + 3-way twofish + **********************************************************************/ +#define CTX %rdi +#define RIO %rdx + +#define RAB0 %rax +#define RAB1 %rbx +#define RAB2 %rcx + +#define RAB0d %eax +#define RAB1d %ebx +#define RAB2d %ecx + +#define RAB0bh %ah +#define RAB1bh %bh +#define RAB2bh %ch + +#define RAB0bl %al +#define RAB1bl %bl +#define RAB2bl %cl + +#define RCD0 %r8 +#define RCD1 %r9 +#define RCD2 %r10 + +#define RCD0d %r8d +#define RCD1d %r9d +#define RCD2d %r10d + +#define RX0 %rbp +#define RX1 %r11 +#define RX2 %r12 + +#define RX0d %ebp +#define RX1d %r11d +#define RX2d %r12d + +#define RY0 %r13 +#define RY1 %r14 +#define RY2 %r15 + +#define RY0d %r13d +#define RY1d %r14d +#define RY2d %r15d + +#define RT0 %rdx +#define RT1 %rsi + +#define RT0d %edx +#define RT1d %esi + +#define do16bit_ror(rot, op1, op2, T0, T1, tmp1, tmp2, ab, dst) \ + movzbl ab ## bl, tmp2 ## d; \ + movzbl ab ## bh, tmp1 ## d; \ + rorq $(rot), ab; \ + op1##l T0(CTX, tmp2, 4), dst ## d; \ + op2##l T1(CTX, tmp1, 4), dst ## d; + +/* + * Combined G1 & G2 function. Reordered with help of rotates to have moves + * at begining. + */ +#define g1g2_3(ab, cd, Tx0, Tx1, Tx2, Tx3, Ty0, Ty1, Ty2, Ty3, x, y) \ + /* G1,1 && G2,1 */ \ + do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 0, ab ## 0, x ## 0); \ + do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 0, ab ## 0, y ## 0); \ + \ + do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 1, ab ## 1, x ## 1); \ + do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 1, ab ## 1, y ## 1); \ + \ + do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 2, ab ## 2, x ## 2); \ + do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 2, ab ## 2, y ## 2); \ + \ + /* G1,2 && G2,2 */ \ + do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 0, x ## 0); \ + do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 0, y ## 0); \ + xchgq cd ## 0, ab ## 0; \ + \ + do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 1, x ## 1); \ + do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 1, y ## 1); \ + xchgq cd ## 1, ab ## 1; \ + \ + do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 2, x ## 2); \ + do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 2, y ## 2); \ + xchgq cd ## 2, ab ## 2; + +#define enc_round_end(ab, x, y, n) \ + addl y ## d, x ## d; \ + addl x ## d, y ## d; \ + addl k+4*(2*(n))(CTX), x ## d; \ + xorl ab ## d, x ## d; \ + addl k+4*(2*(n)+1)(CTX), y ## d; \ + shrq $32, ab; \ + roll $1, ab ## d; \ + xorl y ## d, ab ## d; \ + shlq $32, ab; \ + rorl $1, x ## d; \ + orq x, ab; + +#define dec_round_end(ba, x, y, n) \ + addl y ## d, x ## d; \ + addl x ## d, y ## d; \ + addl k+4*(2*(n))(CTX), x ## d; \ + addl k+4*(2*(n)+1)(CTX), y ## d; \ + xorl ba ## d, y ## d; \ + shrq $32, ba; \ + roll $1, ba ## d; \ + xorl x ## d, ba ## d; \ + shlq $32, ba; \ + rorl $1, y ## d; \ + orq y, ba; + +#define encrypt_round3(ab, cd, n) \ + g1g2_3(ab, cd, s0, s1, s2, s3, s0, s1, s2, s3, RX, RY); \ + \ + enc_round_end(ab ## 0, RX0, RY0, n); \ + enc_round_end(ab ## 1, RX1, RY1, n); \ + enc_round_end(ab ## 2, RX2, RY2, n); + +#define decrypt_round3(ba, dc, n) \ + g1g2_3(ba, dc, s1, s2, s3, s0, s3, s0, s1, s2, RY, RX); \ + \ + dec_round_end(ba ## 0, RX0, RY0, n); \ + dec_round_end(ba ## 1, RX1, RY1, n); \ + dec_round_end(ba ## 2, RX2, RY2, n); + +#define encrypt_cycle3(ab, cd, n) \ + encrypt_round3(ab, cd, n*2); \ + encrypt_round3(ab, cd, (n*2)+1); + +#define decrypt_cycle3(ba, dc, n) \ + decrypt_round3(ba, dc, (n*2)+1); \ + decrypt_round3(ba, dc, (n*2)); + +#define inpack3(in, n, xy, m) \ + movq 4*(n)(in), xy ## 0; \ + xorq w+4*m(CTX), xy ## 0; \ + \ + movq 4*(4+(n))(in), xy ## 1; \ + xorq w+4*m(CTX), xy ## 1; \ + \ + movq 4*(8+(n))(in), xy ## 2; \ + xorq w+4*m(CTX), xy ## 2; + +#define outunpack3(op, out, n, xy, m) \ + xorq w+4*m(CTX), xy ## 0; \ + op ## q xy ## 0, 4*(n)(out); \ + \ + xorq w+4*m(CTX), xy ## 1; \ + op ## q xy ## 1, 4*(4+(n))(out); \ + \ + xorq w+4*m(CTX), xy ## 2; \ + op ## q xy ## 2, 4*(8+(n))(out); + +#define inpack_enc3() \ + inpack3(RIO, 0, RAB, 0); \ + inpack3(RIO, 2, RCD, 2); + +#define outunpack_enc3(op) \ + outunpack3(op, RIO, 2, RAB, 6); \ + outunpack3(op, RIO, 0, RCD, 4); + +#define inpack_dec3() \ + inpack3(RIO, 0, RAB, 4); \ + rorq $32, RAB0; \ + rorq $32, RAB1; \ + rorq $32, RAB2; \ + inpack3(RIO, 2, RCD, 6); \ + rorq $32, RCD0; \ + rorq $32, RCD1; \ + rorq $32, RCD2; + +#define outunpack_dec3() \ + rorq $32, RCD0; \ + rorq $32, RCD1; \ + rorq $32, RCD2; \ + outunpack3(mov, RIO, 0, RCD, 0); \ + rorq $32, RAB0; \ + rorq $32, RAB1; \ + rorq $32, RAB2; \ + outunpack3(mov, RIO, 2, RAB, 2); + +.align 8 +.global __twofish_enc_blk_3way +.type __twofish_enc_blk_3way,@function; + +__twofish_enc_blk_3way: + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src, RIO + * %rcx: bool, if true: xor output + */ + pushq %r15; + pushq %r14; + pushq %r13; + pushq %r12; + pushq %rbp; + pushq %rbx; + + pushq %rcx; /* bool xor */ + pushq %rsi; /* dst */ + + inpack_enc3(); + + encrypt_cycle3(RAB, RCD, 0); + encrypt_cycle3(RAB, RCD, 1); + encrypt_cycle3(RAB, RCD, 2); + encrypt_cycle3(RAB, RCD, 3); + encrypt_cycle3(RAB, RCD, 4); + encrypt_cycle3(RAB, RCD, 5); + encrypt_cycle3(RAB, RCD, 6); + encrypt_cycle3(RAB, RCD, 7); + + popq RIO; /* dst */ + popq %rbp; /* bool xor */ + + testb %bpl, %bpl; + jnz __enc_xor3; + + outunpack_enc3(mov); + + popq %rbx; + popq %rbp; + popq %r12; + popq %r13; + popq %r14; + popq %r15; + ret; + +__enc_xor3: + outunpack_enc3(xor); + + popq %rbx; + popq %rbp; + popq %r12; + popq %r13; + popq %r14; + popq %r15; + ret; + +.global twofish_dec_blk_3way +.type twofish_dec_blk_3way,@function; + +twofish_dec_blk_3way: + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src, RIO + */ + pushq %r15; + pushq %r14; + pushq %r13; + pushq %r12; + pushq %rbp; + pushq %rbx; + + pushq %rsi; /* dst */ + + inpack_dec3(); + + decrypt_cycle3(RAB, RCD, 7); + decrypt_cycle3(RAB, RCD, 6); + decrypt_cycle3(RAB, RCD, 5); + decrypt_cycle3(RAB, RCD, 4); + decrypt_cycle3(RAB, RCD, 3); + decrypt_cycle3(RAB, RCD, 2); + decrypt_cycle3(RAB, RCD, 1); + decrypt_cycle3(RAB, RCD, 0); + + popq RIO; /* dst */ + + outunpack_dec3(); + + popq %rbx; + popq %rbp; + popq %r12; + popq %r13; + popq %r14; + popq %r15; + ret; + diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c new file mode 100644 index 0000000..0cbf9fa --- /dev/null +++ b/arch/x86/crypto/twofish_glue_3way.c @@ -0,0 +1,472 @@ +/* + * Glue Code for 3-way parallel assembler optimized version of Twofish + * + * Copyright (c) 2011 Jussi Kivilinna + * + * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by: + * Copyright (c) 2006 Herbert Xu + * CTR part based on code (crypto/ctr.c) by: + * (C) Copyright IBM Corp. 2007 - Joy Latten + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + * USA + * + */ + +#include +#include +#include +#include +#include +#include +#include + +/* regular block cipher functions from twofish_x86_64 module */ +asmlinkage void twofish_enc_blk(struct twofish_ctx *ctx, u8 *dst, + const u8 *src); +asmlinkage void twofish_dec_blk(struct twofish_ctx *ctx, u8 *dst, + const u8 *src); + +/* 3-way parallel cipher functions */ +asmlinkage void __twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst, + const u8 *src, bool xor); +asmlinkage void twofish_dec_blk_3way(struct twofish_ctx *ctx, u8 *dst, + const u8 *src); + +static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst, + const u8 *src) +{ + __twofish_enc_blk_3way(ctx, dst, src, false); +} + +static inline void twofish_enc_blk_xor_3way(struct twofish_ctx *ctx, u8 *dst, + const u8 *src) +{ + __twofish_enc_blk_3way(ctx, dst, src, true); +} + +static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk, + void (*fn)(struct twofish_ctx *, u8 *, const u8 *), + void (*fn_3way)(struct twofish_ctx *, u8 *, const u8 *)) +{ + struct twofish_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + unsigned int bsize = TF_BLOCK_SIZE; + unsigned int nbytes; + int err; + + err = blkcipher_walk_virt(desc, walk); + + while ((nbytes = walk->nbytes)) { + u8 *wsrc = walk->src.virt.addr; + u8 *wdst = walk->dst.virt.addr; + + /* Process three block batch */ + if (nbytes >= bsize * 3) { + do { + fn_3way(ctx, wdst, wsrc); + + wsrc += bsize * 3; + wdst += bsize * 3; + nbytes -= bsize * 3; + } while (nbytes >= bsize * 3); + + if (nbytes < bsize) + goto done; + } + + /* Handle leftovers */ + do { + fn(ctx, wdst, wsrc); + + wsrc += bsize; + wdst += bsize; + nbytes -= bsize; + } while (nbytes >= bsize); + +done: + err = blkcipher_walk_done(desc, walk, nbytes); + } + + return err; +} + +static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct blkcipher_walk walk; + + blkcipher_walk_init(&walk, dst, src, nbytes); + return ecb_crypt(desc, &walk, twofish_enc_blk, twofish_enc_blk_3way); +} + +static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct blkcipher_walk walk; + + blkcipher_walk_init(&walk, dst, src, nbytes); + return ecb_crypt(desc, &walk, twofish_dec_blk, twofish_dec_blk_3way); +} + +static struct crypto_alg blk_ecb_alg = { + .cra_name = "ecb(twofish)", + .cra_driver_name = "ecb-twofish-3way", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = TF_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct twofish_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(blk_ecb_alg.cra_list), + .cra_u = { + .blkcipher = { + .min_keysize = TF_MIN_KEY_SIZE, + .max_keysize = TF_MAX_KEY_SIZE, + .setkey = twofish_setkey, + .encrypt = ecb_encrypt, + .decrypt = ecb_decrypt, + }, + }, +}; + +static unsigned int __cbc_encrypt(struct blkcipher_desc *desc, + struct blkcipher_walk *walk) +{ + struct twofish_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + unsigned int bsize = TF_BLOCK_SIZE; + unsigned int nbytes = walk->nbytes; + u128 *src = (u128 *)walk->src.virt.addr; + u128 *dst = (u128 *)walk->dst.virt.addr; + u128 *iv = (u128 *)walk->iv; + + do { + u128_xor(dst, src, iv); + twofish_enc_blk(ctx, (u8 *)dst, (u8 *)dst); + iv = dst; + + src += 1; + dst += 1; + nbytes -= bsize; + } while (nbytes >= bsize); + + u128_xor((u128 *)walk->iv, (u128 *)walk->iv, iv); + return nbytes; +} + +static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct blkcipher_walk walk; + int err; + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt(desc, &walk); + + while ((nbytes = walk.nbytes)) { + nbytes = __cbc_encrypt(desc, &walk); + err = blkcipher_walk_done(desc, &walk, nbytes); + } + + return err; +} + +static unsigned int __cbc_decrypt(struct blkcipher_desc *desc, + struct blkcipher_walk *walk) +{ + struct twofish_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + unsigned int bsize = TF_BLOCK_SIZE; + unsigned int nbytes = walk->nbytes; + u128 *src = (u128 *)walk->src.virt.addr; + u128 *dst = (u128 *)walk->dst.virt.addr; + u128 ivs[3 - 1]; + u128 last_iv; + + /* Start of the last block. */ + src += nbytes / bsize - 1; + dst += nbytes / bsize - 1; + + last_iv = *src; + + /* Process three block batch */ + if (nbytes >= bsize * 3) { + do { + nbytes -= bsize * (3 - 1); + src -= 3 - 1; + dst -= 3 - 1; + + ivs[0] = src[0]; + ivs[1] = src[1]; + + twofish_dec_blk_3way(ctx, (u8 *)dst, (u8 *)src); + + u128_xor(dst + 1, dst + 1, ivs + 0); + u128_xor(dst + 2, dst + 2, ivs + 1); + + nbytes -= bsize; + if (nbytes < bsize) + goto done; + + u128_xor(dst, dst, src - 1); + src -= 1; + dst -= 1; + } while (nbytes >= bsize * 3); + + if (nbytes < bsize) + goto done; + } + + /* Handle leftovers */ + for (;;) { + twofish_dec_blk(ctx, (u8 *)dst, (u8 *)src); + + nbytes -= bsize; + if (nbytes < bsize) + break; + + u128_xor(dst, dst, src - 1); + src -= 1; + dst -= 1; + } + +done: + u128_xor(dst, dst, (u128 *)walk->iv); + *(u128 *)walk->iv = last_iv; + + return nbytes; +} + +static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct blkcipher_walk walk; + int err; + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt(desc, &walk); + + while ((nbytes = walk.nbytes)) { + nbytes = __cbc_decrypt(desc, &walk); + err = blkcipher_walk_done(desc, &walk, nbytes); + } + + return err; +} + +static struct crypto_alg blk_cbc_alg = { + .cra_name = "cbc(twofish)", + .cra_driver_name = "cbc-twofish-3way", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = TF_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct twofish_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(blk_cbc_alg.cra_list), + .cra_u = { + .blkcipher = { + .min_keysize = TF_MIN_KEY_SIZE, + .max_keysize = TF_MAX_KEY_SIZE, + .ivsize = TF_BLOCK_SIZE, + .setkey = twofish_setkey, + .encrypt = cbc_encrypt, + .decrypt = cbc_decrypt, + }, + }, +}; + +static inline void u128_to_be128(be128 *dst, const u128 *src) +{ + dst->a = cpu_to_be64(src->a); + dst->b = cpu_to_be64(src->b); +} + +static inline void be128_to_u128(u128 *dst, const be128 *src) +{ + dst->a = be64_to_cpu(src->a); + dst->b = be64_to_cpu(src->b); +} + +static inline void u128_inc(u128 *i) +{ + i->b++; + if (!i->b) + i->a++; +} + +static void ctr_crypt_final(struct blkcipher_desc *desc, + struct blkcipher_walk *walk) +{ + struct twofish_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + u8 *ctrblk = walk->iv; + u8 keystream[TF_BLOCK_SIZE]; + u8 *src = walk->src.virt.addr; + u8 *dst = walk->dst.virt.addr; + unsigned int nbytes = walk->nbytes; + + twofish_enc_blk(ctx, keystream, ctrblk); + crypto_xor(keystream, src, nbytes); + memcpy(dst, keystream, nbytes); + + crypto_inc(ctrblk, TF_BLOCK_SIZE); +} + +static unsigned int __ctr_crypt(struct blkcipher_desc *desc, + struct blkcipher_walk *walk) +{ + struct twofish_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + unsigned int bsize = TF_BLOCK_SIZE; + unsigned int nbytes = walk->nbytes; + u128 *src = (u128 *)walk->src.virt.addr; + u128 *dst = (u128 *)walk->dst.virt.addr; + u128 ctrblk; + be128 ctrblocks[3]; + + be128_to_u128(&ctrblk, (be128 *)walk->iv); + + /* Process three block batch */ + if (nbytes >= bsize * 3) { + do { + if (dst != src) { + dst[0] = src[0]; + dst[1] = src[1]; + dst[2] = src[2]; + } + + /* create ctrblks for parallel encrypt */ + u128_to_be128(&ctrblocks[0], &ctrblk); + u128_inc(&ctrblk); + u128_to_be128(&ctrblocks[1], &ctrblk); + u128_inc(&ctrblk); + u128_to_be128(&ctrblocks[2], &ctrblk); + u128_inc(&ctrblk); + + twofish_enc_blk_xor_3way(ctx, (u8 *)dst, + (u8 *)ctrblocks); + + src += 3; + dst += 3; + nbytes -= bsize * 3; + } while (nbytes >= bsize * 3); + + if (nbytes < bsize) + goto done; + } + + /* Handle leftovers */ + do { + if (dst != src) + *dst = *src; + + u128_to_be128(&ctrblocks[0], &ctrblk); + u128_inc(&ctrblk); + + twofish_enc_blk(ctx, (u8 *)ctrblocks, (u8 *)ctrblocks); + u128_xor(dst, dst, (u128 *)ctrblocks); + + src += 1; + dst += 1; + nbytes -= bsize; + } while (nbytes >= bsize); + +done: + u128_to_be128((be128 *)walk->iv, &ctrblk); + return nbytes; +} + +static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct blkcipher_walk walk; + int err; + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt_block(desc, &walk, TF_BLOCK_SIZE); + + while ((nbytes = walk.nbytes) >= TF_BLOCK_SIZE) { + nbytes = __ctr_crypt(desc, &walk); + err = blkcipher_walk_done(desc, &walk, nbytes); + } + + if (walk.nbytes) { + ctr_crypt_final(desc, &walk); + err = blkcipher_walk_done(desc, &walk, 0); + } + + return err; +} + +static struct crypto_alg blk_ctr_alg = { + .cra_name = "ctr(twofish)", + .cra_driver_name = "ctr-twofish-3way", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = TF_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct twofish_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(blk_ctr_alg.cra_list), + .cra_u = { + .blkcipher = { + .min_keysize = TF_MIN_KEY_SIZE, + .max_keysize = TF_MAX_KEY_SIZE, + .ivsize = TF_BLOCK_SIZE, + .setkey = twofish_setkey, + .encrypt = ctr_crypt, + .decrypt = ctr_crypt, + }, + }, +}; + +int __init init(void) +{ + int err; + + err = crypto_register_alg(&blk_ecb_alg); + if (err) + goto ecb_err; + err = crypto_register_alg(&blk_cbc_alg); + if (err) + goto cbc_err; + err = crypto_register_alg(&blk_ctr_alg); + if (err) + goto ctr_err; + + return 0; + +ctr_err: + crypto_unregister_alg(&blk_cbc_alg); +cbc_err: + crypto_unregister_alg(&blk_ecb_alg); +ecb_err: + return err; +} + +void __exit fini(void) +{ + crypto_unregister_alg(&blk_ctr_alg); + crypto_unregister_alg(&blk_cbc_alg); + crypto_unregister_alg(&blk_ecb_alg); +} + +module_init(init); +module_exit(fini); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Twofish Cipher Algorithm, 3-way parallel asm optimized"); +MODULE_ALIAS("twofish"); +MODULE_ALIAS("twofish-asm"); -- cgit v1.1 From a516ebafdf231702b7461cab562c7fe328ef972b Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Mon, 10 Oct 2011 12:32:15 +0300 Subject: crypto: blowfish-x86_64 - fix ctr blocksize to 1 Signed-off-by: Jussi Kivilinna Signed-off-by: Herbert Xu --- arch/x86/crypto/blowfish_glue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/blowfish_glue.c b/arch/x86/crypto/blowfish_glue.c index 2568a7b..b05aa16 100644 --- a/arch/x86/crypto/blowfish_glue.c +++ b/arch/x86/crypto/blowfish_glue.c @@ -428,7 +428,7 @@ static struct crypto_alg blk_ctr_alg = { .cra_driver_name = "ctr-blowfish-asm", .cra_priority = 300, .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = BF_BLOCK_SIZE, + .cra_blocksize = 1, .cra_ctxsize = sizeof(struct bf_ctx), .cra_alignmask = 0, .cra_type = &crypto_blkcipher_type, -- cgit v1.1 From 906b2c9f2d9f395f5ca01b855b7c74b126517816 Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Mon, 10 Oct 2011 12:33:02 +0300 Subject: crypto: twofish-x86_64-3way - fix ctr blocksize to 1 Signed-off-by: Jussi Kivilinna Signed-off-by: Herbert Xu --- arch/x86/crypto/twofish_glue_3way.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c index 0cbf9fa..5ede9c4 100644 --- a/arch/x86/crypto/twofish_glue_3way.c +++ b/arch/x86/crypto/twofish_glue_3way.c @@ -414,7 +414,7 @@ static struct crypto_alg blk_ctr_alg = { .cra_driver_name = "ctr-twofish-3way", .cra_priority = 300, .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = TF_BLOCK_SIZE, + .cra_blocksize = 1, .cra_ctxsize = sizeof(struct twofish_ctx), .cra_alignmask = 0, .cra_type = &crypto_blkcipher_type, -- cgit v1.1 From a1b60c1cd913c5ccfb38c717ba0bd22622425fa7 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 6 Sep 2011 18:46:34 +0200 Subject: iommu/core: Convert iommu_found to iommu_present With per-bus iommu_ops the iommu_found function needs to work on a bus_type too. This patch adds a bus_type parameter to that function and converts all call-places. The function is also renamed to iommu_present because the function now checks if an iommu is present for a given bus and does not check for a global iommu anymore. Signed-off-by: Joerg Roedel --- arch/x86/kvm/x86.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 84a28ea..73c6a42 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #define CREATE_TRACE_POINTS @@ -2095,7 +2096,7 @@ int kvm_dev_ioctl_check_extension(long ext) r = 0; break; case KVM_CAP_IOMMU: - r = iommu_found(); + r = iommu_present(&pci_bus_type); break; case KVM_CAP_MCE: r = KVM_MAX_MCE_BANKS; -- cgit v1.1 From 8548c84da2f47e71bbbe300f55edb768492575f7 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Sun, 23 Oct 2011 23:19:12 +0200 Subject: x86: Fix S4 regression Commit 4b239f458 ("x86-64, mm: Put early page table high") causes a S4 regression since 2.6.39, namely the machine reboots occasionally at S4 resume. It doesn't happen always, overall rate is about 1/20. But, like other bugs, once when this happens, it continues to happen. This patch fixes the problem by essentially reverting the memory assignment in the older way. Signed-off-by: Takashi Iwai Cc: Cc: Rafael J. Wysocki Cc: Yinghai Lu [ We'll hopefully find the real fix, but that's too late for 3.1 now ] Signed-off-by: Linus Torvalds --- arch/x86/mm/init.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 3032644..87488b9 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -63,9 +63,8 @@ static void __init find_early_table_space(unsigned long end, int use_pse, #ifdef CONFIG_X86_32 /* for fixmap */ tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); - - good_end = max_pfn_mapped << PAGE_SHIFT; #endif + good_end = max_pfn_mapped << PAGE_SHIFT; base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE); if (base == MEMBLOCK_ERROR) -- cgit v1.1 From 42c2544b2d7d4f287bfa239455af09b6408476e2 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Wed, 7 Sep 2011 16:06:51 +0300 Subject: x86, mrst: Some drivers need to known when an SCU is available Add a notifier so that drivers can hook into SCU availability in order to take actions post initialisation when/if the SCU becomes available. In the ideal world we wouldn't need this and we could avoid any init dependancies of this form, but in practice we can't do it for some cases. Signed-off-by: Alan Cox Signed-off-by: Samuel Ortiz --- arch/x86/include/asm/intel_scu_ipc.h | 22 ++++++++++++++++++++++ arch/x86/platform/mrst/mrst.c | 7 +++++++ 2 files changed, 29 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/intel_scu_ipc.h b/arch/x86/include/asm/intel_scu_ipc.h index 29f6679..4420993 100644 --- a/arch/x86/include/asm/intel_scu_ipc.h +++ b/arch/x86/include/asm/intel_scu_ipc.h @@ -1,6 +1,8 @@ #ifndef _ASM_X86_INTEL_SCU_IPC_H_ #define _ASM_X86_INTEL_SCU_IPC_H_ +#include + #define IPCMSG_VRTC 0xFA /* Set vRTC device */ /* Command id associated with message IPCMSG_VRTC */ @@ -44,4 +46,24 @@ int intel_scu_ipc_i2c_cntrl(u32 addr, u32 *data); /* Update FW version */ int intel_scu_ipc_fw_update(u8 *buffer, u32 length); +extern struct blocking_notifier_head intel_scu_notifier; + +static inline void intel_scu_notifier_add(struct notifier_block *nb) +{ + blocking_notifier_chain_register(&intel_scu_notifier, nb); +} + +static inline void intel_scu_notifier_remove(struct notifier_block *nb) +{ + blocking_notifier_chain_unregister(&intel_scu_notifier, nb); +} + +static inline int intel_scu_notifier_post(unsigned long v, void *p) +{ + return blocking_notifier_call_chain(&intel_scu_notifier, v, p); +} + +#define SCU_AVAILABLE 1 +#define SCU_DOWN 2 + #endif diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c index fe73276..51ec016 100644 --- a/arch/x86/platform/mrst/mrst.c +++ b/arch/x86/platform/mrst/mrst.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -555,6 +556,9 @@ static void __init intel_scu_i2c_device_register(int bus, i2c_devs[i2c_next_dev++] = new_dev; } +BLOCKING_NOTIFIER_HEAD(intel_scu_notifier); +EXPORT_SYMBOL_GPL(intel_scu_notifier); + /* Called by IPC driver */ void intel_scu_devices_create(void) { @@ -579,6 +583,7 @@ void intel_scu_devices_create(void) } else i2c_register_board_info(i2c_bus[i], i2c_devs[i], 1); } + intel_scu_notifier_post(SCU_AVAILABLE, 0L); } EXPORT_SYMBOL_GPL(intel_scu_devices_create); @@ -587,6 +592,8 @@ void intel_scu_devices_destroy(void) { int i; + intel_scu_notifier_post(SCU_DOWN, 0L); + for (i = 0; i < ipc_next_dev; i++) platform_device_del(ipc_devs[i]); } -- cgit v1.1 From 360545c1fcdd458053ede5794c3255c44164cc4a Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Tue, 18 Oct 2011 12:41:22 +0300 Subject: x86, mrst: add platform support for MSIC MFD driver The MSIC MFD driver creates platform devices for MSIC device drivers so we don't need to create them in platform code anymore. This patch adds a new runtime check which determines whether we are running on a Medfield platform and enables the MSIC MFD driver accordingly. Signed-off-by: Mika Westerberg Signed-off-by: Samuel Ortiz --- arch/x86/platform/mrst/mrst.c | 169 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 154 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c index 51ec016..8a7cb32 100644 --- a/arch/x86/platform/mrst/mrst.c +++ b/arch/x86/platform/mrst/mrst.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -481,6 +482,128 @@ static void __init *no_platform_data(void *info) return NULL; } +static struct resource msic_resources[] = { + { + .start = INTEL_MSIC_IRQ_PHYS_BASE, + .end = INTEL_MSIC_IRQ_PHYS_BASE + 64 - 1, + .flags = IORESOURCE_MEM, + }, +}; + +static struct intel_msic_platform_data msic_pdata; + +static struct platform_device msic_device = { + .name = "intel_msic", + .id = -1, + .dev = { + .platform_data = &msic_pdata, + }, + .num_resources = ARRAY_SIZE(msic_resources), + .resource = msic_resources, +}; + +static inline bool mrst_has_msic(void) +{ + return mrst_identify_cpu() == MRST_CPU_CHIP_PENWELL; +} + +static int msic_scu_status_change(struct notifier_block *nb, + unsigned long code, void *data) +{ + if (code == SCU_DOWN) { + platform_device_unregister(&msic_device); + return 0; + } + + return platform_device_register(&msic_device); +} + +static int __init msic_init(void) +{ + static struct notifier_block msic_scu_notifier = { + .notifier_call = msic_scu_status_change, + }; + + /* + * We need to be sure that the SCU IPC is ready before MSIC device + * can be registered. + */ + if (mrst_has_msic()) + intel_scu_notifier_add(&msic_scu_notifier); + + return 0; +} +arch_initcall(msic_init); + +/* + * msic_generic_platform_data - sets generic platform data for the block + * @info: pointer to the SFI device table entry for this block + * @block: MSIC block + * + * Function sets IRQ number from the SFI table entry for given device to + * the MSIC platform data. + */ +static void *msic_generic_platform_data(void *info, enum intel_msic_block block) +{ + struct sfi_device_table_entry *entry = info; + + BUG_ON(block < 0 || block >= INTEL_MSIC_BLOCK_LAST); + msic_pdata.irq[block] = entry->irq; + + return no_platform_data(info); +} + +static void *msic_battery_platform_data(void *info) +{ + return msic_generic_platform_data(info, INTEL_MSIC_BLOCK_BATTERY); +} + +static void *msic_gpio_platform_data(void *info) +{ + static struct intel_msic_gpio_pdata pdata; + int gpio = get_gpio_by_name("msic_gpio_base"); + + if (gpio < 0) + return NULL; + + pdata.gpio_base = gpio; + msic_pdata.gpio = &pdata; + + return msic_generic_platform_data(info, INTEL_MSIC_BLOCK_GPIO); +} + +static void *msic_audio_platform_data(void *info) +{ + struct platform_device *pdev; + + pdev = platform_device_register_simple("sst-platform", -1, NULL, 0); + if (IS_ERR(pdev)) { + pr_err("failed to create audio platform device\n"); + return NULL; + } + + return msic_generic_platform_data(info, INTEL_MSIC_BLOCK_AUDIO); +} + +static void *msic_power_btn_platform_data(void *info) +{ + return msic_generic_platform_data(info, INTEL_MSIC_BLOCK_POWER_BTN); +} + +static void *msic_ocd_platform_data(void *info) +{ + static struct intel_msic_ocd_pdata pdata; + int gpio = get_gpio_by_name("ocd_gpio"); + + if (gpio < 0) + return NULL; + + pdata.gpio = gpio; + msic_pdata.ocd = &pdata; + + return msic_generic_platform_data(info, INTEL_MSIC_BLOCK_OCD); +} + static const struct devs_id __initconst device_ids[] = { {"pmic_gpio", SFI_DEV_TYPE_SPI, 1, &pmic_gpio_platform_data}, {"spi_max3111", SFI_DEV_TYPE_SPI, 0, &max3111_platform_data}, @@ -489,7 +612,14 @@ static const struct devs_id __initconst device_ids[] = { {"emc1403", SFI_DEV_TYPE_I2C, 1, &emc1403_platform_data}, {"i2c_accel", SFI_DEV_TYPE_I2C, 0, &lis331dl_platform_data}, {"pmic_audio", SFI_DEV_TYPE_IPC, 1, &no_platform_data}, - {"msic_audio", SFI_DEV_TYPE_IPC, 1, &no_platform_data}, + + /* MSIC subdevices */ + {"msic_battery", SFI_DEV_TYPE_IPC, 1, &msic_battery_platform_data}, + {"msic_gpio", SFI_DEV_TYPE_IPC, 1, &msic_gpio_platform_data}, + {"msic_audio", SFI_DEV_TYPE_IPC, 1, &msic_audio_platform_data}, + {"msic_power_btn", SFI_DEV_TYPE_IPC, 1, &msic_power_btn_platform_data}, + {"msic_ocd", SFI_DEV_TYPE_IPC, 1, &msic_ocd_platform_data}, + {}, }; @@ -610,19 +740,37 @@ static void __init install_irq_resource(struct platform_device *pdev, int irq) platform_device_add_resources(pdev, &res, 1); } -static void __init sfi_handle_ipc_dev(struct platform_device *pdev) +static void __init sfi_handle_ipc_dev(struct sfi_device_table_entry *entry) { const struct devs_id *dev = device_ids; + struct platform_device *pdev; void *pdata = NULL; while (dev->name[0]) { if (dev->type == SFI_DEV_TYPE_IPC && - !strncmp(dev->name, pdev->name, SFI_NAME_LEN)) { - pdata = dev->get_platform_data(pdev); + !strncmp(dev->name, entry->name, SFI_NAME_LEN)) { + pdata = dev->get_platform_data(entry); break; } dev++; } + + /* + * On Medfield the platform device creation is handled by the MSIC + * MFD driver so we don't need to do it here. + */ + if (mrst_has_msic()) + return; + + /* ID as IRQ is a hack that will go away */ + pdev = platform_device_alloc(entry->name, entry->irq); + if (pdev == NULL) { + pr_err("out of memory for SFI platform device '%s'.\n", + entry->name); + return; + } + install_irq_resource(pdev, entry->irq); + pdev->dev.platform_data = pdata; intel_scu_device_register(pdev); } @@ -675,7 +823,6 @@ static int __init sfi_parse_devs(struct sfi_table_header *table) struct sfi_device_table_entry *pentry; struct spi_board_info spi_info; struct i2c_board_info i2c_info; - struct platform_device *pdev; int num, i, bus; int ioapic; struct io_apic_irq_attr irq_attr; @@ -703,17 +850,9 @@ static int __init sfi_parse_devs(struct sfi_table_header *table) switch (pentry->type) { case SFI_DEV_TYPE_IPC: - /* ID as IRQ is a hack that will go away */ - pdev = platform_device_alloc(pentry->name, irq); - if (pdev == NULL) { - pr_err("out of memory for SFI platform device '%s'.\n", - pentry->name); - continue; - } - install_irq_resource(pdev, irq); pr_debug("info[%2d]: IPC bus, name = %16.16s, " - "irq = 0x%2x\n", i, pentry->name, irq); - sfi_handle_ipc_dev(pdev); + "irq = 0x%2x\n", i, pentry->name, pentry->irq); + sfi_handle_ipc_dev(pentry); break; case SFI_DEV_TYPE_SPI: memset(&spi_info, 0, sizeof(spi_info)); -- cgit v1.1 From 315eb8a2a1b7f335d40ceeeb11b9e067475eb881 Mon Sep 17 00:00:00 2001 From: Josh Stone Date: Mon, 24 Oct 2011 10:15:51 -0700 Subject: x86: Fix compilation bug in kprobes' twobyte_is_boostable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When compiling an i386_defconfig kernel with gcc-4.6.1-9.fc15.i686, I noticed a warning about the asm operand for test_bit in kprobes' can_boost. I discovered that this caused only the first long of twobyte_is_boostable[] to be output. Jakub filed and fixed gcc PR50571 to correct the warning and this output issue. But to solve it for less current gcc, we can make kprobes' twobyte_is_boostable[] non-const, and it won't be optimized out. Before: CC arch/x86/kernel/kprobes.o In file included from include/linux/bitops.h:22:0, from include/linux/kernel.h:17, from [...]/arch/x86/include/asm/percpu.h:44, from [...]/arch/x86/include/asm/current.h:5, from [...]/arch/x86/include/asm/processor.h:15, from [...]/arch/x86/include/asm/atomic.h:6, from include/linux/atomic.h:4, from include/linux/mutex.h:18, from include/linux/notifier.h:13, from include/linux/kprobes.h:34, from arch/x86/kernel/kprobes.c:43: [...]/arch/x86/include/asm/bitops.h: In function ‘can_boost.part.1’: [...]/arch/x86/include/asm/bitops.h:319:2: warning: use of memory input without lvalue in asm operand 1 is deprecated [enabled by default] $ objdump -rd arch/x86/kernel/kprobes.o | grep -A1 -w bt 551: 0f a3 05 00 00 00 00 bt %eax,0x0 554: R_386_32 .rodata.cst4 $ objdump -s -j .rodata.cst4 -j .data arch/x86/kernel/kprobes.o arch/x86/kernel/kprobes.o: file format elf32-i386 Contents of section .data: 0000 48000000 00000000 00000000 00000000 H............... Contents of section .rodata.cst4: 0000 4c030000 L... Only a single long of twobyte_is_boostable[] is in the object file. After, without the const on twobyte_is_boostable: $ objdump -rd arch/x86/kernel/kprobes.o | grep -A1 -w bt 551: 0f a3 05 20 00 00 00 bt %eax,0x20 554: R_386_32 .data $ objdump -s -j .rodata.cst4 -j .data arch/x86/kernel/kprobes.o arch/x86/kernel/kprobes.o: file format elf32-i386 Contents of section .data: 0000 48000000 00000000 00000000 00000000 H............... 0010 00000000 00000000 00000000 00000000 ................ 0020 4c030000 0f000200 ffff0000 ffcff0c0 L............... 0030 0000ffff 3bbbfff8 03ff2ebb 26bb2e77 ....;.......&..w Now all 32 bytes are output into .data instead. Signed-off-by: Josh Stone Cc: Masami Hiramatsu Cc: Jakub Jelinek Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- arch/x86/kernel/kprobes.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index f1a6244d..794bc95 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -75,8 +75,10 @@ DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); /* * Undefined/reserved opcodes, conditional jump, Opcode Extension * Groups, and some special opcodes can not boost. + * This is non-const to keep gcc from statically optimizing it out, as + * variable_test_bit makes gcc think only *(unsigned long*) is used. */ -static const u32 twobyte_is_boostable[256 / 32] = { +static u32 twobyte_is_boostable[256 / 32] = { /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ /* ---------------------------------------------- */ W(0x00, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0) | /* 00 */ -- cgit v1.1 From b7e3155818830a358be2b736e1087217c43db9eb Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 29 Sep 2011 11:11:09 -0700 Subject: x86/jump_label: drop arch_jump_label_text_poke_early() It is no longer used. Signed-off-by: Jeremy Fitzhardinge Acked-by: Jason Baron Acked-by: Peter Zijlstra --- arch/x86/kernel/jump_label.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index 3fee346..2ad0298 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -42,10 +42,4 @@ void arch_jump_label_transform(struct jump_entry *entry, put_online_cpus(); } -void arch_jump_label_text_poke_early(jump_label_t addr) -{ - text_poke_early((void *)addr, ideal_nops[NOP_ATOMIC5], - JUMP_LABEL_NOP_SIZE); -} - #endif -- cgit v1.1 From e71a5be15e47a73a2964712967fe93ee8ccf551b Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 29 Sep 2011 11:11:09 -0700 Subject: x86/jump_label: add arch_jump_label_transform_static() This allows jump-label entries to be cheaply updated on code which is not yet live. Signed-off-by: Jeremy Fitzhardinge Acked-by: Jason Baron Acked-by: Peter Zijlstra --- arch/x86/kernel/jump_label.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index 2ad0298..ea9d5f2f 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -24,8 +24,9 @@ union jump_code_union { } __attribute__((packed)); }; -void arch_jump_label_transform(struct jump_entry *entry, - enum jump_label_type type) +static void __jump_label_transform(struct jump_entry *entry, + enum jump_label_type type, + void *(*poker)(void *, const void *, size_t)) { union jump_code_union code; @@ -35,11 +36,24 @@ void arch_jump_label_transform(struct jump_entry *entry, (entry->code + JUMP_LABEL_NOP_SIZE); } else memcpy(&code, ideal_nops[NOP_ATOMIC5], JUMP_LABEL_NOP_SIZE); + + (*poker)((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE); +} + +void arch_jump_label_transform(struct jump_entry *entry, + enum jump_label_type type) +{ get_online_cpus(); mutex_lock(&text_mutex); - text_poke_smp((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE); + __jump_label_transform(entry, type, text_poke_smp); mutex_unlock(&text_mutex); put_online_cpus(); } +void arch_jump_label_transform_static(struct jump_entry *entry, + enum jump_label_type type) +{ + __jump_label_transform(entry, type, text_poke_early); +} + #endif -- cgit v1.1 From 89cfc99177c9270c5c6d429f6c5177ab3428ad57 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 27 Oct 2011 10:56:17 +1030 Subject: lguest: don't allow KVM-detection cpuid. Host might be running under KVM, but we shouldn't allow Guest to think it can use KVM hypercalls (it can't, and it will embarrass itself if it tries). Signed-off-by: Rusty Russell --- arch/x86/lguest/boot.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 13ee258..f63da5e 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -70,6 +70,7 @@ #include #include #include /* for struct machine_ops */ +#include /*G:010 * Welcome to the Guest! @@ -455,6 +456,15 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx, *ax &= 0xFFFFF0FF; *ax |= 0x00000500; break; + + /* + * This is used to detect if we're running under KVM. We might be, + * but that's a Host matter, not us. So say we're not. + */ + case KVM_CPUID_SIGNATURE: + *bx = *cx = *dx = 0; + break; + /* * 0x80000000 returns the highest Extended Function, so we futureproof * like we do above by limiting it to known fields. -- cgit v1.1 From 1448c721e4fa2faf742029a0403b4b787fccb7fa Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 17 Oct 2011 13:40:02 -0700 Subject: compat: sync compat_stats with statfs. This was found by inspection while tracking a similar bug in compat_statfs64, that has been fixed in mainline since decemeber. - This fixes a bug where not all of the f_spare fields were cleared on mips and s390. - Add the f_flags field to struct compat_statfs - Copy f_flags to userspace in case someone cares. - Use __clear_user to copy the f_spare field to userspace to ensure that all of the elements of f_spare are cleared. On some architectures f_spare is has 5 ints and on some architectures f_spare only has 4 ints. Which makes the previous technique of clearing each int individually broken. I don't expect anyone actually uses the old statfs system call anymore but if they do let them benefit from having the compat and the native version working the same. Signed-off-by: Eric W. Biederman Signed-off-by: Christoph Hellwig --- arch/x86/include/asm/compat.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index 1d9cd27..30d737e 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -108,7 +108,8 @@ struct compat_statfs { compat_fsid_t f_fsid; int f_namelen; /* SunOS ignores this field. */ int f_frsize; - int f_spare[5]; + int f_flags; + int f_spare[4]; }; #define COMPAT_RLIM_OLD_INFINITY 0x7fffffff -- cgit v1.1 From f1c1da2bde712812a3e0f9a7a7ebe7a916a4b5f4 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Tue, 18 Oct 2011 18:23:11 +0200 Subject: KVM: SVM: Keep intercepting task switching with NPT enabled AMD processors apparently have a bug in the hardware task switching support when NPT is enabled. If the task switch triggers a NPF, we can get wrong EXITINTINFO along with that fault. On resume, spurious exceptions may then be injected into the guest. We were able to reproduce this bug when our guest triggered #SS and the handler were supposed to run over a separate task with not yet touched stack pages. Work around the issue by continuing to emulate task switches even in NPT mode. Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/svm.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index e7ed4b1..e32243e 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1084,7 +1084,6 @@ static void init_vmcb(struct vcpu_svm *svm) if (npt_enabled) { /* Setup VMCB for Nested Paging */ control->nested_ctl = 1; - clr_intercept(svm, INTERCEPT_TASK_SWITCH); clr_intercept(svm, INTERCEPT_INVLPG); clr_exception_intercept(svm, PF_VECTOR); clr_cr_intercept(svm, INTERCEPT_CR3_READ); -- cgit v1.1 From f0cb54524366654e72c87e0a1f87c0b3ff36deb3 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 18 Jul 2011 11:24:45 -0300 Subject: x86, MCE: Use notifier chain only for MCE decoding Drop the edac_mce custom hook in favor of the generic notifier mechanism. Also, do not log the error to mcelog if the notified agent was able to decode it. Signed-off-by: Borislav Petkov Acked-by: Ingo Molnar Signed-off-by: Mauro Carvalho Chehab --- arch/x86/kernel/cpu/mcheck/mce.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 08363b0..019786a 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -144,23 +144,20 @@ static struct mce_log mcelog = { void mce_log(struct mce *mce) { unsigned next, entry; + int ret = 0; /* Emit the trace record: */ trace_mce_record(mce); + ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce); + if (ret == NOTIFY_STOP) + return; + mce->finished = 0; wmb(); for (;;) { entry = rcu_dereference_check_mce(mcelog.next); for (;;) { - /* - * If edac_mce is enabled, it will check the error type - * and will process it, if it is a known error. - * Otherwise, the error will be sent through mcelog - * interface - */ - if (edac_mce_parse(mce)) - return; /* * When the buffer fills up discard new entries. @@ -551,10 +548,8 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) * Don't get the IP here because it's unlikely to * have anything to do with the actual error location. */ - if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) { + if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) mce_log(&m); - atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, &m); - } /* * Clear state for this bank. -- cgit v1.1 From 4f4d7a9ba0b7cdbc518cc04930323b15411eee14 Mon Sep 17 00:00:00 2001 From: Paul Bolle Date: Mon, 24 Oct 2011 13:42:34 +0200 Subject: x86: drop unused Kconfig symbol Signed-off-by: Paul Bolle Acked-by: David Rientjes Signed-off-by: Michal Marek --- arch/x86/Kconfig | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 6a47bb2..18f74ed 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -192,9 +192,6 @@ config NEED_PER_CPU_EMBED_FIRST_CHUNK config NEED_PER_CPU_PAGE_FIRST_CHUNK def_bool y -config HAVE_CPUMASK_OF_CPU_MAP - def_bool X86_64_SMP - config ARCH_HIBERNATION_POSSIBLE def_bool y -- cgit v1.1 From 2957402297ed3a59a82aab3aa162e8c8e5ebf5e4 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Thu, 26 May 2011 12:33:18 -0400 Subject: x86: fix implicit include of in vsyscall_64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In removing the presence of from some of the more common files, this implict include of was uncovered. CC arch/x86/kernel/vsyscall_64.o arch/x86/kernel/vsyscall_64.c: In function ‘vsyscall_set_cpu’: arch/x86/kernel/vsyscall_64.c:259: error: implicit declaration of function ‘cpu_to_node’ Explicitly call it out so the cleanup can take place. Signed-off-by: Paul Gortmaker --- arch/x86/kernel/vsyscall_64.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index b56c65de..e4d4a22 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include -- cgit v1.1 From 69c60c88eeb364ebf58432f9bc38033522d58767 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Thu, 26 May 2011 12:22:53 -0400 Subject: x86: Fix files explicitly requiring export.h for EXPORT_SYMBOL/THIS_MODULE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These files were implicitly getting EXPORT_SYMBOL via device.h which was including module.h, but that will be fixed up shortly. By fixing these now, we can avoid seeing things like: arch/x86/kernel/rtc.c:29: warning: type defaults to ‘int’ in declaration of ‘EXPORT_SYMBOL’ arch/x86/kernel/pci-dma.c:20: warning: type defaults to ‘int’ in declaration of ‘EXPORT_SYMBOL’ arch/x86/kernel/e820.c:69: warning: type defaults to ‘int’ in declaration of ‘EXPORT_SYMBOL_GPL’ [ with input from Randy Dunlap and also from Stephen Rothwell ] Signed-off-by: Paul Gortmaker --- arch/x86/kernel/cpu/amd.c | 1 + arch/x86/kernel/cpu/mcheck/mce-apei.c | 1 + arch/x86/kernel/cpu/mcheck/mce.c | 1 + arch/x86/kernel/cpu/mcheck/therm_throt.c | 1 + arch/x86/kernel/cpu/perf_event_intel.c | 1 + arch/x86/kernel/devicetree.c | 1 + arch/x86/kernel/e820.c | 1 + arch/x86/kernel/hpet.c | 1 + arch/x86/kernel/irq.c | 1 + arch/x86/kernel/nmi.c | 1 + arch/x86/kernel/pci-dma.c | 1 + arch/x86/kernel/probe_roms.c | 4 ++-- arch/x86/kernel/rtc.c | 1 + arch/x86/kernel/smp.c | 1 + arch/x86/kernel/tboot.c | 1 + arch/x86/kernel/time.c | 1 + arch/x86/kernel/topology.c | 1 + arch/x86/pci/i386.c | 1 + arch/x86/pci/legacy.c | 1 + arch/x86/platform/efi/efi.c | 1 + arch/x86/platform/mrst/vrtc.c | 1 + arch/x86/platform/olpc/olpc-xo1-pm.c | 1 + arch/x86/platform/uv/bios_uv.c | 1 + arch/x86/power/cpu.c | 1 + 24 files changed, 25 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 46ae4f6..c7e46cb 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -1,3 +1,4 @@ +#include #include #include #include diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c index 83930de..507ea58 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-apei.c +++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c @@ -28,6 +28,7 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#include #include #include #include diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 7b5063a..537c89e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 27c6251..787e06c 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index e09ca20..2be5ebe 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index a621f34..5282179 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -2,6 +2,7 @@ * Architecture specific OF callbacks. */ #include +#include #include #include #include diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 3e2ef84..303a0e4 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 4aecc54..b946a9e 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 6c0802e..429e0c9 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 7ec5bd1..b9c8628 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -17,6 +17,7 @@ #include #include #include +#include #include diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 6228720..80dc793 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c index 6322803..34e06e8 100644 --- a/arch/x86/kernel/probe_roms.c +++ b/arch/x86/kernel/probe_roms.c @@ -10,9 +10,9 @@ #include #include #include -#include - +#include +#include #include #include #include diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index ccdbc16..348ce01 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 013e7eb..16204dc 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index e07a2fc..e2410e2 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index 5a64d05..dd5fbf4 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c index 8927486..76ee977 100644 --- a/arch/x86/kernel/topology.c +++ b/arch/x86/kernel/topology.c @@ -26,6 +26,7 @@ * Send feedback to */ #include +#include #include #include #include diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index 494f2e7e..794b092 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -26,6 +26,7 @@ #include #include +#include #include #include #include diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c index c89266b..2c2aeab 100644 --- a/arch/x86/pci/legacy.c +++ b/arch/x86/pci/legacy.c @@ -2,6 +2,7 @@ * legacy.c - traditional, old school PCI bus probing */ #include +#include #include #include diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 3ae4128..37718f0 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/platform/mrst/vrtc.c b/arch/x86/platform/mrst/vrtc.c index 6d5dbcd..a8ac6f1 100644 --- a/arch/x86/platform/mrst/vrtc.c +++ b/arch/x86/platform/mrst/vrtc.c @@ -18,6 +18,7 @@ */ #include +#include #include #include #include diff --git a/arch/x86/platform/olpc/olpc-xo1-pm.c b/arch/x86/platform/olpc/olpc-xo1-pm.c index 6f3855a..0ce8616c 100644 --- a/arch/x86/platform/olpc/olpc-xo1-pm.c +++ b/arch/x86/platform/olpc/olpc-xo1-pm.c @@ -14,6 +14,7 @@ #include #include +#include #include #include #include diff --git a/arch/x86/platform/uv/bios_uv.c b/arch/x86/platform/uv/bios_uv.c index 8bc57ba..7666121 100644 --- a/arch/x86/platform/uv/bios_uv.c +++ b/arch/x86/platform/uv/bios_uv.c @@ -20,6 +20,7 @@ */ #include +#include #include #include #include diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 87bb35e..f10c0af 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -9,6 +9,7 @@ */ #include +#include #include #include -- cgit v1.1 From 7c52d55170ce84ddf9c0ad4e020ef1d7a97975a7 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Fri, 27 May 2011 12:33:10 -0400 Subject: x86: fix up files really needing to include module.h These files aren't just exporting symbols -- they are also defining a MODULE_LICENSE etc. so give them the full module.h file. Signed-off-by: Paul Gortmaker --- arch/x86/crypto/aes_glue.c | 1 + arch/x86/crypto/aesni-intel_glue.c | 1 + arch/x86/video/fbdev.c | 1 + 3 files changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/crypto/aes_glue.c b/arch/x86/crypto/aes_glue.c index 49ae9fe..bdce3ee 100644 --- a/arch/x86/crypto/aes_glue.c +++ b/arch/x86/crypto/aes_glue.c @@ -3,6 +3,7 @@ * */ +#include #include asmlinkage void aes_enc_blk(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in); diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index feee8ff..545d0ce 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/video/fbdev.c b/arch/x86/video/fbdev.c index 6952768..c5ffb6a 100644 --- a/arch/x86/video/fbdev.c +++ b/arch/x86/video/fbdev.c @@ -8,6 +8,7 @@ */ #include #include +#include int fb_is_primary_device(struct fb_info *info) { -- cgit v1.1 From 783ac47c2aad07ff68f9432968d2f0829ef81543 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Fri, 27 May 2011 14:46:07 -0400 Subject: x86: efi_32.c is implicitly getting asm/desc.h via module.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We want to clean up the chain of includes stumbling through module.h, and when we do that, we'll see: CC arch/x86/platform/efi/efi_32.o efi/efi_32.c: In function ‘efi_call_phys_prelog’: efi/efi_32.c:80: error: implicit declaration of function ‘get_cpu_gdt_table’ efi/efi_32.c:82: error: implicit declaration of function ‘load_gdt’ make[4]: *** [arch/x86/platform/efi/efi_32.o] Error 1 Include asm/desc.h so that there are no implicit include assumptions. Signed-off-by: Paul Gortmaker --- arch/x86/platform/efi/efi_32.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c index 5cab48e..e36bf71 100644 --- a/arch/x86/platform/efi/efi_32.c +++ b/arch/x86/platform/efi/efi_32.c @@ -25,6 +25,7 @@ #include #include +#include #include #include #include -- cgit v1.1 From 39a0e33da0189c99ed3cea6945cda1bc9f4b7b83 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Thu, 21 Jul 2011 13:03:20 -0400 Subject: lguest: add export.h to lguest files for THIS_MODULE/EXPORT_SYMBOL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We need this in advance of the module.h cleanup, or we'll get compile errors like this: CC drivers/lguest/lguest_device.o drivers/lguest/lguest_device.c: In function ‘lguest_devices_init’: drivers/lguest/lguest_device.c:490: error: ‘THIS_MODULE’ undeclared (first use in this function) Signed-off-by: Paul Gortmaker --- arch/x86/lguest/boot.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index f63da5e..cf4603b 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -56,6 +56,7 @@ #include #include #include +#include #include #include #include -- cgit v1.1 From fcf634098c00dd9cd247447368495f0b79be12d1 Mon Sep 17 00:00:00 2001 From: Christopher Yeoh Date: Mon, 31 Oct 2011 17:06:39 -0700 Subject: Cross Memory Attach The basic idea behind cross memory attach is to allow MPI programs doing intra-node communication to do a single copy of the message rather than a double copy of the message via shared memory. The following patch attempts to achieve this by allowing a destination process, given an address and size from a source process, to copy memory directly from the source process into its own address space via a system call. There is also a symmetrical ability to copy from the current process's address space into a destination process's address space. - Use of /proc/pid/mem has been considered, but there are issues with using it: - Does not allow for specifying iovecs for both src and dest, assuming preadv or pwritev was implemented either the area read from or written to would need to be contiguous. - Currently mem_read allows only processes who are currently ptrace'ing the target and are still able to ptrace the target to read from the target. This check could possibly be moved to the open call, but its not clear exactly what race this restriction is stopping (reason appears to have been lost) - Having to send the fd of /proc/self/mem via SCM_RIGHTS on unix domain socket is a bit ugly from a userspace point of view, especially when you may have hundreds if not (eventually) thousands of processes that all need to do this with each other - Doesn't allow for some future use of the interface we would like to consider adding in the future (see below) - Interestingly reading from /proc/pid/mem currently actually involves two copies! (But this could be fixed pretty easily) As mentioned previously use of vmsplice instead was considered, but has problems. Since you need the reader and writer working co-operatively if the pipe is not drained then you block. Which requires some wrapping to do non blocking on the send side or polling on the receive. In all to all communication it requires ordering otherwise you can deadlock. And in the example of many MPI tasks writing to one MPI task vmsplice serialises the copying. There are some cases of MPI collectives where even a single copy interface does not get us the performance gain we could. For example in an MPI_Reduce rather than copy the data from the source we would like to instead use it directly in a mathops (say the reduce is doing a sum) as this would save us doing a copy. We don't need to keep a copy of the data from the source. I haven't implemented this, but I think this interface could in the future do all this through the use of the flags - eg could specify the math operation and type and the kernel rather than just copying the data would apply the specified operation between the source and destination and store it in the destination. Although we don't have a "second user" of the interface (though I've had some nibbles from people who may be interested in using it for intra process messaging which is not MPI). This interface is something which hardware vendors are already doing for their custom drivers to implement fast local communication. And so in addition to this being useful for OpenMPI it would mean the driver maintainers don't have to fix things up when the mm changes. There was some discussion about how much faster a true zero copy would go. Here's a link back to the email with some testing I did on that: http://marc.info/?l=linux-mm&m=130105930902915&w=2 There is a basic man page for the proposed interface here: http://ozlabs.org/~cyeoh/cma/process_vm_readv.txt This has been implemented for x86 and powerpc, other architecture should mainly (I think) just need to add syscall numbers for the process_vm_readv and process_vm_writev. There are 32 bit compatibility versions for 64-bit kernels. For arch maintainers there are some simple tests to be able to quickly verify that the syscalls are working correctly here: http://ozlabs.org/~cyeoh/cma/cma-test-20110718.tgz Signed-off-by: Chris Yeoh Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Cc: Arnd Bergmann Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Cc: David Howells Cc: James Morris Cc: Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/ia32/ia32entry.S | 2 ++ arch/x86/include/asm/unistd_32.h | 4 +++- arch/x86/include/asm/unistd_64.h | 4 ++++ arch/x86/kernel/syscall_table_32.S | 2 ++ 4 files changed, 11 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 54edb207..a6253ec 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -850,4 +850,6 @@ ia32_sys_call_table: .quad sys_syncfs .quad compat_sys_sendmmsg /* 345 */ .quad sys_setns + .quad compat_sys_process_vm_readv + .quad compat_sys_process_vm_writev ia32_syscall_end: diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h index 593485b3..599c77d 100644 --- a/arch/x86/include/asm/unistd_32.h +++ b/arch/x86/include/asm/unistd_32.h @@ -352,10 +352,12 @@ #define __NR_syncfs 344 #define __NR_sendmmsg 345 #define __NR_setns 346 +#define __NR_process_vm_readv 347 +#define __NR_process_vm_writev 348 #ifdef __KERNEL__ -#define NR_syscalls 347 +#define NR_syscalls 349 #define __ARCH_WANT_IPC_PARSE_VERSION #define __ARCH_WANT_OLD_READDIR diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h index 0a6ba33..0431f19 100644 --- a/arch/x86/include/asm/unistd_64.h +++ b/arch/x86/include/asm/unistd_64.h @@ -682,6 +682,10 @@ __SYSCALL(__NR_sendmmsg, sys_sendmmsg) __SYSCALL(__NR_setns, sys_setns) #define __NR_getcpu 309 __SYSCALL(__NR_getcpu, sys_getcpu) +#define __NR_process_vm_readv 310 +__SYSCALL(__NR_process_vm_readv, sys_process_vm_readv) +#define __NR_process_vm_writev 311 +__SYSCALL(__NR_process_vm_writev, sys_process_vm_writev) #ifndef __NO_STUBS #define __ARCH_WANT_OLD_READDIR diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index bc19be3..9a0e312 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -346,3 +346,5 @@ ENTRY(sys_call_table) .long sys_syncfs .long sys_sendmmsg /* 345 */ .long sys_setns + .long sys_process_vm_readv + .long sys_process_vm_writev -- cgit v1.1 From 4140c54266365e4267a2dbc5765101bba3b42896 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 18 Jul 2011 11:24:46 -0300 Subject: i7core_edac: Drop the edac_mce facility Remove edac_mce pieces and use the normal MCE decoder notifier chain by retaining the same functionality with considerably less code. Signed-off-by: Borislav Petkov Signed-off-by: Mauro Carvalho Chehab --- arch/x86/kernel/cpu/mcheck/mce.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 019786a..63aad27 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -36,7 +36,6 @@ #include #include #include -#include #include #include -- cgit v1.1 From 5c48b108ecbf6505d929e64d50dace13ac2bdf34 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 18 Aug 2011 20:06:39 +0100 Subject: um: take arch/um/sys-x86 to arch/x86/um Signed-off-by: Al Viro Signed-off-by: Richard Weinberger --- arch/x86/Makefile.um | 61 ++++ arch/x86/um/Kconfig | 70 ++++ arch/x86/um/Makefile | 45 +++ arch/x86/um/asm/arch_hweight.h | 6 + arch/x86/um/asm/archparam.h | 20 ++ arch/x86/um/asm/checksum.h | 10 + arch/x86/um/asm/checksum_32.h | 201 ++++++++++++ arch/x86/um/asm/checksum_64.h | 144 +++++++++ arch/x86/um/asm/elf.h | 221 +++++++++++++ arch/x86/um/asm/module.h | 23 ++ arch/x86/um/asm/processor.h | 15 + arch/x86/um/asm/processor_32.h | 73 +++++ arch/x86/um/asm/processor_64.h | 51 +++ arch/x86/um/asm/ptrace.h | 5 + arch/x86/um/asm/ptrace_32.h | 51 +++ arch/x86/um/asm/ptrace_64.h | 72 +++++ arch/x86/um/asm/system.h | 133 ++++++++ arch/x86/um/asm/vm-flags.h | 25 ++ arch/x86/um/bug.c | 21 ++ arch/x86/um/bugs_32.c | 74 +++++ arch/x86/um/bugs_64.c | 15 + arch/x86/um/checksum_32.S | 458 ++++++++++++++++++++++++++ arch/x86/um/delay_32.c | 60 ++++ arch/x86/um/delay_64.c | 60 ++++ arch/x86/um/elfcore.c | 83 +++++ arch/x86/um/fault.c | 28 ++ arch/x86/um/ksyms.c | 13 + arch/x86/um/ldt.c | 502 +++++++++++++++++++++++++++++ arch/x86/um/mem_32.c | 62 ++++ arch/x86/um/mem_64.c | 26 ++ arch/x86/um/os-Linux/Makefile | 13 + arch/x86/um/os-Linux/mcontext.c | 31 ++ arch/x86/um/os-Linux/prctl.c | 12 + arch/x86/um/os-Linux/registers.c | 111 +++++++ arch/x86/um/os-Linux/task_size.c | 150 +++++++++ arch/x86/um/os-Linux/tls.c | 35 ++ arch/x86/um/ptrace_32.c | 273 ++++++++++++++++ arch/x86/um/ptrace_64.c | 271 ++++++++++++++++ arch/x86/um/ptrace_user.c | 21 ++ arch/x86/um/setjmp_32.S | 58 ++++ arch/x86/um/setjmp_64.S | 54 ++++ arch/x86/um/shared/sysdep/archsetjmp.h | 5 + arch/x86/um/shared/sysdep/archsetjmp_32.h | 22 ++ arch/x86/um/shared/sysdep/archsetjmp_64.h | 24 ++ arch/x86/um/shared/sysdep/faultinfo.h | 5 + arch/x86/um/shared/sysdep/faultinfo_32.h | 35 ++ arch/x86/um/shared/sysdep/faultinfo_64.h | 35 ++ arch/x86/um/shared/sysdep/host_ldt.h | 5 + arch/x86/um/shared/sysdep/host_ldt_32.h | 34 ++ arch/x86/um/shared/sysdep/host_ldt_64.h | 38 +++ arch/x86/um/shared/sysdep/kernel-offsets.h | 21 ++ arch/x86/um/shared/sysdep/mcontext.h | 31 ++ arch/x86/um/shared/sysdep/ptrace.h | 5 + arch/x86/um/shared/sysdep/ptrace_32.h | 117 +++++++ arch/x86/um/shared/sysdep/ptrace_64.h | 160 +++++++++ arch/x86/um/shared/sysdep/ptrace_user.h | 5 + arch/x86/um/shared/sysdep/ptrace_user_32.h | 26 ++ arch/x86/um/shared/sysdep/ptrace_user_64.h | 38 +++ arch/x86/um/shared/sysdep/skas_ptrace.h | 22 ++ arch/x86/um/shared/sysdep/stub.h | 14 + arch/x86/um/shared/sysdep/stub_32.h | 93 ++++++ arch/x86/um/shared/sysdep/stub_64.h | 99 ++++++ arch/x86/um/shared/sysdep/syscalls.h | 5 + arch/x86/um/shared/sysdep/syscalls_32.h | 20 ++ arch/x86/um/shared/sysdep/syscalls_64.h | 32 ++ arch/x86/um/shared/sysdep/tls.h | 5 + arch/x86/um/shared/sysdep/tls_32.h | 32 ++ arch/x86/um/shared/sysdep/tls_64.h | 29 ++ arch/x86/um/signal_32.c | 498 ++++++++++++++++++++++++++++ arch/x86/um/signal_64.c | 255 +++++++++++++++ arch/x86/um/stub_32.S | 51 +++ arch/x86/um/stub_64.S | 66 ++++ arch/x86/um/stub_segv.c | 19 ++ arch/x86/um/sys_call_table_32.S | 28 ++ arch/x86/um/sys_call_table_64.c | 64 ++++ arch/x86/um/syscalls_32.c | 66 ++++ arch/x86/um/syscalls_64.c | 102 ++++++ arch/x86/um/sysrq_32.c | 101 ++++++ arch/x86/um/sysrq_64.c | 41 +++ arch/x86/um/tls_32.c | 396 +++++++++++++++++++++++ arch/x86/um/tls_64.c | 17 + arch/x86/um/user-offsets.c | 79 +++++ arch/x86/um/vdso/Makefile | 90 ++++++ arch/x86/um/vdso/checkundef.sh | 10 + arch/x86/um/vdso/um_vdso.c | 71 ++++ arch/x86/um/vdso/vdso-layout.lds.S | 64 ++++ arch/x86/um/vdso/vdso-note.S | 12 + arch/x86/um/vdso/vdso.S | 10 + arch/x86/um/vdso/vdso.lds.S | 32 ++ arch/x86/um/vdso/vma.c | 74 +++++ 90 files changed, 6760 insertions(+) create mode 100644 arch/x86/Makefile.um create mode 100644 arch/x86/um/Kconfig create mode 100644 arch/x86/um/Makefile create mode 100644 arch/x86/um/asm/arch_hweight.h create mode 100644 arch/x86/um/asm/archparam.h create mode 100644 arch/x86/um/asm/checksum.h create mode 100644 arch/x86/um/asm/checksum_32.h create mode 100644 arch/x86/um/asm/checksum_64.h create mode 100644 arch/x86/um/asm/elf.h create mode 100644 arch/x86/um/asm/module.h create mode 100644 arch/x86/um/asm/processor.h create mode 100644 arch/x86/um/asm/processor_32.h create mode 100644 arch/x86/um/asm/processor_64.h create mode 100644 arch/x86/um/asm/ptrace.h create mode 100644 arch/x86/um/asm/ptrace_32.h create mode 100644 arch/x86/um/asm/ptrace_64.h create mode 100644 arch/x86/um/asm/system.h create mode 100644 arch/x86/um/asm/vm-flags.h create mode 100644 arch/x86/um/bug.c create mode 100644 arch/x86/um/bugs_32.c create mode 100644 arch/x86/um/bugs_64.c create mode 100644 arch/x86/um/checksum_32.S create mode 100644 arch/x86/um/delay_32.c create mode 100644 arch/x86/um/delay_64.c create mode 100644 arch/x86/um/elfcore.c create mode 100644 arch/x86/um/fault.c create mode 100644 arch/x86/um/ksyms.c create mode 100644 arch/x86/um/ldt.c create mode 100644 arch/x86/um/mem_32.c create mode 100644 arch/x86/um/mem_64.c create mode 100644 arch/x86/um/os-Linux/Makefile create mode 100644 arch/x86/um/os-Linux/mcontext.c create mode 100644 arch/x86/um/os-Linux/prctl.c create mode 100644 arch/x86/um/os-Linux/registers.c create mode 100644 arch/x86/um/os-Linux/task_size.c create mode 100644 arch/x86/um/os-Linux/tls.c create mode 100644 arch/x86/um/ptrace_32.c create mode 100644 arch/x86/um/ptrace_64.c create mode 100644 arch/x86/um/ptrace_user.c create mode 100644 arch/x86/um/setjmp_32.S create mode 100644 arch/x86/um/setjmp_64.S create mode 100644 arch/x86/um/shared/sysdep/archsetjmp.h create mode 100644 arch/x86/um/shared/sysdep/archsetjmp_32.h create mode 100644 arch/x86/um/shared/sysdep/archsetjmp_64.h create mode 100644 arch/x86/um/shared/sysdep/faultinfo.h create mode 100644 arch/x86/um/shared/sysdep/faultinfo_32.h create mode 100644 arch/x86/um/shared/sysdep/faultinfo_64.h create mode 100644 arch/x86/um/shared/sysdep/host_ldt.h create mode 100644 arch/x86/um/shared/sysdep/host_ldt_32.h create mode 100644 arch/x86/um/shared/sysdep/host_ldt_64.h create mode 100644 arch/x86/um/shared/sysdep/kernel-offsets.h create mode 100644 arch/x86/um/shared/sysdep/mcontext.h create mode 100644 arch/x86/um/shared/sysdep/ptrace.h create mode 100644 arch/x86/um/shared/sysdep/ptrace_32.h create mode 100644 arch/x86/um/shared/sysdep/ptrace_64.h create mode 100644 arch/x86/um/shared/sysdep/ptrace_user.h create mode 100644 arch/x86/um/shared/sysdep/ptrace_user_32.h create mode 100644 arch/x86/um/shared/sysdep/ptrace_user_64.h create mode 100644 arch/x86/um/shared/sysdep/skas_ptrace.h create mode 100644 arch/x86/um/shared/sysdep/stub.h create mode 100644 arch/x86/um/shared/sysdep/stub_32.h create mode 100644 arch/x86/um/shared/sysdep/stub_64.h create mode 100644 arch/x86/um/shared/sysdep/syscalls.h create mode 100644 arch/x86/um/shared/sysdep/syscalls_32.h create mode 100644 arch/x86/um/shared/sysdep/syscalls_64.h create mode 100644 arch/x86/um/shared/sysdep/tls.h create mode 100644 arch/x86/um/shared/sysdep/tls_32.h create mode 100644 arch/x86/um/shared/sysdep/tls_64.h create mode 100644 arch/x86/um/signal_32.c create mode 100644 arch/x86/um/signal_64.c create mode 100644 arch/x86/um/stub_32.S create mode 100644 arch/x86/um/stub_64.S create mode 100644 arch/x86/um/stub_segv.c create mode 100644 arch/x86/um/sys_call_table_32.S create mode 100644 arch/x86/um/sys_call_table_64.c create mode 100644 arch/x86/um/syscalls_32.c create mode 100644 arch/x86/um/syscalls_64.c create mode 100644 arch/x86/um/sysrq_32.c create mode 100644 arch/x86/um/sysrq_64.c create mode 100644 arch/x86/um/tls_32.c create mode 100644 arch/x86/um/tls_64.c create mode 100644 arch/x86/um/user-offsets.c create mode 100644 arch/x86/um/vdso/Makefile create mode 100644 arch/x86/um/vdso/checkundef.sh create mode 100644 arch/x86/um/vdso/um_vdso.c create mode 100644 arch/x86/um/vdso/vdso-layout.lds.S create mode 100644 arch/x86/um/vdso/vdso-note.S create mode 100644 arch/x86/um/vdso/vdso.S create mode 100644 arch/x86/um/vdso/vdso.lds.S create mode 100644 arch/x86/um/vdso/vma.c (limited to 'arch/x86') diff --git a/arch/x86/Makefile.um b/arch/x86/Makefile.um new file mode 100644 index 0000000..36ddec6 --- /dev/null +++ b/arch/x86/Makefile.um @@ -0,0 +1,61 @@ +core-y += arch/x86/crypto/ + +ifeq ($(CONFIG_X86_32),y) +START := 0x8048000 + +LDFLAGS += -m elf_i386 +ELF_ARCH := i386 +ELF_FORMAT := elf32-i386 +CHECKFLAGS += -D__i386__ + +ifeq ("$(origin SUBARCH)", "command line") +ifneq ("$(shell uname -m | sed -e s/i.86/i386/)", "$(SUBARCH)") +KBUILD_CFLAGS += $(call cc-option,-m32) +KBUILD_AFLAGS += $(call cc-option,-m32) +LINK-y += $(call cc-option,-m32) + +export LDFLAGS +endif +endif + +# First of all, tune CFLAGS for the specific CPU. This actually sets cflags-y. +include $(srctree)/arch/x86/Makefile_32.cpu + +# prevent gcc from keeping the stack 16 byte aligned. Taken from i386. +cflags-y += $(call cc-option,-mpreferred-stack-boundary=2) + +# Prevent sprintf in nfsd from being converted to strcpy and resulting in +# an unresolved reference. +cflags-y += -ffreestanding + +# Disable unit-at-a-time mode on pre-gcc-4.0 compilers, it makes gcc use +# a lot more stack due to the lack of sharing of stacklots. Also, gcc +# 4.3.0 needs -funit-at-a-time for extern inline functions. +KBUILD_CFLAGS += $(shell if [ $(call cc-version) -lt 0400 ] ; then \ + echo $(call cc-option,-fno-unit-at-a-time); \ + else echo $(call cc-option,-funit-at-a-time); fi ;) + +KBUILD_CFLAGS += $(cflags-y) + +else + +START := 0x60000000 + +KBUILD_CFLAGS += -fno-builtin -m64 + +CHECKFLAGS += -m64 -D__x86_64__ +KBUILD_AFLAGS += -m64 +LDFLAGS += -m elf_x86_64 +KBUILD_CPPFLAGS += -m64 + +ELF_ARCH := i386:x86-64 +ELF_FORMAT := elf64-x86-64 + +# Not on all 64-bit distros /lib is a symlink to /lib64. PLD is an example. + +LINK-$(CONFIG_LD_SCRIPT_DYN) += -Wl,-rpath,/lib64 +LINK-y += -m64 + +# Do unit-at-a-time unconditionally on x86_64, following the host +KBUILD_CFLAGS += $(call cc-option,-funit-at-a-time) +endif diff --git a/arch/x86/um/Kconfig b/arch/x86/um/Kconfig new file mode 100644 index 0000000..21bebe6 --- /dev/null +++ b/arch/x86/um/Kconfig @@ -0,0 +1,70 @@ +mainmenu "User Mode Linux/$SUBARCH $KERNELVERSION Kernel Configuration" + +source "arch/um/Kconfig.common" + +menu "UML-specific options" + +menu "Host processor type and features" + +config CMPXCHG_LOCAL + bool + default n + +config CMPXCHG_DOUBLE + bool + default n + +source "arch/x86/Kconfig.cpu" + +endmenu + +config UML_X86 + def_bool y + select GENERIC_FIND_FIRST_BIT + +config 64BIT + bool + default SUBARCH = "x86_64" + +config X86_32 + def_bool !64BIT + select HAVE_AOUT + +config X86_64 + def_bool 64BIT + +config RWSEM_XCHGADD_ALGORITHM + def_bool X86_XADD && 64BIT + +config RWSEM_GENERIC_SPINLOCK + def_bool !RWSEM_XCHGADD_ALGORITHM + +config 3_LEVEL_PGTABLES + bool "Three-level pagetables (EXPERIMENTAL)" if !64BIT + default 64BIT + depends on EXPERIMENTAL + help + Three-level pagetables will let UML have more than 4G of physical + memory. All the memory that can't be mapped directly will be treated + as high memory. + + However, this it experimental on 32-bit architectures, so if unsure say + N (on x86-64 it's automatically enabled, instead, as it's safe there). + +config ARCH_HAS_SC_SIGNALS + def_bool !64BIT + +config ARCH_REUSE_HOST_VSYSCALL_AREA + def_bool !64BIT + +config SMP_BROKEN + def_bool 64BIT + +config GENERIC_HWEIGHT + def_bool y + +source "arch/um/Kconfig.um" + +endmenu + +source "arch/um/Kconfig.rest" diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile new file mode 100644 index 0000000..df41989 --- /dev/null +++ b/arch/x86/um/Makefile @@ -0,0 +1,45 @@ +# +# Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) +# + +ifeq ($(CONFIG_X86_32),y) + BITS := 32 +else + BITS := 64 +endif + +obj-y = bug.o bugs_$(BITS).o delay_$(BITS).o fault.o ksyms.o ldt.o \ + ptrace_$(BITS).o ptrace_user.o setjmp_$(BITS).o signal_$(BITS).o \ + stub_$(BITS).o stub_segv.o syscalls_$(BITS).o \ + sys_call_table_$(BITS).o sysrq_$(BITS).o tls_$(BITS).o \ + mem_$(BITS).o subarch.o os-$(OS)/ + +ifeq ($(CONFIG_X86_32),y) + +obj-y += checksum_32.o +obj-$(CONFIG_BINFMT_ELF) += elfcore.o + +subarch-y = ../lib/string_32.o ../lib/atomic64_32.o ../lib/atomic64_cx8_32.o +subarch-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += ../lib/rwsem.o +subarch-$(CONFIG_HIGHMEM) += ../mm/highmem_32.o + +else + +obj-y += vdso/ + +subarch-y = ../lib/csum-partial_64.o ../lib/memcpy_64.o ../lib/thunk_64.o \ + ../lib/rwsem.o + +endif + +subarch-$(CONFIG_MODULES) += ../kernel/module.o + +USER_OBJS := bugs_$(BITS).o ptrace_user.o fault.o + +extra-y += user-offsets.s +$(obj)/user-offsets.s: c_flags = -Wp,-MD,$(depfile) $(USER_CFLAGS) + +UNPROFILE_OBJS := stub_segv.o +CFLAGS_stub_segv.o := $(CFLAGS_NO_HARDENING) + +include arch/um/scripts/Makefile.rules diff --git a/arch/x86/um/asm/arch_hweight.h b/arch/x86/um/asm/arch_hweight.h new file mode 100644 index 0000000..c656cf4 --- /dev/null +++ b/arch/x86/um/asm/arch_hweight.h @@ -0,0 +1,6 @@ +#ifndef _ASM_UM_HWEIGHT_H +#define _ASM_UM_HWEIGHT_H + +#include + +#endif diff --git a/arch/x86/um/asm/archparam.h b/arch/x86/um/asm/archparam.h new file mode 100644 index 0000000..c17cf68 --- /dev/null +++ b/arch/x86/um/asm/archparam.h @@ -0,0 +1,20 @@ +/* + * Copyright (C) 2000 - 2003 Jeff Dike (jdike@addtoit.com) + * Copyright 2003 PathScale, Inc. + * Licensed under the GPL + */ + +#ifndef __UM_ARCHPARAM_H +#define __UM_ARCHPARAM_H + +#ifdef CONFIG_X86_32 + +#ifdef CONFIG_X86_PAE +#define LAST_PKMAP 512 +#else +#define LAST_PKMAP 1024 +#endif + +#endif + +#endif diff --git a/arch/x86/um/asm/checksum.h b/arch/x86/um/asm/checksum.h new file mode 100644 index 0000000..b6efe23 --- /dev/null +++ b/arch/x86/um/asm/checksum.h @@ -0,0 +1,10 @@ +#ifndef __UM_CHECKSUM_H +#define __UM_CHECKSUM_H + +#ifdef CONFIG_X86_32 +# include "checksum_32.h" +#else +# include "checksum_64.h" +#endif + +#endif diff --git a/arch/x86/um/asm/checksum_32.h b/arch/x86/um/asm/checksum_32.h new file mode 100644 index 0000000..caab742 --- /dev/null +++ b/arch/x86/um/asm/checksum_32.h @@ -0,0 +1,201 @@ +/* + * Licensed under the GPL + */ + +#ifndef __UM_SYSDEP_CHECKSUM_H +#define __UM_SYSDEP_CHECKSUM_H + +#include "linux/in6.h" +#include "linux/string.h" + +/* + * computes the checksum of a memory block at buff, length len, + * and adds in "sum" (32-bit) + * + * returns a 32-bit number suitable for feeding into itself + * or csum_tcpudp_magic + * + * this function must be called with even lengths, except + * for the last fragment, which may be odd + * + * it's best to have buff aligned on a 32-bit boundary + */ +__wsum csum_partial(const void *buff, int len, __wsum sum); + +/* + * Note: when you get a NULL pointer exception here this means someone + * passed in an incorrect kernel address to one of these functions. + * + * If you use these functions directly please don't forget the + * access_ok(). + */ + +static __inline__ +__wsum csum_partial_copy_nocheck(const void *src, void *dst, + int len, __wsum sum) +{ + memcpy(dst, src, len); + return csum_partial(dst, len, sum); +} + +/* + * the same as csum_partial, but copies from src while it + * checksums, and handles user-space pointer exceptions correctly, when needed. + * + * here even more important to align src and dst on a 32-bit (or even + * better 64-bit) boundary + */ + +static __inline__ +__wsum csum_partial_copy_from_user(const void __user *src, void *dst, + int len, __wsum sum, int *err_ptr) +{ + if (copy_from_user(dst, src, len)) { + *err_ptr = -EFAULT; + return (__force __wsum)-1; + } + + return csum_partial(dst, len, sum); +} + +/* + * This is a version of ip_compute_csum() optimized for IP headers, + * which always checksum on 4 octet boundaries. + * + * By Jorge Cwik , adapted for linux by + * Arnt Gulbrandsen. + */ +static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl) +{ + unsigned int sum; + + __asm__ __volatile__( + "movl (%1), %0 ;\n" + "subl $4, %2 ;\n" + "jbe 2f ;\n" + "addl 4(%1), %0 ;\n" + "adcl 8(%1), %0 ;\n" + "adcl 12(%1), %0 ;\n" +"1: adcl 16(%1), %0 ;\n" + "lea 4(%1), %1 ;\n" + "decl %2 ;\n" + "jne 1b ;\n" + "adcl $0, %0 ;\n" + "movl %0, %2 ;\n" + "shrl $16, %0 ;\n" + "addw %w2, %w0 ;\n" + "adcl $0, %0 ;\n" + "notl %0 ;\n" +"2: ;\n" + /* Since the input registers which are loaded with iph and ipl + are modified, we must also specify them as outputs, or gcc + will assume they contain their original values. */ + : "=r" (sum), "=r" (iph), "=r" (ihl) + : "1" (iph), "2" (ihl) + : "memory"); + return (__force __sum16)sum; +} + +/* + * Fold a partial checksum + */ + +static inline __sum16 csum_fold(__wsum sum) +{ + __asm__( + "addl %1, %0 ;\n" + "adcl $0xffff, %0 ;\n" + : "=r" (sum) + : "r" ((__force u32)sum << 16), + "0" ((__force u32)sum & 0xffff0000) + ); + return (__force __sum16)(~(__force u32)sum >> 16); +} + +static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr, + unsigned short len, + unsigned short proto, + __wsum sum) +{ + __asm__( + "addl %1, %0 ;\n" + "adcl %2, %0 ;\n" + "adcl %3, %0 ;\n" + "adcl $0, %0 ;\n" + : "=r" (sum) + : "g" (daddr), "g"(saddr), "g"((len + proto) << 8), "0"(sum)); + return sum; +} + +/* + * computes the checksum of the TCP/UDP pseudo-header + * returns a 16-bit checksum, already complemented + */ +static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr, + unsigned short len, + unsigned short proto, + __wsum sum) +{ + return csum_fold(csum_tcpudp_nofold(saddr,daddr,len,proto,sum)); +} + +/* + * this routine is used for miscellaneous IP-like checksums, mainly + * in icmp.c + */ + +static inline __sum16 ip_compute_csum(const void *buff, int len) +{ + return csum_fold (csum_partial(buff, len, 0)); +} + +#define _HAVE_ARCH_IPV6_CSUM +static __inline__ __sum16 csum_ipv6_magic(const struct in6_addr *saddr, + const struct in6_addr *daddr, + __u32 len, unsigned short proto, + __wsum sum) +{ + __asm__( + "addl 0(%1), %0 ;\n" + "adcl 4(%1), %0 ;\n" + "adcl 8(%1), %0 ;\n" + "adcl 12(%1), %0 ;\n" + "adcl 0(%2), %0 ;\n" + "adcl 4(%2), %0 ;\n" + "adcl 8(%2), %0 ;\n" + "adcl 12(%2), %0 ;\n" + "adcl %3, %0 ;\n" + "adcl %4, %0 ;\n" + "adcl $0, %0 ;\n" + : "=&r" (sum) + : "r" (saddr), "r" (daddr), + "r"(htonl(len)), "r"(htonl(proto)), "0"(sum)); + + return csum_fold(sum); +} + +/* + * Copy and checksum to user + */ +#define HAVE_CSUM_COPY_USER +static __inline__ __wsum csum_and_copy_to_user(const void *src, + void __user *dst, + int len, __wsum sum, int *err_ptr) +{ + if (access_ok(VERIFY_WRITE, dst, len)) { + if (copy_to_user(dst, src, len)) { + *err_ptr = -EFAULT; + return (__force __wsum)-1; + } + + return csum_partial(src, len, sum); + } + + if (len) + *err_ptr = -EFAULT; + + return (__force __wsum)-1; /* invalid checksum */ +} + +#endif + diff --git a/arch/x86/um/asm/checksum_64.h b/arch/x86/um/asm/checksum_64.h new file mode 100644 index 0000000..a5be903 --- /dev/null +++ b/arch/x86/um/asm/checksum_64.h @@ -0,0 +1,144 @@ +/* + * Licensed under the GPL + */ + +#ifndef __UM_SYSDEP_CHECKSUM_H +#define __UM_SYSDEP_CHECKSUM_H + +#include "linux/string.h" +#include "linux/in6.h" +#include "asm/uaccess.h" + +extern __wsum csum_partial(const void *buff, int len, __wsum sum); + +/* + * Note: when you get a NULL pointer exception here this means someone + * passed in an incorrect kernel address to one of these functions. + * + * If you use these functions directly please don't forget the + * access_ok(). + */ + +static __inline__ +__wsum csum_partial_copy_nocheck(const void *src, void *dst, + int len, __wsum sum) +{ + memcpy(dst, src, len); + return(csum_partial(dst, len, sum)); +} + +static __inline__ +__wsum csum_partial_copy_from_user(const void __user *src, + void *dst, int len, __wsum sum, + int *err_ptr) +{ + if (copy_from_user(dst, src, len)) { + *err_ptr = -EFAULT; + return (__force __wsum)-1; + } + return csum_partial(dst, len, sum); +} + +/** + * csum_fold - Fold and invert a 32bit checksum. + * sum: 32bit unfolded sum + * + * Fold a 32bit running checksum to 16bit and invert it. This is usually + * the last step before putting a checksum into a packet. + * Make sure not to mix with 64bit checksums. + */ +static inline __sum16 csum_fold(__wsum sum) +{ + __asm__( + " addl %1,%0\n" + " adcl $0xffff,%0" + : "=r" (sum) + : "r" ((__force u32)sum << 16), + "0" ((__force u32)sum & 0xffff0000) + ); + return (__force __sum16)(~(__force u32)sum >> 16); +} + +/** + * csum_tcpup_nofold - Compute an IPv4 pseudo header checksum. + * @saddr: source address + * @daddr: destination address + * @len: length of packet + * @proto: ip protocol of packet + * @sum: initial sum to be added in (32bit unfolded) + * + * Returns the pseudo header checksum the input data. Result is + * 32bit unfolded. + */ +static inline __wsum +csum_tcpudp_nofold(__be32 saddr, __be32 daddr, unsigned short len, + unsigned short proto, __wsum sum) +{ + asm(" addl %1, %0\n" + " adcl %2, %0\n" + " adcl %3, %0\n" + " adcl $0, %0\n" + : "=r" (sum) + : "g" (daddr), "g" (saddr), "g" ((len + proto) << 8), "0" (sum)); + return sum; +} + +/* + * computes the checksum of the TCP/UDP pseudo-header + * returns a 16-bit checksum, already complemented + */ +static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr, + unsigned short len, + unsigned short proto, + __wsum sum) +{ + return csum_fold(csum_tcpudp_nofold(saddr,daddr,len,proto,sum)); +} + +/** + * ip_fast_csum - Compute the IPv4 header checksum efficiently. + * iph: ipv4 header + * ihl: length of header / 4 + */ +static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl) +{ + unsigned int sum; + + asm( " movl (%1), %0\n" + " subl $4, %2\n" + " jbe 2f\n" + " addl 4(%1), %0\n" + " adcl 8(%1), %0\n" + " adcl 12(%1), %0\n" + "1: adcl 16(%1), %0\n" + " lea 4(%1), %1\n" + " decl %2\n" + " jne 1b\n" + " adcl $0, %0\n" + " movl %0, %2\n" + " shrl $16, %0\n" + " addw %w2, %w0\n" + " adcl $0, %0\n" + " notl %0\n" + "2:" + /* Since the input registers which are loaded with iph and ipl + are modified, we must also specify them as outputs, or gcc + will assume they contain their original values. */ + : "=r" (sum), "=r" (iph), "=r" (ihl) + : "1" (iph), "2" (ihl) + : "memory"); + return (__force __sum16)sum; +} + +static inline unsigned add32_with_carry(unsigned a, unsigned b) +{ + asm("addl %2,%0\n\t" + "adcl $0,%0" + : "=r" (a) + : "0" (a), "r" (b)); + return a; +} + +extern __sum16 ip_compute_csum(const void *buff, int len); + +#endif diff --git a/arch/x86/um/asm/elf.h b/arch/x86/um/asm/elf.h new file mode 100644 index 0000000..f3b0633 --- /dev/null +++ b/arch/x86/um/asm/elf.h @@ -0,0 +1,221 @@ +/* + * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + * Licensed under the GPL + */ +#ifndef __UM_ELF_X86_H +#define __UM_ELF_X86_H + +#include +#include "skas.h" + +#ifdef CONFIG_X86_32 + +#define R_386_NONE 0 +#define R_386_32 1 +#define R_386_PC32 2 +#define R_386_GOT32 3 +#define R_386_PLT32 4 +#define R_386_COPY 5 +#define R_386_GLOB_DAT 6 +#define R_386_JMP_SLOT 7 +#define R_386_RELATIVE 8 +#define R_386_GOTOFF 9 +#define R_386_GOTPC 10 +#define R_386_NUM 11 + +/* + * This is used to ensure we don't load something for the wrong architecture. + */ +#define elf_check_arch(x) \ + (((x)->e_machine == EM_386) || ((x)->e_machine == EM_486)) + +#define ELF_CLASS ELFCLASS32 +#define ELF_DATA ELFDATA2LSB +#define ELF_ARCH EM_386 + +#define ELF_PLAT_INIT(regs, load_addr) do { \ + PT_REGS_EBX(regs) = 0; \ + PT_REGS_ECX(regs) = 0; \ + PT_REGS_EDX(regs) = 0; \ + PT_REGS_ESI(regs) = 0; \ + PT_REGS_EDI(regs) = 0; \ + PT_REGS_EBP(regs) = 0; \ + PT_REGS_EAX(regs) = 0; \ +} while (0) + +/* Shamelessly stolen from include/asm-i386/elf.h */ + +#define ELF_CORE_COPY_REGS(pr_reg, regs) do { \ + pr_reg[0] = PT_REGS_EBX(regs); \ + pr_reg[1] = PT_REGS_ECX(regs); \ + pr_reg[2] = PT_REGS_EDX(regs); \ + pr_reg[3] = PT_REGS_ESI(regs); \ + pr_reg[4] = PT_REGS_EDI(regs); \ + pr_reg[5] = PT_REGS_EBP(regs); \ + pr_reg[6] = PT_REGS_EAX(regs); \ + pr_reg[7] = PT_REGS_DS(regs); \ + pr_reg[8] = PT_REGS_ES(regs); \ + /* fake once used fs and gs selectors? */ \ + pr_reg[9] = PT_REGS_DS(regs); \ + pr_reg[10] = PT_REGS_DS(regs); \ + pr_reg[11] = PT_REGS_SYSCALL_NR(regs); \ + pr_reg[12] = PT_REGS_IP(regs); \ + pr_reg[13] = PT_REGS_CS(regs); \ + pr_reg[14] = PT_REGS_EFLAGS(regs); \ + pr_reg[15] = PT_REGS_SP(regs); \ + pr_reg[16] = PT_REGS_SS(regs); \ +} while (0); + +extern char * elf_aux_platform; +#define ELF_PLATFORM (elf_aux_platform) + +extern unsigned long vsyscall_ehdr; +extern unsigned long vsyscall_end; +extern unsigned long __kernel_vsyscall; + +/* + * This is the range that is readable by user mode, and things + * acting like user mode such as get_user_pages. + */ +#define FIXADDR_USER_START vsyscall_ehdr +#define FIXADDR_USER_END vsyscall_end + + +/* + * Architecture-neutral AT_ values in 0-17, leave some room + * for more of them, start the x86-specific ones at 32. + */ +#define AT_SYSINFO 32 +#define AT_SYSINFO_EHDR 33 + +#define ARCH_DLINFO \ +do { \ + if ( vsyscall_ehdr ) { \ + NEW_AUX_ENT(AT_SYSINFO, __kernel_vsyscall); \ + NEW_AUX_ENT(AT_SYSINFO_EHDR, vsyscall_ehdr); \ + } \ +} while (0) + +#else + +/* x86-64 relocation types, taken from asm-x86_64/elf.h */ +#define R_X86_64_NONE 0 /* No reloc */ +#define R_X86_64_64 1 /* Direct 64 bit */ +#define R_X86_64_PC32 2 /* PC relative 32 bit signed */ +#define R_X86_64_GOT32 3 /* 32 bit GOT entry */ +#define R_X86_64_PLT32 4 /* 32 bit PLT address */ +#define R_X86_64_COPY 5 /* Copy symbol at runtime */ +#define R_X86_64_GLOB_DAT 6 /* Create GOT entry */ +#define R_X86_64_JUMP_SLOT 7 /* Create PLT entry */ +#define R_X86_64_RELATIVE 8 /* Adjust by program base */ +#define R_X86_64_GOTPCREL 9 /* 32 bit signed pc relative + offset to GOT */ +#define R_X86_64_32 10 /* Direct 32 bit zero extended */ +#define R_X86_64_32S 11 /* Direct 32 bit sign extended */ +#define R_X86_64_16 12 /* Direct 16 bit zero extended */ +#define R_X86_64_PC16 13 /* 16 bit sign extended pc relative */ +#define R_X86_64_8 14 /* Direct 8 bit sign extended */ +#define R_X86_64_PC8 15 /* 8 bit sign extended pc relative */ + +#define R_X86_64_NUM 16 + +/* + * This is used to ensure we don't load something for the wrong architecture. + */ +#define elf_check_arch(x) \ + ((x)->e_machine == EM_X86_64) + +#define ELF_CLASS ELFCLASS64 +#define ELF_DATA ELFDATA2LSB +#define ELF_ARCH EM_X86_64 + +#define ELF_PLAT_INIT(regs, load_addr) do { \ + PT_REGS_RBX(regs) = 0; \ + PT_REGS_RCX(regs) = 0; \ + PT_REGS_RDX(regs) = 0; \ + PT_REGS_RSI(regs) = 0; \ + PT_REGS_RDI(regs) = 0; \ + PT_REGS_RBP(regs) = 0; \ + PT_REGS_RAX(regs) = 0; \ + PT_REGS_R8(regs) = 0; \ + PT_REGS_R9(regs) = 0; \ + PT_REGS_R10(regs) = 0; \ + PT_REGS_R11(regs) = 0; \ + PT_REGS_R12(regs) = 0; \ + PT_REGS_R13(regs) = 0; \ + PT_REGS_R14(regs) = 0; \ + PT_REGS_R15(regs) = 0; \ +} while (0) + +#define ELF_CORE_COPY_REGS(pr_reg, _regs) \ + (pr_reg)[0] = (_regs)->regs.gp[0]; \ + (pr_reg)[1] = (_regs)->regs.gp[1]; \ + (pr_reg)[2] = (_regs)->regs.gp[2]; \ + (pr_reg)[3] = (_regs)->regs.gp[3]; \ + (pr_reg)[4] = (_regs)->regs.gp[4]; \ + (pr_reg)[5] = (_regs)->regs.gp[5]; \ + (pr_reg)[6] = (_regs)->regs.gp[6]; \ + (pr_reg)[7] = (_regs)->regs.gp[7]; \ + (pr_reg)[8] = (_regs)->regs.gp[8]; \ + (pr_reg)[9] = (_regs)->regs.gp[9]; \ + (pr_reg)[10] = (_regs)->regs.gp[10]; \ + (pr_reg)[11] = (_regs)->regs.gp[11]; \ + (pr_reg)[12] = (_regs)->regs.gp[12]; \ + (pr_reg)[13] = (_regs)->regs.gp[13]; \ + (pr_reg)[14] = (_regs)->regs.gp[14]; \ + (pr_reg)[15] = (_regs)->regs.gp[15]; \ + (pr_reg)[16] = (_regs)->regs.gp[16]; \ + (pr_reg)[17] = (_regs)->regs.gp[17]; \ + (pr_reg)[18] = (_regs)->regs.gp[18]; \ + (pr_reg)[19] = (_regs)->regs.gp[19]; \ + (pr_reg)[20] = (_regs)->regs.gp[20]; \ + (pr_reg)[21] = current->thread.arch.fs; \ + (pr_reg)[22] = 0; \ + (pr_reg)[23] = 0; \ + (pr_reg)[24] = 0; \ + (pr_reg)[25] = 0; \ + (pr_reg)[26] = 0; + +#define ELF_PLATFORM "x86_64" + +/* No user-accessible fixmap addresses, i.e. vsyscall */ +#define FIXADDR_USER_START 0 +#define FIXADDR_USER_END 0 + +#define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1 +struct linux_binprm; +extern int arch_setup_additional_pages(struct linux_binprm *bprm, + int uses_interp); + +extern unsigned long um_vdso_addr; +#define AT_SYSINFO_EHDR 33 +#define ARCH_DLINFO NEW_AUX_ENT(AT_SYSINFO_EHDR, um_vdso_addr) + +#endif + +typedef unsigned long elf_greg_t; + +#define ELF_NGREG (sizeof (struct user_regs_struct) / sizeof(elf_greg_t)) +typedef elf_greg_t elf_gregset_t[ELF_NGREG]; + +typedef struct user_i387_struct elf_fpregset_t; + +#define task_pt_regs(t) (&(t)->thread.regs) + +struct task_struct; + +extern int elf_core_copy_fpregs(struct task_struct *t, elf_fpregset_t *fpu); + +#define ELF_CORE_COPY_FPREGS(t, fpu) elf_core_copy_fpregs(t, fpu) + +#define ELF_EXEC_PAGESIZE 4096 + +#define ELF_ET_DYN_BASE (2 * TASK_SIZE / 3) + +extern long elf_aux_hwcap; +#define ELF_HWCAP (elf_aux_hwcap) + +#define SET_PERSONALITY(ex) do ; while(0) +#define __HAVE_ARCH_GATE_AREA 1 + +#endif diff --git a/arch/x86/um/asm/module.h b/arch/x86/um/asm/module.h new file mode 100644 index 0000000..61af80e --- /dev/null +++ b/arch/x86/um/asm/module.h @@ -0,0 +1,23 @@ +#ifndef __UM_MODULE_H +#define __UM_MODULE_H + +/* UML is simple */ +struct mod_arch_specific +{ +}; + +#ifdef CONFIG_X86_32 + +#define Elf_Shdr Elf32_Shdr +#define Elf_Sym Elf32_Sym +#define Elf_Ehdr Elf32_Ehdr + +#else + +#define Elf_Shdr Elf64_Shdr +#define Elf_Sym Elf64_Sym +#define Elf_Ehdr Elf64_Ehdr + +#endif + +#endif diff --git a/arch/x86/um/asm/processor.h b/arch/x86/um/asm/processor.h new file mode 100644 index 0000000..d3ac1ce --- /dev/null +++ b/arch/x86/um/asm/processor.h @@ -0,0 +1,15 @@ +#ifndef __UM_PROCESSOR_H +#define __UM_PROCESSOR_H + +/* include faultinfo structure */ +#include + +#ifdef CONFIG_X86_32 +# include "processor_32.h" +#else +# include "processor_64.h" +#endif + +#include + +#endif diff --git a/arch/x86/um/asm/processor_32.h b/arch/x86/um/asm/processor_32.h new file mode 100644 index 0000000..ae0d189 --- /dev/null +++ b/arch/x86/um/asm/processor_32.h @@ -0,0 +1,73 @@ +/* + * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#ifndef __UM_PROCESSOR_I386_H +#define __UM_PROCESSOR_I386_H + +#include +#include +#include + +extern int host_has_cmov; + +struct uml_tls_struct { + struct user_desc tls; + unsigned flushed:1; + unsigned present:1; +}; + +struct arch_thread { + struct uml_tls_struct tls_array[GDT_ENTRY_TLS_ENTRIES]; + unsigned long debugregs[8]; + int debugregs_seq; + struct faultinfo faultinfo; +}; + +#define INIT_ARCH_THREAD { \ + .tls_array = { [ 0 ... GDT_ENTRY_TLS_ENTRIES - 1 ] = \ + { .present = 0, .flushed = 0 } }, \ + .debugregs = { [ 0 ... 7 ] = 0 }, \ + .debugregs_seq = 0, \ + .faultinfo = { 0, 0, 0 } \ +} + +static inline void arch_flush_thread(struct arch_thread *thread) +{ + /* Clear any TLS still hanging */ + memset(&thread->tls_array, 0, sizeof(thread->tls_array)); +} + +static inline void arch_copy_thread(struct arch_thread *from, + struct arch_thread *to) +{ + memcpy(&to->tls_array, &from->tls_array, sizeof(from->tls_array)); +} + +#include + +/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */ +static inline void rep_nop(void) +{ + __asm__ __volatile__("rep;nop": : :"memory"); +} + +#define cpu_relax() rep_nop() + +/* + * Default implementation of macro that returns current + * instruction pointer ("program counter"). Stolen + * from asm-i386/processor.h + */ +#define current_text_addr() \ + ({ void *pc; __asm__("movl $1f,%0\n1:":"=g" (pc)); pc; }) + +#define ARCH_IS_STACKGROW(address) \ + (address + 32 >= UPT_SP(¤t->thread.regs.regs)) + +#define KSTK_EIP(tsk) KSTK_REG(tsk, EIP) +#define KSTK_ESP(tsk) KSTK_REG(tsk, UESP) +#define KSTK_EBP(tsk) KSTK_REG(tsk, EBP) + +#endif diff --git a/arch/x86/um/asm/processor_64.h b/arch/x86/um/asm/processor_64.h new file mode 100644 index 0000000..6db812b --- /dev/null +++ b/arch/x86/um/asm/processor_64.h @@ -0,0 +1,51 @@ +/* + * Copyright 2003 PathScale, Inc. + * + * Licensed under the GPL + */ + +#ifndef __UM_PROCESSOR_X86_64_H +#define __UM_PROCESSOR_X86_64_H + +struct arch_thread { + unsigned long debugregs[8]; + int debugregs_seq; + unsigned long fs; + struct faultinfo faultinfo; +}; + +/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */ +static inline void rep_nop(void) +{ + __asm__ __volatile__("rep;nop": : :"memory"); +} + +#define cpu_relax() rep_nop() + +#define INIT_ARCH_THREAD { .debugregs = { [ 0 ... 7 ] = 0 }, \ + .debugregs_seq = 0, \ + .fs = 0, \ + .faultinfo = { 0, 0, 0 } } + +static inline void arch_flush_thread(struct arch_thread *thread) +{ +} + +static inline void arch_copy_thread(struct arch_thread *from, + struct arch_thread *to) +{ + to->fs = from->fs; +} + +#include + +#define current_text_addr() \ + ({ void *pc; __asm__("movq $1f,%0\n1:":"=g" (pc)); pc; }) + +#define ARCH_IS_STACKGROW(address) \ + (address + 128 >= UPT_SP(¤t->thread.regs.regs)) + +#define KSTK_EIP(tsk) KSTK_REG(tsk, RIP) +#define KSTK_ESP(tsk) KSTK_REG(tsk, RSP) + +#endif diff --git a/arch/x86/um/asm/ptrace.h b/arch/x86/um/asm/ptrace.h new file mode 100644 index 0000000..c8aca8c --- /dev/null +++ b/arch/x86/um/asm/ptrace.h @@ -0,0 +1,5 @@ +#ifdef CONFIG_X86_32 +# include "ptrace_32.h" +#else +# include "ptrace_64.h" +#endif diff --git a/arch/x86/um/asm/ptrace_32.h b/arch/x86/um/asm/ptrace_32.h new file mode 100644 index 0000000..5d2a591 --- /dev/null +++ b/arch/x86/um/asm/ptrace_32.h @@ -0,0 +1,51 @@ +/* + * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + * Licensed under the GPL + */ + +#ifndef __UM_PTRACE_I386_H +#define __UM_PTRACE_I386_H + +#define HOST_AUDIT_ARCH AUDIT_ARCH_I386 + +#include "linux/compiler.h" +#include "asm/ptrace-generic.h" + +#define PT_REGS_EAX(r) UPT_EAX(&(r)->regs) +#define PT_REGS_EBX(r) UPT_EBX(&(r)->regs) +#define PT_REGS_ECX(r) UPT_ECX(&(r)->regs) +#define PT_REGS_EDX(r) UPT_EDX(&(r)->regs) +#define PT_REGS_ESI(r) UPT_ESI(&(r)->regs) +#define PT_REGS_EDI(r) UPT_EDI(&(r)->regs) +#define PT_REGS_EBP(r) UPT_EBP(&(r)->regs) + +#define PT_REGS_CS(r) UPT_CS(&(r)->regs) +#define PT_REGS_SS(r) UPT_SS(&(r)->regs) +#define PT_REGS_DS(r) UPT_DS(&(r)->regs) +#define PT_REGS_ES(r) UPT_ES(&(r)->regs) +#define PT_REGS_FS(r) UPT_FS(&(r)->regs) +#define PT_REGS_GS(r) UPT_GS(&(r)->regs) + +#define PT_REGS_EFLAGS(r) UPT_EFLAGS(&(r)->regs) + +#define PT_REGS_ORIG_SYSCALL(r) PT_REGS_EAX(r) +#define PT_REGS_SYSCALL_RET(r) PT_REGS_EAX(r) +#define PT_FIX_EXEC_STACK(sp) do ; while(0) + +#define profile_pc(regs) PT_REGS_IP(regs) + +#define user_mode(r) UPT_IS_USER(&(r)->regs) + +/* + * Forward declaration to avoid including sysdep/tls.h, which causes a + * circular include, and compilation failures. + */ +struct user_desc; + +extern int ptrace_get_thread_area(struct task_struct *child, int idx, + struct user_desc __user *user_desc); + +extern int ptrace_set_thread_area(struct task_struct *child, int idx, + struct user_desc __user *user_desc); + +#endif diff --git a/arch/x86/um/asm/ptrace_64.h b/arch/x86/um/asm/ptrace_64.h new file mode 100644 index 0000000..83d8c47 --- /dev/null +++ b/arch/x86/um/asm/ptrace_64.h @@ -0,0 +1,72 @@ +/* + * Copyright 2003 PathScale, Inc. + * + * Licensed under the GPL + */ + +#ifndef __UM_PTRACE_X86_64_H +#define __UM_PTRACE_X86_64_H + +#include "linux/compiler.h" +#include "asm/errno.h" + +#define __FRAME_OFFSETS /* Needed to get the R* macros */ +#include "asm/ptrace-generic.h" + +#define HOST_AUDIT_ARCH AUDIT_ARCH_X86_64 + +#define PT_REGS_RBX(r) UPT_RBX(&(r)->regs) +#define PT_REGS_RCX(r) UPT_RCX(&(r)->regs) +#define PT_REGS_RDX(r) UPT_RDX(&(r)->regs) +#define PT_REGS_RSI(r) UPT_RSI(&(r)->regs) +#define PT_REGS_RDI(r) UPT_RDI(&(r)->regs) +#define PT_REGS_RBP(r) UPT_RBP(&(r)->regs) +#define PT_REGS_RAX(r) UPT_RAX(&(r)->regs) +#define PT_REGS_R8(r) UPT_R8(&(r)->regs) +#define PT_REGS_R9(r) UPT_R9(&(r)->regs) +#define PT_REGS_R10(r) UPT_R10(&(r)->regs) +#define PT_REGS_R11(r) UPT_R11(&(r)->regs) +#define PT_REGS_R12(r) UPT_R12(&(r)->regs) +#define PT_REGS_R13(r) UPT_R13(&(r)->regs) +#define PT_REGS_R14(r) UPT_R14(&(r)->regs) +#define PT_REGS_R15(r) UPT_R15(&(r)->regs) + +#define PT_REGS_FS(r) UPT_FS(&(r)->regs) +#define PT_REGS_GS(r) UPT_GS(&(r)->regs) +#define PT_REGS_DS(r) UPT_DS(&(r)->regs) +#define PT_REGS_ES(r) UPT_ES(&(r)->regs) +#define PT_REGS_SS(r) UPT_SS(&(r)->regs) +#define PT_REGS_CS(r) UPT_CS(&(r)->regs) + +#define PT_REGS_ORIG_RAX(r) UPT_ORIG_RAX(&(r)->regs) +#define PT_REGS_RIP(r) UPT_IP(&(r)->regs) +#define PT_REGS_RSP(r) UPT_SP(&(r)->regs) + +#define PT_REGS_EFLAGS(r) UPT_EFLAGS(&(r)->regs) + +/* XXX */ +#define user_mode(r) UPT_IS_USER(&(r)->regs) +#define PT_REGS_ORIG_SYSCALL(r) PT_REGS_RAX(r) +#define PT_REGS_SYSCALL_RET(r) PT_REGS_RAX(r) + +#define PT_FIX_EXEC_STACK(sp) do ; while(0) + +#define profile_pc(regs) PT_REGS_IP(regs) + +struct user_desc; + +static inline int ptrace_get_thread_area(struct task_struct *child, int idx, + struct user_desc __user *user_desc) +{ + return -ENOSYS; +} + +static inline int ptrace_set_thread_area(struct task_struct *child, int idx, + struct user_desc __user *user_desc) +{ + return -ENOSYS; +} + +extern long arch_prctl(struct task_struct *task, int code, + unsigned long __user *addr); +#endif diff --git a/arch/x86/um/asm/system.h b/arch/x86/um/asm/system.h new file mode 100644 index 0000000..a89113b --- /dev/null +++ b/arch/x86/um/asm/system.h @@ -0,0 +1,133 @@ +#ifndef _ASM_X86_SYSTEM_H_ +#define _ASM_X86_SYSTEM_H_ + +#include +#include +#include +#include +#include +#include + +#include +#include + +/* entries in ARCH_DLINFO: */ +#ifdef CONFIG_IA32_EMULATION +# define AT_VECTOR_SIZE_ARCH 2 +#else +# define AT_VECTOR_SIZE_ARCH 1 +#endif + +extern unsigned long arch_align_stack(unsigned long sp); + +void default_idle(void); + +/* + * Force strict CPU ordering. + * And yes, this is required on UP too when we're talking + * to devices. + */ +#ifdef CONFIG_X86_32 +/* + * Some non-Intel clones support out of order store. wmb() ceases to be a + * nop for these. + */ +#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2) +#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2) +#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM) +#else +#define mb() asm volatile("mfence":::"memory") +#define rmb() asm volatile("lfence":::"memory") +#define wmb() asm volatile("sfence" ::: "memory") +#endif + +/** + * read_barrier_depends - Flush all pending reads that subsequents reads + * depend on. + * + * No data-dependent reads from memory-like regions are ever reordered + * over this barrier. All reads preceding this primitive are guaranteed + * to access memory (but not necessarily other CPUs' caches) before any + * reads following this primitive that depend on the data return by + * any of the preceding reads. This primitive is much lighter weight than + * rmb() on most CPUs, and is never heavier weight than is + * rmb(). + * + * These ordering constraints are respected by both the local CPU + * and the compiler. + * + * Ordering is not guaranteed by anything other than these primitives, + * not even by data dependencies. See the documentation for + * memory_barrier() for examples and URLs to more information. + * + * For example, the following code would force ordering (the initial + * value of "a" is zero, "b" is one, and "p" is "&a"): + * + * + * CPU 0 CPU 1 + * + * b = 2; + * memory_barrier(); + * p = &b; q = p; + * read_barrier_depends(); + * d = *q; + * + * + * because the read of "*q" depends on the read of "p" and these + * two reads are separated by a read_barrier_depends(). However, + * the following code, with the same initial values for "a" and "b": + * + * + * CPU 0 CPU 1 + * + * a = 2; + * memory_barrier(); + * b = 3; y = b; + * read_barrier_depends(); + * x = a; + * + * + * does not enforce ordering, since there is no data dependency between + * the read of "a" and the read of "b". Therefore, on some CPUs, such + * as Alpha, "y" could be set to 3 and "x" to 0. Use rmb() + * in cases like this where there are no data dependencies. + **/ + +#define read_barrier_depends() do { } while (0) + +#ifdef CONFIG_SMP +#define smp_mb() mb() +#ifdef CONFIG_X86_PPRO_FENCE +# define smp_rmb() rmb() +#else +# define smp_rmb() barrier() +#endif +#ifdef CONFIG_X86_OOSTORE +# define smp_wmb() wmb() +#else +# define smp_wmb() barrier() +#endif +#define smp_read_barrier_depends() read_barrier_depends() +#define set_mb(var, value) do { (void)xchg(&var, value); } while (0) +#else +#define smp_mb() barrier() +#define smp_rmb() barrier() +#define smp_wmb() barrier() +#define smp_read_barrier_depends() do { } while (0) +#define set_mb(var, value) do { var = value; barrier(); } while (0) +#endif + +/* + * Stop RDTSC speculation. This is needed when you need to use RDTSC + * (or get_cycles or vread that possibly accesses the TSC) in a defined + * code region. + * + * (Could use an alternative three way for this if there was one.) + */ +static inline void rdtsc_barrier(void) +{ + alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC); + alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC); +} + +#endif diff --git a/arch/x86/um/asm/vm-flags.h b/arch/x86/um/asm/vm-flags.h new file mode 100644 index 0000000..7c297e9 --- /dev/null +++ b/arch/x86/um/asm/vm-flags.h @@ -0,0 +1,25 @@ +/* + * Copyright (C) 2004 Jeff Dike (jdike@addtoit.com) + * Copyright 2003 PathScale, Inc. + * Licensed under the GPL + */ + +#ifndef __VM_FLAGS_X86_H +#define __VM_FLAGS_X86_H + +#ifdef CONFIG_X86_32 + +#define VM_DATA_DEFAULT_FLAGS \ + (VM_READ | VM_WRITE | \ + ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \ + VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) + +#else + +#define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \ + VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) +#define VM_STACK_DEFAULT_FLAGS (VM_GROWSDOWN | VM_READ | VM_WRITE | \ + VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) + +#endif +#endif diff --git a/arch/x86/um/bug.c b/arch/x86/um/bug.c new file mode 100644 index 0000000..e8034e3 --- /dev/null +++ b/arch/x86/um/bug.c @@ -0,0 +1,21 @@ +/* + * Copyright (C) 2006 Jeff Dike (jdike@addtoit.com) + * Licensed under the GPL V2 + */ + +#include + +/* + * Mostly copied from i386/x86_86 - eliminated the eip < PAGE_OFFSET because + * that's not relevant in skas mode. + */ + +int is_valid_bugaddr(unsigned long eip) +{ + unsigned short ud2; + + if (probe_kernel_address((unsigned short __user *)eip, ud2)) + return 0; + + return ud2 == 0x0b0f; +} diff --git a/arch/x86/um/bugs_32.c b/arch/x86/um/bugs_32.c new file mode 100644 index 0000000..7058e1f --- /dev/null +++ b/arch/x86/um/bugs_32.c @@ -0,0 +1,74 @@ +/* + * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + * Licensed under the GPL + */ + +#include +#include "kern_util.h" +#include "longjmp.h" +#include "task.h" +#include "sysdep/ptrace.h" + +/* Set during early boot */ +static int host_has_cmov = 1; +static jmp_buf cmov_test_return; + +static void cmov_sigill_test_handler(int sig) +{ + host_has_cmov = 0; + longjmp(cmov_test_return, 1); +} + +void arch_check_bugs(void) +{ + struct sigaction old, new; + + printk(UM_KERN_INFO "Checking for host processor cmov support..."); + new.sa_handler = cmov_sigill_test_handler; + + /* Make sure that SIGILL is enabled after the handler longjmps back */ + new.sa_flags = SA_NODEFER; + sigemptyset(&new.sa_mask); + sigaction(SIGILL, &new, &old); + + if (setjmp(cmov_test_return) == 0) { + unsigned long foo = 0; + __asm__ __volatile__("cmovz %0, %1" : "=r" (foo) : "0" (foo)); + printk(UM_KERN_CONT "Yes\n"); + } else + printk(UM_KERN_CONT "No\n"); + + sigaction(SIGILL, &old, &new); +} + +void arch_examine_signal(int sig, struct uml_pt_regs *regs) +{ + unsigned char tmp[2]; + + /* + * This is testing for a cmov (0x0f 0x4x) instruction causing a + * SIGILL in init. + */ + if ((sig != SIGILL) || (TASK_PID(get_current()) != 1)) + return; + + if (copy_from_user_proc(tmp, (void *) UPT_IP(regs), 2)) { + printk(UM_KERN_ERR "SIGILL in init, could not read " + "instructions!\n"); + return; + } + + if ((tmp[0] != 0x0f) || ((tmp[1] & 0xf0) != 0x40)) + return; + + if (host_has_cmov == 0) + printk(UM_KERN_ERR "SIGILL caused by cmov, which this " + "processor doesn't implement. Boot a filesystem " + "compiled for older processors"); + else if (host_has_cmov == 1) + printk(UM_KERN_ERR "SIGILL caused by cmov, which this " + "processor claims to implement"); + else + printk(UM_KERN_ERR "Bad value for host_has_cmov (%d)", + host_has_cmov); +} diff --git a/arch/x86/um/bugs_64.c b/arch/x86/um/bugs_64.c new file mode 100644 index 0000000..44e02ba --- /dev/null +++ b/arch/x86/um/bugs_64.c @@ -0,0 +1,15 @@ +/* + * Copyright 2003 PathScale, Inc. + * + * Licensed under the GPL + */ + +#include "sysdep/ptrace.h" + +void arch_check_bugs(void) +{ +} + +void arch_examine_signal(int sig, struct uml_pt_regs *regs) +{ +} diff --git a/arch/x86/um/checksum_32.S b/arch/x86/um/checksum_32.S new file mode 100644 index 0000000..f058d2f --- /dev/null +++ b/arch/x86/um/checksum_32.S @@ -0,0 +1,458 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * IP/TCP/UDP checksumming routines + * + * Authors: Jorge Cwik, + * Arnt Gulbrandsen, + * Tom May, + * Pentium Pro/II routines: + * Alexander Kjeldaas + * Finn Arne Gangstad + * Lots of code moved from tcp.c and ip.c; see those files + * for more names. + * + * Changes: Ingo Molnar, converted csum_partial_copy() to 2.1 exception + * handling. + * Andi Kleen, add zeroing on error + * converted to pure assembler + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include + +/* + * computes a partial checksum, e.g. for TCP/UDP fragments + */ + +/* +unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum) + */ + +.text +.align 4 +.globl csum_partial + +#ifndef CONFIG_X86_USE_PPRO_CHECKSUM + + /* + * Experiments with Ethernet and SLIP connections show that buff + * is aligned on either a 2-byte or 4-byte boundary. We get at + * least a twofold speedup on 486 and Pentium if it is 4-byte aligned. + * Fortunately, it is easy to convert 2-byte alignment to 4-byte + * alignment for the unrolled loop. + */ +csum_partial: + pushl %esi + pushl %ebx + movl 20(%esp),%eax # Function arg: unsigned int sum + movl 16(%esp),%ecx # Function arg: int len + movl 12(%esp),%esi # Function arg: unsigned char *buff + testl $2, %esi # Check alignment. + jz 2f # Jump if alignment is ok. + subl $2, %ecx # Alignment uses up two bytes. + jae 1f # Jump if we had at least two bytes. + addl $2, %ecx # ecx was < 2. Deal with it. + jmp 4f +1: movw (%esi), %bx + addl $2, %esi + addw %bx, %ax + adcl $0, %eax +2: + movl %ecx, %edx + shrl $5, %ecx + jz 2f + testl %esi, %esi +1: movl (%esi), %ebx + adcl %ebx, %eax + movl 4(%esi), %ebx + adcl %ebx, %eax + movl 8(%esi), %ebx + adcl %ebx, %eax + movl 12(%esi), %ebx + adcl %ebx, %eax + movl 16(%esi), %ebx + adcl %ebx, %eax + movl 20(%esi), %ebx + adcl %ebx, %eax + movl 24(%esi), %ebx + adcl %ebx, %eax + movl 28(%esi), %ebx + adcl %ebx, %eax + lea 32(%esi), %esi + dec %ecx + jne 1b + adcl $0, %eax +2: movl %edx, %ecx + andl $0x1c, %edx + je 4f + shrl $2, %edx # This clears CF +3: adcl (%esi), %eax + lea 4(%esi), %esi + dec %edx + jne 3b + adcl $0, %eax +4: andl $3, %ecx + jz 7f + cmpl $2, %ecx + jb 5f + movw (%esi),%cx + leal 2(%esi),%esi + je 6f + shll $16,%ecx +5: movb (%esi),%cl +6: addl %ecx,%eax + adcl $0, %eax +7: + popl %ebx + popl %esi + ret + +#else + +/* Version for PentiumII/PPro */ + +csum_partial: + pushl %esi + pushl %ebx + movl 20(%esp),%eax # Function arg: unsigned int sum + movl 16(%esp),%ecx # Function arg: int len + movl 12(%esp),%esi # Function arg: const unsigned char *buf + + testl $2, %esi + jnz 30f +10: + movl %ecx, %edx + movl %ecx, %ebx + andl $0x7c, %ebx + shrl $7, %ecx + addl %ebx,%esi + shrl $2, %ebx + negl %ebx + lea 45f(%ebx,%ebx,2), %ebx + testl %esi, %esi + jmp *%ebx + + # Handle 2-byte-aligned regions +20: addw (%esi), %ax + lea 2(%esi), %esi + adcl $0, %eax + jmp 10b + +30: subl $2, %ecx + ja 20b + je 32f + movzbl (%esi),%ebx # csumming 1 byte, 2-aligned + addl %ebx, %eax + adcl $0, %eax + jmp 80f +32: + addw (%esi), %ax # csumming 2 bytes, 2-aligned + adcl $0, %eax + jmp 80f + +40: + addl -128(%esi), %eax + adcl -124(%esi), %eax + adcl -120(%esi), %eax + adcl -116(%esi), %eax + adcl -112(%esi), %eax + adcl -108(%esi), %eax + adcl -104(%esi), %eax + adcl -100(%esi), %eax + adcl -96(%esi), %eax + adcl -92(%esi), %eax + adcl -88(%esi), %eax + adcl -84(%esi), %eax + adcl -80(%esi), %eax + adcl -76(%esi), %eax + adcl -72(%esi), %eax + adcl -68(%esi), %eax + adcl -64(%esi), %eax + adcl -60(%esi), %eax + adcl -56(%esi), %eax + adcl -52(%esi), %eax + adcl -48(%esi), %eax + adcl -44(%esi), %eax + adcl -40(%esi), %eax + adcl -36(%esi), %eax + adcl -32(%esi), %eax + adcl -28(%esi), %eax + adcl -24(%esi), %eax + adcl -20(%esi), %eax + adcl -16(%esi), %eax + adcl -12(%esi), %eax + adcl -8(%esi), %eax + adcl -4(%esi), %eax +45: + lea 128(%esi), %esi + adcl $0, %eax + dec %ecx + jge 40b + movl %edx, %ecx +50: andl $3, %ecx + jz 80f + + # Handle the last 1-3 bytes without jumping + notl %ecx # 1->2, 2->1, 3->0, higher bits are masked + movl $0xffffff,%ebx # by the shll and shrl instructions + shll $3,%ecx + shrl %cl,%ebx + andl -128(%esi),%ebx # esi is 4-aligned so should be ok + addl %ebx,%eax + adcl $0,%eax +80: + popl %ebx + popl %esi + ret + +#endif + +/* +unsigned int csum_partial_copy_generic (const char *src, char *dst, + int len, int sum, int *src_err_ptr, int *dst_err_ptr) + */ + +/* + * Copy from ds while checksumming, otherwise like csum_partial + * + * The macros SRC and DST specify the type of access for the instruction. + * thus we can call a custom exception handler for all access types. + * + * FIXME: could someone double-check whether I haven't mixed up some SRC and + * DST definitions? It's damn hard to trigger all cases. I hope I got + * them all but there's no guarantee. + */ + +#define SRC(y...) \ + 9999: y; \ + .section __ex_table, "a"; \ + .long 9999b, 6001f ; \ + .previous + +#define DST(y...) \ + 9999: y; \ + .section __ex_table, "a"; \ + .long 9999b, 6002f ; \ + .previous + +.align 4 + +#ifndef CONFIG_X86_USE_PPRO_CHECKSUM + +#define ARGBASE 16 +#define FP 12 + +csum_partial_copy_generic_i386: + subl $4,%esp + pushl %edi + pushl %esi + pushl %ebx + movl ARGBASE+16(%esp),%eax # sum + movl ARGBASE+12(%esp),%ecx # len + movl ARGBASE+4(%esp),%esi # src + movl ARGBASE+8(%esp),%edi # dst + + testl $2, %edi # Check alignment. + jz 2f # Jump if alignment is ok. + subl $2, %ecx # Alignment uses up two bytes. + jae 1f # Jump if we had at least two bytes. + addl $2, %ecx # ecx was < 2. Deal with it. + jmp 4f +SRC(1: movw (%esi), %bx ) + addl $2, %esi +DST( movw %bx, (%edi) ) + addl $2, %edi + addw %bx, %ax + adcl $0, %eax +2: + movl %ecx, FP(%esp) + shrl $5, %ecx + jz 2f + testl %esi, %esi +SRC(1: movl (%esi), %ebx ) +SRC( movl 4(%esi), %edx ) + adcl %ebx, %eax +DST( movl %ebx, (%edi) ) + adcl %edx, %eax +DST( movl %edx, 4(%edi) ) + +SRC( movl 8(%esi), %ebx ) +SRC( movl 12(%esi), %edx ) + adcl %ebx, %eax +DST( movl %ebx, 8(%edi) ) + adcl %edx, %eax +DST( movl %edx, 12(%edi) ) + +SRC( movl 16(%esi), %ebx ) +SRC( movl 20(%esi), %edx ) + adcl %ebx, %eax +DST( movl %ebx, 16(%edi) ) + adcl %edx, %eax +DST( movl %edx, 20(%edi) ) + +SRC( movl 24(%esi), %ebx ) +SRC( movl 28(%esi), %edx ) + adcl %ebx, %eax +DST( movl %ebx, 24(%edi) ) + adcl %edx, %eax +DST( movl %edx, 28(%edi) ) + + lea 32(%esi), %esi + lea 32(%edi), %edi + dec %ecx + jne 1b + adcl $0, %eax +2: movl FP(%esp), %edx + movl %edx, %ecx + andl $0x1c, %edx + je 4f + shrl $2, %edx # This clears CF +SRC(3: movl (%esi), %ebx ) + adcl %ebx, %eax +DST( movl %ebx, (%edi) ) + lea 4(%esi), %esi + lea 4(%edi), %edi + dec %edx + jne 3b + adcl $0, %eax +4: andl $3, %ecx + jz 7f + cmpl $2, %ecx + jb 5f +SRC( movw (%esi), %cx ) + leal 2(%esi), %esi +DST( movw %cx, (%edi) ) + leal 2(%edi), %edi + je 6f + shll $16,%ecx +SRC(5: movb (%esi), %cl ) +DST( movb %cl, (%edi) ) +6: addl %ecx, %eax + adcl $0, %eax +7: +5000: + +# Exception handler: +.section .fixup, "ax" + +6001: + movl ARGBASE+20(%esp), %ebx # src_err_ptr + movl $-EFAULT, (%ebx) + + # zero the complete destination - computing the rest + # is too much work + movl ARGBASE+8(%esp), %edi # dst + movl ARGBASE+12(%esp), %ecx # len + xorl %eax,%eax + rep ; stosb + + jmp 5000b + +6002: + movl ARGBASE+24(%esp), %ebx # dst_err_ptr + movl $-EFAULT,(%ebx) + jmp 5000b + +.previous + + popl %ebx + popl %esi + popl %edi + popl %ecx # equivalent to addl $4,%esp + ret + +#else + +/* Version for PentiumII/PPro */ + +#define ROUND1(x) \ + SRC(movl x(%esi), %ebx ) ; \ + addl %ebx, %eax ; \ + DST(movl %ebx, x(%edi) ) ; + +#define ROUND(x) \ + SRC(movl x(%esi), %ebx ) ; \ + adcl %ebx, %eax ; \ + DST(movl %ebx, x(%edi) ) ; + +#define ARGBASE 12 + +csum_partial_copy_generic_i386: + pushl %ebx + pushl %edi + pushl %esi + movl ARGBASE+4(%esp),%esi #src + movl ARGBASE+8(%esp),%edi #dst + movl ARGBASE+12(%esp),%ecx #len + movl ARGBASE+16(%esp),%eax #sum +# movl %ecx, %edx + movl %ecx, %ebx + movl %esi, %edx + shrl $6, %ecx + andl $0x3c, %ebx + negl %ebx + subl %ebx, %esi + subl %ebx, %edi + lea -1(%esi),%edx + andl $-32,%edx + lea 3f(%ebx,%ebx), %ebx + testl %esi, %esi + jmp *%ebx +1: addl $64,%esi + addl $64,%edi + SRC(movb -32(%edx),%bl) ; SRC(movb (%edx),%bl) + ROUND1(-64) ROUND(-60) ROUND(-56) ROUND(-52) + ROUND (-48) ROUND(-44) ROUND(-40) ROUND(-36) + ROUND (-32) ROUND(-28) ROUND(-24) ROUND(-20) + ROUND (-16) ROUND(-12) ROUND(-8) ROUND(-4) +3: adcl $0,%eax + addl $64, %edx + dec %ecx + jge 1b +4: movl ARGBASE+12(%esp),%edx #len + andl $3, %edx + jz 7f + cmpl $2, %edx + jb 5f +SRC( movw (%esi), %dx ) + leal 2(%esi), %esi +DST( movw %dx, (%edi) ) + leal 2(%edi), %edi + je 6f + shll $16,%edx +5: +SRC( movb (%esi), %dl ) +DST( movb %dl, (%edi) ) +6: addl %edx, %eax + adcl $0, %eax +7: +.section .fixup, "ax" +6001: movl ARGBASE+20(%esp), %ebx # src_err_ptr + movl $-EFAULT, (%ebx) + # zero the complete destination (computing the rest is too much work) + movl ARGBASE+8(%esp),%edi # dst + movl ARGBASE+12(%esp),%ecx # len + xorl %eax,%eax + rep; stosb + jmp 7b +6002: movl ARGBASE+24(%esp), %ebx # dst_err_ptr + movl $-EFAULT, (%ebx) + jmp 7b +.previous + + popl %esi + popl %edi + popl %ebx + ret + +#undef ROUND +#undef ROUND1 + +#endif diff --git a/arch/x86/um/delay_32.c b/arch/x86/um/delay_32.c new file mode 100644 index 0000000..f3fe1a6 --- /dev/null +++ b/arch/x86/um/delay_32.c @@ -0,0 +1,60 @@ +/* + * Copyright (C) 2011 Richard Weinberger + * Mostly copied from arch/x86/lib/delay.c + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include + +void __delay(unsigned long loops) +{ + asm volatile( + "test %0,%0\n" + "jz 3f\n" + "jmp 1f\n" + + ".align 16\n" + "1: jmp 2f\n" + + ".align 16\n" + "2: dec %0\n" + " jnz 2b\n" + "3: dec %0\n" + + : /* we don't need output */ + : "a" (loops) + ); +} +EXPORT_SYMBOL(__delay); + +inline void __const_udelay(unsigned long xloops) +{ + int d0; + + xloops *= 4; + asm("mull %%edx" + : "=d" (xloops), "=&a" (d0) + : "1" (xloops), "0" + (loops_per_jiffy * (HZ/4))); + + __delay(++xloops); +} +EXPORT_SYMBOL(__const_udelay); + +void __udelay(unsigned long usecs) +{ + __const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */ +} +EXPORT_SYMBOL(__udelay); + +void __ndelay(unsigned long nsecs) +{ + __const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */ +} +EXPORT_SYMBOL(__ndelay); diff --git a/arch/x86/um/delay_64.c b/arch/x86/um/delay_64.c new file mode 100644 index 0000000..f3fe1a6 --- /dev/null +++ b/arch/x86/um/delay_64.c @@ -0,0 +1,60 @@ +/* + * Copyright (C) 2011 Richard Weinberger + * Mostly copied from arch/x86/lib/delay.c + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include + +void __delay(unsigned long loops) +{ + asm volatile( + "test %0,%0\n" + "jz 3f\n" + "jmp 1f\n" + + ".align 16\n" + "1: jmp 2f\n" + + ".align 16\n" + "2: dec %0\n" + " jnz 2b\n" + "3: dec %0\n" + + : /* we don't need output */ + : "a" (loops) + ); +} +EXPORT_SYMBOL(__delay); + +inline void __const_udelay(unsigned long xloops) +{ + int d0; + + xloops *= 4; + asm("mull %%edx" + : "=d" (xloops), "=&a" (d0) + : "1" (xloops), "0" + (loops_per_jiffy * (HZ/4))); + + __delay(++xloops); +} +EXPORT_SYMBOL(__const_udelay); + +void __udelay(unsigned long usecs) +{ + __const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */ +} +EXPORT_SYMBOL(__udelay); + +void __ndelay(unsigned long nsecs) +{ + __const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */ +} +EXPORT_SYMBOL(__ndelay); diff --git a/arch/x86/um/elfcore.c b/arch/x86/um/elfcore.c new file mode 100644 index 0000000..6bb49b6 --- /dev/null +++ b/arch/x86/um/elfcore.c @@ -0,0 +1,83 @@ +#include +#include +#include +#include + +#include + + +Elf32_Half elf_core_extra_phdrs(void) +{ + return vsyscall_ehdr ? (((struct elfhdr *)vsyscall_ehdr)->e_phnum) : 0; +} + +int elf_core_write_extra_phdrs(struct file *file, loff_t offset, size_t *size, + unsigned long limit) +{ + if ( vsyscall_ehdr ) { + const struct elfhdr *const ehdrp = + (struct elfhdr *) vsyscall_ehdr; + const struct elf_phdr *const phdrp = + (const struct elf_phdr *) (vsyscall_ehdr + ehdrp->e_phoff); + int i; + Elf32_Off ofs = 0; + + for (i = 0; i < ehdrp->e_phnum; ++i) { + struct elf_phdr phdr = phdrp[i]; + + if (phdr.p_type == PT_LOAD) { + ofs = phdr.p_offset = offset; + offset += phdr.p_filesz; + } else { + phdr.p_offset += ofs; + } + phdr.p_paddr = 0; /* match other core phdrs */ + *size += sizeof(phdr); + if (*size > limit + || !dump_write(file, &phdr, sizeof(phdr))) + return 0; + } + } + return 1; +} + +int elf_core_write_extra_data(struct file *file, size_t *size, + unsigned long limit) +{ + if ( vsyscall_ehdr ) { + const struct elfhdr *const ehdrp = + (struct elfhdr *) vsyscall_ehdr; + const struct elf_phdr *const phdrp = + (const struct elf_phdr *) (vsyscall_ehdr + ehdrp->e_phoff); + int i; + + for (i = 0; i < ehdrp->e_phnum; ++i) { + if (phdrp[i].p_type == PT_LOAD) { + void *addr = (void *) phdrp[i].p_vaddr; + size_t filesz = phdrp[i].p_filesz; + + *size += filesz; + if (*size > limit + || !dump_write(file, addr, filesz)) + return 0; + } + } + } + return 1; +} + +size_t elf_core_extra_data_size(void) +{ + if ( vsyscall_ehdr ) { + const struct elfhdr *const ehdrp = + (struct elfhdr *)vsyscall_ehdr; + const struct elf_phdr *const phdrp = + (const struct elf_phdr *) (vsyscall_ehdr + ehdrp->e_phoff); + int i; + + for (i = 0; i < ehdrp->e_phnum; ++i) + if (phdrp[i].p_type == PT_LOAD) + return (size_t) phdrp[i].p_filesz; + } + return 0; +} diff --git a/arch/x86/um/fault.c b/arch/x86/um/fault.c new file mode 100644 index 0000000..d670f68 --- /dev/null +++ b/arch/x86/um/fault.c @@ -0,0 +1,28 @@ +/* + * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + * Licensed under the GPL + */ + +#include "sysdep/ptrace.h" + +/* These two are from asm-um/uaccess.h and linux/module.h, check them. */ +struct exception_table_entry +{ + unsigned long insn; + unsigned long fixup; +}; + +const struct exception_table_entry *search_exception_tables(unsigned long add); + +/* Compare this to arch/i386/mm/extable.c:fixup_exception() */ +int arch_fixup(unsigned long address, struct uml_pt_regs *regs) +{ + const struct exception_table_entry *fixup; + + fixup = search_exception_tables(address); + if (fixup != 0) { + UPT_IP(regs) = fixup->fixup; + return 1; + } + return 0; +} diff --git a/arch/x86/um/ksyms.c b/arch/x86/um/ksyms.c new file mode 100644 index 0000000..2e8f43e --- /dev/null +++ b/arch/x86/um/ksyms.c @@ -0,0 +1,13 @@ +#include +#include +#include + +#ifndef CONFIG_X86_32 +/*XXX: we need them because they would be exported by x86_64 */ +#if (__GNUC__ == 4 && __GNUC_MINOR__ >= 3) || __GNUC__ > 4 +EXPORT_SYMBOL(memcpy); +#else +EXPORT_SYMBOL(__memcpy); +#endif +#endif +EXPORT_SYMBOL(csum_partial); diff --git a/arch/x86/um/ldt.c b/arch/x86/um/ldt.c new file mode 100644 index 0000000..3f2bf20 --- /dev/null +++ b/arch/x86/um/ldt.c @@ -0,0 +1,502 @@ +/* + * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + * Licensed under the GPL + */ + +#include +#include +#include +#include +#include "os.h" +#include "proc_mm.h" +#include "skas.h" +#include "skas_ptrace.h" +#include "sysdep/tls.h" + +extern int modify_ldt(int func, void *ptr, unsigned long bytecount); + +static long write_ldt_entry(struct mm_id *mm_idp, int func, + struct user_desc *desc, void **addr, int done) +{ + long res; + + if (proc_mm) { + /* + * This is a special handling for the case, that the mm to + * modify isn't current->active_mm. + * If this is called directly by modify_ldt, + * (current->active_mm->context.skas.u == mm_idp) + * will be true. So no call to __switch_mm(mm_idp) is done. + * If this is called in case of init_new_ldt or PTRACE_LDT, + * mm_idp won't belong to current->active_mm, but child->mm. + * So we need to switch child's mm into our userspace, then + * later switch back. + * + * Note: I'm unsure: should interrupts be disabled here? + */ + if (!current->active_mm || current->active_mm == &init_mm || + mm_idp != ¤t->active_mm->context.id) + __switch_mm(mm_idp); + } + + if (ptrace_ldt) { + struct ptrace_ldt ldt_op = (struct ptrace_ldt) { + .func = func, + .ptr = desc, + .bytecount = sizeof(*desc)}; + u32 cpu; + int pid; + + if (!proc_mm) + pid = mm_idp->u.pid; + else { + cpu = get_cpu(); + pid = userspace_pid[cpu]; + } + + res = os_ptrace_ldt(pid, 0, (unsigned long) &ldt_op); + + if (proc_mm) + put_cpu(); + } + else { + void *stub_addr; + res = syscall_stub_data(mm_idp, (unsigned long *)desc, + (sizeof(*desc) + sizeof(long) - 1) & + ~(sizeof(long) - 1), + addr, &stub_addr); + if (!res) { + unsigned long args[] = { func, + (unsigned long)stub_addr, + sizeof(*desc), + 0, 0, 0 }; + res = run_syscall_stub(mm_idp, __NR_modify_ldt, args, + 0, addr, done); + } + } + + if (proc_mm) { + /* + * This is the second part of special handling, that makes + * PTRACE_LDT possible to implement. + */ + if (current->active_mm && current->active_mm != &init_mm && + mm_idp != ¤t->active_mm->context.id) + __switch_mm(¤t->active_mm->context.id); + } + + return res; +} + +static long read_ldt_from_host(void __user * ptr, unsigned long bytecount) +{ + int res, n; + struct ptrace_ldt ptrace_ldt = (struct ptrace_ldt) { + .func = 0, + .bytecount = bytecount, + .ptr = kmalloc(bytecount, GFP_KERNEL)}; + u32 cpu; + + if (ptrace_ldt.ptr == NULL) + return -ENOMEM; + + /* + * This is called from sys_modify_ldt only, so userspace_pid gives + * us the right number + */ + + cpu = get_cpu(); + res = os_ptrace_ldt(userspace_pid[cpu], 0, (unsigned long) &ptrace_ldt); + put_cpu(); + if (res < 0) + goto out; + + n = copy_to_user(ptr, ptrace_ldt.ptr, res); + if (n != 0) + res = -EFAULT; + + out: + kfree(ptrace_ldt.ptr); + + return res; +} + +/* + * In skas mode, we hold our own ldt data in UML. + * Thus, the code implementing sys_modify_ldt_skas + * is very similar to (and mostly stolen from) sys_modify_ldt + * for arch/i386/kernel/ldt.c + * The routines copied and modified in part are: + * - read_ldt + * - read_default_ldt + * - write_ldt + * - sys_modify_ldt_skas + */ + +static int read_ldt(void __user * ptr, unsigned long bytecount) +{ + int i, err = 0; + unsigned long size; + uml_ldt_t * ldt = ¤t->mm->context.ldt; + + if (!ldt->entry_count) + goto out; + if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES) + bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES; + err = bytecount; + + if (ptrace_ldt) + return read_ldt_from_host(ptr, bytecount); + + mutex_lock(&ldt->lock); + if (ldt->entry_count <= LDT_DIRECT_ENTRIES) { + size = LDT_ENTRY_SIZE*LDT_DIRECT_ENTRIES; + if (size > bytecount) + size = bytecount; + if (copy_to_user(ptr, ldt->u.entries, size)) + err = -EFAULT; + bytecount -= size; + ptr += size; + } + else { + for (i=0; ientry_count/LDT_ENTRIES_PER_PAGE && bytecount; + i++) { + size = PAGE_SIZE; + if (size > bytecount) + size = bytecount; + if (copy_to_user(ptr, ldt->u.pages[i], size)) { + err = -EFAULT; + break; + } + bytecount -= size; + ptr += size; + } + } + mutex_unlock(&ldt->lock); + + if (bytecount == 0 || err == -EFAULT) + goto out; + + if (clear_user(ptr, bytecount)) + err = -EFAULT; + +out: + return err; +} + +static int read_default_ldt(void __user * ptr, unsigned long bytecount) +{ + int err; + + if (bytecount > 5*LDT_ENTRY_SIZE) + bytecount = 5*LDT_ENTRY_SIZE; + + err = bytecount; + /* + * UML doesn't support lcall7 and lcall27. + * So, we don't really have a default ldt, but emulate + * an empty ldt of common host default ldt size. + */ + if (clear_user(ptr, bytecount)) + err = -EFAULT; + + return err; +} + +static int write_ldt(void __user * ptr, unsigned long bytecount, int func) +{ + uml_ldt_t * ldt = ¤t->mm->context.ldt; + struct mm_id * mm_idp = ¤t->mm->context.id; + int i, err; + struct user_desc ldt_info; + struct ldt_entry entry0, *ldt_p; + void *addr = NULL; + + err = -EINVAL; + if (bytecount != sizeof(ldt_info)) + goto out; + err = -EFAULT; + if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info))) + goto out; + + err = -EINVAL; + if (ldt_info.entry_number >= LDT_ENTRIES) + goto out; + if (ldt_info.contents == 3) { + if (func == 1) + goto out; + if (ldt_info.seg_not_present == 0) + goto out; + } + + if (!ptrace_ldt) + mutex_lock(&ldt->lock); + + err = write_ldt_entry(mm_idp, func, &ldt_info, &addr, 1); + if (err) + goto out_unlock; + else if (ptrace_ldt) { + /* With PTRACE_LDT available, this is used as a flag only */ + ldt->entry_count = 1; + goto out; + } + + if (ldt_info.entry_number >= ldt->entry_count && + ldt_info.entry_number >= LDT_DIRECT_ENTRIES) { + for (i=ldt->entry_count/LDT_ENTRIES_PER_PAGE; + i*LDT_ENTRIES_PER_PAGE <= ldt_info.entry_number; + i++) { + if (i == 0) + memcpy(&entry0, ldt->u.entries, + sizeof(entry0)); + ldt->u.pages[i] = (struct ldt_entry *) + __get_free_page(GFP_KERNEL|__GFP_ZERO); + if (!ldt->u.pages[i]) { + err = -ENOMEM; + /* Undo the change in host */ + memset(&ldt_info, 0, sizeof(ldt_info)); + write_ldt_entry(mm_idp, 1, &ldt_info, &addr, 1); + goto out_unlock; + } + if (i == 0) { + memcpy(ldt->u.pages[0], &entry0, + sizeof(entry0)); + memcpy(ldt->u.pages[0]+1, ldt->u.entries+1, + sizeof(entry0)*(LDT_DIRECT_ENTRIES-1)); + } + ldt->entry_count = (i + 1) * LDT_ENTRIES_PER_PAGE; + } + } + if (ldt->entry_count <= ldt_info.entry_number) + ldt->entry_count = ldt_info.entry_number + 1; + + if (ldt->entry_count <= LDT_DIRECT_ENTRIES) + ldt_p = ldt->u.entries + ldt_info.entry_number; + else + ldt_p = ldt->u.pages[ldt_info.entry_number/LDT_ENTRIES_PER_PAGE] + + ldt_info.entry_number%LDT_ENTRIES_PER_PAGE; + + if (ldt_info.base_addr == 0 && ldt_info.limit == 0 && + (func == 1 || LDT_empty(&ldt_info))) { + ldt_p->a = 0; + ldt_p->b = 0; + } + else{ + if (func == 1) + ldt_info.useable = 0; + ldt_p->a = LDT_entry_a(&ldt_info); + ldt_p->b = LDT_entry_b(&ldt_info); + } + err = 0; + +out_unlock: + mutex_unlock(&ldt->lock); +out: + return err; +} + +static long do_modify_ldt_skas(int func, void __user *ptr, + unsigned long bytecount) +{ + int ret = -ENOSYS; + + switch (func) { + case 0: + ret = read_ldt(ptr, bytecount); + break; + case 1: + case 0x11: + ret = write_ldt(ptr, bytecount, func); + break; + case 2: + ret = read_default_ldt(ptr, bytecount); + break; + } + return ret; +} + +static DEFINE_SPINLOCK(host_ldt_lock); +static short dummy_list[9] = {0, -1}; +static short * host_ldt_entries = NULL; + +static void ldt_get_host_info(void) +{ + long ret; + struct ldt_entry * ldt; + short *tmp; + int i, size, k, order; + + spin_lock(&host_ldt_lock); + + if (host_ldt_entries != NULL) { + spin_unlock(&host_ldt_lock); + return; + } + host_ldt_entries = dummy_list+1; + + spin_unlock(&host_ldt_lock); + + for (i = LDT_PAGES_MAX-1, order=0; i; i>>=1, order++) + ; + + ldt = (struct ldt_entry *) + __get_free_pages(GFP_KERNEL|__GFP_ZERO, order); + if (ldt == NULL) { + printk(KERN_ERR "ldt_get_host_info: couldn't allocate buffer " + "for host ldt\n"); + return; + } + + ret = modify_ldt(0, ldt, (1<ldt.lock); + + if (!from_mm) { + memset(&desc, 0, sizeof(desc)); + /* + * We have to initialize a clean ldt. + */ + if (proc_mm) { + /* + * If the new mm was created using proc_mm, host's + * default-ldt currently is assigned, which normally + * contains the call-gates for lcall7 and lcall27. + * To remove these gates, we simply write an empty + * entry as number 0 to the host. + */ + err = write_ldt_entry(&new_mm->id, 1, &desc, &addr, 1); + } + else{ + /* + * Now we try to retrieve info about the ldt, we + * inherited from the host. All ldt-entries found + * will be reset in the following loop + */ + ldt_get_host_info(); + for (num_p=host_ldt_entries; *num_p != -1; num_p++) { + desc.entry_number = *num_p; + err = write_ldt_entry(&new_mm->id, 1, &desc, + &addr, *(num_p + 1) == -1); + if (err) + break; + } + } + new_mm->ldt.entry_count = 0; + + goto out; + } + + if (proc_mm) { + /* + * We have a valid from_mm, so we now have to copy the LDT of + * from_mm to new_mm, because using proc_mm an new mm with + * an empty/default LDT was created in new_mm() + */ + copy = ((struct proc_mm_op) { .op = MM_COPY_SEGMENTS, + .u = + { .copy_segments = + from_mm->id.u.mm_fd } } ); + i = os_write_file(new_mm->id.u.mm_fd, ©, sizeof(copy)); + if (i != sizeof(copy)) + printk(KERN_ERR "new_mm : /proc/mm copy_segments " + "failed, err = %d\n", -i); + } + + if (!ptrace_ldt) { + /* + * Our local LDT is used to supply the data for + * modify_ldt(READLDT), if PTRACE_LDT isn't available, + * i.e., we have to use the stub for modify_ldt, which + * can't handle the big read buffer of up to 64kB. + */ + mutex_lock(&from_mm->ldt.lock); + if (from_mm->ldt.entry_count <= LDT_DIRECT_ENTRIES) + memcpy(new_mm->ldt.u.entries, from_mm->ldt.u.entries, + sizeof(new_mm->ldt.u.entries)); + else { + i = from_mm->ldt.entry_count / LDT_ENTRIES_PER_PAGE; + while (i-->0) { + page = __get_free_page(GFP_KERNEL|__GFP_ZERO); + if (!page) { + err = -ENOMEM; + break; + } + new_mm->ldt.u.pages[i] = + (struct ldt_entry *) page; + memcpy(new_mm->ldt.u.pages[i], + from_mm->ldt.u.pages[i], PAGE_SIZE); + } + } + new_mm->ldt.entry_count = from_mm->ldt.entry_count; + mutex_unlock(&from_mm->ldt.lock); + } + + out: + return err; +} + + +void free_ldt(struct mm_context *mm) +{ + int i; + + if (!ptrace_ldt && mm->ldt.entry_count > LDT_DIRECT_ENTRIES) { + i = mm->ldt.entry_count / LDT_ENTRIES_PER_PAGE; + while (i-- > 0) + free_page((long) mm->ldt.u.pages[i]); + } + mm->ldt.entry_count = 0; +} + +int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount) +{ + return do_modify_ldt_skas(func, ptr, bytecount); +} diff --git a/arch/x86/um/mem_32.c b/arch/x86/um/mem_32.c new file mode 100644 index 0000000..639900a --- /dev/null +++ b/arch/x86/um/mem_32.c @@ -0,0 +1,62 @@ +/* + * Copyright (C) 2011 Richard Weinberger + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include + +static struct vm_area_struct gate_vma; + +static int __init gate_vma_init(void) +{ + if (!FIXADDR_USER_START) + return 0; + + gate_vma.vm_mm = NULL; + gate_vma.vm_start = FIXADDR_USER_START; + gate_vma.vm_end = FIXADDR_USER_END; + gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC; + gate_vma.vm_page_prot = __P101; + + /* + * Make sure the vDSO gets into every core dump. + * Dumping its contents makes post-mortem fully interpretable later + * without matching up the same kernel and hardware config to see + * what PC values meant. + */ + gate_vma.vm_flags |= VM_ALWAYSDUMP; + + return 0; +} +__initcall(gate_vma_init); + +struct vm_area_struct *get_gate_vma(struct mm_struct *mm) +{ + return FIXADDR_USER_START ? &gate_vma : NULL; +} + +int in_gate_area_no_mm(unsigned long addr) +{ + if (!FIXADDR_USER_START) + return 0; + + if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END)) + return 1; + + return 0; +} + +int in_gate_area(struct mm_struct *mm, unsigned long addr) +{ + struct vm_area_struct *vma = get_gate_vma(mm); + + if (!vma) + return 0; + + return (addr >= vma->vm_start) && (addr < vma->vm_end); +} diff --git a/arch/x86/um/mem_64.c b/arch/x86/um/mem_64.c new file mode 100644 index 0000000..5465187 --- /dev/null +++ b/arch/x86/um/mem_64.c @@ -0,0 +1,26 @@ +#include "linux/mm.h" +#include "asm/page.h" +#include "asm/mman.h" + +const char *arch_vma_name(struct vm_area_struct *vma) +{ + if (vma->vm_mm && vma->vm_start == um_vdso_addr) + return "[vdso]"; + + return NULL; +} + +struct vm_area_struct *get_gate_vma(struct mm_struct *mm) +{ + return NULL; +} + +int in_gate_area(struct mm_struct *mm, unsigned long addr) +{ + return 0; +} + +int in_gate_area_no_mm(unsigned long addr) +{ + return 0; +} diff --git a/arch/x86/um/os-Linux/Makefile b/arch/x86/um/os-Linux/Makefile new file mode 100644 index 0000000..253bfb8 --- /dev/null +++ b/arch/x86/um/os-Linux/Makefile @@ -0,0 +1,13 @@ +# +# Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) +# Licensed under the GPL +# + +obj-y = registers.o task_size.o mcontext.o + +obj-$(CONFIG_X86_32) += tls.o +obj-$(CONFIG_64BIT) += prctl.o + +USER_OBJS := $(obj-y) + +include arch/um/scripts/Makefile.rules diff --git a/arch/x86/um/os-Linux/mcontext.c b/arch/x86/um/os-Linux/mcontext.c new file mode 100644 index 0000000..1d33d72 --- /dev/null +++ b/arch/x86/um/os-Linux/mcontext.c @@ -0,0 +1,31 @@ +#include +#define __FRAME_OFFSETS +#include +#include + +void get_regs_from_mc(struct uml_pt_regs *regs, mcontext_t *mc) +{ +#ifdef __i386__ +#define COPY2(X,Y) regs->gp[X] = mc->gregs[REG_##Y] +#define COPY(X) regs->gp[X] = mc->gregs[REG_##X] +#define COPY_SEG(X) regs->gp[X] = mc->gregs[REG_##X] & 0xffff; +#define COPY_SEG_CPL3(X) regs->gp[X] = (mc->gregs[REG_##X] & 0xffff) | 3; + COPY_SEG(GS); COPY_SEG(FS); COPY_SEG(ES); COPY_SEG(DS); + COPY(EDI); COPY(ESI); COPY(EBP); + COPY2(UESP, ESP); /* sic */ + COPY(EBX); COPY(EDX); COPY(ECX); COPY(EAX); + COPY(EIP); COPY_SEG_CPL3(CS); COPY(EFL); COPY_SEG_CPL3(SS); +#else +#define COPY2(X,Y) regs->gp[X/sizeof(unsigned long)] = mc->gregs[REG_##Y] +#define COPY(X) regs->gp[X/sizeof(unsigned long)] = mc->gregs[REG_##X] + COPY(R8); COPY(R9); COPY(R10); COPY(R11); + COPY(R12); COPY(R13); COPY(R14); COPY(R15); + COPY(RDI); COPY(RSI); COPY(RBP); COPY(RBX); + COPY(RDX); COPY(RAX); COPY(RCX); COPY(RSP); + COPY(RIP); + COPY2(EFLAGS, EFL); + COPY2(CS, CSGSFS); + regs->gp[CS / sizeof(unsigned long)] &= 0xffff; + regs->gp[CS / sizeof(unsigned long)] |= 3; +#endif +} diff --git a/arch/x86/um/os-Linux/prctl.c b/arch/x86/um/os-Linux/prctl.c new file mode 100644 index 0000000..9d34edd --- /dev/null +++ b/arch/x86/um/os-Linux/prctl.c @@ -0,0 +1,12 @@ +/* + * Copyright (C) 2007 Jeff Dike (jdike@{addtoit.com,linux.intel.com}) + * Licensed under the GPL + */ + +#include +#include + +int os_arch_prctl(int pid, int code, unsigned long *addr) +{ + return ptrace(PTRACE_ARCH_PRCTL, pid, (unsigned long) addr, code); +} diff --git a/arch/x86/um/os-Linux/registers.c b/arch/x86/um/os-Linux/registers.c new file mode 100644 index 0000000..3a9b624 --- /dev/null +++ b/arch/x86/um/os-Linux/registers.c @@ -0,0 +1,111 @@ +/* + * Copyright (C) 2004 PathScale, Inc + * Copyright (C) 2004 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + * Licensed under the GPL + */ + +#include +#include +#include +#include "longjmp.h" +#include "sysdep/ptrace_user.h" + +int save_fp_registers(int pid, unsigned long *fp_regs) +{ + if (ptrace(PTRACE_GETFPREGS, pid, 0, fp_regs) < 0) + return -errno; + return 0; +} + +int restore_fp_registers(int pid, unsigned long *fp_regs) +{ + if (ptrace(PTRACE_SETFPREGS, pid, 0, fp_regs) < 0) + return -errno; + return 0; +} + +#ifdef __i386__ +int have_fpx_regs = 1; +int save_fpx_registers(int pid, unsigned long *fp_regs) +{ + if (ptrace(PTRACE_GETFPXREGS, pid, 0, fp_regs) < 0) + return -errno; + return 0; +} + +int restore_fpx_registers(int pid, unsigned long *fp_regs) +{ + if (ptrace(PTRACE_SETFPXREGS, pid, 0, fp_regs) < 0) + return -errno; + return 0; +} + +int get_fp_registers(int pid, unsigned long *regs) +{ + if (have_fpx_regs) + return save_fpx_registers(pid, regs); + else + return save_fp_registers(pid, regs); +} + +int put_fp_registers(int pid, unsigned long *regs) +{ + if (have_fpx_regs) + return restore_fpx_registers(pid, regs); + else + return restore_fp_registers(pid, regs); +} + +void arch_init_registers(int pid) +{ + struct user_fpxregs_struct fpx_regs; + int err; + + err = ptrace(PTRACE_GETFPXREGS, pid, 0, &fpx_regs); + if (!err) + return; + + if (errno != EIO) + panic("check_ptrace : PTRACE_GETFPXREGS failed, errno = %d", + errno); + + have_fpx_regs = 0; +} +#else + +int get_fp_registers(int pid, unsigned long *regs) +{ + return save_fp_registers(pid, regs); +} + +int put_fp_registers(int pid, unsigned long *regs) +{ + return restore_fp_registers(pid, regs); +} + +#endif + +unsigned long get_thread_reg(int reg, jmp_buf *buf) +{ + switch (reg) { +#ifdef __i386__ + case EIP: + return buf[0]->__eip; + case UESP: + return buf[0]->__esp; + case EBP: + return buf[0]->__ebp; +#else + case RIP: + return buf[0]->__rip; + case RSP: + return buf[0]->__rsp; + case RBP: + return buf[0]->__rbp; +#endif + default: + printk(UM_KERN_ERR "get_thread_regs - unknown register %d\n", + reg); + return 0; + } +} diff --git a/arch/x86/um/os-Linux/task_size.c b/arch/x86/um/os-Linux/task_size.c new file mode 100644 index 0000000..efb16c5 --- /dev/null +++ b/arch/x86/um/os-Linux/task_size.c @@ -0,0 +1,150 @@ +#include +#include +#include +#include +#include "longjmp.h" + +#ifdef __i386__ + +static jmp_buf buf; + +static void segfault(int sig) +{ + longjmp(buf, 1); +} + +static int page_ok(unsigned long page) +{ + unsigned long *address = (unsigned long *) (page << UM_KERN_PAGE_SHIFT); + unsigned long n = ~0UL; + void *mapped = NULL; + int ok = 0; + + /* + * First see if the page is readable. If it is, it may still + * be a VDSO, so we go on to see if it's writable. If not + * then try mapping memory there. If that fails, then we're + * still in the kernel area. As a sanity check, we'll fail if + * the mmap succeeds, but gives us an address different from + * what we wanted. + */ + if (setjmp(buf) == 0) + n = *address; + else { + mapped = mmap(address, UM_KERN_PAGE_SIZE, + PROT_READ | PROT_WRITE, + MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (mapped == MAP_FAILED) + return 0; + if (mapped != address) + goto out; + } + + /* + * Now, is it writeable? If so, then we're in user address + * space. If not, then try mprotecting it and try the write + * again. + */ + if (setjmp(buf) == 0) { + *address = n; + ok = 1; + goto out; + } else if (mprotect(address, UM_KERN_PAGE_SIZE, + PROT_READ | PROT_WRITE) != 0) + goto out; + + if (setjmp(buf) == 0) { + *address = n; + ok = 1; + } + + out: + if (mapped != NULL) + munmap(mapped, UM_KERN_PAGE_SIZE); + return ok; +} + +unsigned long os_get_top_address(void) +{ + struct sigaction sa, old; + unsigned long bottom = 0; + /* + * A 32-bit UML on a 64-bit host gets confused about the VDSO at + * 0xffffe000. It is mapped, is readable, can be reprotected writeable + * and written. However, exec discovers later that it can't be + * unmapped. So, just set the highest address to be checked to just + * below it. This might waste some address space on 4G/4G 32-bit + * hosts, but shouldn't hurt otherwise. + */ + unsigned long top = 0xffffd000 >> UM_KERN_PAGE_SHIFT; + unsigned long test, original; + + printf("Locating the bottom of the address space ... "); + fflush(stdout); + + /* + * We're going to be longjmping out of the signal handler, so + * SA_DEFER needs to be set. + */ + sa.sa_handler = segfault; + sigemptyset(&sa.sa_mask); + sa.sa_flags = SA_NODEFER; + if (sigaction(SIGSEGV, &sa, &old)) { + perror("os_get_top_address"); + exit(1); + } + + /* Manually scan the address space, bottom-up, until we find + * the first valid page (or run out of them). + */ + for (bottom = 0; bottom < top; bottom++) { + if (page_ok(bottom)) + break; + } + + /* If we've got this far, we ran out of pages. */ + if (bottom == top) { + fprintf(stderr, "Unable to determine bottom of address " + "space.\n"); + exit(1); + } + + printf("0x%x\n", bottom << UM_KERN_PAGE_SHIFT); + printf("Locating the top of the address space ... "); + fflush(stdout); + + original = bottom; + + /* This could happen with a 4G/4G split */ + if (page_ok(top)) + goto out; + + do { + test = bottom + (top - bottom) / 2; + if (page_ok(test)) + bottom = test; + else + top = test; + } while (top - bottom > 1); + +out: + /* Restore the old SIGSEGV handling */ + if (sigaction(SIGSEGV, &old, NULL)) { + perror("os_get_top_address"); + exit(1); + } + top <<= UM_KERN_PAGE_SHIFT; + printf("0x%x\n", top); + + return top; +} + +#else + +unsigned long os_get_top_address(void) +{ + /* The old value of CONFIG_TOP_ADDR */ + return 0x7fc0000000; +} + +#endif diff --git a/arch/x86/um/os-Linux/tls.c b/arch/x86/um/os-Linux/tls.c new file mode 100644 index 0000000..281e83e --- /dev/null +++ b/arch/x86/um/os-Linux/tls.c @@ -0,0 +1,35 @@ +#include +#include + +#include +#include + +#include "sysdep/tls.h" + +/* Checks whether host supports TLS, and sets *tls_min according to the value + * valid on the host. + * i386 host have it == 6; x86_64 host have it == 12, for i386 emulation. */ +void check_host_supports_tls(int *supports_tls, int *tls_min) { + /* Values for x86 and x86_64.*/ + int val[] = {GDT_ENTRY_TLS_MIN_I386, GDT_ENTRY_TLS_MIN_X86_64}; + int i; + + for (i = 0; i < ARRAY_SIZE(val); i++) { + user_desc_t info; + info.entry_number = val[i]; + + if (syscall(__NR_get_thread_area, &info) == 0) { + *tls_min = val[i]; + *supports_tls = 1; + return; + } else { + if (errno == EINVAL) + continue; + else if (errno == ENOSYS) + *supports_tls = 0; + return; + } + } + + *supports_tls = 0; +} diff --git a/arch/x86/um/ptrace_32.c b/arch/x86/um/ptrace_32.c new file mode 100644 index 0000000..a174fde --- /dev/null +++ b/arch/x86/um/ptrace_32.c @@ -0,0 +1,273 @@ +/* + * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + * Licensed under the GPL + */ + +#include "linux/mm.h" +#include "linux/sched.h" +#include "asm/uaccess.h" +#include "skas.h" + +extern int arch_switch_tls(struct task_struct *to); + +void arch_switch_to(struct task_struct *to) +{ + int err = arch_switch_tls(to); + if (!err) + return; + + if (err != -EINVAL) + printk(KERN_WARNING "arch_switch_tls failed, errno %d, " + "not EINVAL\n", -err); + else + printk(KERN_WARNING "arch_switch_tls failed, errno = EINVAL\n"); +} + +int is_syscall(unsigned long addr) +{ + unsigned short instr; + int n; + + n = copy_from_user(&instr, (void __user *) addr, sizeof(instr)); + if (n) { + /* access_process_vm() grants access to vsyscall and stub, + * while copy_from_user doesn't. Maybe access_process_vm is + * slow, but that doesn't matter, since it will be called only + * in case of singlestepping, if copy_from_user failed. + */ + n = access_process_vm(current, addr, &instr, sizeof(instr), 0); + if (n != sizeof(instr)) { + printk(KERN_ERR "is_syscall : failed to read " + "instruction from 0x%lx\n", addr); + return 1; + } + } + /* int 0x80 or sysenter */ + return (instr == 0x80cd) || (instr == 0x340f); +} + +/* determines which flags the user has access to. */ +/* 1 = access 0 = no access */ +#define FLAG_MASK 0x00044dd5 + +static const int reg_offsets[] = { + [EBX] = HOST_EBX, + [ECX] = HOST_ECX, + [EDX] = HOST_EDX, + [ESI] = HOST_ESI, + [EDI] = HOST_EDI, + [EBP] = HOST_EBP, + [EAX] = HOST_EAX, + [DS] = HOST_DS, + [ES] = HOST_ES, + [FS] = HOST_FS, + [GS] = HOST_GS, + [EIP] = HOST_IP, + [CS] = HOST_CS, + [EFL] = HOST_EFLAGS, + [UESP] = HOST_SP, + [SS] = HOST_SS, +}; + +int putreg(struct task_struct *child, int regno, unsigned long value) +{ + regno >>= 2; + switch (regno) { + case EBX: + case ECX: + case EDX: + case ESI: + case EDI: + case EBP: + case EAX: + case EIP: + case UESP: + break; + case FS: + if (value && (value & 3) != 3) + return -EIO; + break; + case GS: + if (value && (value & 3) != 3) + return -EIO; + break; + case DS: + case ES: + if (value && (value & 3) != 3) + return -EIO; + value &= 0xffff; + break; + case SS: + case CS: + if ((value & 3) != 3) + return -EIO; + value &= 0xffff; + break; + case EFL: + value &= FLAG_MASK; + child->thread.regs.regs.gp[HOST_EFLAGS] |= value; + return 0; + case ORIG_EAX: + child->thread.regs.regs.syscall = value; + return 0; + default : + panic("Bad register in putreg() : %d\n", regno); + } + child->thread.regs.regs.gp[reg_offsets[regno]] = value; + return 0; +} + +int poke_user(struct task_struct *child, long addr, long data) +{ + if ((addr & 3) || addr < 0) + return -EIO; + + if (addr < MAX_REG_OFFSET) + return putreg(child, addr, data); + else if ((addr >= offsetof(struct user, u_debugreg[0])) && + (addr <= offsetof(struct user, u_debugreg[7]))) { + addr -= offsetof(struct user, u_debugreg[0]); + addr = addr >> 2; + if ((addr == 4) || (addr == 5)) + return -EIO; + child->thread.arch.debugregs[addr] = data; + return 0; + } + return -EIO; +} + +unsigned long getreg(struct task_struct *child, int regno) +{ + unsigned long mask = ~0UL; + + regno >>= 2; + switch (regno) { + case ORIG_EAX: + return child->thread.regs.regs.syscall; + case FS: + case GS: + case DS: + case ES: + case SS: + case CS: + mask = 0xffff; + break; + case EIP: + case UESP: + case EAX: + case EBX: + case ECX: + case EDX: + case ESI: + case EDI: + case EBP: + case EFL: + break; + default: + panic("Bad register in getreg() : %d\n", regno); + } + return mask & child->thread.regs.regs.gp[reg_offsets[regno]]; +} + +/* read the word at location addr in the USER area. */ +int peek_user(struct task_struct *child, long addr, long data) +{ + unsigned long tmp; + + if ((addr & 3) || addr < 0) + return -EIO; + + tmp = 0; /* Default return condition */ + if (addr < MAX_REG_OFFSET) { + tmp = getreg(child, addr); + } + else if ((addr >= offsetof(struct user, u_debugreg[0])) && + (addr <= offsetof(struct user, u_debugreg[7]))) { + addr -= offsetof(struct user, u_debugreg[0]); + addr = addr >> 2; + tmp = child->thread.arch.debugregs[addr]; + } + return put_user(tmp, (unsigned long __user *) data); +} + +static int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *child) +{ + int err, n, cpu = ((struct thread_info *) child->stack)->cpu; + struct user_i387_struct fpregs; + + err = save_fp_registers(userspace_pid[cpu], (unsigned long *) &fpregs); + if (err) + return err; + + n = copy_to_user(buf, &fpregs, sizeof(fpregs)); + if(n > 0) + return -EFAULT; + + return n; +} + +static int set_fpregs(struct user_i387_struct __user *buf, struct task_struct *child) +{ + int n, cpu = ((struct thread_info *) child->stack)->cpu; + struct user_i387_struct fpregs; + + n = copy_from_user(&fpregs, buf, sizeof(fpregs)); + if (n > 0) + return -EFAULT; + + return restore_fp_registers(userspace_pid[cpu], + (unsigned long *) &fpregs); +} + +static int get_fpxregs(struct user_fxsr_struct __user *buf, struct task_struct *child) +{ + int err, n, cpu = ((struct thread_info *) child->stack)->cpu; + struct user_fxsr_struct fpregs; + + err = save_fpx_registers(userspace_pid[cpu], (unsigned long *) &fpregs); + if (err) + return err; + + n = copy_to_user(buf, &fpregs, sizeof(fpregs)); + if(n > 0) + return -EFAULT; + + return n; +} + +static int set_fpxregs(struct user_fxsr_struct __user *buf, struct task_struct *child) +{ + int n, cpu = ((struct thread_info *) child->stack)->cpu; + struct user_fxsr_struct fpregs; + + n = copy_from_user(&fpregs, buf, sizeof(fpregs)); + if (n > 0) + return -EFAULT; + + return restore_fpx_registers(userspace_pid[cpu], + (unsigned long *) &fpregs); +} + +long subarch_ptrace(struct task_struct *child, long request, + unsigned long addr, unsigned long data) +{ + int ret = -EIO; + void __user *datap = (void __user *) data; + switch (request) { + case PTRACE_GETFPREGS: /* Get the child FPU state. */ + ret = get_fpregs(datap, child); + break; + case PTRACE_SETFPREGS: /* Set the child FPU state. */ + ret = set_fpregs(datap, child); + break; + case PTRACE_GETFPXREGS: /* Get the child FPU state. */ + ret = get_fpxregs(datap, child); + break; + case PTRACE_SETFPXREGS: /* Set the child FPU state. */ + ret = set_fpxregs(datap, child); + break; + default: + ret = -EIO; + } + return ret; +} diff --git a/arch/x86/um/ptrace_64.c b/arch/x86/um/ptrace_64.c new file mode 100644 index 0000000..44e68e0 --- /dev/null +++ b/arch/x86/um/ptrace_64.c @@ -0,0 +1,271 @@ +/* + * Copyright 2003 PathScale, Inc. + * Copyright (C) 2003 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + * + * Licensed under the GPL + */ + +#include +#include +#include +#define __FRAME_OFFSETS +#include +#include + +/* + * determines which flags the user has access to. + * 1 = access 0 = no access + */ +#define FLAG_MASK 0x44dd5UL + +static const int reg_offsets[] = +{ + [R8 >> 3] = HOST_R8, + [R9 >> 3] = HOST_R9, + [R10 >> 3] = HOST_R10, + [R11 >> 3] = HOST_R11, + [R12 >> 3] = HOST_R12, + [R13 >> 3] = HOST_R13, + [R14 >> 3] = HOST_R14, + [R15 >> 3] = HOST_R15, + [RIP >> 3] = HOST_IP, + [RSP >> 3] = HOST_SP, + [RAX >> 3] = HOST_RAX, + [RBX >> 3] = HOST_RBX, + [RCX >> 3] = HOST_RCX, + [RDX >> 3] = HOST_RDX, + [RSI >> 3] = HOST_RSI, + [RDI >> 3] = HOST_RDI, + [RBP >> 3] = HOST_RBP, + [CS >> 3] = HOST_CS, + [SS >> 3] = HOST_SS, + [FS_BASE >> 3] = HOST_FS_BASE, + [GS_BASE >> 3] = HOST_GS_BASE, + [DS >> 3] = HOST_DS, + [ES >> 3] = HOST_ES, + [FS >> 3] = HOST_FS, + [GS >> 3] = HOST_GS, + [EFLAGS >> 3] = HOST_EFLAGS, + [ORIG_RAX >> 3] = HOST_ORIG_RAX, +}; + +int putreg(struct task_struct *child, int regno, unsigned long value) +{ +#ifdef TIF_IA32 + /* + * Some code in the 64bit emulation may not be 64bit clean. + * Don't take any chances. + */ + if (test_tsk_thread_flag(child, TIF_IA32)) + value &= 0xffffffff; +#endif + switch (regno) { + case R8: + case R9: + case R10: + case R11: + case R12: + case R13: + case R14: + case R15: + case RIP: + case RSP: + case RAX: + case RBX: + case RCX: + case RDX: + case RSI: + case RDI: + case RBP: + case ORIG_RAX: + break; + + case FS: + case GS: + case DS: + case ES: + case SS: + case CS: + if (value && (value & 3) != 3) + return -EIO; + value &= 0xffff; + break; + + case FS_BASE: + case GS_BASE: + if (!((value >> 48) == 0 || (value >> 48) == 0xffff)) + return -EIO; + break; + + case EFLAGS: + value &= FLAG_MASK; + child->thread.regs.regs.gp[HOST_EFLAGS] |= value; + return 0; + + default: + panic("Bad register in putreg(): %d\n", regno); + } + + child->thread.regs.regs.gp[reg_offsets[regno >> 3]] = value; + return 0; +} + +int poke_user(struct task_struct *child, long addr, long data) +{ + if ((addr & 3) || addr < 0) + return -EIO; + + if (addr < MAX_REG_OFFSET) + return putreg(child, addr, data); + else if ((addr >= offsetof(struct user, u_debugreg[0])) && + (addr <= offsetof(struct user, u_debugreg[7]))) { + addr -= offsetof(struct user, u_debugreg[0]); + addr = addr >> 2; + if ((addr == 4) || (addr == 5)) + return -EIO; + child->thread.arch.debugregs[addr] = data; + return 0; + } + return -EIO; +} + +unsigned long getreg(struct task_struct *child, int regno) +{ + unsigned long mask = ~0UL; +#ifdef TIF_IA32 + if (test_tsk_thread_flag(child, TIF_IA32)) + mask = 0xffffffff; +#endif + switch (regno) { + case R8: + case R9: + case R10: + case R11: + case R12: + case R13: + case R14: + case R15: + case RIP: + case RSP: + case RAX: + case RBX: + case RCX: + case RDX: + case RSI: + case RDI: + case RBP: + case ORIG_RAX: + case EFLAGS: + case FS_BASE: + case GS_BASE: + break; + case FS: + case GS: + case DS: + case ES: + case SS: + case CS: + mask = 0xffff; + break; + default: + panic("Bad register in getreg: %d\n", regno); + } + return mask & child->thread.regs.regs.gp[reg_offsets[regno >> 3]]; +} + +int peek_user(struct task_struct *child, long addr, long data) +{ + /* read the word at location addr in the USER area. */ + unsigned long tmp; + + if ((addr & 3) || addr < 0) + return -EIO; + + tmp = 0; /* Default return condition */ + if (addr < MAX_REG_OFFSET) + tmp = getreg(child, addr); + else if ((addr >= offsetof(struct user, u_debugreg[0])) && + (addr <= offsetof(struct user, u_debugreg[7]))) { + addr -= offsetof(struct user, u_debugreg[0]); + addr = addr >> 2; + tmp = child->thread.arch.debugregs[addr]; + } + return put_user(tmp, (unsigned long *) data); +} + +/* XXX Mostly copied from sys-i386 */ +int is_syscall(unsigned long addr) +{ + unsigned short instr; + int n; + + n = copy_from_user(&instr, (void __user *) addr, sizeof(instr)); + if (n) { + /* + * access_process_vm() grants access to vsyscall and stub, + * while copy_from_user doesn't. Maybe access_process_vm is + * slow, but that doesn't matter, since it will be called only + * in case of singlestepping, if copy_from_user failed. + */ + n = access_process_vm(current, addr, &instr, sizeof(instr), 0); + if (n != sizeof(instr)) { + printk("is_syscall : failed to read instruction from " + "0x%lx\n", addr); + return 1; + } + } + /* sysenter */ + return instr == 0x050f; +} + +static int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *child) +{ + int err, n, cpu = ((struct thread_info *) child->stack)->cpu; + long fpregs[HOST_FP_SIZE]; + + BUG_ON(sizeof(*buf) != sizeof(fpregs)); + err = save_fp_registers(userspace_pid[cpu], fpregs); + if (err) + return err; + + n = copy_to_user(buf, fpregs, sizeof(fpregs)); + if (n > 0) + return -EFAULT; + + return n; +} + +static int set_fpregs(struct user_i387_struct __user *buf, struct task_struct *child) +{ + int n, cpu = ((struct thread_info *) child->stack)->cpu; + long fpregs[HOST_FP_SIZE]; + + BUG_ON(sizeof(*buf) != sizeof(fpregs)); + n = copy_from_user(fpregs, buf, sizeof(fpregs)); + if (n > 0) + return -EFAULT; + + return restore_fp_registers(userspace_pid[cpu], fpregs); +} + +long subarch_ptrace(struct task_struct *child, long request, + unsigned long addr, unsigned long data) +{ + int ret = -EIO; + void __user *datap = (void __user *) data; + + switch (request) { + case PTRACE_GETFPREGS: /* Get the child FPU state. */ + ret = get_fpregs(datap, child); + break; + case PTRACE_SETFPREGS: /* Set the child FPU state. */ + ret = set_fpregs(datap, child); + break; + case PTRACE_ARCH_PRCTL: + /* XXX Calls ptrace on the host - needs some SMP thinking */ + ret = arch_prctl(child, data, (void __user *) addr); + break; + } + + return ret; +} diff --git a/arch/x86/um/ptrace_user.c b/arch/x86/um/ptrace_user.c new file mode 100644 index 0000000..3960ca1 --- /dev/null +++ b/arch/x86/um/ptrace_user.c @@ -0,0 +1,21 @@ +/* + * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + * Licensed under the GPL + */ + +#include +#include "ptrace_user.h" + +int ptrace_getregs(long pid, unsigned long *regs_out) +{ + if (ptrace(PTRACE_GETREGS, pid, 0, regs_out) < 0) + return -errno; + return 0; +} + +int ptrace_setregs(long pid, unsigned long *regs) +{ + if (ptrace(PTRACE_SETREGS, pid, 0, regs) < 0) + return -errno; + return 0; +} diff --git a/arch/x86/um/setjmp_32.S b/arch/x86/um/setjmp_32.S new file mode 100644 index 0000000..b766792 --- /dev/null +++ b/arch/x86/um/setjmp_32.S @@ -0,0 +1,58 @@ +# +# arch/i386/setjmp.S +# +# setjmp/longjmp for the i386 architecture +# + +# +# The jmp_buf is assumed to contain the following, in order: +# %ebx +# %esp +# %ebp +# %esi +# %edi +# +# + + .text + .align 4 + .globl setjmp + .type setjmp, @function +setjmp: +#ifdef _REGPARM + movl %eax,%edx +#else + movl 4(%esp),%edx +#endif + popl %ecx # Return address, and adjust the stack + xorl %eax,%eax # Return value + movl %ebx,(%edx) + movl %esp,4(%edx) # Post-return %esp! + pushl %ecx # Make the call/return stack happy + movl %ebp,8(%edx) + movl %esi,12(%edx) + movl %edi,16(%edx) + movl %ecx,20(%edx) # Return address + ret + + .size setjmp,.-setjmp + + .text + .align 4 + .globl longjmp + .type longjmp, @function +longjmp: +#ifdef _REGPARM + xchgl %eax,%edx +#else + movl 4(%esp),%edx # jmp_ptr address + movl 8(%esp),%eax # Return value +#endif + movl (%edx),%ebx + movl 4(%edx),%esp + movl 8(%edx),%ebp + movl 12(%edx),%esi + movl 16(%edx),%edi + jmp *20(%edx) + + .size longjmp,.-longjmp diff --git a/arch/x86/um/setjmp_64.S b/arch/x86/um/setjmp_64.S new file mode 100644 index 0000000..45f547b --- /dev/null +++ b/arch/x86/um/setjmp_64.S @@ -0,0 +1,54 @@ +# +# arch/x86_64/setjmp.S +# +# setjmp/longjmp for the x86-64 architecture +# + +# +# The jmp_buf is assumed to contain the following, in order: +# %rbx +# %rsp (post-return) +# %rbp +# %r12 +# %r13 +# %r14 +# %r15 +# +# + + .text + .align 4 + .globl setjmp + .type setjmp, @function +setjmp: + pop %rsi # Return address, and adjust the stack + xorl %eax,%eax # Return value + movq %rbx,(%rdi) + movq %rsp,8(%rdi) # Post-return %rsp! + push %rsi # Make the call/return stack happy + movq %rbp,16(%rdi) + movq %r12,24(%rdi) + movq %r13,32(%rdi) + movq %r14,40(%rdi) + movq %r15,48(%rdi) + movq %rsi,56(%rdi) # Return address + ret + + .size setjmp,.-setjmp + + .text + .align 4 + .globl longjmp + .type longjmp, @function +longjmp: + movl %esi,%eax # Return value (int) + movq (%rdi),%rbx + movq 8(%rdi),%rsp + movq 16(%rdi),%rbp + movq 24(%rdi),%r12 + movq 32(%rdi),%r13 + movq 40(%rdi),%r14 + movq 48(%rdi),%r15 + jmp *56(%rdi) + + .size longjmp,.-longjmp diff --git a/arch/x86/um/shared/sysdep/archsetjmp.h b/arch/x86/um/shared/sysdep/archsetjmp.h new file mode 100644 index 0000000..ff7766d --- /dev/null +++ b/arch/x86/um/shared/sysdep/archsetjmp.h @@ -0,0 +1,5 @@ +#ifdef __i386__ +#include "archsetjmp_32.h" +#else +#include "archsetjmp_64.h" +#endif diff --git a/arch/x86/um/shared/sysdep/archsetjmp_32.h b/arch/x86/um/shared/sysdep/archsetjmp_32.h new file mode 100644 index 0000000..0f31208 --- /dev/null +++ b/arch/x86/um/shared/sysdep/archsetjmp_32.h @@ -0,0 +1,22 @@ +/* + * arch/um/include/sysdep-i386/archsetjmp.h + */ + +#ifndef _KLIBC_ARCHSETJMP_H +#define _KLIBC_ARCHSETJMP_H + +struct __jmp_buf { + unsigned int __ebx; + unsigned int __esp; + unsigned int __ebp; + unsigned int __esi; + unsigned int __edi; + unsigned int __eip; +}; + +typedef struct __jmp_buf jmp_buf[1]; + +#define JB_IP __eip +#define JB_SP __esp + +#endif /* _SETJMP_H */ diff --git a/arch/x86/um/shared/sysdep/archsetjmp_64.h b/arch/x86/um/shared/sysdep/archsetjmp_64.h new file mode 100644 index 0000000..2af8f12 --- /dev/null +++ b/arch/x86/um/shared/sysdep/archsetjmp_64.h @@ -0,0 +1,24 @@ +/* + * arch/um/include/sysdep-x86_64/archsetjmp.h + */ + +#ifndef _KLIBC_ARCHSETJMP_H +#define _KLIBC_ARCHSETJMP_H + +struct __jmp_buf { + unsigned long __rbx; + unsigned long __rsp; + unsigned long __rbp; + unsigned long __r12; + unsigned long __r13; + unsigned long __r14; + unsigned long __r15; + unsigned long __rip; +}; + +typedef struct __jmp_buf jmp_buf[1]; + +#define JB_IP __rip +#define JB_SP __rsp + +#endif /* _SETJMP_H */ diff --git a/arch/x86/um/shared/sysdep/faultinfo.h b/arch/x86/um/shared/sysdep/faultinfo.h new file mode 100644 index 0000000..862ecb1 --- /dev/null +++ b/arch/x86/um/shared/sysdep/faultinfo.h @@ -0,0 +1,5 @@ +#ifdef __i386__ +#include "faultinfo_32.h" +#else +#include "faultinfo_64.h" +#endif diff --git a/arch/x86/um/shared/sysdep/faultinfo_32.h b/arch/x86/um/shared/sysdep/faultinfo_32.h new file mode 100644 index 0000000..a26086b --- /dev/null +++ b/arch/x86/um/shared/sysdep/faultinfo_32.h @@ -0,0 +1,35 @@ +/* + * Copyright (C) 2004 Fujitsu Siemens Computers GmbH + * Author: Bodo Stroesser + * Licensed under the GPL + */ + +#ifndef __FAULTINFO_I386_H +#define __FAULTINFO_I386_H + +/* this structure contains the full arch-specific faultinfo + * from the traps. + * On i386, ptrace_faultinfo unfortunately doesn't provide + * all the info, since trap_no is missing. + * All common elements are defined at the same position in + * both structures, thus making it easy to copy the + * contents without knowledge about the structure elements. + */ +struct faultinfo { + int error_code; /* in ptrace_faultinfo misleadingly called is_write */ + unsigned long cr2; /* in ptrace_faultinfo called addr */ + int trap_no; /* missing in ptrace_faultinfo */ +}; + +#define FAULT_WRITE(fi) ((fi).error_code & 2) +#define FAULT_ADDRESS(fi) ((fi).cr2) + +/* This is Page Fault */ +#define SEGV_IS_FIXABLE(fi) ((fi)->trap_no == 14) + +/* SKAS3 has no trap_no on i386, but get_skas_faultinfo() sets it to 0. */ +#define SEGV_MAYBE_FIXABLE(fi) ((fi)->trap_no == 0 && ptrace_faultinfo) + +#define PTRACE_FULL_FAULTINFO 0 + +#endif diff --git a/arch/x86/um/shared/sysdep/faultinfo_64.h b/arch/x86/um/shared/sysdep/faultinfo_64.h new file mode 100644 index 0000000..f811cbe --- /dev/null +++ b/arch/x86/um/shared/sysdep/faultinfo_64.h @@ -0,0 +1,35 @@ +/* + * Copyright (C) 2004 Fujitsu Siemens Computers GmbH + * Author: Bodo Stroesser + * Licensed under the GPL + */ + +#ifndef __FAULTINFO_X86_64_H +#define __FAULTINFO_X86_64_H + +/* this structure contains the full arch-specific faultinfo + * from the traps. + * On i386, ptrace_faultinfo unfortunately doesn't provide + * all the info, since trap_no is missing. + * All common elements are defined at the same position in + * both structures, thus making it easy to copy the + * contents without knowledge about the structure elements. + */ +struct faultinfo { + int error_code; /* in ptrace_faultinfo misleadingly called is_write */ + unsigned long cr2; /* in ptrace_faultinfo called addr */ + int trap_no; /* missing in ptrace_faultinfo */ +}; + +#define FAULT_WRITE(fi) ((fi).error_code & 2) +#define FAULT_ADDRESS(fi) ((fi).cr2) + +/* This is Page Fault */ +#define SEGV_IS_FIXABLE(fi) ((fi)->trap_no == 14) + +/* No broken SKAS API, which doesn't pass trap_no, here. */ +#define SEGV_MAYBE_FIXABLE(fi) 0 + +#define PTRACE_FULL_FAULTINFO 1 + +#endif diff --git a/arch/x86/um/shared/sysdep/host_ldt.h b/arch/x86/um/shared/sysdep/host_ldt.h new file mode 100644 index 0000000..94518b3 --- /dev/null +++ b/arch/x86/um/shared/sysdep/host_ldt.h @@ -0,0 +1,5 @@ +#ifdef __i386__ +#include "host_ldt_32.h" +#else +#include "host_ldt_64.h" +#endif diff --git a/arch/x86/um/shared/sysdep/host_ldt_32.h b/arch/x86/um/shared/sysdep/host_ldt_32.h new file mode 100644 index 0000000..0953cc4 --- /dev/null +++ b/arch/x86/um/shared/sysdep/host_ldt_32.h @@ -0,0 +1,34 @@ +#ifndef __ASM_HOST_LDT_I386_H +#define __ASM_HOST_LDT_I386_H + +#include + +/* + * macros stolen from include/asm-i386/desc.h + */ +#define LDT_entry_a(info) \ + ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff)) + +#define LDT_entry_b(info) \ + (((info)->base_addr & 0xff000000) | \ + (((info)->base_addr & 0x00ff0000) >> 16) | \ + ((info)->limit & 0xf0000) | \ + (((info)->read_exec_only ^ 1) << 9) | \ + ((info)->contents << 10) | \ + (((info)->seg_not_present ^ 1) << 15) | \ + ((info)->seg_32bit << 22) | \ + ((info)->limit_in_pages << 23) | \ + ((info)->useable << 20) | \ + 0x7000) + +#define LDT_empty(info) (\ + (info)->base_addr == 0 && \ + (info)->limit == 0 && \ + (info)->contents == 0 && \ + (info)->read_exec_only == 1 && \ + (info)->seg_32bit == 0 && \ + (info)->limit_in_pages == 0 && \ + (info)->seg_not_present == 1 && \ + (info)->useable == 0 ) + +#endif diff --git a/arch/x86/um/shared/sysdep/host_ldt_64.h b/arch/x86/um/shared/sysdep/host_ldt_64.h new file mode 100644 index 0000000..e8b1be1 --- /dev/null +++ b/arch/x86/um/shared/sysdep/host_ldt_64.h @@ -0,0 +1,38 @@ +#ifndef __ASM_HOST_LDT_X86_64_H +#define __ASM_HOST_LDT_X86_64_H + +#include + +/* + * macros stolen from include/asm-x86_64/desc.h + */ +#define LDT_entry_a(info) \ + ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff)) + +/* Don't allow setting of the lm bit. It is useless anyways because + * 64bit system calls require __USER_CS. */ +#define LDT_entry_b(info) \ + (((info)->base_addr & 0xff000000) | \ + (((info)->base_addr & 0x00ff0000) >> 16) | \ + ((info)->limit & 0xf0000) | \ + (((info)->read_exec_only ^ 1) << 9) | \ + ((info)->contents << 10) | \ + (((info)->seg_not_present ^ 1) << 15) | \ + ((info)->seg_32bit << 22) | \ + ((info)->limit_in_pages << 23) | \ + ((info)->useable << 20) | \ + /* ((info)->lm << 21) | */ \ + 0x7000) + +#define LDT_empty(info) (\ + (info)->base_addr == 0 && \ + (info)->limit == 0 && \ + (info)->contents == 0 && \ + (info)->read_exec_only == 1 && \ + (info)->seg_32bit == 0 && \ + (info)->limit_in_pages == 0 && \ + (info)->seg_not_present == 1 && \ + (info)->useable == 0 && \ + (info)->lm == 0) + +#endif diff --git a/arch/x86/um/shared/sysdep/kernel-offsets.h b/arch/x86/um/shared/sysdep/kernel-offsets.h new file mode 100644 index 0000000..5868526 --- /dev/null +++ b/arch/x86/um/shared/sysdep/kernel-offsets.h @@ -0,0 +1,21 @@ +#include +#include +#include +#include +#include + +#define DEFINE(sym, val) \ + asm volatile("\n->" #sym " %0 " #val : : "i" (val)) + +#define STR(x) #x +#define DEFINE_STR(sym, val) asm volatile("\n->" #sym " " STR(val) " " #val: : ) + +#define BLANK() asm volatile("\n->" : : ) + +#define OFFSET(sym, str, mem) \ + DEFINE(sym, offsetof(struct str, mem)); + +void foo(void) +{ +#include +} diff --git a/arch/x86/um/shared/sysdep/mcontext.h b/arch/x86/um/shared/sysdep/mcontext.h new file mode 100644 index 0000000..b724c54 --- /dev/null +++ b/arch/x86/um/shared/sysdep/mcontext.h @@ -0,0 +1,31 @@ +/* + * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + * Licensed under the GPL + */ + +#ifndef __SYS_SIGCONTEXT_X86_H +#define __SYS_SIGCONTEXT_X86_H + +extern void get_regs_from_mc(struct uml_pt_regs *, mcontext_t *); + +#ifdef __i386__ + +#define GET_FAULTINFO_FROM_MC(fi, mc) \ + { \ + (fi).cr2 = (mc)->cr2; \ + (fi).error_code = (mc)->gregs[REG_ERR]; \ + (fi).trap_no = (mc)->gregs[REG_TRAPNO]; \ + } + +#else + +#define GET_FAULTINFO_FROM_MC(fi, mc) \ + { \ + (fi).cr2 = (mc)->gregs[REG_CR2]; \ + (fi).error_code = (mc)->gregs[REG_ERR]; \ + (fi).trap_no = (mc)->gregs[REG_TRAPNO]; \ + } + +#endif + +#endif diff --git a/arch/x86/um/shared/sysdep/ptrace.h b/arch/x86/um/shared/sysdep/ptrace.h new file mode 100644 index 0000000..711b162 --- /dev/null +++ b/arch/x86/um/shared/sysdep/ptrace.h @@ -0,0 +1,5 @@ +#ifdef __i386__ +#include "ptrace_32.h" +#else +#include "ptrace_64.h" +#endif diff --git a/arch/x86/um/shared/sysdep/ptrace_32.h b/arch/x86/um/shared/sysdep/ptrace_32.h new file mode 100644 index 0000000..ce77fa1 --- /dev/null +++ b/arch/x86/um/shared/sysdep/ptrace_32.h @@ -0,0 +1,117 @@ +/* + * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + * Licensed under the GPL + */ + +#ifndef __SYSDEP_I386_PTRACE_H +#define __SYSDEP_I386_PTRACE_H + +#include +#include "sysdep/faultinfo.h" + +#define MAX_REG_NR (UM_FRAME_SIZE / sizeof(unsigned long)) +#define MAX_REG_OFFSET (UM_FRAME_SIZE) + +static inline void update_debugregs(int seq) {} + +/* syscall emulation path in ptrace */ + +#ifndef PTRACE_SYSEMU +#define PTRACE_SYSEMU 31 +#endif + +void set_using_sysemu(int value); +int get_using_sysemu(void); +extern int sysemu_supported; + +#define REGS_IP(r) ((r)[HOST_IP]) +#define REGS_SP(r) ((r)[HOST_SP]) +#define REGS_EFLAGS(r) ((r)[HOST_EFLAGS]) +#define REGS_EAX(r) ((r)[HOST_EAX]) +#define REGS_EBX(r) ((r)[HOST_EBX]) +#define REGS_ECX(r) ((r)[HOST_ECX]) +#define REGS_EDX(r) ((r)[HOST_EDX]) +#define REGS_ESI(r) ((r)[HOST_ESI]) +#define REGS_EDI(r) ((r)[HOST_EDI]) +#define REGS_EBP(r) ((r)[HOST_EBP]) +#define REGS_CS(r) ((r)[HOST_CS]) +#define REGS_SS(r) ((r)[HOST_SS]) +#define REGS_DS(r) ((r)[HOST_DS]) +#define REGS_ES(r) ((r)[HOST_ES]) +#define REGS_FS(r) ((r)[HOST_FS]) +#define REGS_GS(r) ((r)[HOST_GS]) + +#define REGS_SET_SYSCALL_RETURN(r, res) REGS_EAX(r) = (res) + +#define IP_RESTART_SYSCALL(ip) ((ip) -= 2) +#define REGS_RESTART_SYSCALL(r) IP_RESTART_SYSCALL(REGS_IP(r)) + +#ifndef PTRACE_SYSEMU_SINGLESTEP +#define PTRACE_SYSEMU_SINGLESTEP 32 +#endif + +struct uml_pt_regs { + unsigned long gp[MAX_REG_NR]; + unsigned long fp[HOST_FPX_SIZE]; + struct faultinfo faultinfo; + long syscall; + int is_user; +}; + +#define EMPTY_UML_PT_REGS { } + +#define UPT_IP(r) REGS_IP((r)->gp) +#define UPT_SP(r) REGS_SP((r)->gp) +#define UPT_EFLAGS(r) REGS_EFLAGS((r)->gp) +#define UPT_EAX(r) REGS_EAX((r)->gp) +#define UPT_EBX(r) REGS_EBX((r)->gp) +#define UPT_ECX(r) REGS_ECX((r)->gp) +#define UPT_EDX(r) REGS_EDX((r)->gp) +#define UPT_ESI(r) REGS_ESI((r)->gp) +#define UPT_EDI(r) REGS_EDI((r)->gp) +#define UPT_EBP(r) REGS_EBP((r)->gp) +#define UPT_ORIG_EAX(r) ((r)->syscall) +#define UPT_CS(r) REGS_CS((r)->gp) +#define UPT_SS(r) REGS_SS((r)->gp) +#define UPT_DS(r) REGS_DS((r)->gp) +#define UPT_ES(r) REGS_ES((r)->gp) +#define UPT_FS(r) REGS_FS((r)->gp) +#define UPT_GS(r) REGS_GS((r)->gp) + +#define UPT_SYSCALL_ARG1(r) UPT_EBX(r) +#define UPT_SYSCALL_ARG2(r) UPT_ECX(r) +#define UPT_SYSCALL_ARG3(r) UPT_EDX(r) +#define UPT_SYSCALL_ARG4(r) UPT_ESI(r) +#define UPT_SYSCALL_ARG5(r) UPT_EDI(r) +#define UPT_SYSCALL_ARG6(r) UPT_EBP(r) + +extern int user_context(unsigned long sp); + +#define UPT_IS_USER(r) ((r)->is_user) + +struct syscall_args { + unsigned long args[6]; +}; + +#define SYSCALL_ARGS(r) ((struct syscall_args) \ + { .args = { UPT_SYSCALL_ARG1(r), \ + UPT_SYSCALL_ARG2(r), \ + UPT_SYSCALL_ARG3(r), \ + UPT_SYSCALL_ARG4(r), \ + UPT_SYSCALL_ARG5(r), \ + UPT_SYSCALL_ARG6(r) } } ) + +#define UPT_SET_SYSCALL_RETURN(r, res) \ + REGS_SET_SYSCALL_RETURN((r)->regs, (res)) + +#define UPT_RESTART_SYSCALL(r) REGS_RESTART_SYSCALL((r)->gp) + +#define UPT_ORIG_SYSCALL(r) UPT_EAX(r) +#define UPT_SYSCALL_NR(r) UPT_ORIG_EAX(r) +#define UPT_SYSCALL_RET(r) UPT_EAX(r) + +#define UPT_FAULTINFO(r) (&(r)->faultinfo) + +extern void arch_init_registers(int pid); + +#endif diff --git a/arch/x86/um/shared/sysdep/ptrace_64.h b/arch/x86/um/shared/sysdep/ptrace_64.h new file mode 100644 index 0000000..866fe7e --- /dev/null +++ b/arch/x86/um/shared/sysdep/ptrace_64.h @@ -0,0 +1,160 @@ +/* + * Copyright 2003 PathScale, Inc. + * Copyright (C) 2003 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + * + * Licensed under the GPL + */ + +#ifndef __SYSDEP_X86_64_PTRACE_H +#define __SYSDEP_X86_64_PTRACE_H + +#include +#include "sysdep/faultinfo.h" + +#define MAX_REG_OFFSET (UM_FRAME_SIZE) +#define MAX_REG_NR ((MAX_REG_OFFSET) / sizeof(unsigned long)) + +#define REGS_IP(r) ((r)[HOST_IP]) +#define REGS_SP(r) ((r)[HOST_SP]) + +#define REGS_RBX(r) ((r)[HOST_RBX]) +#define REGS_RCX(r) ((r)[HOST_RCX]) +#define REGS_RDX(r) ((r)[HOST_RDX]) +#define REGS_RSI(r) ((r)[HOST_RSI]) +#define REGS_RDI(r) ((r)[HOST_RDI]) +#define REGS_RBP(r) ((r)[HOST_RBP]) +#define REGS_RAX(r) ((r)[HOST_RAX]) +#define REGS_R8(r) ((r)[HOST_R8]) +#define REGS_R9(r) ((r)[HOST_R9]) +#define REGS_R10(r) ((r)[HOST_R10]) +#define REGS_R11(r) ((r)[HOST_R11]) +#define REGS_R12(r) ((r)[HOST_R12]) +#define REGS_R13(r) ((r)[HOST_R13]) +#define REGS_R14(r) ((r)[HOST_R14]) +#define REGS_R15(r) ((r)[HOST_R15]) +#define REGS_CS(r) ((r)[HOST_CS]) +#define REGS_EFLAGS(r) ((r)[HOST_EFLAGS]) +#define REGS_SS(r) ((r)[HOST_SS]) + +#define HOST_FS_BASE 21 +#define HOST_GS_BASE 22 +#define HOST_DS 23 +#define HOST_ES 24 +#define HOST_FS 25 +#define HOST_GS 26 + +/* Also defined in asm/ptrace-x86_64.h, but not in libc headers. So, these + * are already defined for kernel code, but not for userspace code. + */ +#ifndef FS_BASE +/* These aren't defined in ptrace.h, but exist in struct user_regs_struct, + * which is what x86_64 ptrace actually uses. + */ +#define FS_BASE (HOST_FS_BASE * sizeof(long)) +#define GS_BASE (HOST_GS_BASE * sizeof(long)) +#define DS (HOST_DS * sizeof(long)) +#define ES (HOST_ES * sizeof(long)) +#define FS (HOST_FS * sizeof(long)) +#define GS (HOST_GS * sizeof(long)) +#endif + +#define REGS_FS_BASE(r) ((r)[HOST_FS_BASE]) +#define REGS_GS_BASE(r) ((r)[HOST_GS_BASE]) +#define REGS_DS(r) ((r)[HOST_DS]) +#define REGS_ES(r) ((r)[HOST_ES]) +#define REGS_FS(r) ((r)[HOST_FS]) +#define REGS_GS(r) ((r)[HOST_GS]) + +#define REGS_ORIG_RAX(r) ((r)[HOST_ORIG_RAX]) + +#define REGS_SET_SYSCALL_RETURN(r, res) REGS_RAX(r) = (res) + +#define IP_RESTART_SYSCALL(ip) ((ip) -= 2) +#define REGS_RESTART_SYSCALL(r) IP_RESTART_SYSCALL(REGS_IP(r)) + +#define REGS_FAULT_ADDR(r) ((r)->fault_addr) + +#define REGS_FAULT_WRITE(r) FAULT_WRITE((r)->fault_type) + +#define REGS_TRAP(r) ((r)->trap_type) + +#define REGS_ERR(r) ((r)->fault_type) + +struct uml_pt_regs { + unsigned long gp[MAX_REG_NR]; + unsigned long fp[HOST_FP_SIZE]; + struct faultinfo faultinfo; + long syscall; + int is_user; +}; + +#define EMPTY_UML_PT_REGS { } + +#define UPT_RBX(r) REGS_RBX((r)->gp) +#define UPT_RCX(r) REGS_RCX((r)->gp) +#define UPT_RDX(r) REGS_RDX((r)->gp) +#define UPT_RSI(r) REGS_RSI((r)->gp) +#define UPT_RDI(r) REGS_RDI((r)->gp) +#define UPT_RBP(r) REGS_RBP((r)->gp) +#define UPT_RAX(r) REGS_RAX((r)->gp) +#define UPT_R8(r) REGS_R8((r)->gp) +#define UPT_R9(r) REGS_R9((r)->gp) +#define UPT_R10(r) REGS_R10((r)->gp) +#define UPT_R11(r) REGS_R11((r)->gp) +#define UPT_R12(r) REGS_R12((r)->gp) +#define UPT_R13(r) REGS_R13((r)->gp) +#define UPT_R14(r) REGS_R14((r)->gp) +#define UPT_R15(r) REGS_R15((r)->gp) +#define UPT_CS(r) REGS_CS((r)->gp) +#define UPT_FS_BASE(r) REGS_FS_BASE((r)->gp) +#define UPT_FS(r) REGS_FS((r)->gp) +#define UPT_GS_BASE(r) REGS_GS_BASE((r)->gp) +#define UPT_GS(r) REGS_GS((r)->gp) +#define UPT_DS(r) REGS_DS((r)->gp) +#define UPT_ES(r) REGS_ES((r)->gp) +#define UPT_CS(r) REGS_CS((r)->gp) +#define UPT_SS(r) REGS_SS((r)->gp) +#define UPT_ORIG_RAX(r) REGS_ORIG_RAX((r)->gp) + +#define UPT_IP(r) REGS_IP((r)->gp) +#define UPT_SP(r) REGS_SP((r)->gp) + +#define UPT_EFLAGS(r) REGS_EFLAGS((r)->gp) +#define UPT_SYSCALL_NR(r) ((r)->syscall) +#define UPT_SYSCALL_RET(r) UPT_RAX(r) + +extern int user_context(unsigned long sp); + +#define UPT_IS_USER(r) ((r)->is_user) + +#define UPT_SYSCALL_ARG1(r) UPT_RDI(r) +#define UPT_SYSCALL_ARG2(r) UPT_RSI(r) +#define UPT_SYSCALL_ARG3(r) UPT_RDX(r) +#define UPT_SYSCALL_ARG4(r) UPT_R10(r) +#define UPT_SYSCALL_ARG5(r) UPT_R8(r) +#define UPT_SYSCALL_ARG6(r) UPT_R9(r) + +struct syscall_args { + unsigned long args[6]; +}; + +#define SYSCALL_ARGS(r) ((struct syscall_args) \ + { .args = { UPT_SYSCALL_ARG1(r), \ + UPT_SYSCALL_ARG2(r), \ + UPT_SYSCALL_ARG3(r), \ + UPT_SYSCALL_ARG4(r), \ + UPT_SYSCALL_ARG5(r), \ + UPT_SYSCALL_ARG6(r) } } ) + +#define UPT_SET_SYSCALL_RETURN(r, res) \ + REGS_SET_SYSCALL_RETURN((r)->regs, (res)) + +#define UPT_RESTART_SYSCALL(r) REGS_RESTART_SYSCALL((r)->gp) + +#define UPT_FAULTINFO(r) (&(r)->faultinfo) + +static inline void arch_init_registers(int pid) +{ +} + +#endif diff --git a/arch/x86/um/shared/sysdep/ptrace_user.h b/arch/x86/um/shared/sysdep/ptrace_user.h new file mode 100644 index 0000000..a92f883 --- /dev/null +++ b/arch/x86/um/shared/sysdep/ptrace_user.h @@ -0,0 +1,5 @@ +#ifdef __i386__ +#include "ptrace_user_32.h" +#else +#include "ptrace_user_64.h" +#endif diff --git a/arch/x86/um/shared/sysdep/ptrace_user_32.h b/arch/x86/um/shared/sysdep/ptrace_user_32.h new file mode 100644 index 0000000..9d88a79 --- /dev/null +++ b/arch/x86/um/shared/sysdep/ptrace_user_32.h @@ -0,0 +1,26 @@ +/* + * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#ifndef __SYSDEP_I386_PTRACE_USER_H__ +#define __SYSDEP_I386_PTRACE_USER_H__ + +#include +#include +#include +#include + +#define PT_OFFSET(r) ((r) * sizeof(long)) + +#define PT_SYSCALL_NR(regs) ((regs)[ORIG_EAX]) +#define PT_SYSCALL_NR_OFFSET PT_OFFSET(ORIG_EAX) + +#define PT_SYSCALL_RET_OFFSET PT_OFFSET(EAX) + +#define REGS_IP_INDEX EIP +#define REGS_SP_INDEX UESP + +#define FP_SIZE ((HOST_FPX_SIZE > HOST_FP_SIZE) ? HOST_FPX_SIZE : HOST_FP_SIZE) + +#endif diff --git a/arch/x86/um/shared/sysdep/ptrace_user_64.h b/arch/x86/um/shared/sysdep/ptrace_user_64.h new file mode 100644 index 0000000..2f1b6e3 --- /dev/null +++ b/arch/x86/um/shared/sysdep/ptrace_user_64.h @@ -0,0 +1,38 @@ +/* + * Copyright 2003 PathScale, Inc. + * + * Licensed under the GPL + */ + +#ifndef __SYSDEP_X86_64_PTRACE_USER_H__ +#define __SYSDEP_X86_64_PTRACE_USER_H__ + +#define __FRAME_OFFSETS +#include +#include +#include +#undef __FRAME_OFFSETS +#include + +#define PT_INDEX(off) ((off) / sizeof(unsigned long)) + +#define PT_SYSCALL_NR(regs) ((regs)[PT_INDEX(ORIG_RAX)]) +#define PT_SYSCALL_NR_OFFSET (ORIG_RAX) + +#define PT_SYSCALL_RET_OFFSET (RAX) + +/* + * x86_64 FC3 doesn't define this in /usr/include/linux/ptrace.h even though + * it's defined in the kernel's include/linux/ptrace.h. Additionally, use the + * 2.4 name and value for 2.4 host compatibility. + */ +#ifndef PTRACE_OLDSETOPTIONS +#define PTRACE_OLDSETOPTIONS 21 +#endif + +#define REGS_IP_INDEX PT_INDEX(RIP) +#define REGS_SP_INDEX PT_INDEX(RSP) + +#define FP_SIZE (HOST_FP_SIZE) + +#endif diff --git a/arch/x86/um/shared/sysdep/skas_ptrace.h b/arch/x86/um/shared/sysdep/skas_ptrace.h new file mode 100644 index 0000000..453febe --- /dev/null +++ b/arch/x86/um/shared/sysdep/skas_ptrace.h @@ -0,0 +1,22 @@ +/* + * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#ifndef __SYSDEP_X86_SKAS_PTRACE_H +#define __SYSDEP_X86_SKAS_PTRACE_H + +struct ptrace_faultinfo { + int is_write; + unsigned long addr; +}; + +struct ptrace_ldt { + int func; + void *ptr; + unsigned long bytecount; +}; + +#define PTRACE_LDT 54 + +#endif diff --git a/arch/x86/um/shared/sysdep/stub.h b/arch/x86/um/shared/sysdep/stub.h new file mode 100644 index 0000000..bd161e3 --- /dev/null +++ b/arch/x86/um/shared/sysdep/stub.h @@ -0,0 +1,14 @@ +#include +#include +#include +#include "as-layout.h" +#include "stub-data.h" + +#ifdef __i386__ +#include "stub_32.h" +#else +#include "stub_64.h" +#endif + +extern void stub_segv_handler(int, siginfo_t *, void *); +extern void stub_clone_handler(void); diff --git a/arch/x86/um/shared/sysdep/stub_32.h b/arch/x86/um/shared/sysdep/stub_32.h new file mode 100644 index 0000000..51fd256 --- /dev/null +++ b/arch/x86/um/shared/sysdep/stub_32.h @@ -0,0 +1,93 @@ +/* + * Copyright (C) 2004 Jeff Dike (jdike@addtoit.com) + * Licensed under the GPL + */ + +#ifndef __SYSDEP_STUB_H +#define __SYSDEP_STUB_H + +#include + +#define STUB_SYSCALL_RET EAX +#define STUB_MMAP_NR __NR_mmap2 +#define MMAP_OFFSET(o) ((o) >> UM_KERN_PAGE_SHIFT) + +static inline long stub_syscall0(long syscall) +{ + long ret; + + __asm__ volatile ("int $0x80" : "=a" (ret) : "0" (syscall)); + + return ret; +} + +static inline long stub_syscall1(long syscall, long arg1) +{ + long ret; + + __asm__ volatile ("int $0x80" : "=a" (ret) : "0" (syscall), "b" (arg1)); + + return ret; +} + +static inline long stub_syscall2(long syscall, long arg1, long arg2) +{ + long ret; + + __asm__ volatile ("int $0x80" : "=a" (ret) : "0" (syscall), "b" (arg1), + "c" (arg2)); + + return ret; +} + +static inline long stub_syscall3(long syscall, long arg1, long arg2, long arg3) +{ + long ret; + + __asm__ volatile ("int $0x80" : "=a" (ret) : "0" (syscall), "b" (arg1), + "c" (arg2), "d" (arg3)); + + return ret; +} + +static inline long stub_syscall4(long syscall, long arg1, long arg2, long arg3, + long arg4) +{ + long ret; + + __asm__ volatile ("int $0x80" : "=a" (ret) : "0" (syscall), "b" (arg1), + "c" (arg2), "d" (arg3), "S" (arg4)); + + return ret; +} + +static inline long stub_syscall5(long syscall, long arg1, long arg2, long arg3, + long arg4, long arg5) +{ + long ret; + + __asm__ volatile ("int $0x80" : "=a" (ret) : "0" (syscall), "b" (arg1), + "c" (arg2), "d" (arg3), "S" (arg4), "D" (arg5)); + + return ret; +} + +static inline void trap_myself(void) +{ + __asm("int3"); +} + +static inline void remap_stack(int fd, unsigned long offset) +{ + __asm__ volatile ("movl %%eax,%%ebp ; movl %0,%%eax ; int $0x80 ;" + "movl %7, %%ebx ; movl %%eax, (%%ebx)" + : : "g" (STUB_MMAP_NR), "b" (STUB_DATA), + "c" (UM_KERN_PAGE_SIZE), + "d" (PROT_READ | PROT_WRITE), + "S" (MAP_FIXED | MAP_SHARED), "D" (fd), + "a" (offset), + "i" (&((struct stub_data *) STUB_DATA)->err) + : "memory"); +} + +#endif diff --git a/arch/x86/um/shared/sysdep/stub_64.h b/arch/x86/um/shared/sysdep/stub_64.h new file mode 100644 index 0000000..994df93 --- /dev/null +++ b/arch/x86/um/shared/sysdep/stub_64.h @@ -0,0 +1,99 @@ +/* + * Copyright (C) 2004 Jeff Dike (jdike@addtoit.com) + * Licensed under the GPL + */ + +#ifndef __SYSDEP_STUB_H +#define __SYSDEP_STUB_H + +#include + +#define STUB_SYSCALL_RET PT_INDEX(RAX) +#define STUB_MMAP_NR __NR_mmap +#define MMAP_OFFSET(o) (o) + +#define __syscall_clobber "r11","rcx","memory" +#define __syscall "syscall" + +static inline long stub_syscall0(long syscall) +{ + long ret; + + __asm__ volatile (__syscall + : "=a" (ret) + : "0" (syscall) : __syscall_clobber ); + + return ret; +} + +static inline long stub_syscall2(long syscall, long arg1, long arg2) +{ + long ret; + + __asm__ volatile (__syscall + : "=a" (ret) + : "0" (syscall), "D" (arg1), "S" (arg2) : __syscall_clobber ); + + return ret; +} + +static inline long stub_syscall3(long syscall, long arg1, long arg2, long arg3) +{ + long ret; + + __asm__ volatile (__syscall + : "=a" (ret) + : "0" (syscall), "D" (arg1), "S" (arg2), "d" (arg3) + : __syscall_clobber ); + + return ret; +} + +static inline long stub_syscall4(long syscall, long arg1, long arg2, long arg3, + long arg4) +{ + long ret; + + __asm__ volatile ("movq %5,%%r10 ; " __syscall + : "=a" (ret) + : "0" (syscall), "D" (arg1), "S" (arg2), "d" (arg3), + "g" (arg4) + : __syscall_clobber, "r10" ); + + return ret; +} + +static inline long stub_syscall5(long syscall, long arg1, long arg2, long arg3, + long arg4, long arg5) +{ + long ret; + + __asm__ volatile ("movq %5,%%r10 ; movq %6,%%r8 ; " __syscall + : "=a" (ret) + : "0" (syscall), "D" (arg1), "S" (arg2), "d" (arg3), + "g" (arg4), "g" (arg5) + : __syscall_clobber, "r10", "r8" ); + + return ret; +} + +static inline void trap_myself(void) +{ + __asm("int3"); +} + +static inline void remap_stack(long fd, unsigned long offset) +{ + __asm__ volatile ("movq %4,%%r10 ; movq %5,%%r8 ; " + "movq %6, %%r9; " __syscall "; movq %7, %%rbx ; " + "movq %%rax, (%%rbx)": + : "a" (STUB_MMAP_NR), "D" (STUB_DATA), + "S" (UM_KERN_PAGE_SIZE), + "d" (PROT_READ | PROT_WRITE), + "g" (MAP_FIXED | MAP_SHARED), "g" (fd), + "g" (offset), + "i" (&((struct stub_data *) STUB_DATA)->err) + : __syscall_clobber, "r10", "r8", "r9" ); +} + +#endif diff --git a/arch/x86/um/shared/sysdep/syscalls.h b/arch/x86/um/shared/sysdep/syscalls.h new file mode 100644 index 0000000..bd9a89b --- /dev/null +++ b/arch/x86/um/shared/sysdep/syscalls.h @@ -0,0 +1,5 @@ +#ifdef __i386__ +#include "syscalls_32.h" +#else +#include "syscalls_64.h" +#endif diff --git a/arch/x86/um/shared/sysdep/syscalls_32.h b/arch/x86/um/shared/sysdep/syscalls_32.h new file mode 100644 index 0000000..05cb796 --- /dev/null +++ b/arch/x86/um/shared/sysdep/syscalls_32.h @@ -0,0 +1,20 @@ +/* + * Copyright (C) 2000 - 2008 Jeff Dike (jdike@{addtoit,linux.intel}.com) + * Licensed under the GPL + */ + +#include "asm/unistd.h" +#include "sysdep/ptrace.h" + +typedef long syscall_handler_t(struct pt_regs); + +/* Not declared on x86, incompatible declarations on x86_64, so these have + * to go here rather than in sys_call_table.c + */ +extern syscall_handler_t sys_rt_sigaction; + +extern syscall_handler_t *sys_call_table[]; + +#define EXECUTE_SYSCALL(syscall, regs) \ + ((long (*)(struct syscall_args)) \ + (*sys_call_table[syscall]))(SYSCALL_ARGS(®s->regs)) diff --git a/arch/x86/um/shared/sysdep/syscalls_64.h b/arch/x86/um/shared/sysdep/syscalls_64.h new file mode 100644 index 0000000..8a7d5e1 --- /dev/null +++ b/arch/x86/um/shared/sysdep/syscalls_64.h @@ -0,0 +1,32 @@ +/* + * Copyright 2003 PathScale, Inc. + * + * Licensed under the GPL + */ + +#ifndef __SYSDEP_X86_64_SYSCALLS_H__ +#define __SYSDEP_X86_64_SYSCALLS_H__ + +#include +#include + +typedef long syscall_handler_t(void); + +extern syscall_handler_t *sys_call_table[]; + +#define EXECUTE_SYSCALL(syscall, regs) \ + (((long (*)(long, long, long, long, long, long)) \ + (*sys_call_table[syscall]))(UPT_SYSCALL_ARG1(®s->regs), \ + UPT_SYSCALL_ARG2(®s->regs), \ + UPT_SYSCALL_ARG3(®s->regs), \ + UPT_SYSCALL_ARG4(®s->regs), \ + UPT_SYSCALL_ARG5(®s->regs), \ + UPT_SYSCALL_ARG6(®s->regs))) + +extern long old_mmap(unsigned long addr, unsigned long len, + unsigned long prot, unsigned long flags, + unsigned long fd, unsigned long pgoff); +extern syscall_handler_t sys_modify_ldt; +extern syscall_handler_t sys_arch_prctl; + +#endif diff --git a/arch/x86/um/shared/sysdep/tls.h b/arch/x86/um/shared/sysdep/tls.h new file mode 100644 index 0000000..4d8f752 --- /dev/null +++ b/arch/x86/um/shared/sysdep/tls.h @@ -0,0 +1,5 @@ +#ifdef __i386__ +#include "tls_32.h" +#else +#include "tls_64.h" +#endif diff --git a/arch/x86/um/shared/sysdep/tls_32.h b/arch/x86/um/shared/sysdep/tls_32.h new file mode 100644 index 0000000..3455075 --- /dev/null +++ b/arch/x86/um/shared/sysdep/tls_32.h @@ -0,0 +1,32 @@ +#ifndef _SYSDEP_TLS_H +#define _SYSDEP_TLS_H + +# ifndef __KERNEL__ + +/* Change name to avoid conflicts with the original one from , which + * may be named user_desc (but in 2.4 and in header matching its API was named + * modify_ldt_ldt_s). */ + +typedef struct um_dup_user_desc { + unsigned int entry_number; + unsigned int base_addr; + unsigned int limit; + unsigned int seg_32bit:1; + unsigned int contents:2; + unsigned int read_exec_only:1; + unsigned int limit_in_pages:1; + unsigned int seg_not_present:1; + unsigned int useable:1; +} user_desc_t; + +# else /* __KERNEL__ */ + +# include +typedef struct user_desc user_desc_t; + +# endif /* __KERNEL__ */ + +#define GDT_ENTRY_TLS_MIN_I386 6 +#define GDT_ENTRY_TLS_MIN_X86_64 12 + +#endif /* _SYSDEP_TLS_H */ diff --git a/arch/x86/um/shared/sysdep/tls_64.h b/arch/x86/um/shared/sysdep/tls_64.h new file mode 100644 index 0000000..18c000d --- /dev/null +++ b/arch/x86/um/shared/sysdep/tls_64.h @@ -0,0 +1,29 @@ +#ifndef _SYSDEP_TLS_H +#define _SYSDEP_TLS_H + +# ifndef __KERNEL__ + +/* Change name to avoid conflicts with the original one from , which + * may be named user_desc (but in 2.4 and in header matching its API was named + * modify_ldt_ldt_s). */ + +typedef struct um_dup_user_desc { + unsigned int entry_number; + unsigned int base_addr; + unsigned int limit; + unsigned int seg_32bit:1; + unsigned int contents:2; + unsigned int read_exec_only:1; + unsigned int limit_in_pages:1; + unsigned int seg_not_present:1; + unsigned int useable:1; + unsigned int lm:1; +} user_desc_t; + +# else /* __KERNEL__ */ + +# include +typedef struct user_desc user_desc_t; + +# endif /* __KERNEL__ */ +#endif /* _SYSDEP_TLS_H */ diff --git a/arch/x86/um/signal_32.c b/arch/x86/um/signal_32.c new file mode 100644 index 0000000..bcbfb0d --- /dev/null +++ b/arch/x86/um/signal_32.c @@ -0,0 +1,498 @@ +/* + * Copyright (C) 2004 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + * Licensed under the GPL + */ + +#include +#include +#include +#include +#include "frame_kern.h" +#include "skas.h" + +/* + * FPU tag word conversions. + */ + +static inline unsigned short twd_i387_to_fxsr(unsigned short twd) +{ + unsigned int tmp; /* to avoid 16 bit prefixes in the code */ + + /* Transform each pair of bits into 01 (valid) or 00 (empty) */ + tmp = ~twd; + tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */ + /* and move the valid bits to the lower byte. */ + tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */ + tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */ + tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */ + return tmp; +} + +static inline unsigned long twd_fxsr_to_i387(struct user_fxsr_struct *fxsave) +{ + struct _fpxreg *st = NULL; + unsigned long twd = (unsigned long) fxsave->twd; + unsigned long tag; + unsigned long ret = 0xffff0000; + int i; + +#define FPREG_ADDR(f, n) ((char *)&(f)->st_space + (n) * 16) + + for (i = 0; i < 8; i++) { + if (twd & 0x1) { + st = (struct _fpxreg *) FPREG_ADDR(fxsave, i); + + switch (st->exponent & 0x7fff) { + case 0x7fff: + tag = 2; /* Special */ + break; + case 0x0000: + if ( !st->significand[0] && + !st->significand[1] && + !st->significand[2] && + !st->significand[3] ) { + tag = 1; /* Zero */ + } else { + tag = 2; /* Special */ + } + break; + default: + if (st->significand[3] & 0x8000) { + tag = 0; /* Valid */ + } else { + tag = 2; /* Special */ + } + break; + } + } else { + tag = 3; /* Empty */ + } + ret |= (tag << (2 * i)); + twd = twd >> 1; + } + return ret; +} + +static int convert_fxsr_to_user(struct _fpstate __user *buf, + struct user_fxsr_struct *fxsave) +{ + unsigned long env[7]; + struct _fpreg __user *to; + struct _fpxreg *from; + int i; + + env[0] = (unsigned long)fxsave->cwd | 0xffff0000ul; + env[1] = (unsigned long)fxsave->swd | 0xffff0000ul; + env[2] = twd_fxsr_to_i387(fxsave); + env[3] = fxsave->fip; + env[4] = fxsave->fcs | ((unsigned long)fxsave->fop << 16); + env[5] = fxsave->foo; + env[6] = fxsave->fos; + + if (__copy_to_user(buf, env, 7 * sizeof(unsigned long))) + return 1; + + to = &buf->_st[0]; + from = (struct _fpxreg *) &fxsave->st_space[0]; + for (i = 0; i < 8; i++, to++, from++) { + unsigned long __user *t = (unsigned long __user *)to; + unsigned long *f = (unsigned long *)from; + + if (__put_user(*f, t) || + __put_user(*(f + 1), t + 1) || + __put_user(from->exponent, &to->exponent)) + return 1; + } + return 0; +} + +static int convert_fxsr_from_user(struct user_fxsr_struct *fxsave, + struct _fpstate __user *buf) +{ + unsigned long env[7]; + struct _fpxreg *to; + struct _fpreg __user *from; + int i; + + if (copy_from_user( env, buf, 7 * sizeof(long))) + return 1; + + fxsave->cwd = (unsigned short)(env[0] & 0xffff); + fxsave->swd = (unsigned short)(env[1] & 0xffff); + fxsave->twd = twd_i387_to_fxsr((unsigned short)(env[2] & 0xffff)); + fxsave->fip = env[3]; + fxsave->fop = (unsigned short)((env[4] & 0xffff0000ul) >> 16); + fxsave->fcs = (env[4] & 0xffff); + fxsave->foo = env[5]; + fxsave->fos = env[6]; + + to = (struct _fpxreg *) &fxsave->st_space[0]; + from = &buf->_st[0]; + for (i = 0; i < 8; i++, to++, from++) { + unsigned long *t = (unsigned long *)to; + unsigned long __user *f = (unsigned long __user *)from; + + if (__get_user(*t, f) || + __get_user(*(t + 1), f + 1) || + __get_user(to->exponent, &from->exponent)) + return 1; + } + return 0; +} + +extern int have_fpx_regs; + +static int copy_sc_from_user(struct pt_regs *regs, + struct sigcontext __user *from) +{ + struct sigcontext sc; + int err, pid; + + err = copy_from_user(&sc, from, sizeof(sc)); + if (err) + return err; + + pid = userspace_pid[current_thread_info()->cpu]; + +#define GETREG(regno, regname) regs->regs.gp[HOST_##regno] = sc.regname + + GETREG(GS, gs); + GETREG(FS, fs); + GETREG(ES, es); + GETREG(DS, ds); + GETREG(EDI, di); + GETREG(ESI, si); + GETREG(EBP, bp); + GETREG(SP, sp); + GETREG(EBX, bx); + GETREG(EDX, dx); + GETREG(ECX, cx); + GETREG(EAX, ax); + GETREG(IP, ip); + GETREG(CS, cs); + GETREG(EFLAGS, flags); + GETREG(SS, ss); + +#undef GETREG + if (have_fpx_regs) { + struct user_fxsr_struct fpx; + + err = copy_from_user(&fpx, + &((struct _fpstate __user *)sc.fpstate)->_fxsr_env[0], + sizeof(struct user_fxsr_struct)); + if (err) + return 1; + + err = convert_fxsr_from_user(&fpx, sc.fpstate); + if (err) + return 1; + + err = restore_fpx_registers(pid, (unsigned long *) &fpx); + if (err < 0) { + printk(KERN_ERR "copy_sc_from_user - " + "restore_fpx_registers failed, errno = %d\n", + -err); + return 1; + } + } else { + struct user_i387_struct fp; + + err = copy_from_user(&fp, sc.fpstate, + sizeof(struct user_i387_struct)); + if (err) + return 1; + + err = restore_fp_registers(pid, (unsigned long *) &fp); + if (err < 0) { + printk(KERN_ERR "copy_sc_from_user - " + "restore_fp_registers failed, errno = %d\n", + -err); + return 1; + } + } + + return 0; +} + +static int copy_sc_to_user(struct sigcontext __user *to, + struct _fpstate __user *to_fp, struct pt_regs *regs, + unsigned long sp) +{ + struct sigcontext sc; + struct faultinfo * fi = ¤t->thread.arch.faultinfo; + int err, pid; + memset(&sc, 0, sizeof(struct sigcontext)); + + sc.gs = REGS_GS(regs->regs.gp); + sc.fs = REGS_FS(regs->regs.gp); + sc.es = REGS_ES(regs->regs.gp); + sc.ds = REGS_DS(regs->regs.gp); + sc.di = REGS_EDI(regs->regs.gp); + sc.si = REGS_ESI(regs->regs.gp); + sc.bp = REGS_EBP(regs->regs.gp); + sc.sp = sp; + sc.bx = REGS_EBX(regs->regs.gp); + sc.dx = REGS_EDX(regs->regs.gp); + sc.cx = REGS_ECX(regs->regs.gp); + sc.ax = REGS_EAX(regs->regs.gp); + sc.ip = REGS_IP(regs->regs.gp); + sc.cs = REGS_CS(regs->regs.gp); + sc.flags = REGS_EFLAGS(regs->regs.gp); + sc.sp_at_signal = regs->regs.gp[UESP]; + sc.ss = regs->regs.gp[SS]; + sc.cr2 = fi->cr2; + sc.err = fi->error_code; + sc.trapno = fi->trap_no; + + to_fp = (to_fp ? to_fp : (struct _fpstate __user *) (to + 1)); + sc.fpstate = to_fp; + + pid = userspace_pid[current_thread_info()->cpu]; + if (have_fpx_regs) { + struct user_fxsr_struct fpx; + + err = save_fpx_registers(pid, (unsigned long *) &fpx); + if (err < 0){ + printk(KERN_ERR "copy_sc_to_user - save_fpx_registers " + "failed, errno = %d\n", err); + return 1; + } + + err = convert_fxsr_to_user(to_fp, &fpx); + if (err) + return 1; + + err |= __put_user(fpx.swd, &to_fp->status); + err |= __put_user(X86_FXSR_MAGIC, &to_fp->magic); + if (err) + return 1; + + if (copy_to_user(&to_fp->_fxsr_env[0], &fpx, + sizeof(struct user_fxsr_struct))) + return 1; + } + else { + struct user_i387_struct fp; + + err = save_fp_registers(pid, (unsigned long *) &fp); + if (copy_to_user(to_fp, &fp, sizeof(struct user_i387_struct))) + return 1; + } + + return copy_to_user(to, &sc, sizeof(sc)); +} + +static int copy_ucontext_to_user(struct ucontext __user *uc, + struct _fpstate __user *fp, sigset_t *set, + unsigned long sp) +{ + int err = 0; + + err |= put_user(current->sas_ss_sp, &uc->uc_stack.ss_sp); + err |= put_user(sas_ss_flags(sp), &uc->uc_stack.ss_flags); + err |= put_user(current->sas_ss_size, &uc->uc_stack.ss_size); + err |= copy_sc_to_user(&uc->uc_mcontext, fp, ¤t->thread.regs, sp); + err |= copy_to_user(&uc->uc_sigmask, set, sizeof(*set)); + return err; +} + +struct sigframe +{ + char __user *pretcode; + int sig; + struct sigcontext sc; + struct _fpstate fpstate; + unsigned long extramask[_NSIG_WORDS-1]; + char retcode[8]; +}; + +struct rt_sigframe +{ + char __user *pretcode; + int sig; + struct siginfo __user *pinfo; + void __user *puc; + struct siginfo info; + struct ucontext uc; + struct _fpstate fpstate; + char retcode[8]; +}; + +int setup_signal_stack_sc(unsigned long stack_top, int sig, + struct k_sigaction *ka, struct pt_regs *regs, + sigset_t *mask) +{ + struct sigframe __user *frame; + void __user *restorer; + unsigned long save_sp = PT_REGS_SP(regs); + int err = 0; + + /* This is the same calculation as i386 - ((sp + 4) & 15) == 0 */ + stack_top = ((stack_top + 4) & -16UL) - 4; + frame = (struct sigframe __user *) stack_top - 1; + if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) + return 1; + + restorer = frame->retcode; + if (ka->sa.sa_flags & SA_RESTORER) + restorer = ka->sa.sa_restorer; + + /* Update SP now because the page fault handler refuses to extend + * the stack if the faulting address is too far below the current + * SP, which frame now certainly is. If there's an error, the original + * value is restored on the way out. + * When writing the sigcontext to the stack, we have to write the + * original value, so that's passed to copy_sc_to_user, which does + * the right thing with it. + */ + PT_REGS_SP(regs) = (unsigned long) frame; + + err |= __put_user(restorer, &frame->pretcode); + err |= __put_user(sig, &frame->sig); + err |= copy_sc_to_user(&frame->sc, NULL, regs, save_sp); + err |= __put_user(mask->sig[0], &frame->sc.oldmask); + if (_NSIG_WORDS > 1) + err |= __copy_to_user(&frame->extramask, &mask->sig[1], + sizeof(frame->extramask)); + + /* + * This is popl %eax ; movl $,%eax ; int $0x80 + * + * WE DO NOT USE IT ANY MORE! It's only left here for historical + * reasons and because gdb uses it as a signature to notice + * signal handler stack frames. + */ + err |= __put_user(0xb858, (short __user *)(frame->retcode+0)); + err |= __put_user(__NR_sigreturn, (int __user *)(frame->retcode+2)); + err |= __put_user(0x80cd, (short __user *)(frame->retcode+6)); + + if (err) + goto err; + + PT_REGS_SP(regs) = (unsigned long) frame; + PT_REGS_IP(regs) = (unsigned long) ka->sa.sa_handler; + PT_REGS_EAX(regs) = (unsigned long) sig; + PT_REGS_EDX(regs) = (unsigned long) 0; + PT_REGS_ECX(regs) = (unsigned long) 0; + + if ((current->ptrace & PT_DTRACE) && (current->ptrace & PT_PTRACED)) + ptrace_notify(SIGTRAP); + return 0; + +err: + PT_REGS_SP(regs) = save_sp; + return err; +} + +int setup_signal_stack_si(unsigned long stack_top, int sig, + struct k_sigaction *ka, struct pt_regs *regs, + siginfo_t *info, sigset_t *mask) +{ + struct rt_sigframe __user *frame; + void __user *restorer; + unsigned long save_sp = PT_REGS_SP(regs); + int err = 0; + + stack_top &= -8UL; + frame = (struct rt_sigframe __user *) stack_top - 1; + if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) + return 1; + + restorer = frame->retcode; + if (ka->sa.sa_flags & SA_RESTORER) + restorer = ka->sa.sa_restorer; + + /* See comment above about why this is here */ + PT_REGS_SP(regs) = (unsigned long) frame; + + err |= __put_user(restorer, &frame->pretcode); + err |= __put_user(sig, &frame->sig); + err |= __put_user(&frame->info, &frame->pinfo); + err |= __put_user(&frame->uc, &frame->puc); + err |= copy_siginfo_to_user(&frame->info, info); + err |= copy_ucontext_to_user(&frame->uc, &frame->fpstate, mask, + save_sp); + + /* + * This is movl $,%eax ; int $0x80 + * + * WE DO NOT USE IT ANY MORE! It's only left here for historical + * reasons and because gdb uses it as a signature to notice + * signal handler stack frames. + */ + err |= __put_user(0xb8, (char __user *)(frame->retcode+0)); + err |= __put_user(__NR_rt_sigreturn, (int __user *)(frame->retcode+1)); + err |= __put_user(0x80cd, (short __user *)(frame->retcode+5)); + + if (err) + goto err; + + PT_REGS_IP(regs) = (unsigned long) ka->sa.sa_handler; + PT_REGS_EAX(regs) = (unsigned long) sig; + PT_REGS_EDX(regs) = (unsigned long) &frame->info; + PT_REGS_ECX(regs) = (unsigned long) &frame->uc; + + if ((current->ptrace & PT_DTRACE) && (current->ptrace & PT_PTRACED)) + ptrace_notify(SIGTRAP); + return 0; + +err: + PT_REGS_SP(regs) = save_sp; + return err; +} + +long sys_sigreturn(struct pt_regs regs) +{ + unsigned long sp = PT_REGS_SP(¤t->thread.regs); + struct sigframe __user *frame = (struct sigframe __user *)(sp - 8); + sigset_t set; + struct sigcontext __user *sc = &frame->sc; + unsigned long __user *oldmask = &sc->oldmask; + unsigned long __user *extramask = frame->extramask; + int sig_size = (_NSIG_WORDS - 1) * sizeof(unsigned long); + + if (copy_from_user(&set.sig[0], oldmask, sizeof(set.sig[0])) || + copy_from_user(&set.sig[1], extramask, sig_size)) + goto segfault; + + sigdelsetmask(&set, ~_BLOCKABLE); + set_current_blocked(&set); + + if (copy_sc_from_user(¤t->thread.regs, sc)) + goto segfault; + + /* Avoid ERESTART handling */ + PT_REGS_SYSCALL_NR(¤t->thread.regs) = -1; + return PT_REGS_SYSCALL_RET(¤t->thread.regs); + + segfault: + force_sig(SIGSEGV, current); + return 0; +} + +long sys_rt_sigreturn(struct pt_regs regs) +{ + unsigned long sp = PT_REGS_SP(¤t->thread.regs); + struct rt_sigframe __user *frame = + (struct rt_sigframe __user *) (sp - 4); + sigset_t set; + struct ucontext __user *uc = &frame->uc; + int sig_size = _NSIG_WORDS * sizeof(unsigned long); + + if (copy_from_user(&set, &uc->uc_sigmask, sig_size)) + goto segfault; + + sigdelsetmask(&set, ~_BLOCKABLE); + set_current_blocked(&set); + + if (copy_sc_from_user(¤t->thread.regs, &uc->uc_mcontext)) + goto segfault; + + /* Avoid ERESTART handling */ + PT_REGS_SYSCALL_NR(¤t->thread.regs) = -1; + return PT_REGS_SYSCALL_RET(¤t->thread.regs); + + segfault: + force_sig(SIGSEGV, current); + return 0; +} diff --git a/arch/x86/um/signal_64.c b/arch/x86/um/signal_64.c new file mode 100644 index 0000000..255b2ca --- /dev/null +++ b/arch/x86/um/signal_64.c @@ -0,0 +1,255 @@ +/* + * Copyright (C) 2003 PathScale, Inc. + * Copyright (C) 2003 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + * Licensed under the GPL + */ + +#include +#include +#include +#include +#include +#include +#include "frame_kern.h" +#include "skas.h" + +static int copy_sc_from_user(struct pt_regs *regs, + struct sigcontext __user *from) +{ + struct sigcontext sc; + struct user_i387_struct fp; + void __user *buf; + int err; + + err = copy_from_user(&sc, from, sizeof(sc)); + if (err) + return err; + +#define GETREG(regno, regname) regs->regs.gp[HOST_##regno] = sc.regname + + GETREG(R8, r8); + GETREG(R9, r9); + GETREG(R10, r10); + GETREG(R11, r11); + GETREG(R12, r12); + GETREG(R13, r13); + GETREG(R14, r14); + GETREG(R15, r15); + GETREG(RDI, di); + GETREG(RSI, si); + GETREG(RBP, bp); + GETREG(RBX, bx); + GETREG(RDX, dx); + GETREG(RAX, ax); + GETREG(RCX, cx); + GETREG(SP, sp); + GETREG(IP, ip); + GETREG(EFLAGS, flags); + GETREG(CS, cs); +#undef GETREG + + buf = sc.fpstate; + + err = copy_from_user(&fp, buf, sizeof(struct user_i387_struct)); + if (err) + return 1; + + err = restore_fp_registers(userspace_pid[current_thread_info()->cpu], + (unsigned long *) &fp); + if (err < 0) { + printk(KERN_ERR "copy_sc_from_user - " + "restore_fp_registers failed, errno = %d\n", + -err); + return 1; + } + + return 0; +} + +static int copy_sc_to_user(struct sigcontext __user *to, + struct _fpstate __user *to_fp, struct pt_regs *regs, + unsigned long mask, unsigned long sp) +{ + struct faultinfo * fi = ¤t->thread.arch.faultinfo; + struct sigcontext sc; + struct user_i387_struct fp; + int err = 0; + memset(&sc, 0, sizeof(struct sigcontext)); + +#define PUTREG(regno, regname) sc.regname = regs->regs.gp[HOST_##regno] + + PUTREG(RDI, di); + PUTREG(RSI, si); + PUTREG(RBP, bp); + /* + * Must use original RSP, which is passed in, rather than what's in + * signal frame. + */ + sc.sp = sp; + PUTREG(RBX, bx); + PUTREG(RDX, dx); + PUTREG(RCX, cx); + PUTREG(RAX, ax); + PUTREG(R8, r8); + PUTREG(R9, r9); + PUTREG(R10, r10); + PUTREG(R11, r11); + PUTREG(R12, r12); + PUTREG(R13, r13); + PUTREG(R14, r14); + PUTREG(R15, r15); + PUTREG(CS, cs); /* XXX x86_64 doesn't do this */ + + sc.cr2 = fi->cr2; + sc.err = fi->error_code; + sc.trapno = fi->trap_no; + + PUTREG(IP, ip); + PUTREG(EFLAGS, flags); +#undef PUTREG + + sc.oldmask = mask; + + err = copy_to_user(to, &sc, sizeof(struct sigcontext)); + if (err) + return 1; + + err = save_fp_registers(userspace_pid[current_thread_info()->cpu], + (unsigned long *) &fp); + if (err < 0) { + printk(KERN_ERR "copy_sc_from_user - restore_fp_registers " + "failed, errno = %d\n", -err); + return 1; + } + + if (copy_to_user(to_fp, &fp, sizeof(struct user_i387_struct))) + return 1; + + return err; +} + +struct rt_sigframe +{ + char __user *pretcode; + struct ucontext uc; + struct siginfo info; + struct _fpstate fpstate; +}; + +int setup_signal_stack_si(unsigned long stack_top, int sig, + struct k_sigaction *ka, struct pt_regs * regs, + siginfo_t *info, sigset_t *set) +{ + struct rt_sigframe __user *frame; + unsigned long save_sp = PT_REGS_RSP(regs); + int err = 0; + struct task_struct *me = current; + + frame = (struct rt_sigframe __user *) + round_down(stack_top - sizeof(struct rt_sigframe), 16); + /* Subtract 128 for a red zone and 8 for proper alignment */ + frame = (struct rt_sigframe __user *) ((unsigned long) frame - 128 - 8); + + if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) + goto out; + + if (ka->sa.sa_flags & SA_SIGINFO) { + err |= copy_siginfo_to_user(&frame->info, info); + if (err) + goto out; + } + + /* + * Update SP now because the page fault handler refuses to extend + * the stack if the faulting address is too far below the current + * SP, which frame now certainly is. If there's an error, the original + * value is restored on the way out. + * When writing the sigcontext to the stack, we have to write the + * original value, so that's passed to copy_sc_to_user, which does + * the right thing with it. + */ + PT_REGS_RSP(regs) = (unsigned long) frame; + + /* Create the ucontext. */ + err |= __put_user(0, &frame->uc.uc_flags); + err |= __put_user(0, &frame->uc.uc_link); + err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp); + err |= __put_user(sas_ss_flags(save_sp), + &frame->uc.uc_stack.ss_flags); + err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size); + err |= copy_sc_to_user(&frame->uc.uc_mcontext, &frame->fpstate, regs, + set->sig[0], save_sp); + err |= __put_user(&frame->fpstate, &frame->uc.uc_mcontext.fpstate); + if (sizeof(*set) == 16) { + __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]); + __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]); + } + else + err |= __copy_to_user(&frame->uc.uc_sigmask, set, + sizeof(*set)); + + /* + * Set up to return from userspace. If provided, use a stub + * already in userspace. + */ + /* x86-64 should always use SA_RESTORER. */ + if (ka->sa.sa_flags & SA_RESTORER) + err |= __put_user(ka->sa.sa_restorer, &frame->pretcode); + else + /* could use a vstub here */ + goto restore_sp; + + if (err) + goto restore_sp; + + /* Set up registers for signal handler */ + { + struct exec_domain *ed = current_thread_info()->exec_domain; + if (unlikely(ed && ed->signal_invmap && sig < 32)) + sig = ed->signal_invmap[sig]; + } + + PT_REGS_RDI(regs) = sig; + /* In case the signal handler was declared without prototypes */ + PT_REGS_RAX(regs) = 0; + + /* + * This also works for non SA_SIGINFO handlers because they expect the + * next argument after the signal number on the stack. + */ + PT_REGS_RSI(regs) = (unsigned long) &frame->info; + PT_REGS_RDX(regs) = (unsigned long) &frame->uc; + PT_REGS_RIP(regs) = (unsigned long) ka->sa.sa_handler; + out: + return err; + +restore_sp: + PT_REGS_RSP(regs) = save_sp; + return err; +} + +long sys_rt_sigreturn(struct pt_regs *regs) +{ + unsigned long sp = PT_REGS_SP(¤t->thread.regs); + struct rt_sigframe __user *frame = + (struct rt_sigframe __user *)(sp - 8); + struct ucontext __user *uc = &frame->uc; + sigset_t set; + + if (copy_from_user(&set, &uc->uc_sigmask, sizeof(set))) + goto segfault; + + sigdelsetmask(&set, ~_BLOCKABLE); + set_current_blocked(&set); + + if (copy_sc_from_user(¤t->thread.regs, &uc->uc_mcontext)) + goto segfault; + + /* Avoid ERESTART handling */ + PT_REGS_SYSCALL_NR(¤t->thread.regs) = -1; + return PT_REGS_SYSCALL_RET(¤t->thread.regs); + + segfault: + force_sig(SIGSEGV, current); + return 0; +} diff --git a/arch/x86/um/stub_32.S b/arch/x86/um/stub_32.S new file mode 100644 index 0000000..54a36ec --- /dev/null +++ b/arch/x86/um/stub_32.S @@ -0,0 +1,51 @@ +#include "as-layout.h" + + .globl syscall_stub +.section .__syscall_stub, "ax" + + .globl batch_syscall_stub +batch_syscall_stub: + /* load pointer to first operation */ + mov $(STUB_DATA+8), %esp + +again: + /* load length of additional data */ + mov 0x0(%esp), %eax + + /* if(length == 0) : end of list */ + /* write possible 0 to header */ + mov %eax, STUB_DATA+4 + cmpl $0, %eax + jz done + + /* save current pointer */ + mov %esp, STUB_DATA+4 + + /* skip additional data */ + add %eax, %esp + + /* load syscall-# */ + pop %eax + + /* load syscall params */ + pop %ebx + pop %ecx + pop %edx + pop %esi + pop %edi + pop %ebp + + /* execute syscall */ + int $0x80 + + /* check return value */ + pop %ebx + cmp %ebx, %eax + je again + +done: + /* save return value */ + mov %eax, STUB_DATA + + /* stop */ + int3 diff --git a/arch/x86/um/stub_64.S b/arch/x86/um/stub_64.S new file mode 100644 index 0000000..20e4a96 --- /dev/null +++ b/arch/x86/um/stub_64.S @@ -0,0 +1,66 @@ +#include "as-layout.h" + + .globl syscall_stub +.section .__syscall_stub, "ax" +syscall_stub: + syscall + /* We don't have 64-bit constants, so this constructs the address + * we need. + */ + movq $(STUB_DATA >> 32), %rbx + salq $32, %rbx + movq $(STUB_DATA & 0xffffffff), %rcx + or %rcx, %rbx + movq %rax, (%rbx) + int3 + + .globl batch_syscall_stub +batch_syscall_stub: + mov $(STUB_DATA >> 32), %rbx + sal $32, %rbx + mov $(STUB_DATA & 0xffffffff), %rax + or %rax, %rbx + /* load pointer to first operation */ + mov %rbx, %rsp + add $0x10, %rsp +again: + /* load length of additional data */ + mov 0x0(%rsp), %rax + + /* if(length == 0) : end of list */ + /* write possible 0 to header */ + mov %rax, 8(%rbx) + cmp $0, %rax + jz done + + /* save current pointer */ + mov %rsp, 8(%rbx) + + /* skip additional data */ + add %rax, %rsp + + /* load syscall-# */ + pop %rax + + /* load syscall params */ + pop %rdi + pop %rsi + pop %rdx + pop %r10 + pop %r8 + pop %r9 + + /* execute syscall */ + syscall + + /* check return value */ + pop %rcx + cmp %rcx, %rax + je again + +done: + /* save return value */ + mov %rax, (%rbx) + + /* stop */ + int3 diff --git a/arch/x86/um/stub_segv.c b/arch/x86/um/stub_segv.c new file mode 100644 index 0000000..b7450bd --- /dev/null +++ b/arch/x86/um/stub_segv.c @@ -0,0 +1,19 @@ +/* + * Copyright (C) 2004 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + * Licensed under the GPL + */ + +#include "sysdep/stub.h" +#include "sysdep/faultinfo.h" +#include "sysdep/mcontext.h" + +void __attribute__ ((__section__ (".__syscall_stub"))) +stub_segv_handler(int sig, siginfo_t *info, void *p) +{ + struct ucontext *uc = p; + + GET_FAULTINFO_FROM_MC(*((struct faultinfo *) STUB_DATA), + &uc->uc_mcontext); + trap_myself(); +} + diff --git a/arch/x86/um/sys_call_table_32.S b/arch/x86/um/sys_call_table_32.S new file mode 100644 index 0000000..de274071 --- /dev/null +++ b/arch/x86/um/sys_call_table_32.S @@ -0,0 +1,28 @@ +#include +/* Steal i386 syscall table for our purposes, but with some slight changes.*/ + +#define sys_iopl sys_ni_syscall +#define sys_ioperm sys_ni_syscall + +#define sys_vm86old sys_ni_syscall +#define sys_vm86 sys_ni_syscall + +#define old_mmap sys_old_mmap + +#define ptregs_fork sys_fork +#define ptregs_execve sys_execve +#define ptregs_iopl sys_iopl +#define ptregs_vm86old sys_vm86old +#define ptregs_sigreturn sys_sigreturn +#define ptregs_clone sys_clone +#define ptregs_vm86 sys_vm86 +#define ptregs_rt_sigreturn sys_rt_sigreturn +#define ptregs_sigaltstack sys_sigaltstack +#define ptregs_vfork sys_vfork + +.section .rodata,"a" + +#include "../../x86/kernel/syscall_table_32.S" + +ENTRY(syscall_table_size) +.long .-sys_call_table diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c new file mode 100644 index 0000000..f46de82 --- /dev/null +++ b/arch/x86/um/sys_call_table_64.c @@ -0,0 +1,64 @@ +/* + * System call table for UML/x86-64, copied from arch/x86_64/kernel/syscall.c + * with some changes for UML. + */ + +#include +#include +#include + +#define __NO_STUBS + +/* + * Below you can see, in terms of #define's, the differences between the x86-64 + * and the UML syscall table. + */ + +/* Not going to be implemented by UML, since we have no hardware. */ +#define stub_iopl sys_ni_syscall +#define sys_ioperm sys_ni_syscall + +/* + * The UML TLS problem. Note that x86_64 does not implement this, so the below + * is needed only for the ia32 compatibility. + */ + +/* On UML we call it this way ("old" means it's not mmap2) */ +#define sys_mmap old_mmap + +#define stub_clone sys_clone +#define stub_fork sys_fork +#define stub_vfork sys_vfork +#define stub_execve sys_execve +#define stub_rt_sigsuspend sys_rt_sigsuspend +#define stub_sigaltstack sys_sigaltstack +#define stub_rt_sigreturn sys_rt_sigreturn + +#define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ; +#undef _ASM_X86_UNISTD_64_H +#include "../../x86/include/asm/unistd_64.h" + +#undef __SYSCALL +#define __SYSCALL(nr, sym) [ nr ] = sym, +#undef _ASM_X86_UNISTD_64_H + +typedef void (*sys_call_ptr_t)(void); + +extern void sys_ni_syscall(void); + +/* + * We used to have a trick here which made sure that holes in the + * x86_64 table were filled in with sys_ni_syscall, but a comment in + * unistd_64.h says that holes aren't allowed, so the trick was + * removed. + * The trick looked like this + * [0 ... UM_NR_syscall_max] = &sys_ni_syscall + * before including unistd_64.h - the later initializations overwrote + * the sys_ni_syscall filler. + */ + +sys_call_ptr_t sys_call_table[] __cacheline_aligned = { +#include "../../x86/include/asm/unistd_64.h" +}; + +int syscall_table_size = sizeof(sys_call_table); diff --git a/arch/x86/um/syscalls_32.c b/arch/x86/um/syscalls_32.c new file mode 100644 index 0000000..70ca357 --- /dev/null +++ b/arch/x86/um/syscalls_32.c @@ -0,0 +1,66 @@ +/* + * Copyright (C) 2000 - 2003 Jeff Dike (jdike@addtoit.com) + * Licensed under the GPL + */ + +#include "linux/sched.h" +#include "linux/shm.h" +#include "linux/ipc.h" +#include "linux/syscalls.h" +#include "asm/mman.h" +#include "asm/uaccess.h" +#include "asm/unistd.h" + +/* + * The prototype on i386 is: + * + * int clone(int flags, void * child_stack, int * parent_tidptr, struct user_desc * newtls, int * child_tidptr) + * + * and the "newtls" arg. on i386 is read by copy_thread directly from the + * register saved on the stack. + */ +long sys_clone(unsigned long clone_flags, unsigned long newsp, + int __user *parent_tid, void *newtls, int __user *child_tid) +{ + long ret; + + if (!newsp) + newsp = UPT_SP(¤t->thread.regs.regs); + + current->thread.forking = 1; + ret = do_fork(clone_flags, newsp, ¤t->thread.regs, 0, parent_tid, + child_tid); + current->thread.forking = 0; + return ret; +} + +long sys_sigaction(int sig, const struct old_sigaction __user *act, + struct old_sigaction __user *oact) +{ + struct k_sigaction new_ka, old_ka; + int ret; + + if (act) { + old_sigset_t mask; + if (!access_ok(VERIFY_READ, act, sizeof(*act)) || + __get_user(new_ka.sa.sa_handler, &act->sa_handler) || + __get_user(new_ka.sa.sa_restorer, &act->sa_restorer)) + return -EFAULT; + __get_user(new_ka.sa.sa_flags, &act->sa_flags); + __get_user(mask, &act->sa_mask); + siginitset(&new_ka.sa.sa_mask, mask); + } + + ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); + + if (!ret && oact) { + if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) || + __put_user(old_ka.sa.sa_handler, &oact->sa_handler) || + __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer)) + return -EFAULT; + __put_user(old_ka.sa.sa_flags, &oact->sa_flags); + __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask); + } + + return ret; +} diff --git a/arch/x86/um/syscalls_64.c b/arch/x86/um/syscalls_64.c new file mode 100644 index 0000000..f3d82bb --- /dev/null +++ b/arch/x86/um/syscalls_64.c @@ -0,0 +1,102 @@ +/* + * Copyright (C) 2003 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + * Copyright 2003 PathScale, Inc. + * + * Licensed under the GPL + */ + +#include "linux/linkage.h" +#include "linux/personality.h" +#include "linux/utsname.h" +#include "asm/prctl.h" /* XXX This should get the constants from libc */ +#include "asm/uaccess.h" +#include "os.h" + +long arch_prctl(struct task_struct *task, int code, unsigned long __user *addr) +{ + unsigned long *ptr = addr, tmp; + long ret; + int pid = task->mm->context.id.u.pid; + + /* + * With ARCH_SET_FS (and ARCH_SET_GS is treated similarly to + * be safe), we need to call arch_prctl on the host because + * setting %fs may result in something else happening (like a + * GDT or thread.fs being set instead). So, we let the host + * fiddle the registers and thread struct and restore the + * registers afterwards. + * + * So, the saved registers are stored to the process (this + * needed because a stub may have been the last thing to run), + * arch_prctl is run on the host, then the registers are read + * back. + */ + switch (code) { + case ARCH_SET_FS: + case ARCH_SET_GS: + ret = restore_registers(pid, ¤t->thread.regs.regs); + if (ret) + return ret; + break; + case ARCH_GET_FS: + case ARCH_GET_GS: + /* + * With these two, we read to a local pointer and + * put_user it to the userspace pointer that we were + * given. If addr isn't valid (because it hasn't been + * faulted in or is just bogus), we want put_user to + * fault it in (or return -EFAULT) instead of having + * the host return -EFAULT. + */ + ptr = &tmp; + } + + ret = os_arch_prctl(pid, code, ptr); + if (ret) + return ret; + + switch (code) { + case ARCH_SET_FS: + current->thread.arch.fs = (unsigned long) ptr; + ret = save_registers(pid, ¤t->thread.regs.regs); + break; + case ARCH_SET_GS: + ret = save_registers(pid, ¤t->thread.regs.regs); + break; + case ARCH_GET_FS: + ret = put_user(tmp, addr); + break; + case ARCH_GET_GS: + ret = put_user(tmp, addr); + break; + } + + return ret; +} + +long sys_arch_prctl(int code, unsigned long addr) +{ + return arch_prctl(current, code, (unsigned long __user *) addr); +} + +long sys_clone(unsigned long clone_flags, unsigned long newsp, + void __user *parent_tid, void __user *child_tid) +{ + long ret; + + if (!newsp) + newsp = UPT_SP(¤t->thread.regs.regs); + current->thread.forking = 1; + ret = do_fork(clone_flags, newsp, ¤t->thread.regs, 0, parent_tid, + child_tid); + current->thread.forking = 0; + return ret; +} + +void arch_switch_to(struct task_struct *to) +{ + if ((to->thread.arch.fs == 0) || (to->mm == NULL)) + return; + + arch_prctl(to, ARCH_SET_FS, (void __user *) to->thread.arch.fs); +} diff --git a/arch/x86/um/sysrq_32.c b/arch/x86/um/sysrq_32.c new file mode 100644 index 0000000..171b3e9 --- /dev/null +++ b/arch/x86/um/sysrq_32.c @@ -0,0 +1,101 @@ +/* + * Copyright (C) 2001 - 2003 Jeff Dike (jdike@addtoit.com) + * Licensed under the GPL + */ + +#include "linux/kernel.h" +#include "linux/smp.h" +#include "linux/sched.h" +#include "linux/kallsyms.h" +#include "asm/ptrace.h" +#include "sysrq.h" + +/* This is declared by */ +void show_regs(struct pt_regs *regs) +{ + printk("\n"); + printk("EIP: %04lx:[<%08lx>] CPU: %d %s", + 0xffff & PT_REGS_CS(regs), PT_REGS_IP(regs), + smp_processor_id(), print_tainted()); + if (PT_REGS_CS(regs) & 3) + printk(" ESP: %04lx:%08lx", 0xffff & PT_REGS_SS(regs), + PT_REGS_SP(regs)); + printk(" EFLAGS: %08lx\n %s\n", PT_REGS_EFLAGS(regs), + print_tainted()); + printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", + PT_REGS_EAX(regs), PT_REGS_EBX(regs), + PT_REGS_ECX(regs), + PT_REGS_EDX(regs)); + printk("ESI: %08lx EDI: %08lx EBP: %08lx", + PT_REGS_ESI(regs), PT_REGS_EDI(regs), + PT_REGS_EBP(regs)); + printk(" DS: %04lx ES: %04lx\n", + 0xffff & PT_REGS_DS(regs), + 0xffff & PT_REGS_ES(regs)); + + show_trace(NULL, (unsigned long *) ®s); +} + +/* Copied from i386. */ +static inline int valid_stack_ptr(struct thread_info *tinfo, void *p) +{ + return p > (void *)tinfo && + p < (void *)tinfo + THREAD_SIZE - 3; +} + +/* Adapted from i386 (we also print the address we read from). */ +static inline unsigned long print_context_stack(struct thread_info *tinfo, + unsigned long *stack, unsigned long ebp) +{ + unsigned long addr; + +#ifdef CONFIG_FRAME_POINTER + while (valid_stack_ptr(tinfo, (void *)ebp)) { + addr = *(unsigned long *)(ebp + 4); + printk("%08lx: [<%08lx>]", ebp + 4, addr); + print_symbol(" %s", addr); + printk("\n"); + ebp = *(unsigned long *)ebp; + } +#else + while (valid_stack_ptr(tinfo, stack)) { + addr = *stack; + if (__kernel_text_address(addr)) { + printk("%08lx: [<%08lx>]", (unsigned long) stack, addr); + print_symbol(" %s", addr); + printk("\n"); + } + stack++; + } +#endif + return ebp; +} + +void show_trace(struct task_struct* task, unsigned long * stack) +{ + unsigned long ebp; + struct thread_info *context; + + /* Turn this into BUG_ON if possible. */ + if (!stack) { + stack = (unsigned long*) &stack; + printk("show_trace: got NULL stack, implicit assumption task == current"); + WARN_ON(1); + } + + if (!task) + task = current; + + if (task != current) { + ebp = (unsigned long) KSTK_EBP(task); + } else { + asm ("movl %%ebp, %0" : "=r" (ebp) : ); + } + + context = (struct thread_info *) + ((unsigned long)stack & (~(THREAD_SIZE - 1))); + print_context_stack(context, stack, ebp); + + printk("\n"); +} + diff --git a/arch/x86/um/sysrq_64.c b/arch/x86/um/sysrq_64.c new file mode 100644 index 0000000..f4f82be --- /dev/null +++ b/arch/x86/um/sysrq_64.c @@ -0,0 +1,41 @@ +/* + * Copyright 2003 PathScale, Inc. + * + * Licensed under the GPL + */ + +#include +#include +#include +#include +#include +#include +#include "sysrq.h" + +void __show_regs(struct pt_regs *regs) +{ + printk("\n"); + print_modules(); + printk(KERN_INFO "Pid: %d, comm: %.20s %s %s\n", task_pid_nr(current), + current->comm, print_tainted(), init_utsname()->release); + printk(KERN_INFO "RIP: %04lx:[<%016lx>]\n", PT_REGS_CS(regs) & 0xffff, + PT_REGS_RIP(regs)); + printk(KERN_INFO "RSP: %016lx EFLAGS: %08lx\n", PT_REGS_RSP(regs), + PT_REGS_EFLAGS(regs)); + printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n", + PT_REGS_RAX(regs), PT_REGS_RBX(regs), PT_REGS_RCX(regs)); + printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n", + PT_REGS_RDX(regs), PT_REGS_RSI(regs), PT_REGS_RDI(regs)); + printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n", + PT_REGS_RBP(regs), PT_REGS_R8(regs), PT_REGS_R9(regs)); + printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n", + PT_REGS_R10(regs), PT_REGS_R11(regs), PT_REGS_R12(regs)); + printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n", + PT_REGS_R13(regs), PT_REGS_R14(regs), PT_REGS_R15(regs)); +} + +void show_regs(struct pt_regs *regs) +{ + __show_regs(regs); + show_trace(current, (unsigned long *) ®s); +} diff --git a/arch/x86/um/tls_32.c b/arch/x86/um/tls_32.c new file mode 100644 index 0000000..c6c7131 --- /dev/null +++ b/arch/x86/um/tls_32.c @@ -0,0 +1,396 @@ +/* + * Copyright (C) 2005 Paolo 'Blaisorblade' Giarrusso + * Licensed under the GPL + */ + +#include "linux/percpu.h" +#include "linux/sched.h" +#include "asm/uaccess.h" +#include "os.h" +#include "skas.h" +#include "sysdep/tls.h" + +/* + * If needed we can detect when it's uninitialized. + * + * These are initialized in an initcall and unchanged thereafter. + */ +static int host_supports_tls = -1; +int host_gdt_entry_tls_min; + +int do_set_thread_area(struct user_desc *info) +{ + int ret; + u32 cpu; + + cpu = get_cpu(); + ret = os_set_thread_area(info, userspace_pid[cpu]); + put_cpu(); + + if (ret) + printk(KERN_ERR "PTRACE_SET_THREAD_AREA failed, err = %d, " + "index = %d\n", ret, info->entry_number); + + return ret; +} + +int do_get_thread_area(struct user_desc *info) +{ + int ret; + u32 cpu; + + cpu = get_cpu(); + ret = os_get_thread_area(info, userspace_pid[cpu]); + put_cpu(); + + if (ret) + printk(KERN_ERR "PTRACE_GET_THREAD_AREA failed, err = %d, " + "index = %d\n", ret, info->entry_number); + + return ret; +} + +/* + * sys_get_thread_area: get a yet unused TLS descriptor index. + * XXX: Consider leaving one free slot for glibc usage at first place. This must + * be done here (and by changing GDT_ENTRY_TLS_* macros) and nowhere else. + * + * Also, this must be tested when compiling in SKAS mode with dynamic linking + * and running against NPTL. + */ +static int get_free_idx(struct task_struct* task) +{ + struct thread_struct *t = &task->thread; + int idx; + + if (!t->arch.tls_array) + return GDT_ENTRY_TLS_MIN; + + for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++) + if (!t->arch.tls_array[idx].present) + return idx + GDT_ENTRY_TLS_MIN; + return -ESRCH; +} + +static inline void clear_user_desc(struct user_desc* info) +{ + /* Postcondition: LDT_empty(info) returns true. */ + memset(info, 0, sizeof(*info)); + + /* + * Check the LDT_empty or the i386 sys_get_thread_area code - we obtain + * indeed an empty user_desc. + */ + info->read_exec_only = 1; + info->seg_not_present = 1; +} + +#define O_FORCE 1 + +static int load_TLS(int flags, struct task_struct *to) +{ + int ret = 0; + int idx; + + for (idx = GDT_ENTRY_TLS_MIN; idx < GDT_ENTRY_TLS_MAX; idx++) { + struct uml_tls_struct* curr = + &to->thread.arch.tls_array[idx - GDT_ENTRY_TLS_MIN]; + + /* + * Actually, now if it wasn't flushed it gets cleared and + * flushed to the host, which will clear it. + */ + if (!curr->present) { + if (!curr->flushed) { + clear_user_desc(&curr->tls); + curr->tls.entry_number = idx; + } else { + WARN_ON(!LDT_empty(&curr->tls)); + continue; + } + } + + if (!(flags & O_FORCE) && curr->flushed) + continue; + + ret = do_set_thread_area(&curr->tls); + if (ret) + goto out; + + curr->flushed = 1; + } +out: + return ret; +} + +/* + * Verify if we need to do a flush for the new process, i.e. if there are any + * present desc's, only if they haven't been flushed. + */ +static inline int needs_TLS_update(struct task_struct *task) +{ + int i; + int ret = 0; + + for (i = GDT_ENTRY_TLS_MIN; i < GDT_ENTRY_TLS_MAX; i++) { + struct uml_tls_struct* curr = + &task->thread.arch.tls_array[i - GDT_ENTRY_TLS_MIN]; + + /* + * Can't test curr->present, we may need to clear a descriptor + * which had a value. + */ + if (curr->flushed) + continue; + ret = 1; + break; + } + return ret; +} + +/* + * On a newly forked process, the TLS descriptors haven't yet been flushed. So + * we mark them as such and the first switch_to will do the job. + */ +void clear_flushed_tls(struct task_struct *task) +{ + int i; + + for (i = GDT_ENTRY_TLS_MIN; i < GDT_ENTRY_TLS_MAX; i++) { + struct uml_tls_struct* curr = + &task->thread.arch.tls_array[i - GDT_ENTRY_TLS_MIN]; + + /* + * Still correct to do this, if it wasn't present on the host it + * will remain as flushed as it was. + */ + if (!curr->present) + continue; + + curr->flushed = 0; + } +} + +/* + * In SKAS0 mode, currently, multiple guest threads sharing the same ->mm have a + * common host process. So this is needed in SKAS0 too. + * + * However, if each thread had a different host process (and this was discussed + * for SMP support) this won't be needed. + * + * And this will not need be used when (and if) we'll add support to the host + * SKAS patch. + */ + +int arch_switch_tls(struct task_struct *to) +{ + if (!host_supports_tls) + return 0; + + /* + * We have no need whatsoever to switch TLS for kernel threads; beyond + * that, that would also result in us calling os_set_thread_area with + * userspace_pid[cpu] == 0, which gives an error. + */ + if (likely(to->mm)) + return load_TLS(O_FORCE, to); + + return 0; +} + +static int set_tls_entry(struct task_struct* task, struct user_desc *info, + int idx, int flushed) +{ + struct thread_struct *t = &task->thread; + + if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) + return -EINVAL; + + t->arch.tls_array[idx - GDT_ENTRY_TLS_MIN].tls = *info; + t->arch.tls_array[idx - GDT_ENTRY_TLS_MIN].present = 1; + t->arch.tls_array[idx - GDT_ENTRY_TLS_MIN].flushed = flushed; + + return 0; +} + +int arch_copy_tls(struct task_struct *new) +{ + struct user_desc info; + int idx, ret = -EFAULT; + + if (copy_from_user(&info, + (void __user *) UPT_ESI(&new->thread.regs.regs), + sizeof(info))) + goto out; + + ret = -EINVAL; + if (LDT_empty(&info)) + goto out; + + idx = info.entry_number; + + ret = set_tls_entry(new, &info, idx, 0); +out: + return ret; +} + +/* XXX: use do_get_thread_area to read the host value? I'm not at all sure! */ +static int get_tls_entry(struct task_struct *task, struct user_desc *info, + int idx) +{ + struct thread_struct *t = &task->thread; + + if (!t->arch.tls_array) + goto clear; + + if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) + return -EINVAL; + + if (!t->arch.tls_array[idx - GDT_ENTRY_TLS_MIN].present) + goto clear; + + *info = t->arch.tls_array[idx - GDT_ENTRY_TLS_MIN].tls; + +out: + /* + * Temporary debugging check, to make sure that things have been + * flushed. This could be triggered if load_TLS() failed. + */ + if (unlikely(task == current && + !t->arch.tls_array[idx - GDT_ENTRY_TLS_MIN].flushed)) { + printk(KERN_ERR "get_tls_entry: task with pid %d got here " + "without flushed TLS.", current->pid); + } + + return 0; +clear: + /* + * When the TLS entry has not been set, the values read to user in the + * tls_array are 0 (because it's cleared at boot, see + * arch/i386/kernel/head.S:cpu_gdt_table). Emulate that. + */ + clear_user_desc(info); + info->entry_number = idx; + goto out; +} + +int sys_set_thread_area(struct user_desc __user *user_desc) +{ + struct user_desc info; + int idx, ret; + + if (!host_supports_tls) + return -ENOSYS; + + if (copy_from_user(&info, user_desc, sizeof(info))) + return -EFAULT; + + idx = info.entry_number; + + if (idx == -1) { + idx = get_free_idx(current); + if (idx < 0) + return idx; + info.entry_number = idx; + /* Tell the user which slot we chose for him.*/ + if (put_user(idx, &user_desc->entry_number)) + return -EFAULT; + } + + ret = do_set_thread_area(&info); + if (ret) + return ret; + return set_tls_entry(current, &info, idx, 1); +} + +/* + * Perform set_thread_area on behalf of the traced child. + * Note: error handling is not done on the deferred load, and this differ from + * i386. However the only possible error are caused by bugs. + */ +int ptrace_set_thread_area(struct task_struct *child, int idx, + struct user_desc __user *user_desc) +{ + struct user_desc info; + + if (!host_supports_tls) + return -EIO; + + if (copy_from_user(&info, user_desc, sizeof(info))) + return -EFAULT; + + return set_tls_entry(child, &info, idx, 0); +} + +int sys_get_thread_area(struct user_desc __user *user_desc) +{ + struct user_desc info; + int idx, ret; + + if (!host_supports_tls) + return -ENOSYS; + + if (get_user(idx, &user_desc->entry_number)) + return -EFAULT; + + ret = get_tls_entry(current, &info, idx); + if (ret < 0) + goto out; + + if (copy_to_user(user_desc, &info, sizeof(info))) + ret = -EFAULT; + +out: + return ret; +} + +/* + * Perform get_thread_area on behalf of the traced child. + */ +int ptrace_get_thread_area(struct task_struct *child, int idx, + struct user_desc __user *user_desc) +{ + struct user_desc info; + int ret; + + if (!host_supports_tls) + return -EIO; + + ret = get_tls_entry(child, &info, idx); + if (ret < 0) + goto out; + + if (copy_to_user(user_desc, &info, sizeof(info))) + ret = -EFAULT; +out: + return ret; +} + +/* + * This code is really i386-only, but it detects and logs x86_64 GDT indexes + * if a 32-bit UML is running on a 64-bit host. + */ +static int __init __setup_host_supports_tls(void) +{ + check_host_supports_tls(&host_supports_tls, &host_gdt_entry_tls_min); + if (host_supports_tls) { + printk(KERN_INFO "Host TLS support detected\n"); + printk(KERN_INFO "Detected host type: "); + switch (host_gdt_entry_tls_min) { + case GDT_ENTRY_TLS_MIN_I386: + printk(KERN_CONT "i386"); + break; + case GDT_ENTRY_TLS_MIN_X86_64: + printk(KERN_CONT "x86_64"); + break; + } + printk(KERN_CONT " (GDT indexes %d to %d)\n", + host_gdt_entry_tls_min, + host_gdt_entry_tls_min + GDT_ENTRY_TLS_ENTRIES); + } else + printk(KERN_ERR " Host TLS support NOT detected! " + "TLS support inside UML will not work\n"); + return 0; +} + +__initcall(__setup_host_supports_tls); diff --git a/arch/x86/um/tls_64.c b/arch/x86/um/tls_64.c new file mode 100644 index 0000000..f7ba462 --- /dev/null +++ b/arch/x86/um/tls_64.c @@ -0,0 +1,17 @@ +#include "linux/sched.h" + +void clear_flushed_tls(struct task_struct *task) +{ +} + +int arch_copy_tls(struct task_struct *t) +{ + /* + * If CLONE_SETTLS is set, we need to save the thread id + * (which is argument 5, child_tid, of clone) so it can be set + * during context switches. + */ + t->thread.arch.fs = t->thread.regs.regs.gp[R8 / sizeof(long)]; + + return 0; +} diff --git a/arch/x86/um/user-offsets.c b/arch/x86/um/user-offsets.c new file mode 100644 index 0000000..3c19c48 --- /dev/null +++ b/arch/x86/um/user-offsets.c @@ -0,0 +1,79 @@ +#include +#include +#include +#include +#include +#include +#define __FRAME_OFFSETS +#include +#include + +#define DEFINE(sym, val) \ + asm volatile("\n->" #sym " %0 " #val : : "i" (val)) + +#define DEFINE_LONGS(sym, val) \ + asm volatile("\n->" #sym " %0 " #val : : "i" (val/sizeof(unsigned long))) + +void foo(void) +{ +#ifdef __i386__ + DEFINE_LONGS(HOST_FP_SIZE, sizeof(struct user_fpregs_struct)); + DEFINE_LONGS(HOST_FPX_SIZE, sizeof(struct user_fpxregs_struct)); + + DEFINE(HOST_IP, EIP); + DEFINE(HOST_SP, UESP); + DEFINE(HOST_EFLAGS, EFL); + DEFINE(HOST_EAX, EAX); + DEFINE(HOST_EBX, EBX); + DEFINE(HOST_ECX, ECX); + DEFINE(HOST_EDX, EDX); + DEFINE(HOST_ESI, ESI); + DEFINE(HOST_EDI, EDI); + DEFINE(HOST_EBP, EBP); + DEFINE(HOST_CS, CS); + DEFINE(HOST_SS, SS); + DEFINE(HOST_DS, DS); + DEFINE(HOST_FS, FS); + DEFINE(HOST_ES, ES); + DEFINE(HOST_GS, GS); +#else + DEFINE(HOST_FP_SIZE, sizeof(struct _fpstate) / sizeof(unsigned long)); + DEFINE_LONGS(HOST_RBX, RBX); + DEFINE_LONGS(HOST_RCX, RCX); + DEFINE_LONGS(HOST_RDI, RDI); + DEFINE_LONGS(HOST_RSI, RSI); + DEFINE_LONGS(HOST_RDX, RDX); + DEFINE_LONGS(HOST_RBP, RBP); + DEFINE_LONGS(HOST_RAX, RAX); + DEFINE_LONGS(HOST_R8, R8); + DEFINE_LONGS(HOST_R9, R9); + DEFINE_LONGS(HOST_R10, R10); + DEFINE_LONGS(HOST_R11, R11); + DEFINE_LONGS(HOST_R12, R12); + DEFINE_LONGS(HOST_R13, R13); + DEFINE_LONGS(HOST_R14, R14); + DEFINE_LONGS(HOST_R15, R15); + DEFINE_LONGS(HOST_ORIG_RAX, ORIG_RAX); + DEFINE_LONGS(HOST_CS, CS); + DEFINE_LONGS(HOST_SS, SS); + DEFINE_LONGS(HOST_EFLAGS, EFLAGS); +#if 0 + DEFINE_LONGS(HOST_FS, FS); + DEFINE_LONGS(HOST_GS, GS); + DEFINE_LONGS(HOST_DS, DS); + DEFINE_LONGS(HOST_ES, ES); +#endif + + DEFINE_LONGS(HOST_IP, RIP); + DEFINE_LONGS(HOST_SP, RSP); +#endif + + DEFINE(UM_FRAME_SIZE, sizeof(struct user_regs_struct)); + DEFINE(UM_POLLIN, POLLIN); + DEFINE(UM_POLLPRI, POLLPRI); + DEFINE(UM_POLLOUT, POLLOUT); + + DEFINE(UM_PROT_READ, PROT_READ); + DEFINE(UM_PROT_WRITE, PROT_WRITE); + DEFINE(UM_PROT_EXEC, PROT_EXEC); +} diff --git a/arch/x86/um/vdso/Makefile b/arch/x86/um/vdso/Makefile new file mode 100644 index 0000000..5dffe6d --- /dev/null +++ b/arch/x86/um/vdso/Makefile @@ -0,0 +1,90 @@ +# +# Building vDSO images for x86. +# + +VDSO64-y := y + +vdso-install-$(VDSO64-y) += vdso.so + + +# files to link into the vdso +vobjs-y := vdso-note.o um_vdso.o + +# files to link into kernel +obj-$(VDSO64-y) += vdso.o vma.o + +vobjs := $(foreach F,$(vobjs-y),$(obj)/$F) + +$(obj)/vdso.o: $(obj)/vdso.so + +targets += vdso.so vdso.so.dbg vdso.lds $(vobjs-y) + +export CPPFLAGS_vdso.lds += -P -C + +VDSO_LDFLAGS_vdso.lds = -m64 -Wl,-soname=linux-vdso.so.1 \ + -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096 + +$(obj)/vdso.o: $(src)/vdso.S $(obj)/vdso.so + +$(obj)/vdso.so.dbg: $(src)/vdso.lds $(vobjs) FORCE + $(call if_changed,vdso) + +$(obj)/%.so: OBJCOPYFLAGS := -S +$(obj)/%.so: $(obj)/%.so.dbg FORCE + $(call if_changed,objcopy) + +# +# Don't omit frame pointers for ease of userspace debugging, but do +# optimize sibling calls. +# +CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \ + $(filter -g%,$(KBUILD_CFLAGS)) $(call cc-option, -fno-stack-protector) \ + -fno-omit-frame-pointer -foptimize-sibling-calls + +$(vobjs): KBUILD_CFLAGS += $(CFL) + +# +# vDSO code runs in userspace and -pg doesn't help with profiling anyway. +# +CFLAGS_REMOVE_vdso-note.o = -pg +CFLAGS_REMOVE_um_vdso.o = -pg + +targets += vdso-syms.lds +obj-$(VDSO64-y) += vdso-syms.lds + +# +# Match symbols in the DSO that look like VDSO*; produce a file of constants. +# +sed-vdsosym := -e 's/^00*/0/' \ + -e 's/^\([0-9a-fA-F]*\) . \(VDSO[a-zA-Z0-9_]*\)$$/\2 = 0x\1;/p' +quiet_cmd_vdsosym = VDSOSYM $@ +define cmd_vdsosym + $(NM) $< | LC_ALL=C sed -n $(sed-vdsosym) | LC_ALL=C sort > $@ +endef + +$(obj)/%-syms.lds: $(obj)/%.so.dbg FORCE + $(call if_changed,vdsosym) + +# +# The DSO images are built using a special linker script. +# +quiet_cmd_vdso = VDSO $@ + cmd_vdso = $(CC) -nostdlib -o $@ \ + $(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \ + -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) && \ + sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@' + +VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=sysv) +GCOV_PROFILE := n + +# +# Install the unstripped copy of vdso*.so listed in $(vdso-install-y). +# +quiet_cmd_vdso_install = INSTALL $@ + cmd_vdso_install = cp $(obj)/$@.dbg $(MODLIB)/vdso/$@ +$(vdso-install-y): %.so: $(obj)/%.so.dbg FORCE + @mkdir -p $(MODLIB)/vdso + $(call cmd,vdso_install) + +PHONY += vdso_install $(vdso-install-y) +vdso_install: $(vdso-install-y) diff --git a/arch/x86/um/vdso/checkundef.sh b/arch/x86/um/vdso/checkundef.sh new file mode 100644 index 0000000..7ee90a9 --- /dev/null +++ b/arch/x86/um/vdso/checkundef.sh @@ -0,0 +1,10 @@ +#!/bin/sh +nm="$1" +file="$2" +$nm "$file" | grep '^ *U' > /dev/null 2>&1 +if [ $? -eq 1 ]; then + exit 0 +else + echo "$file: undefined symbols found" >&2 + exit 1 +fi diff --git a/arch/x86/um/vdso/um_vdso.c b/arch/x86/um/vdso/um_vdso.c new file mode 100644 index 0000000..7c441b5 --- /dev/null +++ b/arch/x86/um/vdso/um_vdso.c @@ -0,0 +1,71 @@ +/* + * Copyright (C) 2011 Richard Weinberger + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This vDSO turns all calls into a syscall so that UML can trap them. + */ + + +/* Disable profiling for userspace code */ +#define DISABLE_BRANCH_PROFILING + +#include +#include +#include + +int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) +{ + long ret; + + asm("syscall" : "=a" (ret) : + "0" (__NR_clock_gettime), "D" (clock), "S" (ts) : "memory"); + + return ret; +} +int clock_gettime(clockid_t, struct timespec *) + __attribute__((weak, alias("__vdso_clock_gettime"))); + +int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) +{ + long ret; + + asm("syscall" : "=a" (ret) : + "0" (__NR_gettimeofday), "D" (tv), "S" (tz) : "memory"); + + return ret; +} +int gettimeofday(struct timeval *, struct timezone *) + __attribute__((weak, alias("__vdso_gettimeofday"))); + +time_t __vdso_time(time_t *t) +{ + long secs; + + asm volatile("syscall" + : "=a" (secs) + : "0" (__NR_time), "D" (t) : "cc", "r11", "cx", "memory"); + + return secs; +} +int time(time_t *t) __attribute__((weak, alias("__vdso_time"))); + +long +__vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused) +{ + /* + * UML does not support SMP, we can cheat here. :) + */ + + if (cpu) + *cpu = 0; + if (node) + *node = 0; + + return 0; +} + +long getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache) + __attribute__((weak, alias("__vdso_getcpu"))); diff --git a/arch/x86/um/vdso/vdso-layout.lds.S b/arch/x86/um/vdso/vdso-layout.lds.S new file mode 100644 index 0000000..634a2cf --- /dev/null +++ b/arch/x86/um/vdso/vdso-layout.lds.S @@ -0,0 +1,64 @@ +/* + * Linker script for vDSO. This is an ELF shared object prelinked to + * its virtual address, and with only one read-only segment. + * This script controls its layout. + */ + +SECTIONS +{ + . = VDSO_PRELINK + SIZEOF_HEADERS; + + .hash : { *(.hash) } :text + .gnu.hash : { *(.gnu.hash) } + .dynsym : { *(.dynsym) } + .dynstr : { *(.dynstr) } + .gnu.version : { *(.gnu.version) } + .gnu.version_d : { *(.gnu.version_d) } + .gnu.version_r : { *(.gnu.version_r) } + + .note : { *(.note.*) } :text :note + + .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr + .eh_frame : { KEEP (*(.eh_frame)) } :text + + .dynamic : { *(.dynamic) } :text :dynamic + + .rodata : { *(.rodata*) } :text + .data : { + *(.data*) + *(.sdata*) + *(.got.plt) *(.got) + *(.gnu.linkonce.d.*) + *(.bss*) + *(.dynbss*) + *(.gnu.linkonce.b.*) + } + + .altinstructions : { *(.altinstructions) } + .altinstr_replacement : { *(.altinstr_replacement) } + + /* + * Align the actual code well away from the non-instruction data. + * This is the best thing for the I-cache. + */ + . = ALIGN(0x100); + + .text : { *(.text*) } :text =0x90909090 +} + +/* + * Very old versions of ld do not recognize this name token; use the constant. + */ +#define PT_GNU_EH_FRAME 0x6474e550 + +/* + * We must supply the ELF program headers explicitly to get just one + * PT_LOAD segment, and set the flags explicitly to make segments read-only. + */ +PHDRS +{ + text PT_LOAD FLAGS(5) FILEHDR PHDRS; /* PF_R|PF_X */ + dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ + note PT_NOTE FLAGS(4); /* PF_R */ + eh_frame_hdr PT_GNU_EH_FRAME; +} diff --git a/arch/x86/um/vdso/vdso-note.S b/arch/x86/um/vdso/vdso-note.S new file mode 100644 index 0000000..79a071e --- /dev/null +++ b/arch/x86/um/vdso/vdso-note.S @@ -0,0 +1,12 @@ +/* + * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text. + * Here we can supply some information useful to userland. + */ + +#include +#include +#include + +ELFNOTE_START(Linux, 0, "a") + .long LINUX_VERSION_CODE +ELFNOTE_END diff --git a/arch/x86/um/vdso/vdso.S b/arch/x86/um/vdso/vdso.S new file mode 100644 index 0000000..1cb468a --- /dev/null +++ b/arch/x86/um/vdso/vdso.S @@ -0,0 +1,10 @@ +#include + +__INITDATA + + .globl vdso_start, vdso_end +vdso_start: + .incbin "arch/x86/um/vdso/vdso.so" +vdso_end: + +__FINIT diff --git a/arch/x86/um/vdso/vdso.lds.S b/arch/x86/um/vdso/vdso.lds.S new file mode 100644 index 0000000..b96b267 --- /dev/null +++ b/arch/x86/um/vdso/vdso.lds.S @@ -0,0 +1,32 @@ +/* + * Linker script for 64-bit vDSO. + * We #include the file to define the layout details. + * Here we only choose the prelinked virtual address. + * + * This file defines the version script giving the user-exported symbols in + * the DSO. We can define local symbols here called VDSO* to make their + * values visible using the asm-x86/vdso.h macros from the kernel proper. + */ + +#define VDSO_PRELINK 0xffffffffff700000 +#include "vdso-layout.lds.S" + +/* + * This controls what userland symbols we export from the vDSO. + */ +VERSION { + LINUX_2.6 { + global: + clock_gettime; + __vdso_clock_gettime; + gettimeofday; + __vdso_gettimeofday; + getcpu; + __vdso_getcpu; + time; + __vdso_time; + local: *; + }; +} + +VDSO64_PRELINK = VDSO_PRELINK; diff --git a/arch/x86/um/vdso/vma.c b/arch/x86/um/vdso/vma.c new file mode 100644 index 0000000..9495c8d --- /dev/null +++ b/arch/x86/um/vdso/vma.c @@ -0,0 +1,74 @@ +/* + * Copyright (C) 2011 Richard Weinberger + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include + +unsigned int __read_mostly vdso_enabled = 1; +unsigned long um_vdso_addr; + +extern unsigned long task_size; +extern char vdso_start[], vdso_end[]; + +static struct page **vdsop; + +static int __init init_vdso(void) +{ + struct page *um_vdso; + + BUG_ON(vdso_end - vdso_start > PAGE_SIZE); + + um_vdso_addr = task_size - PAGE_SIZE; + + vdsop = kmalloc(GFP_KERNEL, sizeof(struct page *)); + if (!vdsop) + goto oom; + + um_vdso = alloc_page(GFP_KERNEL); + if (!um_vdso) { + kfree(vdsop); + + goto oom; + } + + copy_page(page_address(um_vdso), vdso_start); + *vdsop = um_vdso; + + return 0; + +oom: + printk(KERN_ERR "Cannot allocate vdso\n"); + vdso_enabled = 0; + + return -ENOMEM; +} +subsys_initcall(init_vdso); + +int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) +{ + int err; + struct mm_struct *mm = current->mm; + + if (!vdso_enabled) + return 0; + + down_write(&mm->mmap_sem); + + err = install_special_mapping(mm, um_vdso_addr, PAGE_SIZE, + VM_READ|VM_EXEC| + VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| + VM_ALWAYSDUMP, + vdsop); + + up_write(&mm->mmap_sem); + + return err; +} -- cgit v1.1 From 0acdbbeb6dd435f5f3f1648fc3a2ab5fd07b5545 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 18 Aug 2011 20:06:49 +0100 Subject: um: bury unused macros around ptrace.h Signed-off-by: Al Viro Signed-off-by: Richard Weinberger --- arch/x86/um/shared/sysdep/ptrace_32.h | 3 --- arch/x86/um/shared/sysdep/ptrace_64.h | 3 --- 2 files changed, 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/um/shared/sysdep/ptrace_32.h b/arch/x86/um/shared/sysdep/ptrace_32.h index ce77fa1..bad747a 100644 --- a/arch/x86/um/shared/sysdep/ptrace_32.h +++ b/arch/x86/um/shared/sysdep/ptrace_32.h @@ -101,9 +101,6 @@ struct syscall_args { UPT_SYSCALL_ARG5(r), \ UPT_SYSCALL_ARG6(r) } } ) -#define UPT_SET_SYSCALL_RETURN(r, res) \ - REGS_SET_SYSCALL_RETURN((r)->regs, (res)) - #define UPT_RESTART_SYSCALL(r) REGS_RESTART_SYSCALL((r)->gp) #define UPT_ORIG_SYSCALL(r) UPT_EAX(r) diff --git a/arch/x86/um/shared/sysdep/ptrace_64.h b/arch/x86/um/shared/sysdep/ptrace_64.h index 866fe7e..a360fe2 100644 --- a/arch/x86/um/shared/sysdep/ptrace_64.h +++ b/arch/x86/um/shared/sysdep/ptrace_64.h @@ -146,9 +146,6 @@ struct syscall_args { UPT_SYSCALL_ARG5(r), \ UPT_SYSCALL_ARG6(r) } } ) -#define UPT_SET_SYSCALL_RETURN(r, res) \ - REGS_SET_SYSCALL_RETURN((r)->regs, (res)) - #define UPT_RESTART_SYSCALL(r) REGS_RESTART_SYSCALL((r)->gp) #define UPT_FAULTINFO(r) (&(r)->faultinfo) -- cgit v1.1 From 5ade8878e03a9a298a71efbf2895aa482e45448a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 18 Aug 2011 20:07:39 +0100 Subject: um: kill shared/task.h and HOST_TASK_REGS Signed-off-by: Al Viro Signed-off-by: Richard Weinberger --- arch/x86/um/bugs_32.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/um/bugs_32.c b/arch/x86/um/bugs_32.c index 7058e1f..a1fba5f 100644 --- a/arch/x86/um/bugs_32.c +++ b/arch/x86/um/bugs_32.c @@ -6,13 +6,15 @@ #include #include "kern_util.h" #include "longjmp.h" -#include "task.h" #include "sysdep/ptrace.h" +#include /* Set during early boot */ static int host_has_cmov = 1; static jmp_buf cmov_test_return; +#define TASK_PID(task) *((int *) &(((char *) (task))[HOST_TASK_PID])) + static void cmov_sigill_test_handler(int sig) { host_has_cmov = 0; -- cgit v1.1 From 09e129a6038ead049b3ee509475420963f052a80 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 18 Aug 2011 20:09:19 +0100 Subject: um: merge tls_{32,64}.h Signed-off-by: Al Viro Signed-off-by: Richard Weinberger --- arch/x86/um/shared/sysdep/tls.h | 38 +++++++++++++++++++++++++++++++++++--- arch/x86/um/shared/sysdep/tls_32.h | 32 -------------------------------- arch/x86/um/shared/sysdep/tls_64.h | 29 ----------------------------- 3 files changed, 35 insertions(+), 64 deletions(-) delete mode 100644 arch/x86/um/shared/sysdep/tls_32.h delete mode 100644 arch/x86/um/shared/sysdep/tls_64.h (limited to 'arch/x86') diff --git a/arch/x86/um/shared/sysdep/tls.h b/arch/x86/um/shared/sysdep/tls.h index 4d8f752..db2ae9c 100644 --- a/arch/x86/um/shared/sysdep/tls.h +++ b/arch/x86/um/shared/sysdep/tls.h @@ -1,5 +1,37 @@ +#ifndef _SYSDEP_TLS_H +#define _SYSDEP_TLS_H + +# ifndef __KERNEL__ + +/* Change name to avoid conflicts with the original one from , which + * may be named user_desc (but in 2.4 and in header matching its API was named + * modify_ldt_ldt_s). */ + +typedef struct um_dup_user_desc { + unsigned int entry_number; + unsigned int base_addr; + unsigned int limit; + unsigned int seg_32bit:1; + unsigned int contents:2; + unsigned int read_exec_only:1; + unsigned int limit_in_pages:1; + unsigned int seg_not_present:1; + unsigned int useable:1; +#ifdef __x86_64__ + unsigned int lm:1; +#endif +} user_desc_t; + +# else /* __KERNEL__ */ + +# include +typedef struct user_desc user_desc_t; + +# endif /* __KERNEL__ */ + #ifdef __i386__ -#include "tls_32.h" -#else -#include "tls_64.h" +#define GDT_ENTRY_TLS_MIN_I386 6 +#define GDT_ENTRY_TLS_MIN_X86_64 12 #endif + +#endif /* _SYSDEP_TLS_H */ diff --git a/arch/x86/um/shared/sysdep/tls_32.h b/arch/x86/um/shared/sysdep/tls_32.h deleted file mode 100644 index 3455075..0000000 --- a/arch/x86/um/shared/sysdep/tls_32.h +++ /dev/null @@ -1,32 +0,0 @@ -#ifndef _SYSDEP_TLS_H -#define _SYSDEP_TLS_H - -# ifndef __KERNEL__ - -/* Change name to avoid conflicts with the original one from , which - * may be named user_desc (but in 2.4 and in header matching its API was named - * modify_ldt_ldt_s). */ - -typedef struct um_dup_user_desc { - unsigned int entry_number; - unsigned int base_addr; - unsigned int limit; - unsigned int seg_32bit:1; - unsigned int contents:2; - unsigned int read_exec_only:1; - unsigned int limit_in_pages:1; - unsigned int seg_not_present:1; - unsigned int useable:1; -} user_desc_t; - -# else /* __KERNEL__ */ - -# include -typedef struct user_desc user_desc_t; - -# endif /* __KERNEL__ */ - -#define GDT_ENTRY_TLS_MIN_I386 6 -#define GDT_ENTRY_TLS_MIN_X86_64 12 - -#endif /* _SYSDEP_TLS_H */ diff --git a/arch/x86/um/shared/sysdep/tls_64.h b/arch/x86/um/shared/sysdep/tls_64.h deleted file mode 100644 index 18c000d..0000000 --- a/arch/x86/um/shared/sysdep/tls_64.h +++ /dev/null @@ -1,29 +0,0 @@ -#ifndef _SYSDEP_TLS_H -#define _SYSDEP_TLS_H - -# ifndef __KERNEL__ - -/* Change name to avoid conflicts with the original one from , which - * may be named user_desc (but in 2.4 and in header matching its API was named - * modify_ldt_ldt_s). */ - -typedef struct um_dup_user_desc { - unsigned int entry_number; - unsigned int base_addr; - unsigned int limit; - unsigned int seg_32bit:1; - unsigned int contents:2; - unsigned int read_exec_only:1; - unsigned int limit_in_pages:1; - unsigned int seg_not_present:1; - unsigned int useable:1; - unsigned int lm:1; -} user_desc_t; - -# else /* __KERNEL__ */ - -# include -typedef struct user_desc user_desc_t; - -# endif /* __KERNEL__ */ -#endif /* _SYSDEP_TLS_H */ -- cgit v1.1 From 2014d01878a8c38111eba3333f0d70ceb91f0bb7 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 18 Aug 2011 20:09:29 +0100 Subject: um: merge host_ldt_{32,64}.h Signed-off-by: Al Viro Signed-off-by: Richard Weinberger --- arch/x86/um/shared/sysdep/host_ldt.h | 38 ++++++++++++++++++++++++++++++--- arch/x86/um/shared/sysdep/host_ldt_32.h | 34 ----------------------------- arch/x86/um/shared/sysdep/host_ldt_64.h | 38 --------------------------------- 3 files changed, 35 insertions(+), 75 deletions(-) delete mode 100644 arch/x86/um/shared/sysdep/host_ldt_32.h delete mode 100644 arch/x86/um/shared/sysdep/host_ldt_64.h (limited to 'arch/x86') diff --git a/arch/x86/um/shared/sysdep/host_ldt.h b/arch/x86/um/shared/sysdep/host_ldt.h index 94518b3..246ff7a 100644 --- a/arch/x86/um/shared/sysdep/host_ldt.h +++ b/arch/x86/um/shared/sysdep/host_ldt.h @@ -1,5 +1,37 @@ -#ifdef __i386__ -#include "host_ldt_32.h" +#ifndef __ASM_HOST_LDT_H +#define __ASM_HOST_LDT_H + +#include + +#define LDT_entry_a(info) \ + ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff)) + +#define LDT_entry_b(info) \ + (((info)->base_addr & 0xff000000) | \ + (((info)->base_addr & 0x00ff0000) >> 16) | \ + ((info)->limit & 0xf0000) | \ + (((info)->read_exec_only ^ 1) << 9) | \ + ((info)->contents << 10) | \ + (((info)->seg_not_present ^ 1) << 15) | \ + ((info)->seg_32bit << 22) | \ + ((info)->limit_in_pages << 23) | \ + ((info)->useable << 20) | \ + 0x7000) + +#define _LDT_empty(info) (\ + (info)->base_addr == 0 && \ + (info)->limit == 0 && \ + (info)->contents == 0 && \ + (info)->read_exec_only == 1 && \ + (info)->seg_32bit == 0 && \ + (info)->limit_in_pages == 0 && \ + (info)->seg_not_present == 1 && \ + (info)->useable == 0 ) + +#ifdef CONFIG_X86_64 +#define LDT_empty(info) (_LDT_empty(info) && ((info)->lm == 0)) #else -#include "host_ldt_64.h" +#define LDT_empty(info) (_LDT_empty(info)) +#endif + #endif diff --git a/arch/x86/um/shared/sysdep/host_ldt_32.h b/arch/x86/um/shared/sysdep/host_ldt_32.h deleted file mode 100644 index 0953cc4..0000000 --- a/arch/x86/um/shared/sysdep/host_ldt_32.h +++ /dev/null @@ -1,34 +0,0 @@ -#ifndef __ASM_HOST_LDT_I386_H -#define __ASM_HOST_LDT_I386_H - -#include - -/* - * macros stolen from include/asm-i386/desc.h - */ -#define LDT_entry_a(info) \ - ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff)) - -#define LDT_entry_b(info) \ - (((info)->base_addr & 0xff000000) | \ - (((info)->base_addr & 0x00ff0000) >> 16) | \ - ((info)->limit & 0xf0000) | \ - (((info)->read_exec_only ^ 1) << 9) | \ - ((info)->contents << 10) | \ - (((info)->seg_not_present ^ 1) << 15) | \ - ((info)->seg_32bit << 22) | \ - ((info)->limit_in_pages << 23) | \ - ((info)->useable << 20) | \ - 0x7000) - -#define LDT_empty(info) (\ - (info)->base_addr == 0 && \ - (info)->limit == 0 && \ - (info)->contents == 0 && \ - (info)->read_exec_only == 1 && \ - (info)->seg_32bit == 0 && \ - (info)->limit_in_pages == 0 && \ - (info)->seg_not_present == 1 && \ - (info)->useable == 0 ) - -#endif diff --git a/arch/x86/um/shared/sysdep/host_ldt_64.h b/arch/x86/um/shared/sysdep/host_ldt_64.h deleted file mode 100644 index e8b1be1..0000000 --- a/arch/x86/um/shared/sysdep/host_ldt_64.h +++ /dev/null @@ -1,38 +0,0 @@ -#ifndef __ASM_HOST_LDT_X86_64_H -#define __ASM_HOST_LDT_X86_64_H - -#include - -/* - * macros stolen from include/asm-x86_64/desc.h - */ -#define LDT_entry_a(info) \ - ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff)) - -/* Don't allow setting of the lm bit. It is useless anyways because - * 64bit system calls require __USER_CS. */ -#define LDT_entry_b(info) \ - (((info)->base_addr & 0xff000000) | \ - (((info)->base_addr & 0x00ff0000) >> 16) | \ - ((info)->limit & 0xf0000) | \ - (((info)->read_exec_only ^ 1) << 9) | \ - ((info)->contents << 10) | \ - (((info)->seg_not_present ^ 1) << 15) | \ - ((info)->seg_32bit << 22) | \ - ((info)->limit_in_pages << 23) | \ - ((info)->useable << 20) | \ - /* ((info)->lm << 21) | */ \ - 0x7000) - -#define LDT_empty(info) (\ - (info)->base_addr == 0 && \ - (info)->limit == 0 && \ - (info)->contents == 0 && \ - (info)->read_exec_only == 1 && \ - (info)->seg_32bit == 0 && \ - (info)->limit_in_pages == 0 && \ - (info)->seg_not_present == 1 && \ - (info)->useable == 0 && \ - (info)->lm == 0) - -#endif -- cgit v1.1 From c5cc32fe14ccbc19484202d20cf7d6bad45e3567 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 18 Aug 2011 20:09:39 +0100 Subject: um: move asm/desc.h into arch/x86/um/asm its only purpose is to shadow the x86 asm/desc.h Signed-off-by: Al Viro Signed-off-by: Richard Weinberger --- arch/x86/um/asm/desc.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 arch/x86/um/asm/desc.h (limited to 'arch/x86') diff --git a/arch/x86/um/asm/desc.h b/arch/x86/um/asm/desc.h new file mode 100644 index 0000000..4ec34a5 --- /dev/null +++ b/arch/x86/um/asm/desc.h @@ -0,0 +1,16 @@ +#ifndef __UM_DESC_H +#define __UM_DESC_H + +/* Taken from asm-i386/desc.h, it's the only thing we need. The rest wouldn't + * compile, and has never been used. */ +#define LDT_empty(info) (\ + (info)->base_addr == 0 && \ + (info)->limit == 0 && \ + (info)->contents == 0 && \ + (info)->read_exec_only == 1 && \ + (info)->seg_32bit == 0 && \ + (info)->limit_in_pages == 0 && \ + (info)->seg_not_present == 1 && \ + (info)->useable == 0 ) + +#endif -- cgit v1.1 From 1bbd5f21f426d99660ea8120b79595a282e5ff8a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 18 Aug 2011 20:09:49 +0100 Subject: um: merge os-Linux/tls.c into arch/x86/um/os-Linux/tls.c it's i386-specific; moreover, analogs on other targets have incompatible interface - PTRACE_GET_THREAD_AREA does exist elsewhere, but struct user_desc does *not* Signed-off-by: Al Viro Signed-off-by: Richard Weinberger --- arch/x86/um/os-Linux/tls.c | 34 +++++++++++++++++++++++++++++++++- arch/x86/um/shared/sysdep/tls.h | 3 +++ 2 files changed, 36 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/um/os-Linux/tls.c b/arch/x86/um/os-Linux/tls.c index 281e83e..82276b6 100644 --- a/arch/x86/um/os-Linux/tls.c +++ b/arch/x86/um/os-Linux/tls.c @@ -1,15 +1,25 @@ #include #include +#include #include #include #include "sysdep/tls.h" +#ifndef PTRACE_GET_THREAD_AREA +#define PTRACE_GET_THREAD_AREA 25 +#endif + +#ifndef PTRACE_SET_THREAD_AREA +#define PTRACE_SET_THREAD_AREA 26 +#endif + /* Checks whether host supports TLS, and sets *tls_min according to the value * valid on the host. * i386 host have it == 6; x86_64 host have it == 12, for i386 emulation. */ -void check_host_supports_tls(int *supports_tls, int *tls_min) { +void check_host_supports_tls(int *supports_tls, int *tls_min) +{ /* Values for x86 and x86_64.*/ int val[] = {GDT_ENTRY_TLS_MIN_I386, GDT_ENTRY_TLS_MIN_X86_64}; int i; @@ -33,3 +43,25 @@ void check_host_supports_tls(int *supports_tls, int *tls_min) { *supports_tls = 0; } + +int os_set_thread_area(user_desc_t *info, int pid) +{ + int ret; + + ret = ptrace(PTRACE_SET_THREAD_AREA, pid, info->entry_number, + (unsigned long) info); + if (ret < 0) + ret = -errno; + return ret; +} + +int os_get_thread_area(user_desc_t *info, int pid) +{ + int ret; + + ret = ptrace(PTRACE_GET_THREAD_AREA, pid, info->entry_number, + (unsigned long) info); + if (ret < 0) + ret = -errno; + return ret; +} diff --git a/arch/x86/um/shared/sysdep/tls.h b/arch/x86/um/shared/sysdep/tls.h index db2ae9c..f2f30bd 100644 --- a/arch/x86/um/shared/sysdep/tls.h +++ b/arch/x86/um/shared/sysdep/tls.h @@ -29,6 +29,9 @@ typedef struct user_desc user_desc_t; # endif /* __KERNEL__ */ +extern int os_set_thread_area(user_desc_t *info, int pid); +extern int os_get_thread_area(user_desc_t *info, int pid); + #ifdef __i386__ #define GDT_ENTRY_TLS_MIN_I386 6 #define GDT_ENTRY_TLS_MIN_X86_64 12 -- cgit v1.1 From 8edc4147bec5b7b92b48a6013ec99c41b16c5f93 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 18 Aug 2011 20:09:59 +0100 Subject: um: sanitize paths in sys_call_table* includes Signed-off-by: Al Viro Signed-off-by: Richard Weinberger --- arch/x86/um/sys_call_table_32.S | 2 +- arch/x86/um/sys_call_table_64.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/um/sys_call_table_32.S b/arch/x86/um/sys_call_table_32.S index de274071..c3431cf 100644 --- a/arch/x86/um/sys_call_table_32.S +++ b/arch/x86/um/sys_call_table_32.S @@ -22,7 +22,7 @@ .section .rodata,"a" -#include "../../x86/kernel/syscall_table_32.S" +#include "../kernel/syscall_table_32.S" ENTRY(syscall_table_size) .long .-sys_call_table diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c index f46de82..99522f7 100644 --- a/arch/x86/um/sys_call_table_64.c +++ b/arch/x86/um/sys_call_table_64.c @@ -58,7 +58,7 @@ extern void sys_ni_syscall(void); */ sys_call_ptr_t sys_call_table[] __cacheline_aligned = { -#include "../../x86/include/asm/unistd_64.h" +#include }; int syscall_table_size = sizeof(sys_call_table); -- cgit v1.1 From 3579a389730dd74d9f280152c52aa851dd1da860 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 18 Aug 2011 20:10:09 +0100 Subject: um: merge HOST_... of registers common on i386 and amd64 Signed-off-by: Al Viro Signed-off-by: Richard Weinberger --- arch/x86/um/ptrace_32.c | 14 +++++++------- arch/x86/um/ptrace_64.c | 14 +++++++------- arch/x86/um/shared/sysdep/ptrace_32.h | 14 +++++++------- arch/x86/um/shared/sysdep/ptrace_64.h | 14 +++++++------- arch/x86/um/signal_32.c | 14 +++++++------- arch/x86/um/signal_64.c | 28 ++++++++++++++-------------- arch/x86/um/user-offsets.c | 28 ++++++++++++++-------------- 7 files changed, 63 insertions(+), 63 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/um/ptrace_32.c b/arch/x86/um/ptrace_32.c index a174fde..3b949daa 100644 --- a/arch/x86/um/ptrace_32.c +++ b/arch/x86/um/ptrace_32.c @@ -51,13 +51,13 @@ int is_syscall(unsigned long addr) #define FLAG_MASK 0x00044dd5 static const int reg_offsets[] = { - [EBX] = HOST_EBX, - [ECX] = HOST_ECX, - [EDX] = HOST_EDX, - [ESI] = HOST_ESI, - [EDI] = HOST_EDI, - [EBP] = HOST_EBP, - [EAX] = HOST_EAX, + [EBX] = HOST_BX, + [ECX] = HOST_CX, + [EDX] = HOST_DX, + [ESI] = HOST_SI, + [EDI] = HOST_DI, + [EBP] = HOST_BP, + [EAX] = HOST_AX, [DS] = HOST_DS, [ES] = HOST_ES, [FS] = HOST_FS, diff --git a/arch/x86/um/ptrace_64.c b/arch/x86/um/ptrace_64.c index 44e68e0..6e6343e 100644 --- a/arch/x86/um/ptrace_64.c +++ b/arch/x86/um/ptrace_64.c @@ -30,13 +30,13 @@ static const int reg_offsets[] = [R15 >> 3] = HOST_R15, [RIP >> 3] = HOST_IP, [RSP >> 3] = HOST_SP, - [RAX >> 3] = HOST_RAX, - [RBX >> 3] = HOST_RBX, - [RCX >> 3] = HOST_RCX, - [RDX >> 3] = HOST_RDX, - [RSI >> 3] = HOST_RSI, - [RDI >> 3] = HOST_RDI, - [RBP >> 3] = HOST_RBP, + [RAX >> 3] = HOST_AX, + [RBX >> 3] = HOST_BX, + [RCX >> 3] = HOST_CX, + [RDX >> 3] = HOST_DX, + [RSI >> 3] = HOST_SI, + [RDI >> 3] = HOST_DI, + [RBP >> 3] = HOST_BP, [CS >> 3] = HOST_CS, [SS >> 3] = HOST_SS, [FS_BASE >> 3] = HOST_FS_BASE, diff --git a/arch/x86/um/shared/sysdep/ptrace_32.h b/arch/x86/um/shared/sysdep/ptrace_32.h index bad747a..befd1df 100644 --- a/arch/x86/um/shared/sysdep/ptrace_32.h +++ b/arch/x86/um/shared/sysdep/ptrace_32.h @@ -27,13 +27,13 @@ extern int sysemu_supported; #define REGS_IP(r) ((r)[HOST_IP]) #define REGS_SP(r) ((r)[HOST_SP]) #define REGS_EFLAGS(r) ((r)[HOST_EFLAGS]) -#define REGS_EAX(r) ((r)[HOST_EAX]) -#define REGS_EBX(r) ((r)[HOST_EBX]) -#define REGS_ECX(r) ((r)[HOST_ECX]) -#define REGS_EDX(r) ((r)[HOST_EDX]) -#define REGS_ESI(r) ((r)[HOST_ESI]) -#define REGS_EDI(r) ((r)[HOST_EDI]) -#define REGS_EBP(r) ((r)[HOST_EBP]) +#define REGS_EAX(r) ((r)[HOST_AX]) +#define REGS_EBX(r) ((r)[HOST_BX]) +#define REGS_ECX(r) ((r)[HOST_CX]) +#define REGS_EDX(r) ((r)[HOST_DX]) +#define REGS_ESI(r) ((r)[HOST_SI]) +#define REGS_EDI(r) ((r)[HOST_DI]) +#define REGS_EBP(r) ((r)[HOST_BP]) #define REGS_CS(r) ((r)[HOST_CS]) #define REGS_SS(r) ((r)[HOST_SS]) #define REGS_DS(r) ((r)[HOST_DS]) diff --git a/arch/x86/um/shared/sysdep/ptrace_64.h b/arch/x86/um/shared/sysdep/ptrace_64.h index a360fe2..430dedc 100644 --- a/arch/x86/um/shared/sysdep/ptrace_64.h +++ b/arch/x86/um/shared/sysdep/ptrace_64.h @@ -17,13 +17,13 @@ #define REGS_IP(r) ((r)[HOST_IP]) #define REGS_SP(r) ((r)[HOST_SP]) -#define REGS_RBX(r) ((r)[HOST_RBX]) -#define REGS_RCX(r) ((r)[HOST_RCX]) -#define REGS_RDX(r) ((r)[HOST_RDX]) -#define REGS_RSI(r) ((r)[HOST_RSI]) -#define REGS_RDI(r) ((r)[HOST_RDI]) -#define REGS_RBP(r) ((r)[HOST_RBP]) -#define REGS_RAX(r) ((r)[HOST_RAX]) +#define REGS_RBX(r) ((r)[HOST_BX]) +#define REGS_RCX(r) ((r)[HOST_CX]) +#define REGS_RDX(r) ((r)[HOST_DX]) +#define REGS_RSI(r) ((r)[HOST_SI]) +#define REGS_RDI(r) ((r)[HOST_DI]) +#define REGS_RBP(r) ((r)[HOST_BP]) +#define REGS_RAX(r) ((r)[HOST_AX]) #define REGS_R8(r) ((r)[HOST_R8]) #define REGS_R9(r) ((r)[HOST_R9]) #define REGS_R10(r) ((r)[HOST_R10]) diff --git a/arch/x86/um/signal_32.c b/arch/x86/um/signal_32.c index bcbfb0d..2eebdc0 100644 --- a/arch/x86/um/signal_32.c +++ b/arch/x86/um/signal_32.c @@ -160,14 +160,14 @@ static int copy_sc_from_user(struct pt_regs *regs, GETREG(FS, fs); GETREG(ES, es); GETREG(DS, ds); - GETREG(EDI, di); - GETREG(ESI, si); - GETREG(EBP, bp); + GETREG(DI, di); + GETREG(SI, si); + GETREG(BP, bp); GETREG(SP, sp); - GETREG(EBX, bx); - GETREG(EDX, dx); - GETREG(ECX, cx); - GETREG(EAX, ax); + GETREG(BX, bx); + GETREG(DX, dx); + GETREG(CX, cx); + GETREG(AX, ax); GETREG(IP, ip); GETREG(CS, cs); GETREG(EFLAGS, flags); diff --git a/arch/x86/um/signal_64.c b/arch/x86/um/signal_64.c index 255b2ca..4e5b9b0 100644 --- a/arch/x86/um/signal_64.c +++ b/arch/x86/um/signal_64.c @@ -35,13 +35,13 @@ static int copy_sc_from_user(struct pt_regs *regs, GETREG(R13, r13); GETREG(R14, r14); GETREG(R15, r15); - GETREG(RDI, di); - GETREG(RSI, si); - GETREG(RBP, bp); - GETREG(RBX, bx); - GETREG(RDX, dx); - GETREG(RAX, ax); - GETREG(RCX, cx); + GETREG(DI, di); + GETREG(SI, si); + GETREG(BP, bp); + GETREG(BX, bx); + GETREG(DX, dx); + GETREG(AX, ax); + GETREG(CX, cx); GETREG(SP, sp); GETREG(IP, ip); GETREG(EFLAGS, flags); @@ -78,18 +78,18 @@ static int copy_sc_to_user(struct sigcontext __user *to, #define PUTREG(regno, regname) sc.regname = regs->regs.gp[HOST_##regno] - PUTREG(RDI, di); - PUTREG(RSI, si); - PUTREG(RBP, bp); + PUTREG(DI, di); + PUTREG(SI, si); + PUTREG(BP, bp); /* * Must use original RSP, which is passed in, rather than what's in * signal frame. */ sc.sp = sp; - PUTREG(RBX, bx); - PUTREG(RDX, dx); - PUTREG(RCX, cx); - PUTREG(RAX, ax); + PUTREG(BX, bx); + PUTREG(DX, dx); + PUTREG(CX, cx); + PUTREG(AX, ax); PUTREG(R8, r8); PUTREG(R9, r9); PUTREG(R10, r10); diff --git a/arch/x86/um/user-offsets.c b/arch/x86/um/user-offsets.c index 3c19c48..88d1255 100644 --- a/arch/x86/um/user-offsets.c +++ b/arch/x86/um/user-offsets.c @@ -23,13 +23,13 @@ void foo(void) DEFINE(HOST_IP, EIP); DEFINE(HOST_SP, UESP); DEFINE(HOST_EFLAGS, EFL); - DEFINE(HOST_EAX, EAX); - DEFINE(HOST_EBX, EBX); - DEFINE(HOST_ECX, ECX); - DEFINE(HOST_EDX, EDX); - DEFINE(HOST_ESI, ESI); - DEFINE(HOST_EDI, EDI); - DEFINE(HOST_EBP, EBP); + DEFINE(HOST_AX, EAX); + DEFINE(HOST_BX, EBX); + DEFINE(HOST_CX, ECX); + DEFINE(HOST_DX, EDX); + DEFINE(HOST_SI, ESI); + DEFINE(HOST_DI, EDI); + DEFINE(HOST_BP, EBP); DEFINE(HOST_CS, CS); DEFINE(HOST_SS, SS); DEFINE(HOST_DS, DS); @@ -38,13 +38,13 @@ void foo(void) DEFINE(HOST_GS, GS); #else DEFINE(HOST_FP_SIZE, sizeof(struct _fpstate) / sizeof(unsigned long)); - DEFINE_LONGS(HOST_RBX, RBX); - DEFINE_LONGS(HOST_RCX, RCX); - DEFINE_LONGS(HOST_RDI, RDI); - DEFINE_LONGS(HOST_RSI, RSI); - DEFINE_LONGS(HOST_RDX, RDX); - DEFINE_LONGS(HOST_RBP, RBP); - DEFINE_LONGS(HOST_RAX, RAX); + DEFINE_LONGS(HOST_BX, RBX); + DEFINE_LONGS(HOST_CX, RCX); + DEFINE_LONGS(HOST_DI, RDI); + DEFINE_LONGS(HOST_SI, RSI); + DEFINE_LONGS(HOST_DX, RDX); + DEFINE_LONGS(HOST_BP, RBP); + DEFINE_LONGS(HOST_AX, RAX); DEFINE_LONGS(HOST_R8, R8); DEFINE_LONGS(HOST_R9, R9); DEFINE_LONGS(HOST_R10, R10); -- cgit v1.1 From c7ea591c91162a203a5961d69f5c86a6ef9d50c1 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 18 Aug 2011 20:10:19 +0100 Subject: um: increase stack growth cushion in pagefault analog of [PATCH] i386: let usermode execute the "enter" instruction from circa 2006. Signed-off-by: Al Viro Signed-off-by: Richard Weinberger --- arch/x86/um/asm/processor.h | 3 +++ arch/x86/um/asm/processor_32.h | 3 --- arch/x86/um/asm/processor_64.h | 3 --- 3 files changed, 3 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/um/asm/processor.h b/arch/x86/um/asm/processor.h index d3ac1ce..f34c4b2 100644 --- a/arch/x86/um/asm/processor.h +++ b/arch/x86/um/asm/processor.h @@ -10,6 +10,9 @@ # include "processor_64.h" #endif +#define ARCH_IS_STACKGROW(address) \ + (address + 65536 + 32 * sizeof(unsigned long) >= UPT_SP(¤t->thread.regs.regs)) + #include #endif diff --git a/arch/x86/um/asm/processor_32.h b/arch/x86/um/asm/processor_32.h index ae0d189..e5b72fa 100644 --- a/arch/x86/um/asm/processor_32.h +++ b/arch/x86/um/asm/processor_32.h @@ -63,9 +63,6 @@ static inline void rep_nop(void) #define current_text_addr() \ ({ void *pc; __asm__("movl $1f,%0\n1:":"=g" (pc)); pc; }) -#define ARCH_IS_STACKGROW(address) \ - (address + 32 >= UPT_SP(¤t->thread.regs.regs)) - #define KSTK_EIP(tsk) KSTK_REG(tsk, EIP) #define KSTK_ESP(tsk) KSTK_REG(tsk, UESP) #define KSTK_EBP(tsk) KSTK_REG(tsk, EBP) diff --git a/arch/x86/um/asm/processor_64.h b/arch/x86/um/asm/processor_64.h index 6db812b..0186c61 100644 --- a/arch/x86/um/asm/processor_64.h +++ b/arch/x86/um/asm/processor_64.h @@ -42,9 +42,6 @@ static inline void arch_copy_thread(struct arch_thread *from, #define current_text_addr() \ ({ void *pc; __asm__("movq $1f,%0\n1:":"=g" (pc)); pc; }) -#define ARCH_IS_STACKGROW(address) \ - (address + 128 >= UPT_SP(¤t->thread.regs.regs)) - #define KSTK_EIP(tsk) KSTK_REG(tsk, RIP) #define KSTK_ESP(tsk) KSTK_REG(tsk, RSP) -- cgit v1.1 From fbe9868693d9025753427d7d28b9eed4d01cf674 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 18 Aug 2011 20:10:29 +0100 Subject: um: no need to play with save_sp in signal frame setup anymore Signed-off-by: Al Viro Signed-off-by: Richard Weinberger --- arch/x86/um/signal_32.c | 39 ++++++++------------------------------- arch/x86/um/signal_64.c | 33 +++++++-------------------------- 2 files changed, 15 insertions(+), 57 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/um/signal_32.c b/arch/x86/um/signal_32.c index 2eebdc0..7a206d8 100644 --- a/arch/x86/um/signal_32.c +++ b/arch/x86/um/signal_32.c @@ -215,8 +215,7 @@ static int copy_sc_from_user(struct pt_regs *regs, } static int copy_sc_to_user(struct sigcontext __user *to, - struct _fpstate __user *to_fp, struct pt_regs *regs, - unsigned long sp) + struct _fpstate __user *to_fp, struct pt_regs *regs) { struct sigcontext sc; struct faultinfo * fi = ¤t->thread.arch.faultinfo; @@ -230,7 +229,7 @@ static int copy_sc_to_user(struct sigcontext __user *to, sc.di = REGS_EDI(regs->regs.gp); sc.si = REGS_ESI(regs->regs.gp); sc.bp = REGS_EBP(regs->regs.gp); - sc.sp = sp; + sc.sp = REGS_SP(regs->regs.gp); sc.bx = REGS_EBX(regs->regs.gp); sc.dx = REGS_EDX(regs->regs.gp); sc.cx = REGS_ECX(regs->regs.gp); @@ -291,7 +290,7 @@ static int copy_ucontext_to_user(struct ucontext __user *uc, err |= put_user(current->sas_ss_sp, &uc->uc_stack.ss_sp); err |= put_user(sas_ss_flags(sp), &uc->uc_stack.ss_flags); err |= put_user(current->sas_ss_size, &uc->uc_stack.ss_size); - err |= copy_sc_to_user(&uc->uc_mcontext, fp, ¤t->thread.regs, sp); + err |= copy_sc_to_user(&uc->uc_mcontext, fp, ¤t->thread.regs); err |= copy_to_user(&uc->uc_sigmask, set, sizeof(*set)); return err; } @@ -324,7 +323,6 @@ int setup_signal_stack_sc(unsigned long stack_top, int sig, { struct sigframe __user *frame; void __user *restorer; - unsigned long save_sp = PT_REGS_SP(regs); int err = 0; /* This is the same calculation as i386 - ((sp + 4) & 15) == 0 */ @@ -337,19 +335,9 @@ int setup_signal_stack_sc(unsigned long stack_top, int sig, if (ka->sa.sa_flags & SA_RESTORER) restorer = ka->sa.sa_restorer; - /* Update SP now because the page fault handler refuses to extend - * the stack if the faulting address is too far below the current - * SP, which frame now certainly is. If there's an error, the original - * value is restored on the way out. - * When writing the sigcontext to the stack, we have to write the - * original value, so that's passed to copy_sc_to_user, which does - * the right thing with it. - */ - PT_REGS_SP(regs) = (unsigned long) frame; - err |= __put_user(restorer, &frame->pretcode); err |= __put_user(sig, &frame->sig); - err |= copy_sc_to_user(&frame->sc, NULL, regs, save_sp); + err |= copy_sc_to_user(&frame->sc, NULL, regs); err |= __put_user(mask->sig[0], &frame->sc.oldmask); if (_NSIG_WORDS > 1) err |= __copy_to_user(&frame->extramask, &mask->sig[1], @@ -367,7 +355,7 @@ int setup_signal_stack_sc(unsigned long stack_top, int sig, err |= __put_user(0x80cd, (short __user *)(frame->retcode+6)); if (err) - goto err; + return err; PT_REGS_SP(regs) = (unsigned long) frame; PT_REGS_IP(regs) = (unsigned long) ka->sa.sa_handler; @@ -378,10 +366,6 @@ int setup_signal_stack_sc(unsigned long stack_top, int sig, if ((current->ptrace & PT_DTRACE) && (current->ptrace & PT_PTRACED)) ptrace_notify(SIGTRAP); return 0; - -err: - PT_REGS_SP(regs) = save_sp; - return err; } int setup_signal_stack_si(unsigned long stack_top, int sig, @@ -390,7 +374,6 @@ int setup_signal_stack_si(unsigned long stack_top, int sig, { struct rt_sigframe __user *frame; void __user *restorer; - unsigned long save_sp = PT_REGS_SP(regs); int err = 0; stack_top &= -8UL; @@ -402,16 +385,13 @@ int setup_signal_stack_si(unsigned long stack_top, int sig, if (ka->sa.sa_flags & SA_RESTORER) restorer = ka->sa.sa_restorer; - /* See comment above about why this is here */ - PT_REGS_SP(regs) = (unsigned long) frame; - err |= __put_user(restorer, &frame->pretcode); err |= __put_user(sig, &frame->sig); err |= __put_user(&frame->info, &frame->pinfo); err |= __put_user(&frame->uc, &frame->puc); err |= copy_siginfo_to_user(&frame->info, info); err |= copy_ucontext_to_user(&frame->uc, &frame->fpstate, mask, - save_sp); + PT_REGS_SP(regs)); /* * This is movl $,%eax ; int $0x80 @@ -425,8 +405,9 @@ int setup_signal_stack_si(unsigned long stack_top, int sig, err |= __put_user(0x80cd, (short __user *)(frame->retcode+5)); if (err) - goto err; + return err; + PT_REGS_SP(regs) = (unsigned long) frame; PT_REGS_IP(regs) = (unsigned long) ka->sa.sa_handler; PT_REGS_EAX(regs) = (unsigned long) sig; PT_REGS_EDX(regs) = (unsigned long) &frame->info; @@ -435,10 +416,6 @@ int setup_signal_stack_si(unsigned long stack_top, int sig, if ((current->ptrace & PT_DTRACE) && (current->ptrace & PT_PTRACED)) ptrace_notify(SIGTRAP); return 0; - -err: - PT_REGS_SP(regs) = save_sp; - return err; } long sys_sigreturn(struct pt_regs regs) diff --git a/arch/x86/um/signal_64.c b/arch/x86/um/signal_64.c index 4e5b9b0..74c2598 100644 --- a/arch/x86/um/signal_64.c +++ b/arch/x86/um/signal_64.c @@ -68,7 +68,7 @@ static int copy_sc_from_user(struct pt_regs *regs, static int copy_sc_to_user(struct sigcontext __user *to, struct _fpstate __user *to_fp, struct pt_regs *regs, - unsigned long mask, unsigned long sp) + unsigned long mask) { struct faultinfo * fi = ¤t->thread.arch.faultinfo; struct sigcontext sc; @@ -81,11 +81,7 @@ static int copy_sc_to_user(struct sigcontext __user *to, PUTREG(DI, di); PUTREG(SI, si); PUTREG(BP, bp); - /* - * Must use original RSP, which is passed in, rather than what's in - * signal frame. - */ - sc.sp = sp; + PUTREG(SP, sp); PUTREG(BX, bx); PUTREG(DX, dx); PUTREG(CX, cx); @@ -141,7 +137,6 @@ int setup_signal_stack_si(unsigned long stack_top, int sig, siginfo_t *info, sigset_t *set) { struct rt_sigframe __user *frame; - unsigned long save_sp = PT_REGS_RSP(regs); int err = 0; struct task_struct *me = current; @@ -159,26 +154,15 @@ int setup_signal_stack_si(unsigned long stack_top, int sig, goto out; } - /* - * Update SP now because the page fault handler refuses to extend - * the stack if the faulting address is too far below the current - * SP, which frame now certainly is. If there's an error, the original - * value is restored on the way out. - * When writing the sigcontext to the stack, we have to write the - * original value, so that's passed to copy_sc_to_user, which does - * the right thing with it. - */ - PT_REGS_RSP(regs) = (unsigned long) frame; - /* Create the ucontext. */ err |= __put_user(0, &frame->uc.uc_flags); err |= __put_user(0, &frame->uc.uc_link); err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp); - err |= __put_user(sas_ss_flags(save_sp), + err |= __put_user(sas_ss_flags(PT_REGS_RSP(regs)), &frame->uc.uc_stack.ss_flags); err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size); err |= copy_sc_to_user(&frame->uc.uc_mcontext, &frame->fpstate, regs, - set->sig[0], save_sp); + set->sig[0]); err |= __put_user(&frame->fpstate, &frame->uc.uc_mcontext.fpstate); if (sizeof(*set) == 16) { __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]); @@ -197,10 +181,10 @@ int setup_signal_stack_si(unsigned long stack_top, int sig, err |= __put_user(ka->sa.sa_restorer, &frame->pretcode); else /* could use a vstub here */ - goto restore_sp; + return err; if (err) - goto restore_sp; + return err; /* Set up registers for signal handler */ { @@ -209,6 +193,7 @@ int setup_signal_stack_si(unsigned long stack_top, int sig, sig = ed->signal_invmap[sig]; } + PT_REGS_RSP(regs) = (unsigned long) frame; PT_REGS_RDI(regs) = sig; /* In case the signal handler was declared without prototypes */ PT_REGS_RAX(regs) = 0; @@ -222,10 +207,6 @@ int setup_signal_stack_si(unsigned long stack_top, int sig, PT_REGS_RIP(regs) = (unsigned long) ka->sa.sa_handler; out: return err; - -restore_sp: - PT_REGS_RSP(regs) = save_sp; - return err; } long sys_rt_sigreturn(struct pt_regs *regs) -- cgit v1.1 From f67aa2ffb7ce2f6d88e2e7a8069309dc2627932e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 18 Aug 2011 20:10:39 +0100 Subject: um: merge signal_{32,64}.c Signed-off-by: Al Viro Signed-off-by: Richard Weinberger --- arch/x86/um/Makefile | 2 +- arch/x86/um/asm/ptrace_64.h | 2 +- arch/x86/um/signal.c | 624 ++++++++++++++++++++++++++++++++++++++++ arch/x86/um/signal_32.c | 475 ------------------------------ arch/x86/um/signal_64.c | 236 --------------- arch/x86/um/sys_call_table_32.S | 2 - arch/x86/um/sysrq_64.c | 2 +- 7 files changed, 627 insertions(+), 716 deletions(-) create mode 100644 arch/x86/um/signal.c delete mode 100644 arch/x86/um/signal_32.c delete mode 100644 arch/x86/um/signal_64.c (limited to 'arch/x86') diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile index df41989..ffe1008 100644 --- a/arch/x86/um/Makefile +++ b/arch/x86/um/Makefile @@ -9,7 +9,7 @@ else endif obj-y = bug.o bugs_$(BITS).o delay_$(BITS).o fault.o ksyms.o ldt.o \ - ptrace_$(BITS).o ptrace_user.o setjmp_$(BITS).o signal_$(BITS).o \ + ptrace_$(BITS).o ptrace_user.o setjmp_$(BITS).o signal.o \ stub_$(BITS).o stub_segv.o syscalls_$(BITS).o \ sys_call_table_$(BITS).o sysrq_$(BITS).o tls_$(BITS).o \ mem_$(BITS).o subarch.o os-$(OS)/ diff --git a/arch/x86/um/asm/ptrace_64.h b/arch/x86/um/asm/ptrace_64.h index 83d8c47..706a0d8 100644 --- a/arch/x86/um/asm/ptrace_64.h +++ b/arch/x86/um/asm/ptrace_64.h @@ -40,7 +40,7 @@ #define PT_REGS_ORIG_RAX(r) UPT_ORIG_RAX(&(r)->regs) #define PT_REGS_RIP(r) UPT_IP(&(r)->regs) -#define PT_REGS_RSP(r) UPT_SP(&(r)->regs) +#define PT_REGS_SP(r) UPT_SP(&(r)->regs) #define PT_REGS_EFLAGS(r) UPT_EFLAGS(&(r)->regs) diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c new file mode 100644 index 0000000..4883b95 --- /dev/null +++ b/arch/x86/um/signal.c @@ -0,0 +1,624 @@ +/* + * Copyright (C) 2003 PathScale, Inc. + * Copyright (C) 2003 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + * Licensed under the GPL + */ + + +#include +#include +#include +#include +#include +#include +#include "frame_kern.h" +#include "skas.h" + +#ifdef CONFIG_X86_32 + +/* + * FPU tag word conversions. + */ + +static inline unsigned short twd_i387_to_fxsr(unsigned short twd) +{ + unsigned int tmp; /* to avoid 16 bit prefixes in the code */ + + /* Transform each pair of bits into 01 (valid) or 00 (empty) */ + tmp = ~twd; + tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */ + /* and move the valid bits to the lower byte. */ + tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */ + tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */ + tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */ + return tmp; +} + +static inline unsigned long twd_fxsr_to_i387(struct user_fxsr_struct *fxsave) +{ + struct _fpxreg *st = NULL; + unsigned long twd = (unsigned long) fxsave->twd; + unsigned long tag; + unsigned long ret = 0xffff0000; + int i; + +#define FPREG_ADDR(f, n) ((char *)&(f)->st_space + (n) * 16) + + for (i = 0; i < 8; i++) { + if (twd & 0x1) { + st = (struct _fpxreg *) FPREG_ADDR(fxsave, i); + + switch (st->exponent & 0x7fff) { + case 0x7fff: + tag = 2; /* Special */ + break; + case 0x0000: + if ( !st->significand[0] && + !st->significand[1] && + !st->significand[2] && + !st->significand[3] ) { + tag = 1; /* Zero */ + } else { + tag = 2; /* Special */ + } + break; + default: + if (st->significand[3] & 0x8000) { + tag = 0; /* Valid */ + } else { + tag = 2; /* Special */ + } + break; + } + } else { + tag = 3; /* Empty */ + } + ret |= (tag << (2 * i)); + twd = twd >> 1; + } + return ret; +} + +static int convert_fxsr_to_user(struct _fpstate __user *buf, + struct user_fxsr_struct *fxsave) +{ + unsigned long env[7]; + struct _fpreg __user *to; + struct _fpxreg *from; + int i; + + env[0] = (unsigned long)fxsave->cwd | 0xffff0000ul; + env[1] = (unsigned long)fxsave->swd | 0xffff0000ul; + env[2] = twd_fxsr_to_i387(fxsave); + env[3] = fxsave->fip; + env[4] = fxsave->fcs | ((unsigned long)fxsave->fop << 16); + env[5] = fxsave->foo; + env[6] = fxsave->fos; + + if (__copy_to_user(buf, env, 7 * sizeof(unsigned long))) + return 1; + + to = &buf->_st[0]; + from = (struct _fpxreg *) &fxsave->st_space[0]; + for (i = 0; i < 8; i++, to++, from++) { + unsigned long __user *t = (unsigned long __user *)to; + unsigned long *f = (unsigned long *)from; + + if (__put_user(*f, t) || + __put_user(*(f + 1), t + 1) || + __put_user(from->exponent, &to->exponent)) + return 1; + } + return 0; +} + +static int convert_fxsr_from_user(struct user_fxsr_struct *fxsave, + struct _fpstate __user *buf) +{ + unsigned long env[7]; + struct _fpxreg *to; + struct _fpreg __user *from; + int i; + + if (copy_from_user( env, buf, 7 * sizeof(long))) + return 1; + + fxsave->cwd = (unsigned short)(env[0] & 0xffff); + fxsave->swd = (unsigned short)(env[1] & 0xffff); + fxsave->twd = twd_i387_to_fxsr((unsigned short)(env[2] & 0xffff)); + fxsave->fip = env[3]; + fxsave->fop = (unsigned short)((env[4] & 0xffff0000ul) >> 16); + fxsave->fcs = (env[4] & 0xffff); + fxsave->foo = env[5]; + fxsave->fos = env[6]; + + to = (struct _fpxreg *) &fxsave->st_space[0]; + from = &buf->_st[0]; + for (i = 0; i < 8; i++, to++, from++) { + unsigned long *t = (unsigned long *)to; + unsigned long __user *f = (unsigned long __user *)from; + + if (__get_user(*t, f) || + __get_user(*(t + 1), f + 1) || + __get_user(to->exponent, &from->exponent)) + return 1; + } + return 0; +} + +extern int have_fpx_regs; + +#endif + +static int copy_sc_from_user(struct pt_regs *regs, + struct sigcontext __user *from) +{ + struct sigcontext sc; + int err, pid; + + err = copy_from_user(&sc, from, sizeof(sc)); + if (err) + return err; + +#define GETREG(regno, regname) regs->regs.gp[HOST_##regno] = sc.regname + +#ifdef CONFIG_X86_32 + GETREG(GS, gs); + GETREG(FS, fs); + GETREG(ES, es); + GETREG(DS, ds); +#endif + GETREG(DI, di); + GETREG(SI, si); + GETREG(BP, bp); + GETREG(SP, sp); + GETREG(BX, bx); + GETREG(DX, dx); + GETREG(CX, cx); + GETREG(AX, ax); + GETREG(IP, ip); + +#ifdef CONFIG_X86_64 + GETREG(R8, r8); + GETREG(R9, r9); + GETREG(R10, r10); + GETREG(R11, r11); + GETREG(R12, r12); + GETREG(R13, r13); + GETREG(R14, r14); + GETREG(R15, r15); +#endif + + GETREG(CS, cs); + GETREG(EFLAGS, flags); +#ifdef CONFIG_X86_32 + GETREG(SS, ss); +#endif + +#undef GETREG + + pid = userspace_pid[current_thread_info()->cpu]; +#ifdef CONFIG_X86_32 + if (have_fpx_regs) { + struct user_fxsr_struct fpx; + + err = copy_from_user(&fpx, + &((struct _fpstate __user *)sc.fpstate)->_fxsr_env[0], + sizeof(struct user_fxsr_struct)); + if (err) + return 1; + + err = convert_fxsr_from_user(&fpx, sc.fpstate); + if (err) + return 1; + + err = restore_fpx_registers(pid, (unsigned long *) &fpx); + if (err < 0) { + printk(KERN_ERR "copy_sc_from_user - " + "restore_fpx_registers failed, errno = %d\n", + -err); + return 1; + } + } else +#endif + { + struct user_i387_struct fp; + + err = copy_from_user(&fp, sc.fpstate, + sizeof(struct user_i387_struct)); + if (err) + return 1; + + err = restore_fp_registers(pid, (unsigned long *) &fp); + if (err < 0) { + printk(KERN_ERR "copy_sc_from_user - " + "restore_fp_registers failed, errno = %d\n", + -err); + return 1; + } + } + return 0; +} + +static int copy_sc_to_user(struct sigcontext __user *to, + struct _fpstate __user *to_fp, struct pt_regs *regs, + unsigned long mask) +{ + struct sigcontext sc; + struct faultinfo * fi = ¤t->thread.arch.faultinfo; + int err, pid; + memset(&sc, 0, sizeof(struct sigcontext)); + +#define PUTREG(regno, regname) sc.regname = regs->regs.gp[HOST_##regno] + +#ifdef CONFIG_X86_32 + PUTREG(GS, gs); + PUTREG(FS, fs); + PUTREG(ES, es); + PUTREG(DS, ds); +#endif + PUTREG(DI, di); + PUTREG(SI, si); + PUTREG(BP, bp); + PUTREG(SP, sp); + PUTREG(BX, bx); + PUTREG(DX, dx); + PUTREG(CX, cx); + PUTREG(AX, ax); +#ifdef CONFIG_X86_64 + PUTREG(R8, r8); + PUTREG(R9, r9); + PUTREG(R10, r10); + PUTREG(R11, r11); + PUTREG(R12, r12); + PUTREG(R13, r13); + PUTREG(R14, r14); + PUTREG(R15, r15); +#endif + + sc.cr2 = fi->cr2; + sc.err = fi->error_code; + sc.trapno = fi->trap_no; + PUTREG(IP, ip); + PUTREG(CS, cs); + PUTREG(EFLAGS, flags); +#ifdef CONFIG_X86_32 + PUTREG(SP, sp_at_signal); + PUTREG(SS, ss); +#endif +#undef PUTREG + sc.oldmask = mask; + sc.fpstate = to_fp; + + err = copy_to_user(to, &sc, sizeof(struct sigcontext)); + if (err) + return 1; + + pid = userspace_pid[current_thread_info()->cpu]; + +#ifdef CONFIG_X86_32 + if (have_fpx_regs) { + struct user_fxsr_struct fpx; + + err = save_fpx_registers(pid, (unsigned long *) &fpx); + if (err < 0){ + printk(KERN_ERR "copy_sc_to_user - save_fpx_registers " + "failed, errno = %d\n", err); + return 1; + } + + err = convert_fxsr_to_user(to_fp, &fpx); + if (err) + return 1; + + err |= __put_user(fpx.swd, &to_fp->status); + err |= __put_user(X86_FXSR_MAGIC, &to_fp->magic); + if (err) + return 1; + + if (copy_to_user(&to_fp->_fxsr_env[0], &fpx, + sizeof(struct user_fxsr_struct))) + return 1; + } else +#endif + { + struct user_i387_struct fp; + + err = save_fp_registers(pid, (unsigned long *) &fp); + if (copy_to_user(to_fp, &fp, sizeof(struct user_i387_struct))) + return 1; + } + + return 0; +} + +#ifdef CONFIG_X86_32 +static int copy_ucontext_to_user(struct ucontext __user *uc, + struct _fpstate __user *fp, sigset_t *set, + unsigned long sp) +{ + int err = 0; + + err |= put_user(current->sas_ss_sp, &uc->uc_stack.ss_sp); + err |= put_user(sas_ss_flags(sp), &uc->uc_stack.ss_flags); + err |= put_user(current->sas_ss_size, &uc->uc_stack.ss_size); + err |= copy_sc_to_user(&uc->uc_mcontext, fp, ¤t->thread.regs, 0); + err |= copy_to_user(&uc->uc_sigmask, set, sizeof(*set)); + return err; +} + +struct sigframe +{ + char __user *pretcode; + int sig; + struct sigcontext sc; + struct _fpstate fpstate; + unsigned long extramask[_NSIG_WORDS-1]; + char retcode[8]; +}; + +struct rt_sigframe +{ + char __user *pretcode; + int sig; + struct siginfo __user *pinfo; + void __user *puc; + struct siginfo info; + struct ucontext uc; + struct _fpstate fpstate; + char retcode[8]; +}; + +int setup_signal_stack_sc(unsigned long stack_top, int sig, + struct k_sigaction *ka, struct pt_regs *regs, + sigset_t *mask) +{ + struct sigframe __user *frame; + void __user *restorer; + int err = 0; + + /* This is the same calculation as i386 - ((sp + 4) & 15) == 0 */ + stack_top = ((stack_top + 4) & -16UL) - 4; + frame = (struct sigframe __user *) stack_top - 1; + if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) + return 1; + + restorer = frame->retcode; + if (ka->sa.sa_flags & SA_RESTORER) + restorer = ka->sa.sa_restorer; + + err |= __put_user(restorer, &frame->pretcode); + err |= __put_user(sig, &frame->sig); + err |= copy_sc_to_user(&frame->sc, &frame->fpstate, regs, mask->sig[0]); + if (_NSIG_WORDS > 1) + err |= __copy_to_user(&frame->extramask, &mask->sig[1], + sizeof(frame->extramask)); + + /* + * This is popl %eax ; movl $,%eax ; int $0x80 + * + * WE DO NOT USE IT ANY MORE! It's only left here for historical + * reasons and because gdb uses it as a signature to notice + * signal handler stack frames. + */ + err |= __put_user(0xb858, (short __user *)(frame->retcode+0)); + err |= __put_user(__NR_sigreturn, (int __user *)(frame->retcode+2)); + err |= __put_user(0x80cd, (short __user *)(frame->retcode+6)); + + if (err) + return err; + + PT_REGS_SP(regs) = (unsigned long) frame; + PT_REGS_IP(regs) = (unsigned long) ka->sa.sa_handler; + PT_REGS_EAX(regs) = (unsigned long) sig; + PT_REGS_EDX(regs) = (unsigned long) 0; + PT_REGS_ECX(regs) = (unsigned long) 0; + + if ((current->ptrace & PT_DTRACE) && (current->ptrace & PT_PTRACED)) + ptrace_notify(SIGTRAP); + return 0; +} + +int setup_signal_stack_si(unsigned long stack_top, int sig, + struct k_sigaction *ka, struct pt_regs *regs, + siginfo_t *info, sigset_t *mask) +{ + struct rt_sigframe __user *frame; + void __user *restorer; + int err = 0; + + stack_top &= -8UL; + frame = (struct rt_sigframe __user *) stack_top - 1; + if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) + return 1; + + restorer = frame->retcode; + if (ka->sa.sa_flags & SA_RESTORER) + restorer = ka->sa.sa_restorer; + + err |= __put_user(restorer, &frame->pretcode); + err |= __put_user(sig, &frame->sig); + err |= __put_user(&frame->info, &frame->pinfo); + err |= __put_user(&frame->uc, &frame->puc); + err |= copy_siginfo_to_user(&frame->info, info); + err |= copy_ucontext_to_user(&frame->uc, &frame->fpstate, mask, + PT_REGS_SP(regs)); + + /* + * This is movl $,%eax ; int $0x80 + * + * WE DO NOT USE IT ANY MORE! It's only left here for historical + * reasons and because gdb uses it as a signature to notice + * signal handler stack frames. + */ + err |= __put_user(0xb8, (char __user *)(frame->retcode+0)); + err |= __put_user(__NR_rt_sigreturn, (int __user *)(frame->retcode+1)); + err |= __put_user(0x80cd, (short __user *)(frame->retcode+5)); + + if (err) + return err; + + PT_REGS_SP(regs) = (unsigned long) frame; + PT_REGS_IP(regs) = (unsigned long) ka->sa.sa_handler; + PT_REGS_EAX(regs) = (unsigned long) sig; + PT_REGS_EDX(regs) = (unsigned long) &frame->info; + PT_REGS_ECX(regs) = (unsigned long) &frame->uc; + + if ((current->ptrace & PT_DTRACE) && (current->ptrace & PT_PTRACED)) + ptrace_notify(SIGTRAP); + return 0; +} + +long sys_sigreturn(struct pt_regs *regs) +{ + unsigned long sp = PT_REGS_SP(¤t->thread.regs); + struct sigframe __user *frame = (struct sigframe __user *)(sp - 8); + sigset_t set; + struct sigcontext __user *sc = &frame->sc; + unsigned long __user *oldmask = &sc->oldmask; + unsigned long __user *extramask = frame->extramask; + int sig_size = (_NSIG_WORDS - 1) * sizeof(unsigned long); + + if (copy_from_user(&set.sig[0], oldmask, sizeof(set.sig[0])) || + copy_from_user(&set.sig[1], extramask, sig_size)) + goto segfault; + + sigdelsetmask(&set, ~_BLOCKABLE); + set_current_blocked(&set); + + if (copy_sc_from_user(¤t->thread.regs, sc)) + goto segfault; + + /* Avoid ERESTART handling */ + PT_REGS_SYSCALL_NR(¤t->thread.regs) = -1; + return PT_REGS_SYSCALL_RET(¤t->thread.regs); + + segfault: + force_sig(SIGSEGV, current); + return 0; +} + +#else + +struct rt_sigframe +{ + char __user *pretcode; + struct ucontext uc; + struct siginfo info; + struct _fpstate fpstate; +}; + +int setup_signal_stack_si(unsigned long stack_top, int sig, + struct k_sigaction *ka, struct pt_regs * regs, + siginfo_t *info, sigset_t *set) +{ + struct rt_sigframe __user *frame; + int err = 0; + struct task_struct *me = current; + + frame = (struct rt_sigframe __user *) + round_down(stack_top - sizeof(struct rt_sigframe), 16); + /* Subtract 128 for a red zone and 8 for proper alignment */ + frame = (struct rt_sigframe __user *) ((unsigned long) frame - 128 - 8); + + if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) + goto out; + + if (ka->sa.sa_flags & SA_SIGINFO) { + err |= copy_siginfo_to_user(&frame->info, info); + if (err) + goto out; + } + + /* Create the ucontext. */ + err |= __put_user(0, &frame->uc.uc_flags); + err |= __put_user(0, &frame->uc.uc_link); + err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp); + err |= __put_user(sas_ss_flags(PT_REGS_SP(regs)), + &frame->uc.uc_stack.ss_flags); + err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size); + err |= copy_sc_to_user(&frame->uc.uc_mcontext, &frame->fpstate, regs, + set->sig[0]); + err |= __put_user(&frame->fpstate, &frame->uc.uc_mcontext.fpstate); + if (sizeof(*set) == 16) { + __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]); + __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]); + } + else + err |= __copy_to_user(&frame->uc.uc_sigmask, set, + sizeof(*set)); + + /* + * Set up to return from userspace. If provided, use a stub + * already in userspace. + */ + /* x86-64 should always use SA_RESTORER. */ + if (ka->sa.sa_flags & SA_RESTORER) + err |= __put_user(ka->sa.sa_restorer, &frame->pretcode); + else + /* could use a vstub here */ + return err; + + if (err) + return err; + + /* Set up registers for signal handler */ + { + struct exec_domain *ed = current_thread_info()->exec_domain; + if (unlikely(ed && ed->signal_invmap && sig < 32)) + sig = ed->signal_invmap[sig]; + } + + PT_REGS_SP(regs) = (unsigned long) frame; + PT_REGS_RDI(regs) = sig; + /* In case the signal handler was declared without prototypes */ + PT_REGS_RAX(regs) = 0; + + /* + * This also works for non SA_SIGINFO handlers because they expect the + * next argument after the signal number on the stack. + */ + PT_REGS_RSI(regs) = (unsigned long) &frame->info; + PT_REGS_RDX(regs) = (unsigned long) &frame->uc; + PT_REGS_RIP(regs) = (unsigned long) ka->sa.sa_handler; + out: + return err; +} +#endif + +long sys_rt_sigreturn(struct pt_regs *regs) +{ + unsigned long sp = PT_REGS_SP(¤t->thread.regs); + struct rt_sigframe __user *frame = + (struct rt_sigframe __user *)(sp - sizeof(long)); + struct ucontext __user *uc = &frame->uc; + sigset_t set; + + if (copy_from_user(&set, &uc->uc_sigmask, sizeof(set))) + goto segfault; + + sigdelsetmask(&set, ~_BLOCKABLE); + set_current_blocked(&set); + + if (copy_sc_from_user(¤t->thread.regs, &uc->uc_mcontext)) + goto segfault; + + /* Avoid ERESTART handling */ + PT_REGS_SYSCALL_NR(¤t->thread.regs) = -1; + return PT_REGS_SYSCALL_RET(¤t->thread.regs); + + segfault: + force_sig(SIGSEGV, current); + return 0; +} + +#ifdef CONFIG_X86_32 +long ptregs_sigreturn(void) +{ + return sys_sigreturn(NULL); +} +long ptregs_rt_sigreturn(void) +{ + return sys_rt_sigreturn(NULL); +} +#endif diff --git a/arch/x86/um/signal_32.c b/arch/x86/um/signal_32.c deleted file mode 100644 index 7a206d8..0000000 --- a/arch/x86/um/signal_32.c +++ /dev/null @@ -1,475 +0,0 @@ -/* - * Copyright (C) 2004 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL - */ - -#include -#include -#include -#include -#include "frame_kern.h" -#include "skas.h" - -/* - * FPU tag word conversions. - */ - -static inline unsigned short twd_i387_to_fxsr(unsigned short twd) -{ - unsigned int tmp; /* to avoid 16 bit prefixes in the code */ - - /* Transform each pair of bits into 01 (valid) or 00 (empty) */ - tmp = ~twd; - tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */ - /* and move the valid bits to the lower byte. */ - tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */ - tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */ - tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */ - return tmp; -} - -static inline unsigned long twd_fxsr_to_i387(struct user_fxsr_struct *fxsave) -{ - struct _fpxreg *st = NULL; - unsigned long twd = (unsigned long) fxsave->twd; - unsigned long tag; - unsigned long ret = 0xffff0000; - int i; - -#define FPREG_ADDR(f, n) ((char *)&(f)->st_space + (n) * 16) - - for (i = 0; i < 8; i++) { - if (twd & 0x1) { - st = (struct _fpxreg *) FPREG_ADDR(fxsave, i); - - switch (st->exponent & 0x7fff) { - case 0x7fff: - tag = 2; /* Special */ - break; - case 0x0000: - if ( !st->significand[0] && - !st->significand[1] && - !st->significand[2] && - !st->significand[3] ) { - tag = 1; /* Zero */ - } else { - tag = 2; /* Special */ - } - break; - default: - if (st->significand[3] & 0x8000) { - tag = 0; /* Valid */ - } else { - tag = 2; /* Special */ - } - break; - } - } else { - tag = 3; /* Empty */ - } - ret |= (tag << (2 * i)); - twd = twd >> 1; - } - return ret; -} - -static int convert_fxsr_to_user(struct _fpstate __user *buf, - struct user_fxsr_struct *fxsave) -{ - unsigned long env[7]; - struct _fpreg __user *to; - struct _fpxreg *from; - int i; - - env[0] = (unsigned long)fxsave->cwd | 0xffff0000ul; - env[1] = (unsigned long)fxsave->swd | 0xffff0000ul; - env[2] = twd_fxsr_to_i387(fxsave); - env[3] = fxsave->fip; - env[4] = fxsave->fcs | ((unsigned long)fxsave->fop << 16); - env[5] = fxsave->foo; - env[6] = fxsave->fos; - - if (__copy_to_user(buf, env, 7 * sizeof(unsigned long))) - return 1; - - to = &buf->_st[0]; - from = (struct _fpxreg *) &fxsave->st_space[0]; - for (i = 0; i < 8; i++, to++, from++) { - unsigned long __user *t = (unsigned long __user *)to; - unsigned long *f = (unsigned long *)from; - - if (__put_user(*f, t) || - __put_user(*(f + 1), t + 1) || - __put_user(from->exponent, &to->exponent)) - return 1; - } - return 0; -} - -static int convert_fxsr_from_user(struct user_fxsr_struct *fxsave, - struct _fpstate __user *buf) -{ - unsigned long env[7]; - struct _fpxreg *to; - struct _fpreg __user *from; - int i; - - if (copy_from_user( env, buf, 7 * sizeof(long))) - return 1; - - fxsave->cwd = (unsigned short)(env[0] & 0xffff); - fxsave->swd = (unsigned short)(env[1] & 0xffff); - fxsave->twd = twd_i387_to_fxsr((unsigned short)(env[2] & 0xffff)); - fxsave->fip = env[3]; - fxsave->fop = (unsigned short)((env[4] & 0xffff0000ul) >> 16); - fxsave->fcs = (env[4] & 0xffff); - fxsave->foo = env[5]; - fxsave->fos = env[6]; - - to = (struct _fpxreg *) &fxsave->st_space[0]; - from = &buf->_st[0]; - for (i = 0; i < 8; i++, to++, from++) { - unsigned long *t = (unsigned long *)to; - unsigned long __user *f = (unsigned long __user *)from; - - if (__get_user(*t, f) || - __get_user(*(t + 1), f + 1) || - __get_user(to->exponent, &from->exponent)) - return 1; - } - return 0; -} - -extern int have_fpx_regs; - -static int copy_sc_from_user(struct pt_regs *regs, - struct sigcontext __user *from) -{ - struct sigcontext sc; - int err, pid; - - err = copy_from_user(&sc, from, sizeof(sc)); - if (err) - return err; - - pid = userspace_pid[current_thread_info()->cpu]; - -#define GETREG(regno, regname) regs->regs.gp[HOST_##regno] = sc.regname - - GETREG(GS, gs); - GETREG(FS, fs); - GETREG(ES, es); - GETREG(DS, ds); - GETREG(DI, di); - GETREG(SI, si); - GETREG(BP, bp); - GETREG(SP, sp); - GETREG(BX, bx); - GETREG(DX, dx); - GETREG(CX, cx); - GETREG(AX, ax); - GETREG(IP, ip); - GETREG(CS, cs); - GETREG(EFLAGS, flags); - GETREG(SS, ss); - -#undef GETREG - if (have_fpx_regs) { - struct user_fxsr_struct fpx; - - err = copy_from_user(&fpx, - &((struct _fpstate __user *)sc.fpstate)->_fxsr_env[0], - sizeof(struct user_fxsr_struct)); - if (err) - return 1; - - err = convert_fxsr_from_user(&fpx, sc.fpstate); - if (err) - return 1; - - err = restore_fpx_registers(pid, (unsigned long *) &fpx); - if (err < 0) { - printk(KERN_ERR "copy_sc_from_user - " - "restore_fpx_registers failed, errno = %d\n", - -err); - return 1; - } - } else { - struct user_i387_struct fp; - - err = copy_from_user(&fp, sc.fpstate, - sizeof(struct user_i387_struct)); - if (err) - return 1; - - err = restore_fp_registers(pid, (unsigned long *) &fp); - if (err < 0) { - printk(KERN_ERR "copy_sc_from_user - " - "restore_fp_registers failed, errno = %d\n", - -err); - return 1; - } - } - - return 0; -} - -static int copy_sc_to_user(struct sigcontext __user *to, - struct _fpstate __user *to_fp, struct pt_regs *regs) -{ - struct sigcontext sc; - struct faultinfo * fi = ¤t->thread.arch.faultinfo; - int err, pid; - memset(&sc, 0, sizeof(struct sigcontext)); - - sc.gs = REGS_GS(regs->regs.gp); - sc.fs = REGS_FS(regs->regs.gp); - sc.es = REGS_ES(regs->regs.gp); - sc.ds = REGS_DS(regs->regs.gp); - sc.di = REGS_EDI(regs->regs.gp); - sc.si = REGS_ESI(regs->regs.gp); - sc.bp = REGS_EBP(regs->regs.gp); - sc.sp = REGS_SP(regs->regs.gp); - sc.bx = REGS_EBX(regs->regs.gp); - sc.dx = REGS_EDX(regs->regs.gp); - sc.cx = REGS_ECX(regs->regs.gp); - sc.ax = REGS_EAX(regs->regs.gp); - sc.ip = REGS_IP(regs->regs.gp); - sc.cs = REGS_CS(regs->regs.gp); - sc.flags = REGS_EFLAGS(regs->regs.gp); - sc.sp_at_signal = regs->regs.gp[UESP]; - sc.ss = regs->regs.gp[SS]; - sc.cr2 = fi->cr2; - sc.err = fi->error_code; - sc.trapno = fi->trap_no; - - to_fp = (to_fp ? to_fp : (struct _fpstate __user *) (to + 1)); - sc.fpstate = to_fp; - - pid = userspace_pid[current_thread_info()->cpu]; - if (have_fpx_regs) { - struct user_fxsr_struct fpx; - - err = save_fpx_registers(pid, (unsigned long *) &fpx); - if (err < 0){ - printk(KERN_ERR "copy_sc_to_user - save_fpx_registers " - "failed, errno = %d\n", err); - return 1; - } - - err = convert_fxsr_to_user(to_fp, &fpx); - if (err) - return 1; - - err |= __put_user(fpx.swd, &to_fp->status); - err |= __put_user(X86_FXSR_MAGIC, &to_fp->magic); - if (err) - return 1; - - if (copy_to_user(&to_fp->_fxsr_env[0], &fpx, - sizeof(struct user_fxsr_struct))) - return 1; - } - else { - struct user_i387_struct fp; - - err = save_fp_registers(pid, (unsigned long *) &fp); - if (copy_to_user(to_fp, &fp, sizeof(struct user_i387_struct))) - return 1; - } - - return copy_to_user(to, &sc, sizeof(sc)); -} - -static int copy_ucontext_to_user(struct ucontext __user *uc, - struct _fpstate __user *fp, sigset_t *set, - unsigned long sp) -{ - int err = 0; - - err |= put_user(current->sas_ss_sp, &uc->uc_stack.ss_sp); - err |= put_user(sas_ss_flags(sp), &uc->uc_stack.ss_flags); - err |= put_user(current->sas_ss_size, &uc->uc_stack.ss_size); - err |= copy_sc_to_user(&uc->uc_mcontext, fp, ¤t->thread.regs); - err |= copy_to_user(&uc->uc_sigmask, set, sizeof(*set)); - return err; -} - -struct sigframe -{ - char __user *pretcode; - int sig; - struct sigcontext sc; - struct _fpstate fpstate; - unsigned long extramask[_NSIG_WORDS-1]; - char retcode[8]; -}; - -struct rt_sigframe -{ - char __user *pretcode; - int sig; - struct siginfo __user *pinfo; - void __user *puc; - struct siginfo info; - struct ucontext uc; - struct _fpstate fpstate; - char retcode[8]; -}; - -int setup_signal_stack_sc(unsigned long stack_top, int sig, - struct k_sigaction *ka, struct pt_regs *regs, - sigset_t *mask) -{ - struct sigframe __user *frame; - void __user *restorer; - int err = 0; - - /* This is the same calculation as i386 - ((sp + 4) & 15) == 0 */ - stack_top = ((stack_top + 4) & -16UL) - 4; - frame = (struct sigframe __user *) stack_top - 1; - if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) - return 1; - - restorer = frame->retcode; - if (ka->sa.sa_flags & SA_RESTORER) - restorer = ka->sa.sa_restorer; - - err |= __put_user(restorer, &frame->pretcode); - err |= __put_user(sig, &frame->sig); - err |= copy_sc_to_user(&frame->sc, NULL, regs); - err |= __put_user(mask->sig[0], &frame->sc.oldmask); - if (_NSIG_WORDS > 1) - err |= __copy_to_user(&frame->extramask, &mask->sig[1], - sizeof(frame->extramask)); - - /* - * This is popl %eax ; movl $,%eax ; int $0x80 - * - * WE DO NOT USE IT ANY MORE! It's only left here for historical - * reasons and because gdb uses it as a signature to notice - * signal handler stack frames. - */ - err |= __put_user(0xb858, (short __user *)(frame->retcode+0)); - err |= __put_user(__NR_sigreturn, (int __user *)(frame->retcode+2)); - err |= __put_user(0x80cd, (short __user *)(frame->retcode+6)); - - if (err) - return err; - - PT_REGS_SP(regs) = (unsigned long) frame; - PT_REGS_IP(regs) = (unsigned long) ka->sa.sa_handler; - PT_REGS_EAX(regs) = (unsigned long) sig; - PT_REGS_EDX(regs) = (unsigned long) 0; - PT_REGS_ECX(regs) = (unsigned long) 0; - - if ((current->ptrace & PT_DTRACE) && (current->ptrace & PT_PTRACED)) - ptrace_notify(SIGTRAP); - return 0; -} - -int setup_signal_stack_si(unsigned long stack_top, int sig, - struct k_sigaction *ka, struct pt_regs *regs, - siginfo_t *info, sigset_t *mask) -{ - struct rt_sigframe __user *frame; - void __user *restorer; - int err = 0; - - stack_top &= -8UL; - frame = (struct rt_sigframe __user *) stack_top - 1; - if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) - return 1; - - restorer = frame->retcode; - if (ka->sa.sa_flags & SA_RESTORER) - restorer = ka->sa.sa_restorer; - - err |= __put_user(restorer, &frame->pretcode); - err |= __put_user(sig, &frame->sig); - err |= __put_user(&frame->info, &frame->pinfo); - err |= __put_user(&frame->uc, &frame->puc); - err |= copy_siginfo_to_user(&frame->info, info); - err |= copy_ucontext_to_user(&frame->uc, &frame->fpstate, mask, - PT_REGS_SP(regs)); - - /* - * This is movl $,%eax ; int $0x80 - * - * WE DO NOT USE IT ANY MORE! It's only left here for historical - * reasons and because gdb uses it as a signature to notice - * signal handler stack frames. - */ - err |= __put_user(0xb8, (char __user *)(frame->retcode+0)); - err |= __put_user(__NR_rt_sigreturn, (int __user *)(frame->retcode+1)); - err |= __put_user(0x80cd, (short __user *)(frame->retcode+5)); - - if (err) - return err; - - PT_REGS_SP(regs) = (unsigned long) frame; - PT_REGS_IP(regs) = (unsigned long) ka->sa.sa_handler; - PT_REGS_EAX(regs) = (unsigned long) sig; - PT_REGS_EDX(regs) = (unsigned long) &frame->info; - PT_REGS_ECX(regs) = (unsigned long) &frame->uc; - - if ((current->ptrace & PT_DTRACE) && (current->ptrace & PT_PTRACED)) - ptrace_notify(SIGTRAP); - return 0; -} - -long sys_sigreturn(struct pt_regs regs) -{ - unsigned long sp = PT_REGS_SP(¤t->thread.regs); - struct sigframe __user *frame = (struct sigframe __user *)(sp - 8); - sigset_t set; - struct sigcontext __user *sc = &frame->sc; - unsigned long __user *oldmask = &sc->oldmask; - unsigned long __user *extramask = frame->extramask; - int sig_size = (_NSIG_WORDS - 1) * sizeof(unsigned long); - - if (copy_from_user(&set.sig[0], oldmask, sizeof(set.sig[0])) || - copy_from_user(&set.sig[1], extramask, sig_size)) - goto segfault; - - sigdelsetmask(&set, ~_BLOCKABLE); - set_current_blocked(&set); - - if (copy_sc_from_user(¤t->thread.regs, sc)) - goto segfault; - - /* Avoid ERESTART handling */ - PT_REGS_SYSCALL_NR(¤t->thread.regs) = -1; - return PT_REGS_SYSCALL_RET(¤t->thread.regs); - - segfault: - force_sig(SIGSEGV, current); - return 0; -} - -long sys_rt_sigreturn(struct pt_regs regs) -{ - unsigned long sp = PT_REGS_SP(¤t->thread.regs); - struct rt_sigframe __user *frame = - (struct rt_sigframe __user *) (sp - 4); - sigset_t set; - struct ucontext __user *uc = &frame->uc; - int sig_size = _NSIG_WORDS * sizeof(unsigned long); - - if (copy_from_user(&set, &uc->uc_sigmask, sig_size)) - goto segfault; - - sigdelsetmask(&set, ~_BLOCKABLE); - set_current_blocked(&set); - - if (copy_sc_from_user(¤t->thread.regs, &uc->uc_mcontext)) - goto segfault; - - /* Avoid ERESTART handling */ - PT_REGS_SYSCALL_NR(¤t->thread.regs) = -1; - return PT_REGS_SYSCALL_RET(¤t->thread.regs); - - segfault: - force_sig(SIGSEGV, current); - return 0; -} diff --git a/arch/x86/um/signal_64.c b/arch/x86/um/signal_64.c deleted file mode 100644 index 74c2598..0000000 --- a/arch/x86/um/signal_64.c +++ /dev/null @@ -1,236 +0,0 @@ -/* - * Copyright (C) 2003 PathScale, Inc. - * Copyright (C) 2003 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL - */ - -#include -#include -#include -#include -#include -#include -#include "frame_kern.h" -#include "skas.h" - -static int copy_sc_from_user(struct pt_regs *regs, - struct sigcontext __user *from) -{ - struct sigcontext sc; - struct user_i387_struct fp; - void __user *buf; - int err; - - err = copy_from_user(&sc, from, sizeof(sc)); - if (err) - return err; - -#define GETREG(regno, regname) regs->regs.gp[HOST_##regno] = sc.regname - - GETREG(R8, r8); - GETREG(R9, r9); - GETREG(R10, r10); - GETREG(R11, r11); - GETREG(R12, r12); - GETREG(R13, r13); - GETREG(R14, r14); - GETREG(R15, r15); - GETREG(DI, di); - GETREG(SI, si); - GETREG(BP, bp); - GETREG(BX, bx); - GETREG(DX, dx); - GETREG(AX, ax); - GETREG(CX, cx); - GETREG(SP, sp); - GETREG(IP, ip); - GETREG(EFLAGS, flags); - GETREG(CS, cs); -#undef GETREG - - buf = sc.fpstate; - - err = copy_from_user(&fp, buf, sizeof(struct user_i387_struct)); - if (err) - return 1; - - err = restore_fp_registers(userspace_pid[current_thread_info()->cpu], - (unsigned long *) &fp); - if (err < 0) { - printk(KERN_ERR "copy_sc_from_user - " - "restore_fp_registers failed, errno = %d\n", - -err); - return 1; - } - - return 0; -} - -static int copy_sc_to_user(struct sigcontext __user *to, - struct _fpstate __user *to_fp, struct pt_regs *regs, - unsigned long mask) -{ - struct faultinfo * fi = ¤t->thread.arch.faultinfo; - struct sigcontext sc; - struct user_i387_struct fp; - int err = 0; - memset(&sc, 0, sizeof(struct sigcontext)); - -#define PUTREG(regno, regname) sc.regname = regs->regs.gp[HOST_##regno] - - PUTREG(DI, di); - PUTREG(SI, si); - PUTREG(BP, bp); - PUTREG(SP, sp); - PUTREG(BX, bx); - PUTREG(DX, dx); - PUTREG(CX, cx); - PUTREG(AX, ax); - PUTREG(R8, r8); - PUTREG(R9, r9); - PUTREG(R10, r10); - PUTREG(R11, r11); - PUTREG(R12, r12); - PUTREG(R13, r13); - PUTREG(R14, r14); - PUTREG(R15, r15); - PUTREG(CS, cs); /* XXX x86_64 doesn't do this */ - - sc.cr2 = fi->cr2; - sc.err = fi->error_code; - sc.trapno = fi->trap_no; - - PUTREG(IP, ip); - PUTREG(EFLAGS, flags); -#undef PUTREG - - sc.oldmask = mask; - - err = copy_to_user(to, &sc, sizeof(struct sigcontext)); - if (err) - return 1; - - err = save_fp_registers(userspace_pid[current_thread_info()->cpu], - (unsigned long *) &fp); - if (err < 0) { - printk(KERN_ERR "copy_sc_from_user - restore_fp_registers " - "failed, errno = %d\n", -err); - return 1; - } - - if (copy_to_user(to_fp, &fp, sizeof(struct user_i387_struct))) - return 1; - - return err; -} - -struct rt_sigframe -{ - char __user *pretcode; - struct ucontext uc; - struct siginfo info; - struct _fpstate fpstate; -}; - -int setup_signal_stack_si(unsigned long stack_top, int sig, - struct k_sigaction *ka, struct pt_regs * regs, - siginfo_t *info, sigset_t *set) -{ - struct rt_sigframe __user *frame; - int err = 0; - struct task_struct *me = current; - - frame = (struct rt_sigframe __user *) - round_down(stack_top - sizeof(struct rt_sigframe), 16); - /* Subtract 128 for a red zone and 8 for proper alignment */ - frame = (struct rt_sigframe __user *) ((unsigned long) frame - 128 - 8); - - if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) - goto out; - - if (ka->sa.sa_flags & SA_SIGINFO) { - err |= copy_siginfo_to_user(&frame->info, info); - if (err) - goto out; - } - - /* Create the ucontext. */ - err |= __put_user(0, &frame->uc.uc_flags); - err |= __put_user(0, &frame->uc.uc_link); - err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp); - err |= __put_user(sas_ss_flags(PT_REGS_RSP(regs)), - &frame->uc.uc_stack.ss_flags); - err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size); - err |= copy_sc_to_user(&frame->uc.uc_mcontext, &frame->fpstate, regs, - set->sig[0]); - err |= __put_user(&frame->fpstate, &frame->uc.uc_mcontext.fpstate); - if (sizeof(*set) == 16) { - __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]); - __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]); - } - else - err |= __copy_to_user(&frame->uc.uc_sigmask, set, - sizeof(*set)); - - /* - * Set up to return from userspace. If provided, use a stub - * already in userspace. - */ - /* x86-64 should always use SA_RESTORER. */ - if (ka->sa.sa_flags & SA_RESTORER) - err |= __put_user(ka->sa.sa_restorer, &frame->pretcode); - else - /* could use a vstub here */ - return err; - - if (err) - return err; - - /* Set up registers for signal handler */ - { - struct exec_domain *ed = current_thread_info()->exec_domain; - if (unlikely(ed && ed->signal_invmap && sig < 32)) - sig = ed->signal_invmap[sig]; - } - - PT_REGS_RSP(regs) = (unsigned long) frame; - PT_REGS_RDI(regs) = sig; - /* In case the signal handler was declared without prototypes */ - PT_REGS_RAX(regs) = 0; - - /* - * This also works for non SA_SIGINFO handlers because they expect the - * next argument after the signal number on the stack. - */ - PT_REGS_RSI(regs) = (unsigned long) &frame->info; - PT_REGS_RDX(regs) = (unsigned long) &frame->uc; - PT_REGS_RIP(regs) = (unsigned long) ka->sa.sa_handler; - out: - return err; -} - -long sys_rt_sigreturn(struct pt_regs *regs) -{ - unsigned long sp = PT_REGS_SP(¤t->thread.regs); - struct rt_sigframe __user *frame = - (struct rt_sigframe __user *)(sp - 8); - struct ucontext __user *uc = &frame->uc; - sigset_t set; - - if (copy_from_user(&set, &uc->uc_sigmask, sizeof(set))) - goto segfault; - - sigdelsetmask(&set, ~_BLOCKABLE); - set_current_blocked(&set); - - if (copy_sc_from_user(¤t->thread.regs, &uc->uc_mcontext)) - goto segfault; - - /* Avoid ERESTART handling */ - PT_REGS_SYSCALL_NR(¤t->thread.regs) = -1; - return PT_REGS_SYSCALL_RET(¤t->thread.regs); - - segfault: - force_sig(SIGSEGV, current); - return 0; -} diff --git a/arch/x86/um/sys_call_table_32.S b/arch/x86/um/sys_call_table_32.S index c3431cf..a7ca80d 100644 --- a/arch/x86/um/sys_call_table_32.S +++ b/arch/x86/um/sys_call_table_32.S @@ -13,10 +13,8 @@ #define ptregs_execve sys_execve #define ptregs_iopl sys_iopl #define ptregs_vm86old sys_vm86old -#define ptregs_sigreturn sys_sigreturn #define ptregs_clone sys_clone #define ptregs_vm86 sys_vm86 -#define ptregs_rt_sigreturn sys_rt_sigreturn #define ptregs_sigaltstack sys_sigaltstack #define ptregs_vfork sys_vfork diff --git a/arch/x86/um/sysrq_64.c b/arch/x86/um/sysrq_64.c index f4f82be..e891343 100644 --- a/arch/x86/um/sysrq_64.c +++ b/arch/x86/um/sysrq_64.c @@ -20,7 +20,7 @@ void __show_regs(struct pt_regs *regs) current->comm, print_tainted(), init_utsname()->release); printk(KERN_INFO "RIP: %04lx:[<%016lx>]\n", PT_REGS_CS(regs) & 0xffff, PT_REGS_RIP(regs)); - printk(KERN_INFO "RSP: %016lx EFLAGS: %08lx\n", PT_REGS_RSP(regs), + printk(KERN_INFO "RSP: %016lx EFLAGS: %08lx\n", PT_REGS_SP(regs), PT_REGS_EFLAGS(regs)); printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n", PT_REGS_RAX(regs), PT_REGS_RBX(regs), PT_REGS_RCX(regs)); -- cgit v1.1 From b3ee571e58120de30c3d15657022bf2c72477e02 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 18 Aug 2011 20:10:49 +0100 Subject: um: take ldt.h to arch/x86/um/asm/mm_context.h it's x86-only and we have no business playing with it in asm/mmu.h; make the latter have struct uml_arch_mm_context arch; instead of struct uml_ldt ldt; and let arch//um/asm/mm_context.h decide what'll be in there. While we are at it, kill host_ldt.h - it's not needed in part of places that include it (we want asm/ldt.h in those) and it can be trivially expanded into the single remaining one. Signed-off-by: Al Viro Signed-off-by: Richard Weinberger --- arch/x86/um/asm/mm_context.h | 72 ++++++++++++++++++++++++++++++++++++ arch/x86/um/asm/processor_32.h | 2 +- arch/x86/um/ldt.c | 36 +++++++++--------- arch/x86/um/shared/sysdep/host_ldt.h | 37 ------------------ arch/x86/um/shared/sysdep/tls.h | 1 - 5 files changed, 91 insertions(+), 57 deletions(-) create mode 100644 arch/x86/um/asm/mm_context.h delete mode 100644 arch/x86/um/shared/sysdep/host_ldt.h (limited to 'arch/x86') diff --git a/arch/x86/um/asm/mm_context.h b/arch/x86/um/asm/mm_context.h new file mode 100644 index 0000000..4a73d63 --- /dev/null +++ b/arch/x86/um/asm/mm_context.h @@ -0,0 +1,72 @@ +/* + * Copyright (C) 2004 Fujitsu Siemens Computers GmbH + * Licensed under the GPL + * + * Author: Bodo Stroesser + */ + +#ifndef __ASM_LDT_H +#define __ASM_LDT_H + +#include +#include + +extern void ldt_host_info(void); + +#define LDT_PAGES_MAX \ + ((LDT_ENTRIES * LDT_ENTRY_SIZE)/PAGE_SIZE) +#define LDT_ENTRIES_PER_PAGE \ + (PAGE_SIZE/LDT_ENTRY_SIZE) +#define LDT_DIRECT_ENTRIES \ + ((LDT_PAGES_MAX*sizeof(void *))/LDT_ENTRY_SIZE) + +struct ldt_entry { + __u32 a; + __u32 b; +}; + +typedef struct uml_ldt { + int entry_count; + struct mutex lock; + union { + struct ldt_entry * pages[LDT_PAGES_MAX]; + struct ldt_entry entries[LDT_DIRECT_ENTRIES]; + } u; +} uml_ldt_t; + +#define LDT_entry_a(info) \ + ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff)) + +#define LDT_entry_b(info) \ + (((info)->base_addr & 0xff000000) | \ + (((info)->base_addr & 0x00ff0000) >> 16) | \ + ((info)->limit & 0xf0000) | \ + (((info)->read_exec_only ^ 1) << 9) | \ + ((info)->contents << 10) | \ + (((info)->seg_not_present ^ 1) << 15) | \ + ((info)->seg_32bit << 22) | \ + ((info)->limit_in_pages << 23) | \ + ((info)->useable << 20) | \ + 0x7000) + +#define _LDT_empty(info) (\ + (info)->base_addr == 0 && \ + (info)->limit == 0 && \ + (info)->contents == 0 && \ + (info)->read_exec_only == 1 && \ + (info)->seg_32bit == 0 && \ + (info)->limit_in_pages == 0 && \ + (info)->seg_not_present == 1 && \ + (info)->useable == 0 ) + +#ifdef CONFIG_X86_64 +#define LDT_empty(info) (_LDT_empty(info) && ((info)->lm == 0)) +#else +#define LDT_empty(info) (_LDT_empty(info)) +#endif + +struct uml_arch_mm_context { + uml_ldt_t ldt; +}; + +#endif diff --git a/arch/x86/um/asm/processor_32.h b/arch/x86/um/asm/processor_32.h index e5b72fa..c4078b5 100644 --- a/arch/x86/um/asm/processor_32.h +++ b/arch/x86/um/asm/processor_32.h @@ -7,8 +7,8 @@ #define __UM_PROCESSOR_I386_H #include -#include #include +#include extern int host_has_cmov; diff --git a/arch/x86/um/ldt.c b/arch/x86/um/ldt.c index 3f2bf20..26b0e39 100644 --- a/arch/x86/um/ldt.c +++ b/arch/x86/um/ldt.c @@ -137,7 +137,7 @@ static int read_ldt(void __user * ptr, unsigned long bytecount) { int i, err = 0; unsigned long size; - uml_ldt_t * ldt = ¤t->mm->context.ldt; + uml_ldt_t *ldt = ¤t->mm->context.arch.ldt; if (!ldt->entry_count) goto out; @@ -205,7 +205,7 @@ static int read_default_ldt(void __user * ptr, unsigned long bytecount) static int write_ldt(void __user * ptr, unsigned long bytecount, int func) { - uml_ldt_t * ldt = ¤t->mm->context.ldt; + uml_ldt_t *ldt = ¤t->mm->context.arch.ldt; struct mm_id * mm_idp = ¤t->mm->context.id; int i, err; struct user_desc ldt_info; @@ -397,7 +397,7 @@ long init_new_ldt(struct mm_context *new_mm, struct mm_context *from_mm) if (!ptrace_ldt) - mutex_init(&new_mm->ldt.lock); + mutex_init(&new_mm->arch.ldt.lock); if (!from_mm) { memset(&desc, 0, sizeof(desc)); @@ -429,7 +429,7 @@ long init_new_ldt(struct mm_context *new_mm, struct mm_context *from_mm) break; } } - new_mm->ldt.entry_count = 0; + new_mm->arch.ldt.entry_count = 0; goto out; } @@ -457,26 +457,26 @@ long init_new_ldt(struct mm_context *new_mm, struct mm_context *from_mm) * i.e., we have to use the stub for modify_ldt, which * can't handle the big read buffer of up to 64kB. */ - mutex_lock(&from_mm->ldt.lock); - if (from_mm->ldt.entry_count <= LDT_DIRECT_ENTRIES) - memcpy(new_mm->ldt.u.entries, from_mm->ldt.u.entries, - sizeof(new_mm->ldt.u.entries)); + mutex_lock(&from_mm->arch.ldt.lock); + if (from_mm->arch.ldt.entry_count <= LDT_DIRECT_ENTRIES) + memcpy(new_mm->arch.ldt.u.entries, from_mm->arch.ldt.u.entries, + sizeof(new_mm->arch.ldt.u.entries)); else { - i = from_mm->ldt.entry_count / LDT_ENTRIES_PER_PAGE; + i = from_mm->arch.ldt.entry_count / LDT_ENTRIES_PER_PAGE; while (i-->0) { page = __get_free_page(GFP_KERNEL|__GFP_ZERO); if (!page) { err = -ENOMEM; break; } - new_mm->ldt.u.pages[i] = + new_mm->arch.ldt.u.pages[i] = (struct ldt_entry *) page; - memcpy(new_mm->ldt.u.pages[i], - from_mm->ldt.u.pages[i], PAGE_SIZE); + memcpy(new_mm->arch.ldt.u.pages[i], + from_mm->arch.ldt.u.pages[i], PAGE_SIZE); } } - new_mm->ldt.entry_count = from_mm->ldt.entry_count; - mutex_unlock(&from_mm->ldt.lock); + new_mm->arch.ldt.entry_count = from_mm->arch.ldt.entry_count; + mutex_unlock(&from_mm->arch.ldt.lock); } out: @@ -488,12 +488,12 @@ void free_ldt(struct mm_context *mm) { int i; - if (!ptrace_ldt && mm->ldt.entry_count > LDT_DIRECT_ENTRIES) { - i = mm->ldt.entry_count / LDT_ENTRIES_PER_PAGE; + if (!ptrace_ldt && mm->arch.ldt.entry_count > LDT_DIRECT_ENTRIES) { + i = mm->arch.ldt.entry_count / LDT_ENTRIES_PER_PAGE; while (i-- > 0) - free_page((long) mm->ldt.u.pages[i]); + free_page((long) mm->arch.ldt.u.pages[i]); } - mm->ldt.entry_count = 0; + mm->arch.ldt.entry_count = 0; } int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount) diff --git a/arch/x86/um/shared/sysdep/host_ldt.h b/arch/x86/um/shared/sysdep/host_ldt.h deleted file mode 100644 index 246ff7a..0000000 --- a/arch/x86/um/shared/sysdep/host_ldt.h +++ /dev/null @@ -1,37 +0,0 @@ -#ifndef __ASM_HOST_LDT_H -#define __ASM_HOST_LDT_H - -#include - -#define LDT_entry_a(info) \ - ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff)) - -#define LDT_entry_b(info) \ - (((info)->base_addr & 0xff000000) | \ - (((info)->base_addr & 0x00ff0000) >> 16) | \ - ((info)->limit & 0xf0000) | \ - (((info)->read_exec_only ^ 1) << 9) | \ - ((info)->contents << 10) | \ - (((info)->seg_not_present ^ 1) << 15) | \ - ((info)->seg_32bit << 22) | \ - ((info)->limit_in_pages << 23) | \ - ((info)->useable << 20) | \ - 0x7000) - -#define _LDT_empty(info) (\ - (info)->base_addr == 0 && \ - (info)->limit == 0 && \ - (info)->contents == 0 && \ - (info)->read_exec_only == 1 && \ - (info)->seg_32bit == 0 && \ - (info)->limit_in_pages == 0 && \ - (info)->seg_not_present == 1 && \ - (info)->useable == 0 ) - -#ifdef CONFIG_X86_64 -#define LDT_empty(info) (_LDT_empty(info) && ((info)->lm == 0)) -#else -#define LDT_empty(info) (_LDT_empty(info)) -#endif - -#endif diff --git a/arch/x86/um/shared/sysdep/tls.h b/arch/x86/um/shared/sysdep/tls.h index f2f30bd..27cce00 100644 --- a/arch/x86/um/shared/sysdep/tls.h +++ b/arch/x86/um/shared/sysdep/tls.h @@ -24,7 +24,6 @@ typedef struct um_dup_user_desc { # else /* __KERNEL__ */ -# include typedef struct user_desc user_desc_t; # endif /* __KERNEL__ */ -- cgit v1.1 From 8807c1d56121269a27ad973c6adda18cc4c6a099 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 18 Aug 2011 20:11:09 +0100 Subject: um: asm/apic.h is there only to shadow the x86 one... ... so take it to arch/um/x86/asm. Signed-off-by: Al Viro Signed-off-by: Richard Weinberger --- arch/x86/um/asm/apic.h | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 arch/x86/um/asm/apic.h (limited to 'arch/x86') diff --git a/arch/x86/um/asm/apic.h b/arch/x86/um/asm/apic.h new file mode 100644 index 0000000..876dee8 --- /dev/null +++ b/arch/x86/um/asm/apic.h @@ -0,0 +1,4 @@ +#ifndef __UM_APIC_H +#define __UM_APIC_H + +#endif -- cgit v1.1 From ff9586e98feaf6c2df0c936075e3cbb31045b99e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 18 Aug 2011 20:11:19 +0100 Subject: um: required-features.h is there only to shadow x86 one... Signed-off-by: Al Viro Signed-off-by: Richard Weinberger --- arch/x86/um/asm/required-features.h | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 arch/x86/um/asm/required-features.h (limited to 'arch/x86') diff --git a/arch/x86/um/asm/required-features.h b/arch/x86/um/asm/required-features.h new file mode 100644 index 0000000..dfb967b --- /dev/null +++ b/arch/x86/um/asm/required-features.h @@ -0,0 +1,9 @@ +#ifndef __UM_REQUIRED_FEATURES_H +#define __UM_REQUIRED_FEATURES_H + +/* + * Nothing to see, just need something for the i386 and x86_64 asm + * headers to include. + */ + +#endif -- cgit v1.1 From 3fb77d7256f442e8e15d51f111bb6cf096d4a9f8 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 18 Aug 2011 20:11:29 +0100 Subject: um: irq_vectors.h just shadows x86 one Signed-off-by: Al Viro Signed-off-by: Richard Weinberger --- arch/x86/um/asm/irq_vectors.h | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 arch/x86/um/asm/irq_vectors.h (limited to 'arch/x86') diff --git a/arch/x86/um/asm/irq_vectors.h b/arch/x86/um/asm/irq_vectors.h new file mode 100644 index 0000000..272a81e --- /dev/null +++ b/arch/x86/um/asm/irq_vectors.h @@ -0,0 +1,10 @@ +/* + * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#ifndef __UM_IRQ_VECTORS_H +#define __UM_IRQ_VECTORS_H + +#endif + -- cgit v1.1 From 4d211093e838ddd049b5cf4f0773aa5ac67f9976 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 18 Aug 2011 20:11:49 +0100 Subject: um: fix gcov build breakage a) exports in gmon_syms.c duplicate kernel/gcov/* ones b) excluding -pg in vdso compile is not enough - -fprofile-arcs and -ftest-coverage also needs to be excluded Signed-off-by: Al Viro Signed-off-by: Richard Weinberger --- arch/x86/um/vdso/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/um/vdso/Makefile b/arch/x86/um/vdso/Makefile index 5dffe6d..6c803ca 100644 --- a/arch/x86/um/vdso/Makefile +++ b/arch/x86/um/vdso/Makefile @@ -46,8 +46,8 @@ $(vobjs): KBUILD_CFLAGS += $(CFL) # # vDSO code runs in userspace and -pg doesn't help with profiling anyway. # -CFLAGS_REMOVE_vdso-note.o = -pg -CFLAGS_REMOVE_um_vdso.o = -pg +CFLAGS_REMOVE_vdso-note.o = -pg -fprofile-arcs -ftest-coverage +CFLAGS_REMOVE_um_vdso.o = -pg -fprofile-arcs -ftest-coverage targets += vdso-syms.lds obj-$(VDSO64-y) += vdso-syms.lds -- cgit v1.1 From a10c95d84c2d04a4bfb02104644bbf2811b99690 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 18 Aug 2011 20:12:09 +0100 Subject: um: unify KSTK_... ... and switch get_thread_register() to HOST_... for register numbers Signed-off-by: Al Viro Signed-off-by: Richard Weinberger --- arch/x86/um/asm/processor.h | 4 ++++ arch/x86/um/asm/processor_32.h | 4 ---- arch/x86/um/asm/processor_64.h | 3 --- arch/x86/um/os-Linux/registers.c | 12 ++++++------ 4 files changed, 10 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/um/asm/processor.h b/arch/x86/um/asm/processor.h index f34c4b2..118c143 100644 --- a/arch/x86/um/asm/processor.h +++ b/arch/x86/um/asm/processor.h @@ -10,6 +10,10 @@ # include "processor_64.h" #endif +#define KSTK_EIP(tsk) KSTK_REG(tsk, HOST_IP) +#define KSTK_ESP(tsk) KSTK_REG(tsk, HOST_IP) +#define KSTK_EBP(tsk) KSTK_REG(tsk, HOST_BP) + #define ARCH_IS_STACKGROW(address) \ (address + 65536 + 32 * sizeof(unsigned long) >= UPT_SP(¤t->thread.regs.regs)) diff --git a/arch/x86/um/asm/processor_32.h b/arch/x86/um/asm/processor_32.h index c4078b5..018f732 100644 --- a/arch/x86/um/asm/processor_32.h +++ b/arch/x86/um/asm/processor_32.h @@ -63,8 +63,4 @@ static inline void rep_nop(void) #define current_text_addr() \ ({ void *pc; __asm__("movl $1f,%0\n1:":"=g" (pc)); pc; }) -#define KSTK_EIP(tsk) KSTK_REG(tsk, EIP) -#define KSTK_ESP(tsk) KSTK_REG(tsk, UESP) -#define KSTK_EBP(tsk) KSTK_REG(tsk, EBP) - #endif diff --git a/arch/x86/um/asm/processor_64.h b/arch/x86/um/asm/processor_64.h index 0186c61..61de92d 100644 --- a/arch/x86/um/asm/processor_64.h +++ b/arch/x86/um/asm/processor_64.h @@ -42,7 +42,4 @@ static inline void arch_copy_thread(struct arch_thread *from, #define current_text_addr() \ ({ void *pc; __asm__("movq $1f,%0\n1:":"=g" (pc)); pc; }) -#define KSTK_EIP(tsk) KSTK_REG(tsk, RIP) -#define KSTK_ESP(tsk) KSTK_REG(tsk, RSP) - #endif diff --git a/arch/x86/um/os-Linux/registers.c b/arch/x86/um/os-Linux/registers.c index 3a9b624..fa2e5a6 100644 --- a/arch/x86/um/os-Linux/registers.c +++ b/arch/x86/um/os-Linux/registers.c @@ -89,18 +89,18 @@ unsigned long get_thread_reg(int reg, jmp_buf *buf) { switch (reg) { #ifdef __i386__ - case EIP: + case HOST_IP: return buf[0]->__eip; - case UESP: + case HOST_SP: return buf[0]->__esp; - case EBP: + case HOST_BP: return buf[0]->__ebp; #else - case RIP: + case HOST_IP: return buf[0]->__rip; - case RSP: + case HOST_SP: return buf[0]->__rsp; - case RBP: + case HOST_BP: return buf[0]->__rbp; #endif default: -- cgit v1.1 From 966e803ab12538faf2b236dbe83f7fb796a031d6 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 18 Aug 2011 20:12:19 +0100 Subject: um: unify ptrace_user.h Signed-off-by: Al Viro Signed-off-by: Richard Weinberger --- arch/x86/um/ptrace_64.c | 2 +- arch/x86/um/shared/sysdep/ptrace_64.h | 2 +- arch/x86/um/shared/sysdep/ptrace_user.h | 26 ++++++++++++++++++-- arch/x86/um/shared/sysdep/ptrace_user_32.h | 26 -------------------- arch/x86/um/shared/sysdep/ptrace_user_64.h | 38 ------------------------------ arch/x86/um/user-offsets.c | 3 ++- 6 files changed, 28 insertions(+), 69 deletions(-) delete mode 100644 arch/x86/um/shared/sysdep/ptrace_user_32.h delete mode 100644 arch/x86/um/shared/sysdep/ptrace_user_64.h (limited to 'arch/x86') diff --git a/arch/x86/um/ptrace_64.c b/arch/x86/um/ptrace_64.c index 6e6343e..3b52bf0 100644 --- a/arch/x86/um/ptrace_64.c +++ b/arch/x86/um/ptrace_64.c @@ -46,7 +46,7 @@ static const int reg_offsets[] = [FS >> 3] = HOST_FS, [GS >> 3] = HOST_GS, [EFLAGS >> 3] = HOST_EFLAGS, - [ORIG_RAX >> 3] = HOST_ORIG_RAX, + [ORIG_RAX >> 3] = HOST_ORIG_AX, }; int putreg(struct task_struct *child, int regno, unsigned long value) diff --git a/arch/x86/um/shared/sysdep/ptrace_64.h b/arch/x86/um/shared/sysdep/ptrace_64.h index 430dedc..031edc5 100644 --- a/arch/x86/um/shared/sysdep/ptrace_64.h +++ b/arch/x86/um/shared/sysdep/ptrace_64.h @@ -65,7 +65,7 @@ #define REGS_FS(r) ((r)[HOST_FS]) #define REGS_GS(r) ((r)[HOST_GS]) -#define REGS_ORIG_RAX(r) ((r)[HOST_ORIG_RAX]) +#define REGS_ORIG_RAX(r) ((r)[HOST_ORIG_AX]) #define REGS_SET_SYSCALL_RETURN(r, res) REGS_RAX(r) = (res) diff --git a/arch/x86/um/shared/sysdep/ptrace_user.h b/arch/x86/um/shared/sysdep/ptrace_user.h index a92f883..16cd6b5 100644 --- a/arch/x86/um/shared/sysdep/ptrace_user.h +++ b/arch/x86/um/shared/sysdep/ptrace_user.h @@ -1,5 +1,27 @@ +#include + +#define PT_OFFSET(r) ((r) * sizeof(long)) + +#define PT_SYSCALL_NR(regs) ((regs)[HOST_ORIG_AX]) +#define PT_SYSCALL_NR_OFFSET PT_OFFSET(HOST_ORIG_AX) + +#define PT_SYSCALL_RET_OFFSET PT_OFFSET(HOST_AX) + +#define REGS_IP_INDEX HOST_IP +#define REGS_SP_INDEX HOST_SP + #ifdef __i386__ -#include "ptrace_user_32.h" +#define FP_SIZE ((HOST_FPX_SIZE > HOST_FP_SIZE) ? HOST_FPX_SIZE : HOST_FP_SIZE) #else -#include "ptrace_user_64.h" +#define FP_SIZE HOST_FP_SIZE + +/* + * x86_64 FC3 doesn't define this in /usr/include/linux/ptrace.h even though + * it's defined in the kernel's include/linux/ptrace.h. Additionally, use the + * 2.4 name and value for 2.4 host compatibility. + */ +#ifndef PTRACE_OLDSETOPTIONS +#define PTRACE_OLDSETOPTIONS 21 +#endif + #endif diff --git a/arch/x86/um/shared/sysdep/ptrace_user_32.h b/arch/x86/um/shared/sysdep/ptrace_user_32.h deleted file mode 100644 index 9d88a79..0000000 --- a/arch/x86/um/shared/sysdep/ptrace_user_32.h +++ /dev/null @@ -1,26 +0,0 @@ -/* - * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) - * Licensed under the GPL - */ - -#ifndef __SYSDEP_I386_PTRACE_USER_H__ -#define __SYSDEP_I386_PTRACE_USER_H__ - -#include -#include -#include -#include - -#define PT_OFFSET(r) ((r) * sizeof(long)) - -#define PT_SYSCALL_NR(regs) ((regs)[ORIG_EAX]) -#define PT_SYSCALL_NR_OFFSET PT_OFFSET(ORIG_EAX) - -#define PT_SYSCALL_RET_OFFSET PT_OFFSET(EAX) - -#define REGS_IP_INDEX EIP -#define REGS_SP_INDEX UESP - -#define FP_SIZE ((HOST_FPX_SIZE > HOST_FP_SIZE) ? HOST_FPX_SIZE : HOST_FP_SIZE) - -#endif diff --git a/arch/x86/um/shared/sysdep/ptrace_user_64.h b/arch/x86/um/shared/sysdep/ptrace_user_64.h deleted file mode 100644 index 2f1b6e3..0000000 --- a/arch/x86/um/shared/sysdep/ptrace_user_64.h +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Copyright 2003 PathScale, Inc. - * - * Licensed under the GPL - */ - -#ifndef __SYSDEP_X86_64_PTRACE_USER_H__ -#define __SYSDEP_X86_64_PTRACE_USER_H__ - -#define __FRAME_OFFSETS -#include -#include -#include -#undef __FRAME_OFFSETS -#include - -#define PT_INDEX(off) ((off) / sizeof(unsigned long)) - -#define PT_SYSCALL_NR(regs) ((regs)[PT_INDEX(ORIG_RAX)]) -#define PT_SYSCALL_NR_OFFSET (ORIG_RAX) - -#define PT_SYSCALL_RET_OFFSET (RAX) - -/* - * x86_64 FC3 doesn't define this in /usr/include/linux/ptrace.h even though - * it's defined in the kernel's include/linux/ptrace.h. Additionally, use the - * 2.4 name and value for 2.4 host compatibility. - */ -#ifndef PTRACE_OLDSETOPTIONS -#define PTRACE_OLDSETOPTIONS 21 -#endif - -#define REGS_IP_INDEX PT_INDEX(RIP) -#define REGS_SP_INDEX PT_INDEX(RSP) - -#define FP_SIZE (HOST_FP_SIZE) - -#endif diff --git a/arch/x86/um/user-offsets.c b/arch/x86/um/user-offsets.c index 88d1255..ca49be8 100644 --- a/arch/x86/um/user-offsets.c +++ b/arch/x86/um/user-offsets.c @@ -36,6 +36,7 @@ void foo(void) DEFINE(HOST_FS, FS); DEFINE(HOST_ES, ES); DEFINE(HOST_GS, GS); + DEFINE(HOST_ORIG_AX, ORIG_EAX); #else DEFINE(HOST_FP_SIZE, sizeof(struct _fpstate) / sizeof(unsigned long)); DEFINE_LONGS(HOST_BX, RBX); @@ -53,7 +54,7 @@ void foo(void) DEFINE_LONGS(HOST_R13, R13); DEFINE_LONGS(HOST_R14, R14); DEFINE_LONGS(HOST_R15, R15); - DEFINE_LONGS(HOST_ORIG_RAX, ORIG_RAX); + DEFINE_LONGS(HOST_ORIG_AX, ORIG_RAX); DEFINE_LONGS(HOST_CS, CS); DEFINE_LONGS(HOST_SS, SS); DEFINE_LONGS(HOST_EFLAGS, EFLAGS); -- cgit v1.1 From 46ecca8ae1540abe73000d0fb4878de6956a208f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 18 Aug 2011 20:13:40 +0100 Subject: um: segment.h is x86-only and needed only there Signed-off-by: Al Viro Signed-off-by: Richard Weinberger --- arch/x86/um/asm/segment.h | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 arch/x86/um/asm/segment.h (limited to 'arch/x86') diff --git a/arch/x86/um/asm/segment.h b/arch/x86/um/asm/segment.h new file mode 100644 index 0000000..45183fc --- /dev/null +++ b/arch/x86/um/asm/segment.h @@ -0,0 +1,10 @@ +#ifndef __UM_SEGMENT_H +#define __UM_SEGMENT_H + +extern int host_gdt_entry_tls_min; + +#define GDT_ENTRY_TLS_ENTRIES 3 +#define GDT_ENTRY_TLS_MIN host_gdt_entry_tls_min +#define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1) + +#endif -- cgit v1.1 From a34978cbd977ab62c744f63daacd9dc1474482be Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 18 Aug 2011 20:14:00 +0100 Subject: um: kill system-um.h most of it belonged in irqflags.h, actually Signed-off-by: Al Viro Signed-off-by: Richard Weinberger --- arch/x86/um/asm/system.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/um/asm/system.h b/arch/x86/um/asm/system.h index a89113b..a459fd9 100644 --- a/arch/x86/um/asm/system.h +++ b/arch/x86/um/asm/system.h @@ -6,7 +6,6 @@ #include #include #include -#include #include #include @@ -130,4 +129,7 @@ static inline void rdtsc_barrier(void) alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC); } +extern void *_switch_to(void *prev, void *next, void *last); +#define switch_to(prev, next, last) prev = _switch_to(prev, next, last) + #endif -- cgit v1.1 From d0af6cbfa2eb675643cc651114364752e972900b Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Thu, 18 Aug 2011 21:55:11 +0200 Subject: um: merge delay_{32,64}.c Signed-off-by: Richard Weinberger --- arch/x86/um/Makefile | 2 +- arch/x86/um/delay.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/um/delay_32.c | 60 -------------------------------------------------- arch/x86/um/delay_64.c | 60 -------------------------------------------------- 4 files changed, 61 insertions(+), 121 deletions(-) create mode 100644 arch/x86/um/delay.c delete mode 100644 arch/x86/um/delay_32.c delete mode 100644 arch/x86/um/delay_64.c (limited to 'arch/x86') diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile index ffe1008..8fb5840 100644 --- a/arch/x86/um/Makefile +++ b/arch/x86/um/Makefile @@ -8,7 +8,7 @@ else BITS := 64 endif -obj-y = bug.o bugs_$(BITS).o delay_$(BITS).o fault.o ksyms.o ldt.o \ +obj-y = bug.o bugs_$(BITS).o delay.o fault.o ksyms.o ldt.o \ ptrace_$(BITS).o ptrace_user.o setjmp_$(BITS).o signal.o \ stub_$(BITS).o stub_segv.o syscalls_$(BITS).o \ sys_call_table_$(BITS).o sysrq_$(BITS).o tls_$(BITS).o \ diff --git a/arch/x86/um/delay.c b/arch/x86/um/delay.c new file mode 100644 index 0000000..f3fe1a6 --- /dev/null +++ b/arch/x86/um/delay.c @@ -0,0 +1,60 @@ +/* + * Copyright (C) 2011 Richard Weinberger + * Mostly copied from arch/x86/lib/delay.c + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include + +void __delay(unsigned long loops) +{ + asm volatile( + "test %0,%0\n" + "jz 3f\n" + "jmp 1f\n" + + ".align 16\n" + "1: jmp 2f\n" + + ".align 16\n" + "2: dec %0\n" + " jnz 2b\n" + "3: dec %0\n" + + : /* we don't need output */ + : "a" (loops) + ); +} +EXPORT_SYMBOL(__delay); + +inline void __const_udelay(unsigned long xloops) +{ + int d0; + + xloops *= 4; + asm("mull %%edx" + : "=d" (xloops), "=&a" (d0) + : "1" (xloops), "0" + (loops_per_jiffy * (HZ/4))); + + __delay(++xloops); +} +EXPORT_SYMBOL(__const_udelay); + +void __udelay(unsigned long usecs) +{ + __const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */ +} +EXPORT_SYMBOL(__udelay); + +void __ndelay(unsigned long nsecs) +{ + __const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */ +} +EXPORT_SYMBOL(__ndelay); diff --git a/arch/x86/um/delay_32.c b/arch/x86/um/delay_32.c deleted file mode 100644 index f3fe1a6..0000000 --- a/arch/x86/um/delay_32.c +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright (C) 2011 Richard Weinberger - * Mostly copied from arch/x86/lib/delay.c - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include -#include -#include -#include - -void __delay(unsigned long loops) -{ - asm volatile( - "test %0,%0\n" - "jz 3f\n" - "jmp 1f\n" - - ".align 16\n" - "1: jmp 2f\n" - - ".align 16\n" - "2: dec %0\n" - " jnz 2b\n" - "3: dec %0\n" - - : /* we don't need output */ - : "a" (loops) - ); -} -EXPORT_SYMBOL(__delay); - -inline void __const_udelay(unsigned long xloops) -{ - int d0; - - xloops *= 4; - asm("mull %%edx" - : "=d" (xloops), "=&a" (d0) - : "1" (xloops), "0" - (loops_per_jiffy * (HZ/4))); - - __delay(++xloops); -} -EXPORT_SYMBOL(__const_udelay); - -void __udelay(unsigned long usecs) -{ - __const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */ -} -EXPORT_SYMBOL(__udelay); - -void __ndelay(unsigned long nsecs) -{ - __const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */ -} -EXPORT_SYMBOL(__ndelay); diff --git a/arch/x86/um/delay_64.c b/arch/x86/um/delay_64.c deleted file mode 100644 index f3fe1a6..0000000 --- a/arch/x86/um/delay_64.c +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright (C) 2011 Richard Weinberger - * Mostly copied from arch/x86/lib/delay.c - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include -#include -#include -#include - -void __delay(unsigned long loops) -{ - asm volatile( - "test %0,%0\n" - "jz 3f\n" - "jmp 1f\n" - - ".align 16\n" - "1: jmp 2f\n" - - ".align 16\n" - "2: dec %0\n" - " jnz 2b\n" - "3: dec %0\n" - - : /* we don't need output */ - : "a" (loops) - ); -} -EXPORT_SYMBOL(__delay); - -inline void __const_udelay(unsigned long xloops) -{ - int d0; - - xloops *= 4; - asm("mull %%edx" - : "=d" (xloops), "=&a" (d0) - : "1" (xloops), "0" - (loops_per_jiffy * (HZ/4))); - - __delay(++xloops); -} -EXPORT_SYMBOL(__const_udelay); - -void __udelay(unsigned long usecs) -{ - __const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */ -} -EXPORT_SYMBOL(__udelay); - -void __ndelay(unsigned long nsecs) -{ - __const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */ -} -EXPORT_SYMBOL(__ndelay); -- cgit v1.1 From 38b64aed786d59854ecc850ee5ece85b61dd252b Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Thu, 18 Aug 2011 21:58:07 +0200 Subject: um: we need sys/user.h only on i386 Signed-off-by: Richard Weinberger --- arch/x86/um/os-Linux/registers.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/um/os-Linux/registers.c b/arch/x86/um/os-Linux/registers.c index fa2e5a6..0cdbb86 100644 --- a/arch/x86/um/os-Linux/registers.c +++ b/arch/x86/um/os-Linux/registers.c @@ -6,7 +6,9 @@ #include #include +#ifdef __i386__ #include +#endif #include "longjmp.h" #include "sysdep/ptrace_user.h" -- cgit v1.1 From 0d65ede0a605d6252acc5c8a9c536c4cd0211f3c Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Mon, 24 Oct 2011 18:15:32 -0400 Subject: um: Fix kmalloc argument order in um/vdso/vma.c kmalloc size is 1st arg, not second. Signed-off-by: Dave Jones Signed-off-by: Richard Weinberger Cc: # 3.0.x [richard@nod.at: on 3.0 the to be patched file is arch/um/sys-x86_64/vdso/vma.c] --- arch/x86/um/vdso/vma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/um/vdso/vma.c b/arch/x86/um/vdso/vma.c index 9495c8d..91f4ec9 100644 --- a/arch/x86/um/vdso/vma.c +++ b/arch/x86/um/vdso/vma.c @@ -28,7 +28,7 @@ static int __init init_vdso(void) um_vdso_addr = task_size - PAGE_SIZE; - vdsop = kmalloc(GFP_KERNEL, sizeof(struct page *)); + vdsop = kmalloc(sizeof(struct page *), GFP_KERNEL); if (!vdsop) goto oom; -- cgit v1.1 From 70b50f94f1644e2aa7cb374819cfd93f3c28d725 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 2 Nov 2011 13:36:59 -0700 Subject: mm: thp: tail page refcounting fix Michel while working on the working set estimation code, noticed that calling get_page_unless_zero() on a random pfn_to_page(random_pfn) wasn't safe, if the pfn ended up being a tail page of a transparent hugepage under splitting by __split_huge_page_refcount(). He then found the problem could also theoretically materialize with page_cache_get_speculative() during the speculative radix tree lookups that uses get_page_unless_zero() in SMP if the radix tree page is freed and reallocated and get_user_pages is called on it before page_cache_get_speculative has a chance to call get_page_unless_zero(). So the best way to fix the problem is to keep page_tail->_count zero at all times. This will guarantee that get_page_unless_zero() can never succeed on any tail page. page_tail->_mapcount is guaranteed zero and is unused for all tail pages of a compound page, so we can simply account the tail page references there and transfer them to tail_page->_count in __split_huge_page_refcount() (in addition to the head_page->_mapcount). While debugging this s/_count/_mapcount/ change I also noticed get_page is called by direct-io.c on pages returned by get_user_pages. That wasn't entirely safe because the two atomic_inc in get_page weren't atomic. As opposed to other get_user_page users like secondary-MMU page fault to establish the shadow pagetables would never call any superflous get_page after get_user_page returns. It's safer to make get_page universally safe for tail pages and to use get_page_foll() within follow_page (inside get_user_pages()). get_page_foll() is safe to do the refcounting for tail pages without taking any locks because it is run within PT lock protected critical sections (PT lock for pte and page_table_lock for pmd_trans_huge). The standard get_page() as invoked by direct-io instead will now take the compound_lock but still only for tail pages. The direct-io paths are usually I/O bound and the compound_lock is per THP so very finegrined, so there's no risk of scalability issues with it. A simple direct-io benchmarks with all lockdep prove locking and spinlock debugging infrastructure enabled shows identical performance and no overhead. So it's worth it. Ideally direct-io should stop calling get_page() on pages returned by get_user_pages(). The spinlock in get_page() is already optimized away for no-THP builds but doing get_page() on tail pages returned by GUP is generally a rare operation and usually only run in I/O paths. This new refcounting on page_tail->_mapcount in addition to avoiding new RCU critical sections will also allow the working set estimation code to work without any further complexity associated to the tail page refcounting with THP. Signed-off-by: Andrea Arcangeli Reported-by: Michel Lespinasse Reviewed-by: Michel Lespinasse Reviewed-by: Minchan Kim Cc: Peter Zijlstra Cc: Hugh Dickins Cc: Johannes Weiner Cc: Rik van Riel Cc: Mel Gorman Cc: KOSAKI Motohiro Cc: Benjamin Herrenschmidt Cc: David Gibson Cc: Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/gup.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index dbe34b9..3b5032a 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -114,8 +114,9 @@ static inline void get_huge_page_tail(struct page *page) * __split_huge_page_refcount() cannot run * from under us. */ - VM_BUG_ON(atomic_read(&page->_count) < 0); - atomic_inc(&page->_count); + VM_BUG_ON(page_mapcount(page) < 0); + VM_BUG_ON(atomic_read(&page->_count) != 0); + atomic_inc(&page->_mapcount); } static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, -- cgit v1.1 From b35a35b556f5e6b7993ad0baf20173e75c09ce8c Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 2 Nov 2011 13:37:36 -0700 Subject: thp: share get_huge_page_tail() This avoids duplicating the function in every arch gup_fast. Signed-off-by: Andrea Arcangeli Cc: Peter Zijlstra Cc: Hugh Dickins Cc: Johannes Weiner Cc: Rik van Riel Cc: Mel Gorman Cc: KOSAKI Motohiro Cc: Benjamin Herrenschmidt Cc: David Gibson Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: David Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/gup.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index 3b5032a..ea30585 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -108,17 +108,6 @@ static inline void get_head_page_multiple(struct page *page, int nr) SetPageReferenced(page); } -static inline void get_huge_page_tail(struct page *page) -{ - /* - * __split_huge_page_refcount() cannot run - * from under us. - */ - VM_BUG_ON(page_mapcount(page) < 0); - VM_BUG_ON(atomic_read(&page->_count) != 0); - atomic_inc(&page->_mapcount); -} - static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { -- cgit v1.1 From 22f4521d664030e417f41953e922f61c65f2e189 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Fri, 12 Aug 2011 00:13:47 -0400 Subject: mrst pmu: update comment referenced MeeGo, in particular, but really means Linux, in general. Signed-off-by: Len Brown --- arch/x86/platform/mrst/pmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/mrst/pmu.c b/arch/x86/platform/mrst/pmu.c index 9281da7..c0ac06d 100644 --- a/arch/x86/platform/mrst/pmu.c +++ b/arch/x86/platform/mrst/pmu.c @@ -70,7 +70,7 @@ static struct mrst_device mrst_devs[] = { /* 24 */ { 0x4110, 0 }, /* Lincroft */ }; -/* n.b. We ignore PCI-id 0x815 in LSS9 b/c MeeGo has no driver for it */ +/* n.b. We ignore PCI-id 0x815 in LSS9 b/c Linux has no driver for it */ static u16 mrst_lss9_pci_ids[] = {0x080a, 0x0814, 0}; static u16 mrst_lss10_pci_ids[] = {0x0800, 0x0801, 0x0802, 0x0803, 0x0804, 0x0805, 0x080f, 0}; -- cgit v1.1 From 66f5ddf30a59f811818656cb2833c80da0340cfa Mon Sep 17 00:00:00 2001 From: "Luck, Tony" Date: Thu, 3 Nov 2011 11:46:47 -0700 Subject: x86/mce: Make mce_chrdev_ops 'static const' Arjan would like to make struct file_operations const, but mce-inject directly writes to the mce_chrdev_ops to install its write handler. In an ideal world mce-inject would have its own character device, but we have a sizable legacy of test scripts that hardwire "/dev/mcelog", so it would be painful to switch to a separate device now. Instead, this patch switches to a stub function in the mce code, with a registration helper that mce-inject can call when it is loaded. Note that this would also allow for a sane process to allow mce-inject to be unloaded again (with an unregister function, and appropriate module_{get,put}() calls), but that is left for potential future patches. Reported-by: Arjan van de Ven Signed-off-by: Tony Luck Link: http://lkml.kernel.org/r/4eb2e1971326651a3b@agluck-desktop.sc.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mce.h | 5 ++++- arch/x86/kernel/cpu/mcheck/mce-inject.c | 2 +- arch/x86/kernel/cpu/mcheck/mce.c | 25 ++++++++++++++++++++++--- 3 files changed, 27 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index c9321f3..0e8ae57 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -201,7 +201,10 @@ int mce_notify_irq(void); void mce_notify_process(void); DECLARE_PER_CPU(struct mce, injectm); -extern struct file_operations mce_chrdev_ops; + +extern void register_mce_write_callback(ssize_t (*)(struct file *filp, + const char __user *ubuf, + size_t usize, loff_t *off)); /* * Exception handler diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index 6199232..319882e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -208,7 +208,7 @@ static int inject_init(void) if (!alloc_cpumask_var(&mce_inject_cpumask, GFP_KERNEL)) return -ENOMEM; printk(KERN_INFO "Machine check injector initialized\n"); - mce_chrdev_ops.write = mce_write; + register_mce_write_callback(mce_write); register_nmi_handler(NMI_LOCAL, mce_raise_notify, 0, "mce_notify"); return 0; diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 362056a..2af127d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1634,16 +1634,35 @@ static long mce_chrdev_ioctl(struct file *f, unsigned int cmd, } } -/* Modified in mce-inject.c, so not static or const */ -struct file_operations mce_chrdev_ops = { +static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf, + size_t usize, loff_t *off); + +void register_mce_write_callback(ssize_t (*fn)(struct file *filp, + const char __user *ubuf, + size_t usize, loff_t *off)) +{ + mce_write = fn; +} +EXPORT_SYMBOL_GPL(register_mce_write_callback); + +ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf, + size_t usize, loff_t *off) +{ + if (mce_write) + return mce_write(filp, ubuf, usize, off); + else + return -EINVAL; +} + +static const struct file_operations mce_chrdev_ops = { .open = mce_chrdev_open, .release = mce_chrdev_release, .read = mce_chrdev_read, + .write = mce_chrdev_write, .poll = mce_chrdev_poll, .unlocked_ioctl = mce_chrdev_ioctl, .llseek = no_llseek, }; -EXPORT_SYMBOL_GPL(mce_chrdev_ops); static struct miscdevice mce_chrdev_device = { MISC_MCELOG_MINOR, -- cgit v1.1 From cf8ff6b6ab0e99dd3058852f4ec76a6140abadec Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Thu, 10 Nov 2011 13:42:02 +0000 Subject: x86/platform: Add a wallclock_init func to x86_platforms ops Some wall clock devices use MMIO based HW register, this new function will give them a chance to do some initialization work before their get/set_time service get called. Signed-off-by: Feng Tang Signed-off-by: Jacob Pan Signed-off-by: Alan Cox Signed-off-by: Dirk Brandewie Signed-off-by: Ingo Molnar --- arch/x86/include/asm/x86_init.h | 2 ++ arch/x86/kernel/setup.c | 2 ++ arch/x86/kernel/x86_init.c | 2 ++ 3 files changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index d3d8590..f864fbe 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -152,6 +152,7 @@ struct x86_cpuinit_ops { /** * struct x86_platform_ops - platform specific runtime functions * @calibrate_tsc: calibrate TSC + * @wallclock_init: init the wallclock device * @get_wallclock: get time from HW clock like RTC etc. * @set_wallclock: set time back to HW clock * @is_untracked_pat_range exclude from PAT logic @@ -160,6 +161,7 @@ struct x86_cpuinit_ops { */ struct x86_platform_ops { unsigned long (*calibrate_tsc)(void); + void (*wallclock_init)(void); unsigned long (*get_wallclock)(void); int (*set_wallclock)(unsigned long nowtime); void (*iommu_shutdown)(void); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index afaf384..cf0ef98 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1045,6 +1045,8 @@ void __init setup_arch(char **cmdline_p) x86_init.timers.wallclock_init(); + x86_platform.wallclock_init(); + mcheck_init(); arch_init_ideal_nops(); diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 6f164bd..701c7be 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -27,6 +27,7 @@ void __init x86_init_uint_noop(unsigned int unused) { } void __init x86_init_pgd_noop(pgd_t *unused) { } int __init iommu_init_noop(void) { return 0; } void iommu_shutdown_noop(void) { } +void wallclock_init_noop(void) { } /* * The platform setup functions are preset with the default functions @@ -97,6 +98,7 @@ static int default_i8042_detect(void) { return 1; }; struct x86_platform_ops x86_platform = { .calibrate_tsc = native_calibrate_tsc, + .wallclock_init = wallclock_init_noop, .get_wallclock = mach_get_cmos_time, .set_wallclock = mach_set_rtc_mmss, .iommu_shutdown = iommu_shutdown_noop, -- cgit v1.1 From bb84ac2d3a603f8f6c7cc553a260e8ceaf871df2 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Thu, 10 Nov 2011 13:42:21 +0000 Subject: x86/apic: Do not clear nr_irqs_gsi if no legacy irqs nr_legacy_irqs is set in probe_nr_irqs_gsi, we should not clear it after that. Otherwise, the result is that MSI irqs will be allocated from the wrong range for the systems without legacy PIC. Signed-off-by: Jacob Pan Signed-off-by: Dirk Brandewie Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 3c31fa9..841b8da 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -193,10 +193,8 @@ int __init arch_early_irq_init(void) struct irq_cfg *cfg; int count, node, i; - if (!legacy_pic->nr_legacy_irqs) { - nr_irqs_gsi = 0; + if (!legacy_pic->nr_legacy_irqs) io_apic_irqs = ~0UL; - } for (i = 0; i < nr_ioapics; i++) { ioapics[i].saved_registers = -- cgit v1.1 From 1ade93efd0a3dda5b0c0afda8ab8f4bd12938c1b Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Thu, 10 Nov 2011 13:42:40 +0000 Subject: x86/apic: Allow use of lapic timer early calibration result lapic timer calibration can be combined with tsc in platform specific calibration functions. if such calibration result is obtained early, we can skip the redundant calibration loops. Signed-off-by: Jacob Pan Signed-off-by: Jacob Pan Signed-off-by: Alan Cox Signed-off-by: Dirk Brandewie Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apic.h | 1 + arch/x86/kernel/apic/apic.c | 33 ++++++++++++++++++++++++++------- 2 files changed, 27 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 9b7273c..1a6c09a 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -49,6 +49,7 @@ extern unsigned int apic_verbosity; extern int local_apic_timer_c2_ok; extern int disable_apic; +extern unsigned int lapic_timer_frequency; #ifdef CONFIG_SMP extern void __inquire_remote_apic(int apicid); diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index a2fd72e..f98d84c 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -186,7 +186,7 @@ static struct resource lapic_resource = { .flags = IORESOURCE_MEM | IORESOURCE_BUSY, }; -static unsigned int calibration_result; +unsigned int lapic_timer_frequency = 0; static void apic_pm_activate(void); @@ -454,7 +454,7 @@ static void lapic_timer_setup(enum clock_event_mode mode, switch (mode) { case CLOCK_EVT_MODE_PERIODIC: case CLOCK_EVT_MODE_ONESHOT: - __setup_APIC_LVTT(calibration_result, + __setup_APIC_LVTT(lapic_timer_frequency, mode != CLOCK_EVT_MODE_PERIODIC, 1); break; case CLOCK_EVT_MODE_UNUSED: @@ -638,6 +638,25 @@ static int __init calibrate_APIC_clock(void) long delta, deltatsc; int pm_referenced = 0; + /** + * check if lapic timer has already been calibrated by platform + * specific routine, such as tsc calibration code. if so, we just fill + * in the clockevent structure and return. + */ + + if (lapic_timer_frequency) { + apic_printk(APIC_VERBOSE, "lapic timer already calibrated %d\n", + lapic_timer_frequency); + lapic_clockevent.mult = div_sc(lapic_timer_frequency/APIC_DIVISOR, + TICK_NSEC, lapic_clockevent.shift); + lapic_clockevent.max_delta_ns = + clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); + lapic_clockevent.min_delta_ns = + clockevent_delta2ns(0xF, &lapic_clockevent); + lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; + return 0; + } + local_irq_disable(); /* Replace the global interrupt handler */ @@ -679,12 +698,12 @@ static int __init calibrate_APIC_clock(void) lapic_clockevent.min_delta_ns = clockevent_delta2ns(0xF, &lapic_clockevent); - calibration_result = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS; + lapic_timer_frequency = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS; apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta); apic_printk(APIC_VERBOSE, "..... mult: %u\n", lapic_clockevent.mult); apic_printk(APIC_VERBOSE, "..... calibration result: %u\n", - calibration_result); + lapic_timer_frequency); if (cpu_has_tsc) { apic_printk(APIC_VERBOSE, "..... CPU clock speed is " @@ -695,13 +714,13 @@ static int __init calibrate_APIC_clock(void) apic_printk(APIC_VERBOSE, "..... host bus clock speed is " "%u.%04u MHz.\n", - calibration_result / (1000000 / HZ), - calibration_result % (1000000 / HZ)); + lapic_timer_frequency / (1000000 / HZ), + lapic_timer_frequency % (1000000 / HZ)); /* * Do a sanity check on the APIC calibration result */ - if (calibration_result < (1000000 / HZ)) { + if (lapic_timer_frequency < (1000000 / HZ)) { local_irq_enable(); pr_warning("APIC frequency too slow, disabling apic timer\n"); return -1; -- cgit v1.1 From 0a9153261d54c432bc0bdc88607f24c835ac729c Mon Sep 17 00:00:00 2001 From: Dirk Brandewie Date: Thu, 10 Nov 2011 13:42:53 +0000 Subject: x86/mrst: Add support for Penwell clock calibration Signed-off-by: Dirk Brandewie Signed-off-by: Alan Cox Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mrst.h | 7 +++++++ arch/x86/platform/mrst/mrst.c | 33 ++++++++++++++++++++++++++++----- 2 files changed, 35 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h index 719f00b..e628312 100644 --- a/arch/x86/include/asm/mrst.h +++ b/arch/x86/include/asm/mrst.h @@ -44,6 +44,13 @@ enum mrst_timer_options { extern enum mrst_timer_options mrst_timer_options; +/* + * Penwell uses spread spectrum clock, so the freq number is not exactly + * the same as reported by MSR based on SDM. + */ +#define PENWELL_FSB_FREQ_83SKU 83200 +#define PENWELL_FSB_FREQ_100SKU 99840 + #define SFI_MTMR_MAX_NUM 8 #define SFI_MRTC_MAX 8 diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c index 6ed7afd..b7f14e5 100644 --- a/arch/x86/platform/mrst/mrst.c +++ b/arch/x86/platform/mrst/mrst.c @@ -187,11 +187,34 @@ int __init sfi_parse_mrtc(struct sfi_table_header *table) static unsigned long __init mrst_calibrate_tsc(void) { unsigned long flags, fast_calibrate; - - local_irq_save(flags); - fast_calibrate = apbt_quick_calibrate(); - local_irq_restore(flags); - + if (__mrst_cpu_chip == MRST_CPU_CHIP_PENWELL) { + u32 lo, hi, ratio, fsb; + + rdmsr(MSR_IA32_PERF_STATUS, lo, hi); + pr_debug("IA32 perf status is 0x%x, 0x%0x\n", lo, hi); + ratio = (hi >> 8) & 0x1f; + pr_debug("ratio is %d\n", ratio); + if (!ratio) { + pr_err("read a zero ratio, should be incorrect!\n"); + pr_err("force tsc ratio to 16 ...\n"); + ratio = 16; + } + rdmsr(MSR_FSB_FREQ, lo, hi); + if ((lo & 0x7) == 0x7) + fsb = PENWELL_FSB_FREQ_83SKU; + else + fsb = PENWELL_FSB_FREQ_100SKU; + fast_calibrate = ratio * fsb; + pr_debug("read penwell tsc %lu khz\n", fast_calibrate); + lapic_timer_frequency = fsb * 1000 / HZ; + /* mark tsc clocksource as reliable */ + set_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC_RELIABLE); + } else { + local_irq_save(flags); + fast_calibrate = apbt_quick_calibrate(); + local_irq_restore(flags); + } + if (fast_calibrate) return fast_calibrate; -- cgit v1.1 From 064a59b6dd1f341cc478c212bb436e3da9cb8d04 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Thu, 10 Nov 2011 13:43:05 +0000 Subject: x86/mrst: Avoid reporting wrong nmi status Moorestown/Medfield platform does not have port 0x61 to report NMI status, nor does it have external NMI sources. The only NMI sources are from lapic, as results of perf counter overflow or IPI, e.g. NMI watchdog or spin lock debug. Reading port 0x61 on Moorestown will return 0xff which misled NMI handlers to false critical errors such memory parity error. The subsequent ioport access for NMI handling can also cause undefined behavior on Moorestown. This patch allows kernel process NMI due to watchdog or backrace dump without unnecessary hangs. Signed-off-by: Jacob Pan Signed-off-by: Ingo Molnar [hand applied] Signed-off-by: Alan Cox --- arch/x86/include/asm/mach_traps.h | 2 +- arch/x86/include/asm/x86_init.h | 1 + arch/x86/kernel/nmi.c | 2 +- arch/x86/kernel/x86_init.c | 2 ++ arch/x86/platform/mrst/mrst.c | 13 +++++++++++++ 5 files changed, 18 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mach_traps.h b/arch/x86/include/asm/mach_traps.h index 72a8b52..a01e7ec7 100644 --- a/arch/x86/include/asm/mach_traps.h +++ b/arch/x86/include/asm/mach_traps.h @@ -17,7 +17,7 @@ #define NMI_REASON_CLEAR_IOCHK 0x08 #define NMI_REASON_CLEAR_MASK 0x0f -static inline unsigned char get_nmi_reason(void) +static inline unsigned char default_get_nmi_reason(void) { return inb(NMI_REASON_PORT); } diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index f864fbe..1971e65 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -167,6 +167,7 @@ struct x86_platform_ops { void (*iommu_shutdown)(void); bool (*is_untracked_pat_range)(u64 start, u64 end); void (*nmi_init)(void); + unsigned char (*get_nmi_reason)(void); int (*i8042_detect)(void); }; diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index b9c8628..27d1e7c 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -348,7 +348,7 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) /* Non-CPU-specific NMI: NMI sources can be processed on any CPU */ raw_spin_lock(&nmi_reason_lock); - reason = get_nmi_reason(); + reason = x86_platform.get_nmi_reason(); if (reason & NMI_REASON_MASK) { if (reason & NMI_REASON_SERR) diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 701c7be..c1d6cd5 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -21,6 +21,7 @@ #include #include #include +#include void __cpuinit x86_init_noop(void) { } void __init x86_init_uint_noop(unsigned int unused) { } @@ -104,6 +105,7 @@ struct x86_platform_ops x86_platform = { .iommu_shutdown = iommu_shutdown_noop, .is_untracked_pat_range = is_ISA_range, .nmi_init = default_nmi_init, + .get_nmi_reason = default_get_nmi_reason, .i8042_detect = default_i8042_detect }; diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c index b7f14e5..9b9ee29 100644 --- a/arch/x86/platform/mrst/mrst.c +++ b/arch/x86/platform/mrst/mrst.c @@ -277,6 +277,17 @@ static void mrst_reboot(void) } /* + * Moorestown does not have external NMI source nor port 0x61 to report + * NMI status. The possible NMI sources are from pmu as a result of NMI + * watchdog or lock debug. Reading io port 0x61 results in 0xff which + * misled NMI handler. + */ +static unsigned char mrst_get_nmi_reason(void) +{ + return 0; +} + +/* * Moorestown specific x86_init function overrides and early setup * calls. */ @@ -297,6 +308,8 @@ void __init x86_mrst_early_setup(void) x86_platform.calibrate_tsc = mrst_calibrate_tsc; x86_platform.i8042_detect = mrst_i8042_detect; x86_init.timers.wallclock_init = mrst_rtc_init; + x86_platform.get_nmi_reason = mrst_get_nmi_reason; + x86_init.pci.init = pci_mrst_init; x86_init.pci.fixup_irqs = x86_init_noop; -- cgit v1.1 From 6fd36ba02132c61f67ebefff77fe710bd38ba95a Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Thu, 10 Nov 2011 13:45:24 +0000 Subject: x86, ioapic: Only print ioapic debug information for IRQs belonging to an ioapic chip with "apic=verbose" the print_IO_APIC() function tries to print IRQ to pin mappings for every active irq. It assumes chip_data is of type irq_cfg and may cause an oops if not. As the print_IO_APIC() is called from a late_initcall other chained irq chips may already be registered with custom chip_data information, causing an oops. This is the case with intel MID SoC devices with gpio demuxers registered as irq_chips. Signed-off-by: Mathias Nyman Signed-off-by: Alan Cox [ -v2: fixed build failure ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 5 +++++ arch/x86/kernel/nmi.c | 1 + 2 files changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 841b8da..6d939d7 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1694,6 +1694,7 @@ __apicdebuginit(void) print_IO_APICs(void) int ioapic_idx; struct irq_cfg *cfg; unsigned int irq; + struct irq_chip *chip; printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++) @@ -1714,6 +1715,10 @@ __apicdebuginit(void) print_IO_APICs(void) for_each_active_irq(irq) { struct irq_pin_list *entry; + chip = irq_get_chip(irq); + if (chip != &ioapic_chip) + continue; + cfg = irq_get_chip_data(irq); if (!cfg) continue; diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 27d1e7c..e88f37b 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -29,6 +29,7 @@ #include #include #include +#include #define NMI_MAX_NAMELEN 16 struct nmiaction { -- cgit v1.1 From f2ee442115c9b6219083c019939a9cc0c9abb2f8 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Thu, 10 Nov 2011 13:21:10 +0000 Subject: ce4100: fix a build error Fix a build error. CE4100 with no serial errors because the alternate function is only a prototype not a null function as intended. Signed-off-by: Zhang Rui Signed-off-by: Alan Cox Signed-off-by: Linus Torvalds --- arch/x86/platform/ce4100/ce4100.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/ce4100/ce4100.c b/arch/x86/platform/ce4100/ce4100.c index 28071bb..4c61b52 100644 --- a/arch/x86/platform/ce4100/ce4100.c +++ b/arch/x86/platform/ce4100/ce4100.c @@ -109,7 +109,7 @@ static __init void sdv_serial_fixup(void) } #else -static inline void sdv_serial_fixup(void); +static inline void sdv_serial_fixup(void) {}; #endif static void __init sdv_arch_setup(void) -- cgit v1.1 From 57e6319dd61d5ca10fe8dd57bcce8c0e2c480799 Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Thu, 10 Nov 2011 13:23:39 +0000 Subject: vrtc: change its year offset from 1960 to 1972 Real world year equals the value in vrtc YEAR register plus an offset. We used 1960 as the offset to make leap year consistent, but for a device's first use, its YEAR register is 0 and the system year will be parsed as 1960 which is not a valid UNIX time and will cause many applications to fail mysteriously. So we use 1972 instead to fix this issue. Updated patch which adds a sanity check suggested by Mathias This isn't a change in behaviour for systems, because 1972 is the one we actually use. It's the old version in upstream which is out of sync with all devices. Signed-off-by: Feng Tang Signed-off-by: Alan Cox Signed-off-by: Linus Torvalds --- arch/x86/platform/mrst/vrtc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/mrst/vrtc.c b/arch/x86/platform/mrst/vrtc.c index a8ac6f1..225bd0f 100644 --- a/arch/x86/platform/mrst/vrtc.c +++ b/arch/x86/platform/mrst/vrtc.c @@ -76,8 +76,8 @@ unsigned long vrtc_get_time(void) spin_unlock_irqrestore(&rtc_lock, flags); - /* vRTC YEAR reg contains the offset to 1960 */ - year += 1960; + /* vRTC YEAR reg contains the offset to 1972 */ + year += 1972; printk(KERN_INFO "vRTC: sec: %d min: %d hour: %d day: %d " "mon: %d year: %d\n", sec, min, hour, mday, mon, year); -- cgit v1.1 From 9f80d8b68fb244bbd69c55aced663b91098544fc Mon Sep 17 00:00:00 2001 From: William Douglas Date: Thu, 10 Nov 2011 13:50:38 +0000 Subject: bma023: Add SFI translation for this device This needed the sfi IRQ 0xFF fix to go in first. It simply plumbs in the bma023 driver with the firmware naming of it. Signed-off-by: William Douglas Signed-off-by: Alan Cox Signed-off-by: Linus Torvalds --- arch/x86/platform/mrst/mrst.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c index 6ed7afd..541020d 100644 --- a/arch/x86/platform/mrst/mrst.c +++ b/arch/x86/platform/mrst/mrst.c @@ -608,6 +608,7 @@ static void *msic_ocd_platform_data(void *info) } static const struct devs_id __initconst device_ids[] = { + {"bma023", SFI_DEV_TYPE_I2C, 1, &no_platform_data}, {"pmic_gpio", SFI_DEV_TYPE_SPI, 1, &pmic_gpio_platform_data}, {"spi_max3111", SFI_DEV_TYPE_SPI, 0, &max3111_platform_data}, {"i2c_max7315", SFI_DEV_TYPE_I2C, 1, &max7315_platform_data}, -- cgit v1.1 From 78345d2edc25e001558f3b7c85906f645d38d23c Mon Sep 17 00:00:00 2001 From: Rabin Vincent Date: Thu, 27 Oct 2011 13:24:32 +0530 Subject: x86: Call stop_machine_text_poke() on all CPUs It appears that stop_machine_text_poke() wants to be called on all CPUs, like it's done from text_poke_smp(). Fix text_poke_smp_batch() to do this. Signed-off-by: Rabin Vincent Acked-by: Masami Hiramatsu Signed-off-by: Peter Zijlstra Cc: Mathieu Desnoyers Cc: Jason Baron Link: http://lkml.kernel.org/r/1319702072-32676-1-git-send-email-rabin@rab.in Signed-off-by: Ingo Molnar --- arch/x86/kernel/alternative.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index c638228..1f84794 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -738,5 +738,5 @@ void __kprobes text_poke_smp_batch(struct text_poke_param *params, int n) atomic_set(&stop_machine_first, 1); wrote_text = 0; - __stop_machine(stop_machine_text_poke, (void *)&tpp, NULL); + __stop_machine(stop_machine_text_poke, (void *)&tpp, cpu_online_mask); } -- cgit v1.1 From cd12909cb576d37311fe35868780e82d5007d0c8 Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Thu, 29 Sep 2011 16:53:32 +0100 Subject: xen: map foreign pages for shared rings by updating the PTEs directly When mapping a foreign page with xenbus_map_ring_valloc() with the GNTTABOP_map_grant_ref hypercall, set the GNTMAP_contains_pte flag and pass a pointer to the PTE (in init_mm). After the page is mapped, the usual fault mechanism can be used to update additional MMs. This allows the vmalloc_sync_all() to be removed from alloc_vm_area(). Signed-off-by: David Vrabel Acked-by: Andrew Morton [v1: Squashed fix by Michal for no-mmu case] Signed-off-by: Konrad Rzeszutek Wilk Signed-off-by: Michal Simek --- arch/x86/xen/grant-table.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c index 6bbfd7a..5a40d24 100644 --- a/arch/x86/xen/grant-table.c +++ b/arch/x86/xen/grant-table.c @@ -71,7 +71,7 @@ int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes, if (shared == NULL) { struct vm_struct *area = - alloc_vm_area(PAGE_SIZE * max_nr_gframes); + alloc_vm_area(PAGE_SIZE * max_nr_gframes, NULL); BUG_ON(area == NULL); shared = area->addr; *__shared = shared; -- cgit v1.1 From 90d4f5534d14815bd94c10e8ceccc57287657ecc Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Thu, 27 Oct 2011 22:28:59 -0700 Subject: xen:pvhvm: enable PVHVM VCPU placement when using more than 32 CPUs. PVHVM running with more than 32 vcpus and pv_irq/pv_time enabled need VCPU placement to work, or else it will softlockup. CC: stable@kernel.org Acked-by: Stefano Stabellini Signed-off-by: Zhenzhong Duan Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/enlighten.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index da8afd5..1f92865 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1356,7 +1356,7 @@ static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self, int cpu = (long)hcpu; switch (action) { case CPU_UP_PREPARE: - per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; + xen_vcpu_setup(cpu); if (xen_have_vector_callback) xen_init_lock_cpu(cpu); break; @@ -1386,7 +1386,6 @@ static void __init xen_hvm_guest_init(void) xen_hvm_smp_init(); register_cpu_notifier(&xen_hvm_cpu_notifier); xen_unplug_emulated_devices(); - have_vcpu_info_placement = 0; x86_init.irqs.intr_init = xen_init_IRQ; xen_hvm_init_time_ops(); xen_hvm_init_mmu_ops(); -- cgit v1.1 From 8bf00a529967dafbbb210b377c38a15834d1e979 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 5 Oct 2011 14:01:22 +0200 Subject: KVM: VMX: add support for switching of PERF_GLOBAL_CTRL Some cpus have special support for switching PERF_GLOBAL_CTRL msr. Add logic to detect if such support exists and works properly and extend msr switching code to use it if available. Also extend number of generic msr switching entries to 8. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 104 +++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 93 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a0d6bd9..55e849b 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -118,7 +118,7 @@ module_param(ple_gap, int, S_IRUGO); static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW; module_param(ple_window, int, S_IRUGO); -#define NR_AUTOLOAD_MSRS 1 +#define NR_AUTOLOAD_MSRS 8 #define VMCS02_POOL_SIZE 1 struct vmcs { @@ -622,6 +622,7 @@ static unsigned long *vmx_msr_bitmap_legacy; static unsigned long *vmx_msr_bitmap_longmode; static bool cpu_has_load_ia32_efer; +static bool cpu_has_load_perf_global_ctrl; static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); static DEFINE_SPINLOCK(vmx_vpid_lock); @@ -1191,15 +1192,34 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) vmcs_write32(EXCEPTION_BITMAP, eb); } +static void clear_atomic_switch_msr_special(unsigned long entry, + unsigned long exit) +{ + vmcs_clear_bits(VM_ENTRY_CONTROLS, entry); + vmcs_clear_bits(VM_EXIT_CONTROLS, exit); +} + static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) { unsigned i; struct msr_autoload *m = &vmx->msr_autoload; - if (msr == MSR_EFER && cpu_has_load_ia32_efer) { - vmcs_clear_bits(VM_ENTRY_CONTROLS, VM_ENTRY_LOAD_IA32_EFER); - vmcs_clear_bits(VM_EXIT_CONTROLS, VM_EXIT_LOAD_IA32_EFER); - return; + switch (msr) { + case MSR_EFER: + if (cpu_has_load_ia32_efer) { + clear_atomic_switch_msr_special(VM_ENTRY_LOAD_IA32_EFER, + VM_EXIT_LOAD_IA32_EFER); + return; + } + break; + case MSR_CORE_PERF_GLOBAL_CTRL: + if (cpu_has_load_perf_global_ctrl) { + clear_atomic_switch_msr_special( + VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, + VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL); + return; + } + break; } for (i = 0; i < m->nr; ++i) @@ -1215,18 +1235,44 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr); } +static void add_atomic_switch_msr_special(unsigned long entry, + unsigned long exit, unsigned long guest_val_vmcs, + unsigned long host_val_vmcs, u64 guest_val, u64 host_val) +{ + vmcs_write64(guest_val_vmcs, guest_val); + vmcs_write64(host_val_vmcs, host_val); + vmcs_set_bits(VM_ENTRY_CONTROLS, entry); + vmcs_set_bits(VM_EXIT_CONTROLS, exit); +} + static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, u64 guest_val, u64 host_val) { unsigned i; struct msr_autoload *m = &vmx->msr_autoload; - if (msr == MSR_EFER && cpu_has_load_ia32_efer) { - vmcs_write64(GUEST_IA32_EFER, guest_val); - vmcs_write64(HOST_IA32_EFER, host_val); - vmcs_set_bits(VM_ENTRY_CONTROLS, VM_ENTRY_LOAD_IA32_EFER); - vmcs_set_bits(VM_EXIT_CONTROLS, VM_EXIT_LOAD_IA32_EFER); - return; + switch (msr) { + case MSR_EFER: + if (cpu_has_load_ia32_efer) { + add_atomic_switch_msr_special(VM_ENTRY_LOAD_IA32_EFER, + VM_EXIT_LOAD_IA32_EFER, + GUEST_IA32_EFER, + HOST_IA32_EFER, + guest_val, host_val); + return; + } + break; + case MSR_CORE_PERF_GLOBAL_CTRL: + if (cpu_has_load_perf_global_ctrl) { + add_atomic_switch_msr_special( + VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, + VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL, + GUEST_IA32_PERF_GLOBAL_CTRL, + HOST_IA32_PERF_GLOBAL_CTRL, + guest_val, host_val); + return; + } + break; } for (i = 0; i < m->nr; ++i) @@ -2455,6 +2501,42 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS, VM_EXIT_LOAD_IA32_EFER); + cpu_has_load_perf_global_ctrl = + allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS, + VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) + && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS, + VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL); + + /* + * Some cpus support VM_ENTRY_(LOAD|SAVE)_IA32_PERF_GLOBAL_CTRL + * but due to arrata below it can't be used. Workaround is to use + * msr load mechanism to switch IA32_PERF_GLOBAL_CTRL. + * + * VM Exit May Incorrectly Clear IA32_PERF_GLOBAL_CTRL [34:32] + * + * AAK155 (model 26) + * AAP115 (model 30) + * AAT100 (model 37) + * BC86,AAY89,BD102 (model 44) + * BA97 (model 46) + * + */ + if (cpu_has_load_perf_global_ctrl && boot_cpu_data.x86 == 0x6) { + switch (boot_cpu_data.x86_model) { + case 26: + case 30: + case 37: + case 44: + case 46: + cpu_has_load_perf_global_ctrl = false; + printk_once(KERN_WARNING"kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL " + "does not work properly. Using workaround\n"); + break; + default: + break; + } + } + return 0; } -- cgit v1.1 From d7cd97964ba6d70c558348bd2c87290dce885583 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 5 Oct 2011 14:01:23 +0200 Subject: KVM: VMX: Add support for guest/host-only profiling Support guest/host-only profiling by switch perf msrs on a guest entry if needed. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 55e849b..98f4b0b 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -39,6 +39,7 @@ #include #include #include +#include #include "trace.h" @@ -6050,6 +6051,24 @@ static void vmx_cancel_injection(struct kvm_vcpu *vcpu) vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); } +static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) +{ + int i, nr_msrs; + struct perf_guest_switch_msr *msrs; + + msrs = perf_guest_get_msrs(&nr_msrs); + + if (!msrs) + return; + + for (i = 0; i < nr_msrs; i++) + if (msrs[i].host == msrs[i].guest) + clear_atomic_switch_msr(vmx, msrs[i].msr); + else + add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest, + msrs[i].host); +} + #ifdef CONFIG_X86_64 #define R "r" #define Q "q" @@ -6099,6 +6118,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) vmx_set_interrupt_shadow(vcpu, 0); + atomic_switch_perf_msrs(vmx); + vmx->__launched = vmx->loaded_vmcs->launched; asm( /* Store host registers */ -- cgit v1.1 From e7fc6f93b4242b2b566f0070709e27257d6da8a2 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 5 Oct 2011 14:01:24 +0200 Subject: KVM: VMX: Check for automatic switch msr table overflow Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 98f4b0b..579a0b5 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1280,7 +1280,11 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, if (m->guest[i].index == msr) break; - if (i == m->nr) { + if (i == NR_AUTOLOAD_MSRS) { + printk_once(KERN_WARNING"Not enough mst switch entries. " + "Can't add msr %x\n", msr); + return; + } else if (i == m->nr) { ++m->nr; vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr); vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr); -- cgit v1.1 From 95ef1e52922cf75b1ea2eae54ef886f2cc47eecb Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 15 Nov 2011 14:59:07 +0200 Subject: KVM guest: prevent tracing recursion with kvmclock Prevent tracing of preempt_disable() in get_cpu_var() in kvm_clock_read(). When CONFIG_DEBUG_PREEMPT is enabled, preempt_disable/enable() are traced and this causes the function_graph tracer to go into an infinite recursion. By open coding the preempt_disable() around the get_cpu_var(), we can use the notrace version which prevents preempt_disable/enable() from being traced and prevents the recursion. Based on a similar patch for Xen from Jeremy Fitzhardinge. Tested-by: Gleb Natapov Acked-by: Steven Rostedt Signed-off-by: Avi Kivity --- arch/x86/kernel/kvmclock.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index c1a0188..44842d7 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -74,9 +74,10 @@ static cycle_t kvm_clock_read(void) struct pvclock_vcpu_time_info *src; cycle_t ret; - src = &get_cpu_var(hv_clock); + preempt_disable_notrace(); + src = &__get_cpu_var(hv_clock); ret = pvclock_clocksource_read(src); - put_cpu_var(hv_clock); + preempt_enable_notrace(); return ret; } -- cgit v1.1 From cc11f9edd919002d8b3a03601a181a449150defd Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 21 Nov 2011 03:52:18 +0000 Subject: fix braino in um patchset (mea culpa) wrong register returned... Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- arch/x86/um/asm/processor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/um/asm/processor.h b/arch/x86/um/asm/processor.h index 118c143..2c32df6 100644 --- a/arch/x86/um/asm/processor.h +++ b/arch/x86/um/asm/processor.h @@ -11,7 +11,7 @@ #endif #define KSTK_EIP(tsk) KSTK_REG(tsk, HOST_IP) -#define KSTK_ESP(tsk) KSTK_REG(tsk, HOST_IP) +#define KSTK_ESP(tsk) KSTK_REG(tsk, HOST_SP) #define KSTK_EBP(tsk) KSTK_REG(tsk, HOST_BP) #define ARCH_IS_STACKGROW(address) \ -- cgit v1.1