From 9594a4986192f99c01a7c0a1779b5ac0eff8e208 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Mon, 2 Jul 2012 17:53:25 +0900 Subject: KVM: MMU: Use __gfn_to_rmap() to clean up kvm_handle_hva() We can treat every level uniformly. Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 28c8fbc..2beb95b 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1281,14 +1281,14 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; gfn_t gfn = memslot->base_gfn + gfn_offset; - ret = handler(kvm, &memslot->rmap[gfn_offset], data); + ret = 0; - for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { - struct kvm_lpage_info *linfo; + for (j = PT_PAGE_TABLE_LEVEL; + j < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++j) { + unsigned long *rmapp; - linfo = lpage_info_slot(gfn, memslot, - PT_DIRECTORY_LEVEL + j); - ret |= handler(kvm, &linfo->rmap_pde, data); + rmapp = __gfn_to_rmap(gfn, j, memslot); + ret |= handler(kvm, rmapp, data); } trace_kvm_age_page(hva, memslot, ret); retval |= ret; -- cgit v1.1 From d19a748b1c42b133e9263e9023c1d162efa6f4ad Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Mon, 2 Jul 2012 17:54:30 +0900 Subject: KVM: Introduce hva_to_gfn_memslot() for kvm_handle_hva() This restricts hva handling in mmu code and makes it easier to extend kvm_handle_hva() so that it can treat a range of addresses later in this patch series. Signed-off-by: Takuya Yoshikawa Cc: Alexander Graf Cc: Paul Mackerras Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 2beb95b..170a632 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1278,8 +1278,7 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, end = start + (memslot->npages << PAGE_SHIFT); if (hva >= start && hva < end) { - gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; - gfn_t gfn = memslot->base_gfn + gfn_offset; + gfn_t gfn = hva_to_gfn_memslot(hva, memslot); ret = 0; -- cgit v1.1 From 84504ef38673fa021b3d8f3da2b79cf878b33315 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Mon, 2 Jul 2012 17:55:48 +0900 Subject: KVM: MMU: Make kvm_handle_hva() handle range of addresses When guest's memory is backed by THP pages, MMU notifier needs to call kvm_unmap_hva(), which in turn leads to kvm_handle_hva(), in a loop to invalidate a range of pages which constitute one huge page: for each page for each memslot if page is in memslot unmap using rmap This means although every page in that range is expected to be found in the same memslot, we are forced to check unrelated memslots many times. If the guest has more memslots, the situation will become worse. Furthermore, if the range does not include any pages in the guest's memory, the loop over the pages will just consume extra time. This patch, together with the following patches, solves this problem by introducing kvm_handle_hva_range() which makes the loop look like this: for each memslot for each page in memslot unmap using rmap In this new processing, the actual work is converted to a loop over rmap which is much more cache friendly than before. Signed-off-by: Takuya Yoshikawa Cc: Alexander Graf Cc: Paul Mackerras Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 42 ++++++++++++++++++++++++++++++++---------- 1 file changed, 32 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 170a632..7235b0c 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1259,10 +1259,13 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, return 0; } -static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, - unsigned long data, - int (*handler)(struct kvm *kvm, unsigned long *rmapp, - unsigned long data)) +static int kvm_handle_hva_range(struct kvm *kvm, + unsigned long start, + unsigned long end, + unsigned long data, + int (*handler)(struct kvm *kvm, + unsigned long *rmapp, + unsigned long data)) { int j; int ret; @@ -1273,13 +1276,22 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, slots = kvm_memslots(kvm); kvm_for_each_memslot(memslot, slots) { - unsigned long start = memslot->userspace_addr; - unsigned long end; + unsigned long hva_start, hva_end; + gfn_t gfn, gfn_end; - end = start + (memslot->npages << PAGE_SHIFT); - if (hva >= start && hva < end) { - gfn_t gfn = hva_to_gfn_memslot(hva, memslot); + hva_start = max(start, memslot->userspace_addr); + hva_end = min(end, memslot->userspace_addr + + (memslot->npages << PAGE_SHIFT)); + if (hva_start >= hva_end) + continue; + /* + * {gfn(page) | page intersects with [hva_start, hva_end)} = + * {gfn, gfn+1, ..., gfn_end-1}. + */ + gfn = hva_to_gfn_memslot(hva_start, memslot); + gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); + for (; gfn < gfn_end; ++gfn) { ret = 0; for (j = PT_PAGE_TABLE_LEVEL; @@ -1289,7 +1301,9 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, rmapp = __gfn_to_rmap(gfn, j, memslot); ret |= handler(kvm, rmapp, data); } - trace_kvm_age_page(hva, memslot, ret); + trace_kvm_age_page(memslot->userspace_addr + + (gfn - memslot->base_gfn) * PAGE_SIZE, + memslot, ret); retval |= ret; } } @@ -1297,6 +1311,14 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, return retval; } +static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, + unsigned long data, + int (*handler)(struct kvm *kvm, unsigned long *rmapp, + unsigned long data)) +{ + return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler); +} + int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) { return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp); -- cgit v1.1 From b3ae2096974b12c3af2ad1a4e7716b084949867f Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Mon, 2 Jul 2012 17:56:33 +0900 Subject: KVM: Introduce kvm_unmap_hva_range() for kvm_mmu_notifier_invalidate_range_start() When we tested KVM under memory pressure, with THP enabled on the host, we noticed that MMU notifier took a long time to invalidate huge pages. Since the invalidation was done with mmu_lock held, it not only wasted the CPU but also made the host harder to respond. This patch mitigates this by using kvm_handle_hva_range(). Signed-off-by: Takuya Yoshikawa Cc: Alexander Graf Cc: Paul Mackerras Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/mmu.c | 5 +++++ 2 files changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index a3e9409..d4aab86 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -944,6 +944,7 @@ extern bool kvm_rebooting; #define KVM_ARCH_WANT_MMU_NOTIFIER int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); +int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end); int kvm_age_hva(struct kvm *kvm, unsigned long hva); int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 7235b0c..d2855f8 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1324,6 +1324,11 @@ int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp); } +int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end) +{ + return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp); +} + void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) { kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp); -- cgit v1.1 From 77d11309b3a10e1ce112058ec2c9b7b979bcf311 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Mon, 2 Jul 2012 17:57:17 +0900 Subject: KVM: Separate rmap_pde from kvm_lpage_info->write_count This makes it possible to loop over rmap_pde arrays in the same way as we do over rmap so that we can optimize kvm_handle_hva_range() easily in the following patch. Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/mmu.c | 6 +++--- arch/x86/kvm/x86.c | 11 +++++++++++ 3 files changed, 15 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index d4aab86..4f98da9 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -500,11 +500,11 @@ struct kvm_vcpu_arch { }; struct kvm_lpage_info { - unsigned long rmap_pde; int write_count; }; struct kvm_arch_memory_slot { + unsigned long *rmap_pde[KVM_NR_PAGE_SIZES - 1]; struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1]; }; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index d2855f8..3b3f5ae 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -960,13 +960,13 @@ static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn) static unsigned long *__gfn_to_rmap(gfn_t gfn, int level, struct kvm_memory_slot *slot) { - struct kvm_lpage_info *linfo; + unsigned long idx; if (likely(level == PT_PAGE_TABLE_LEVEL)) return &slot->rmap[gfn - slot->base_gfn]; - linfo = lpage_info_slot(gfn, slot, level); - return &linfo->rmap_pde; + idx = gfn_to_index(gfn, slot->base_gfn, level); + return &slot->arch.rmap_pde[level - PT_DIRECTORY_LEVEL][idx]; } /* diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 59b5950..829b4e9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6314,6 +6314,10 @@ void kvm_arch_free_memslot(struct kvm_memory_slot *free, int i; for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { + if (!dont || free->arch.rmap_pde[i] != dont->arch.rmap_pde[i]) { + kvm_kvfree(free->arch.rmap_pde[i]); + free->arch.rmap_pde[i] = NULL; + } if (!dont || free->arch.lpage_info[i] != dont->arch.lpage_info[i]) { kvm_kvfree(free->arch.lpage_info[i]); free->arch.lpage_info[i] = NULL; @@ -6333,6 +6337,11 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) lpages = gfn_to_index(slot->base_gfn + npages - 1, slot->base_gfn, level) + 1; + slot->arch.rmap_pde[i] = + kvm_kvzalloc(lpages * sizeof(*slot->arch.rmap_pde[i])); + if (!slot->arch.rmap_pde[i]) + goto out_free; + slot->arch.lpage_info[i] = kvm_kvzalloc(lpages * sizeof(*slot->arch.lpage_info[i])); if (!slot->arch.lpage_info[i]) @@ -6361,7 +6370,9 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) out_free: for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { + kvm_kvfree(slot->arch.rmap_pde[i]); kvm_kvfree(slot->arch.lpage_info[i]); + slot->arch.rmap_pde[i] = NULL; slot->arch.lpage_info[i] = NULL; } return -ENOMEM; -- cgit v1.1 From 048212d0bc0b1769a4bbecd7ace8c8d237577d1b Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Mon, 2 Jul 2012 17:57:59 +0900 Subject: KVM: MMU: Add memslot parameter to hva handlers This is needed to push trace_kvm_age_page() into kvm_age_rmapp() in the following patch. Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 3b3f5ae..dfd7a9a 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1200,7 +1200,7 @@ static bool rmap_write_protect(struct kvm *kvm, u64 gfn) } static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, - unsigned long data) + struct kvm_memory_slot *slot, unsigned long data) { u64 *sptep; struct rmap_iterator iter; @@ -1218,7 +1218,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, } static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, - unsigned long data) + struct kvm_memory_slot *slot, unsigned long data) { u64 *sptep; struct rmap_iterator iter; @@ -1265,6 +1265,7 @@ static int kvm_handle_hva_range(struct kvm *kvm, unsigned long data, int (*handler)(struct kvm *kvm, unsigned long *rmapp, + struct kvm_memory_slot *slot, unsigned long data)) { int j; @@ -1299,7 +1300,7 @@ static int kvm_handle_hva_range(struct kvm *kvm, unsigned long *rmapp; rmapp = __gfn_to_rmap(gfn, j, memslot); - ret |= handler(kvm, rmapp, data); + ret |= handler(kvm, rmapp, memslot, data); } trace_kvm_age_page(memslot->userspace_addr + (gfn - memslot->base_gfn) * PAGE_SIZE, @@ -1314,6 +1315,7 @@ static int kvm_handle_hva_range(struct kvm *kvm, static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, unsigned long data, int (*handler)(struct kvm *kvm, unsigned long *rmapp, + struct kvm_memory_slot *slot, unsigned long data)) { return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler); @@ -1335,7 +1337,7 @@ void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) } static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, - unsigned long data) + struct kvm_memory_slot *slot, unsigned long data) { u64 *sptep; struct rmap_iterator uninitialized_var(iter); @@ -1350,7 +1352,7 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, * out actively used pages or breaking up actively used hugepages. */ if (!shadow_accessed_mask) - return kvm_unmap_rmapp(kvm, rmapp, data); + return kvm_unmap_rmapp(kvm, rmapp, slot, data); for (sptep = rmap_get_first(*rmapp, &iter); sptep; sptep = rmap_get_next(&iter)) { @@ -1367,7 +1369,7 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, } static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, - unsigned long data) + struct kvm_memory_slot *slot, unsigned long data) { u64 *sptep; struct rmap_iterator iter; @@ -1405,7 +1407,7 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); - kvm_unmap_rmapp(vcpu->kvm, rmapp, 0); + kvm_unmap_rmapp(vcpu->kvm, rmapp, NULL, 0); kvm_flush_remote_tlbs(vcpu->kvm); } -- cgit v1.1 From f395302e09ef783b8f82d1160510a95aa8c66dbc Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Mon, 2 Jul 2012 17:58:48 +0900 Subject: KVM: MMU: Push trace_kvm_age_page() into kvm_age_rmapp() This restricts the tracing to page aging and makes it possible to optimize kvm_handle_hva_range() further in the following patch. Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index dfd7a9a..58adec3 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1269,8 +1269,7 @@ static int kvm_handle_hva_range(struct kvm *kvm, unsigned long data)) { int j; - int ret; - int retval = 0; + int ret = 0; struct kvm_memslots *slots; struct kvm_memory_slot *memslot; @@ -1293,8 +1292,6 @@ static int kvm_handle_hva_range(struct kvm *kvm, gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); for (; gfn < gfn_end; ++gfn) { - ret = 0; - for (j = PT_PAGE_TABLE_LEVEL; j < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++j) { unsigned long *rmapp; @@ -1302,14 +1299,10 @@ static int kvm_handle_hva_range(struct kvm *kvm, rmapp = __gfn_to_rmap(gfn, j, memslot); ret |= handler(kvm, rmapp, memslot, data); } - trace_kvm_age_page(memslot->userspace_addr + - (gfn - memslot->base_gfn) * PAGE_SIZE, - memslot, ret); - retval |= ret; } } - return retval; + return ret; } static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, @@ -1351,8 +1344,10 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, * This has some overhead, but not as much as the cost of swapping * out actively used pages or breaking up actively used hugepages. */ - if (!shadow_accessed_mask) - return kvm_unmap_rmapp(kvm, rmapp, slot, data); + if (!shadow_accessed_mask) { + young = kvm_unmap_rmapp(kvm, rmapp, slot, data); + goto out; + } for (sptep = rmap_get_first(*rmapp, &iter); sptep; sptep = rmap_get_next(&iter)) { @@ -1364,7 +1359,9 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, (unsigned long *)sptep); } } - +out: + /* @data has hva passed to kvm_age_hva(). */ + trace_kvm_age_page(data, slot, young); return young; } @@ -1413,7 +1410,7 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) int kvm_age_hva(struct kvm *kvm, unsigned long hva) { - return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp); + return kvm_handle_hva(kvm, hva, hva, kvm_age_rmapp); } int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) -- cgit v1.1 From bcd3ef58283a471d6b65855b83f78bd39eb55391 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Mon, 2 Jul 2012 17:59:33 +0900 Subject: KVM: MMU: Avoid handling same rmap_pde in kvm_handle_hva_range() When we invalidate a THP page, we call the handler with the same rmap_pde argument 512 times in the following loop: for each guest page in the range for each level unmap using rmap This patch avoids these extra handler calls by changing the loop order like this: for each level for each rmap in the range unmap using rmap With the preceding patches in the patch series, this made THP page invalidation more than 5 times faster on our x86 host: the host became more responsive during swapping the guest's memory as a result. Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 58adec3..a5d6ef7 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1277,7 +1277,7 @@ static int kvm_handle_hva_range(struct kvm *kvm, kvm_for_each_memslot(memslot, slots) { unsigned long hva_start, hva_end; - gfn_t gfn, gfn_end; + gfn_t gfn_start, gfn_end; hva_start = max(start, memslot->userspace_addr); hva_end = min(end, memslot->userspace_addr + @@ -1286,19 +1286,27 @@ static int kvm_handle_hva_range(struct kvm *kvm, continue; /* * {gfn(page) | page intersects with [hva_start, hva_end)} = - * {gfn, gfn+1, ..., gfn_end-1}. + * {gfn_start, gfn_start+1, ..., gfn_end-1}. */ - gfn = hva_to_gfn_memslot(hva_start, memslot); + gfn_start = hva_to_gfn_memslot(hva_start, memslot); gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); - for (; gfn < gfn_end; ++gfn) { - for (j = PT_PAGE_TABLE_LEVEL; - j < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++j) { - unsigned long *rmapp; + for (j = PT_PAGE_TABLE_LEVEL; + j < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++j) { + unsigned long idx, idx_end; + unsigned long *rmapp; - rmapp = __gfn_to_rmap(gfn, j, memslot); - ret |= handler(kvm, rmapp, memslot, data); - } + /* + * {idx(page_j) | page_j intersects with + * [hva_start, hva_end)} = {idx, idx+1, ..., idx_end}. + */ + idx = gfn_to_index(gfn_start, memslot->base_gfn, j); + idx_end = gfn_to_index(gfn_end - 1, memslot->base_gfn, j); + + rmapp = __gfn_to_rmap(gfn_start, j, memslot); + + for (; idx <= idx_end; ++idx) + ret |= handler(kvm, rmapp++, memslot, data); } } -- cgit v1.1 From 2f5f6ad9390c1ebbf738d130dbfe80b60eaa167e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 8 Aug 2011 16:57:47 -0400 Subject: ftrace: Pass ftrace_ops as third parameter to function trace callback Currently the function trace callback receives only the ip and parent_ip of the function that it traced. It would be more powerful to also return the ops that registered the function as well. This allows the same function to act differently depending on what ftrace_ops registered it. Link: http://lkml.kernel.org/r/20120612225424.267254552@goodmis.org Reviewed-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- arch/x86/include/asm/ftrace.h | 4 ++++ arch/x86/kernel/entry_64.S | 1 + 2 files changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index b0767bc..783b107 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -32,6 +32,10 @@ #define MCOUNT_ADDR ((long)(mcount)) #define MCOUNT_INSN_SIZE 5 /* sizeof mcount call */ +#if defined(CONFIG_DYNAMIC_FTRACE) && defined(CONFIG_X86_64) +#define ARCH_SUPPORTS_FTRACE_OPS 1 +#endif + #ifndef __ASSEMBLY__ extern void mcount(void); extern atomic_t modifying_ftrace_code; diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 7d65133..2b4f94c 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -79,6 +79,7 @@ ENTRY(ftrace_caller) MCOUNT_SAVE_FRAME + leaq function_trace_op, %rdx movq 0x38(%rsp), %rdi movq 8(%rbp), %rsi subq $MCOUNT_INSN_SIZE, %rdi -- cgit v1.1 From 28fb5dfa783c25dbeeb25a72663f8066a3a517f5 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 10 Aug 2011 22:00:55 -0400 Subject: ftrace/x86_32: Push ftrace_ops in as 3rd parameter to function tracer Add support of passing the current ftrace_ops into the 3rd parameter of the callback to the function tracer. Link: http://lkml.kernel.org/r/20120612225424.942411318@goodmis.org Reviewed-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- arch/x86/include/asm/ftrace.h | 2 +- arch/x86/kernel/entry_32.S | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 783b107..b3bb1f3 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -32,7 +32,7 @@ #define MCOUNT_ADDR ((long)(mcount)) #define MCOUNT_INSN_SIZE 5 /* sizeof mcount call */ -#if defined(CONFIG_DYNAMIC_FTRACE) && defined(CONFIG_X86_64) +#ifdef CONFIG_DYNAMIC_FTRACE #define ARCH_SUPPORTS_FTRACE_OPS 1 #endif diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 623f288..e3e17a0 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1111,6 +1111,7 @@ ENTRY(ftrace_caller) pushl %edx movl 0xc(%esp), %eax movl 0x4(%ebp), %edx + leal function_trace_op, %ecx subl $MCOUNT_INSN_SIZE, %eax .globl ftrace_call -- cgit v1.1 From 08f6fba503111e0336f2b4d6915a4a18f9b60e51 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 30 Apr 2012 16:20:23 -0400 Subject: ftrace/x86: Add separate function to save regs Add a way to have different functions calling different trampolines. If a ftrace_ops wants regs saved on the return, then have only the functions with ops registered to save regs. Functions registered by other ops would not be affected, unless the functions overlap. If one ftrace_ops registered functions A, B and C and another ops registered fucntions to save regs on A, and D, then only functions A and D would be saving regs. Function B and C would work as normal. Although A is registered by both ops: normal and saves regs; this is fine as saving the regs is needed to satisfy one of the ops that calls it but the regs are ignored by the other ops function. x86_64 implements the full regs saving, and i386 just passes a NULL for regs to satisfy the ftrace_ops passing. Where an arch must supply both regs and ftrace_ops parameters, even if regs is just NULL. It is OK for an arch to pass NULL regs. All function trace users that require regs passing must add the flag FTRACE_OPS_FL_SAVE_REGS when registering the ftrace_ops. If the arch does not support saving regs then the ftrace_ops will fail to register. The flag FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED may be set that will prevent the ftrace_ops from failing to register. In this case, the handler may either check if regs is not NULL or check if ARCH_SUPPORTS_FTRACE_SAVE_REGS. If the arch supports passing regs it will set this macro and pass regs for ops that request them. All other archs will just pass NULL. Link: Link: http://lkml.kernel.org/r/20120711195745.107705970@goodmis.org Cc: Alexander van Heukelum Reviewed-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- arch/x86/include/asm/ftrace.h | 47 +++++++++++++--------- arch/x86/kernel/entry_32.S | 4 +- arch/x86/kernel/entry_64.S | 94 +++++++++++++++++++++++++++++++++++++++---- arch/x86/kernel/ftrace.c | 77 +++++++++++++++++++++++++++++++++-- 4 files changed, 190 insertions(+), 32 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index b3bb1f3..a847501 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -3,27 +3,33 @@ #ifdef __ASSEMBLY__ - .macro MCOUNT_SAVE_FRAME - /* taken from glibc */ - subq $0x38, %rsp - movq %rax, (%rsp) - movq %rcx, 8(%rsp) - movq %rdx, 16(%rsp) - movq %rsi, 24(%rsp) - movq %rdi, 32(%rsp) - movq %r8, 40(%rsp) - movq %r9, 48(%rsp) + /* skip is set if the stack was already partially adjusted */ + .macro MCOUNT_SAVE_FRAME skip=0 + /* + * We add enough stack to save all regs. + */ + subq $(SS+8-\skip), %rsp + movq %rax, RAX(%rsp) + movq %rcx, RCX(%rsp) + movq %rdx, RDX(%rsp) + movq %rsi, RSI(%rsp) + movq %rdi, RDI(%rsp) + movq %r8, R8(%rsp) + movq %r9, R9(%rsp) + /* Move RIP to its proper location */ + movq SS+8(%rsp), %rdx + movq %rdx, RIP(%rsp) .endm - .macro MCOUNT_RESTORE_FRAME - movq 48(%rsp), %r9 - movq 40(%rsp), %r8 - movq 32(%rsp), %rdi - movq 24(%rsp), %rsi - movq 16(%rsp), %rdx - movq 8(%rsp), %rcx - movq (%rsp), %rax - addq $0x38, %rsp + .macro MCOUNT_RESTORE_FRAME skip=0 + movq R9(%rsp), %r9 + movq R8(%rsp), %r8 + movq RDI(%rsp), %rdi + movq RSI(%rsp), %rsi + movq RDX(%rsp), %rdx + movq RCX(%rsp), %rcx + movq RAX(%rsp), %rax + addq $(SS+8-\skip), %rsp .endm #endif @@ -34,6 +40,9 @@ #ifdef CONFIG_DYNAMIC_FTRACE #define ARCH_SUPPORTS_FTRACE_OPS 1 +#ifdef CONFIG_X86_64 +#define ARCH_SUPPORTS_FTRACE_SAVE_REGS +#endif #endif #ifndef __ASSEMBLY__ diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index e3e17a0..5da11d1 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1109,7 +1109,8 @@ ENTRY(ftrace_caller) pushl %eax pushl %ecx pushl %edx - movl 0xc(%esp), %eax + pushl $0 /* Pass NULL as regs pointer */ + movl 4*4(%esp), %eax movl 0x4(%ebp), %edx leal function_trace_op, %ecx subl $MCOUNT_INSN_SIZE, %eax @@ -1118,6 +1119,7 @@ ENTRY(ftrace_caller) ftrace_call: call ftrace_stub + addl $4,%esp /* skip NULL pointer */ popl %edx popl %ecx popl %eax diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 2b4f94c..52bda2e 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -73,21 +73,34 @@ ENTRY(mcount) retq END(mcount) +/* skip is set if stack has been adjusted */ +.macro ftrace_caller_setup skip=0 + MCOUNT_SAVE_FRAME \skip + + /* Load the ftrace_ops into the 3rd parameter */ + leaq function_trace_op, %rdx + + /* Load ip into the first parameter */ + movq RIP(%rsp), %rdi + subq $MCOUNT_INSN_SIZE, %rdi + /* Load the parent_ip into the second parameter */ + movq 8(%rbp), %rsi +.endm + ENTRY(ftrace_caller) + /* Check if tracing was disabled (quick check) */ cmpl $0, function_trace_stop jne ftrace_stub - MCOUNT_SAVE_FRAME - - leaq function_trace_op, %rdx - movq 0x38(%rsp), %rdi - movq 8(%rbp), %rsi - subq $MCOUNT_INSN_SIZE, %rdi + ftrace_caller_setup + /* regs go into 4th parameter (but make it NULL) */ + movq $0, %rcx GLOBAL(ftrace_call) call ftrace_stub MCOUNT_RESTORE_FRAME +ftrace_return: #ifdef CONFIG_FUNCTION_GRAPH_TRACER GLOBAL(ftrace_graph_call) @@ -98,6 +111,71 @@ GLOBAL(ftrace_stub) retq END(ftrace_caller) +ENTRY(ftrace_regs_caller) + /* Save the current flags before compare (in SS location)*/ + pushfq + + /* Check if tracing was disabled (quick check) */ + cmpl $0, function_trace_stop + jne ftrace_restore_flags + + /* skip=8 to skip flags saved in SS */ + ftrace_caller_setup 8 + + /* Save the rest of pt_regs */ + movq %r15, R15(%rsp) + movq %r14, R14(%rsp) + movq %r13, R13(%rsp) + movq %r12, R12(%rsp) + movq %r11, R11(%rsp) + movq %r10, R10(%rsp) + movq %rbp, RBP(%rsp) + movq %rbx, RBX(%rsp) + /* Copy saved flags */ + movq SS(%rsp), %rcx + movq %rcx, EFLAGS(%rsp) + /* Kernel segments */ + movq $__KERNEL_DS, %rcx + movq %rcx, SS(%rsp) + movq $__KERNEL_CS, %rcx + movq %rcx, CS(%rsp) + /* Stack - skipping return address */ + leaq SS+16(%rsp), %rcx + movq %rcx, RSP(%rsp) + + /* regs go into 4th parameter */ + leaq (%rsp), %rcx + +GLOBAL(ftrace_regs_call) + call ftrace_stub + + /* Copy flags back to SS, to restore them */ + movq EFLAGS(%rsp), %rax + movq %rax, SS(%rsp) + + /* restore the rest of pt_regs */ + movq R15(%rsp), %r15 + movq R14(%rsp), %r14 + movq R13(%rsp), %r13 + movq R12(%rsp), %r12 + movq R10(%rsp), %r10 + movq RBP(%rsp), %rbp + movq RBX(%rsp), %rbx + + /* skip=8 to skip flags saved in SS */ + MCOUNT_RESTORE_FRAME 8 + + /* Restore flags */ + popfq + + jmp ftrace_return +ftrace_restore_flags: + popfq + jmp ftrace_stub + +END(ftrace_regs_caller) + + #else /* ! CONFIG_DYNAMIC_FTRACE */ ENTRY(mcount) cmpl $0, function_trace_stop @@ -120,7 +198,7 @@ GLOBAL(ftrace_stub) trace: MCOUNT_SAVE_FRAME - movq 0x38(%rsp), %rdi + movq RIP(%rsp), %rdi movq 8(%rbp), %rsi subq $MCOUNT_INSN_SIZE, %rdi @@ -141,7 +219,7 @@ ENTRY(ftrace_graph_caller) MCOUNT_SAVE_FRAME leaq 8(%rbp), %rdi - movq 0x38(%rsp), %rsi + movq RIP(%rsp), %rsi movq (%rbp), %rdx subq $MCOUNT_INSN_SIZE, %rsi diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index c3a7cb4..b90eb1a 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -206,6 +206,23 @@ static int ftrace_modify_code(unsigned long ip, unsigned const char *old_code, unsigned const char *new_code); +#ifdef ARCH_SUPPORTS_FTRACE_SAVE_REGS +/* + * Should never be called: + * As it is only called by __ftrace_replace_code() which is called by + * ftrace_replace_code() that x86 overrides, and by ftrace_update_code() + * which is called to turn mcount into nops or nops into function calls + * but not to convert a function from not using regs to one that uses + * regs, which ftrace_modify_call() is for. + */ +int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, + unsigned long addr) +{ + WARN_ON(1); + return -EINVAL; +} +#endif + int ftrace_update_ftrace_func(ftrace_func_t func) { unsigned long ip = (unsigned long)(&ftrace_call); @@ -220,6 +237,16 @@ int ftrace_update_ftrace_func(ftrace_func_t func) ret = ftrace_modify_code(ip, old, new); +#ifdef ARCH_SUPPORTS_FTRACE_SAVE_REGS + /* Also update the regs callback function */ + if (!ret) { + ip = (unsigned long)(&ftrace_regs_call); + memcpy(old, &ftrace_regs_call, MCOUNT_INSN_SIZE); + new = ftrace_call_replace(ip, (unsigned long)func); + ret = ftrace_modify_code(ip, old, new); + } +#endif + atomic_dec(&modifying_ftrace_code); return ret; @@ -299,6 +326,32 @@ static int add_brk_on_nop(struct dyn_ftrace *rec) return add_break(rec->ip, old); } +/* + * If the record has the FTRACE_FL_REGS set, that means that it + * wants to convert to a callback that saves all regs. If FTRACE_FL_REGS + * is not not set, then it wants to convert to the normal callback. + */ +static unsigned long get_ftrace_addr(struct dyn_ftrace *rec) +{ + if (rec->flags & FTRACE_FL_REGS) + return (unsigned long)FTRACE_REGS_ADDR; + else + return (unsigned long)FTRACE_ADDR; +} + +/* + * The FTRACE_FL_REGS_EN is set when the record already points to + * a function that saves all the regs. Basically the '_EN' version + * represents the current state of the function. + */ +static unsigned long get_ftrace_old_addr(struct dyn_ftrace *rec) +{ + if (rec->flags & FTRACE_FL_REGS_EN) + return (unsigned long)FTRACE_REGS_ADDR; + else + return (unsigned long)FTRACE_ADDR; +} + static int add_breakpoints(struct dyn_ftrace *rec, int enable) { unsigned long ftrace_addr; @@ -306,7 +359,7 @@ static int add_breakpoints(struct dyn_ftrace *rec, int enable) ret = ftrace_test_record(rec, enable); - ftrace_addr = (unsigned long)FTRACE_ADDR; + ftrace_addr = get_ftrace_addr(rec); switch (ret) { case FTRACE_UPDATE_IGNORE: @@ -316,6 +369,10 @@ static int add_breakpoints(struct dyn_ftrace *rec, int enable) /* converting nop to call */ return add_brk_on_nop(rec); + case FTRACE_UPDATE_MODIFY_CALL_REGS: + case FTRACE_UPDATE_MODIFY_CALL: + ftrace_addr = get_ftrace_old_addr(rec); + /* fall through */ case FTRACE_UPDATE_MAKE_NOP: /* converting a call to a nop */ return add_brk_on_call(rec, ftrace_addr); @@ -360,13 +417,21 @@ static int remove_breakpoint(struct dyn_ftrace *rec) * If not, don't touch the breakpoint, we make just create * a disaster. */ - ftrace_addr = (unsigned long)FTRACE_ADDR; + ftrace_addr = get_ftrace_addr(rec); + nop = ftrace_call_replace(ip, ftrace_addr); + + if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) == 0) + goto update; + + /* Check both ftrace_addr and ftrace_old_addr */ + ftrace_addr = get_ftrace_old_addr(rec); nop = ftrace_call_replace(ip, ftrace_addr); if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0) return -EINVAL; } + update: return probe_kernel_write((void *)ip, &nop[0], 1); } @@ -405,12 +470,14 @@ static int add_update(struct dyn_ftrace *rec, int enable) ret = ftrace_test_record(rec, enable); - ftrace_addr = (unsigned long)FTRACE_ADDR; + ftrace_addr = get_ftrace_addr(rec); switch (ret) { case FTRACE_UPDATE_IGNORE: return 0; + case FTRACE_UPDATE_MODIFY_CALL_REGS: + case FTRACE_UPDATE_MODIFY_CALL: case FTRACE_UPDATE_MAKE_CALL: /* converting nop to call */ return add_update_call(rec, ftrace_addr); @@ -455,12 +522,14 @@ static int finish_update(struct dyn_ftrace *rec, int enable) ret = ftrace_update_record(rec, enable); - ftrace_addr = (unsigned long)FTRACE_ADDR; + ftrace_addr = get_ftrace_addr(rec); switch (ret) { case FTRACE_UPDATE_IGNORE: return 0; + case FTRACE_UPDATE_MODIFY_CALL_REGS: + case FTRACE_UPDATE_MODIFY_CALL: case FTRACE_UPDATE_MAKE_CALL: /* converting nop to call */ return finish_update_call(rec, ftrace_addr); -- cgit v1.1 From 4de72395ff4cf48e23b61986dbc90b99a7c4ed97 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 5 Jun 2012 20:00:11 -0400 Subject: ftrace/x86: Add save_regs for i386 function calls Add saving full regs for function tracing on i386. The saving of regs was influenced by patches sent out by Masami Hiramatsu. Link: Link: http://lkml.kernel.org/r/20120711195745.379060003@goodmis.org Reviewed-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- arch/x86/include/asm/ftrace.h | 2 -- arch/x86/kernel/entry_32.S | 68 +++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/ftrace.c | 4 --- 3 files changed, 68 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index a847501..a6cae0c 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -40,10 +40,8 @@ #ifdef CONFIG_DYNAMIC_FTRACE #define ARCH_SUPPORTS_FTRACE_OPS 1 -#ifdef CONFIG_X86_64 #define ARCH_SUPPORTS_FTRACE_SAVE_REGS #endif -#endif #ifndef __ASSEMBLY__ extern void mcount(void); diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 5da11d1..46caa56 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1123,6 +1123,7 @@ ftrace_call: popl %edx popl %ecx popl %eax +ftrace_ret: #ifdef CONFIG_FUNCTION_GRAPH_TRACER .globl ftrace_graph_call ftrace_graph_call: @@ -1134,6 +1135,73 @@ ftrace_stub: ret END(ftrace_caller) +ENTRY(ftrace_regs_caller) + pushf /* push flags before compare (in cs location) */ + cmpl $0, function_trace_stop + jne ftrace_restore_flags + + /* + * i386 does not save SS and ESP when coming from kernel. + * Instead, to get sp, ®s->sp is used (see ptrace.h). + * Unfortunately, that means eflags must be at the same location + * as the current return ip is. We move the return ip into the + * ip location, and move flags into the return ip location. + */ + pushl 4(%esp) /* save return ip into ip slot */ + subl $MCOUNT_INSN_SIZE, (%esp) /* Adjust ip */ + + pushl $0 /* Load 0 into orig_ax */ + pushl %gs + pushl %fs + pushl %es + pushl %ds + pushl %eax + pushl %ebp + pushl %edi + pushl %esi + pushl %edx + pushl %ecx + pushl %ebx + + movl 13*4(%esp), %eax /* Get the saved flags */ + movl %eax, 14*4(%esp) /* Move saved flags into regs->flags location */ + /* clobbering return ip */ + movl $__KERNEL_CS,13*4(%esp) + + movl 12*4(%esp), %eax /* Load ip (1st parameter) */ + movl 0x4(%ebp), %edx /* Load parent ip (2cd parameter) */ + lea (%esp), %ecx + pushl %ecx /* Save pt_regs as 4th parameter */ + leal function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */ + +GLOBAL(ftrace_regs_call) + call ftrace_stub + + addl $4, %esp /* Skip pt_regs */ + movl 14*4(%esp), %eax /* Move flags back into cs */ + movl %eax, 13*4(%esp) /* Needed to keep addl from modifying flags */ + movl 12*4(%esp), %eax /* Get return ip from regs->ip */ + addl $MCOUNT_INSN_SIZE, %eax + movl %eax, 14*4(%esp) /* Put return ip back for ret */ + + popl %ebx + popl %ecx + popl %edx + popl %esi + popl %edi + popl %ebp + popl %eax + popl %ds + popl %es + popl %fs + popl %gs + addl $8, %esp /* Skip orig_ax and ip */ + popf /* Pop flags at end (no addl to corrupt flags) */ + jmp ftrace_ret + +ftrace_restore_flags: + popf + jmp ftrace_stub #else /* ! CONFIG_DYNAMIC_FTRACE */ ENTRY(mcount) diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index b90eb1a..1d41402 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -206,7 +206,6 @@ static int ftrace_modify_code(unsigned long ip, unsigned const char *old_code, unsigned const char *new_code); -#ifdef ARCH_SUPPORTS_FTRACE_SAVE_REGS /* * Should never be called: * As it is only called by __ftrace_replace_code() which is called by @@ -221,7 +220,6 @@ int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, WARN_ON(1); return -EINVAL; } -#endif int ftrace_update_ftrace_func(ftrace_func_t func) { @@ -237,7 +235,6 @@ int ftrace_update_ftrace_func(ftrace_func_t func) ret = ftrace_modify_code(ip, old, new); -#ifdef ARCH_SUPPORTS_FTRACE_SAVE_REGS /* Also update the regs callback function */ if (!ret) { ip = (unsigned long)(&ftrace_regs_call); @@ -245,7 +242,6 @@ int ftrace_update_ftrace_func(ftrace_func_t func) new = ftrace_call_replace(ip, (unsigned long)func); ret = ftrace_modify_code(ip, old, new); } -#endif atomic_dec(&modifying_ftrace_code); -- cgit v1.1 From 9d3c92af47d853d4e31ee971dba7bc086275b7b3 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 17 Jul 2012 21:50:48 +0800 Subject: KVM: x86: remove unnecessary mark_page_dirty fix: [ 132.474633] 3.5.0-rc1+ #50 Not tainted [ 132.474634] ------------------------------- [ 132.474635] include/linux/kvm_host.h:369 suspicious rcu_dereference_check() usage! [ 132.474636] [ 132.474636] other info that might help us debug this: [ 132.474636] [ 132.474638] [ 132.474638] rcu_scheduler_active = 1, debug_locks = 1 [ 132.474640] 1 lock held by qemu-kvm/2832: [ 132.474657] #0: (&vcpu->mutex){+.+.+.}, at: [] vcpu_load+0x1e/0x91 [kvm] [ 132.474658] [ 132.474658] stack backtrace: [ 132.474660] Pid: 2832, comm: qemu-kvm Not tainted 3.5.0-rc1+ #50 [ 132.474661] Call Trace: [ 132.474665] [] lockdep_rcu_suspicious+0xfc/0x105 [ 132.474675] [] kvm_memslots+0x6d/0x75 [kvm] [ 132.474683] [] gfn_to_memslot+0x14/0x4c [kvm] [ 132.474693] [] mark_page_dirty+0x17/0x2a [kvm] [ 132.474706] [] kvm_arch_vcpu_ioctl+0xbcf/0xc07 [kvm] Actually, we do not write vcpu->arch.time at this time, mark_page_dirty should be removed. Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 829b4e9..ecc71dd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2632,7 +2632,6 @@ static int kvm_set_guest_paused(struct kvm_vcpu *vcpu) if (!vcpu->arch.time_page) return -EINVAL; src->flags |= PVCLOCK_GUEST_STOPPED; - mark_page_dirty(vcpu->kvm, vcpu->arch.time >> PAGE_SHIFT); kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); return 0; } -- cgit v1.1 From 86fde74cf5b829627b37ca86322acfdd99b524b8 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 17 Jul 2012 21:52:52 +0800 Subject: KVM: MMU: track the refcount when unmap the page It will trigger a WARN_ON if the page has been freed but it is still used in mmu, it can help us to detect mm bug early Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index a5d6ef7..685a485 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -556,6 +556,14 @@ static int mmu_spte_clear_track_bits(u64 *sptep) return 0; pfn = spte_to_pfn(old_spte); + + /* + * KVM does not hold the refcount of the page used by + * kvm mmu, before reclaiming the page, we should + * unmap it from mmu first. + */ + WARN_ON(!kvm_is_mmio_pfn(pfn) && !page_count(pfn_to_page(pfn))); + if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) kvm_set_pfn_accessed(pfn); if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask)) -- cgit v1.1 From 903816fa4d016e20ec71a1a97700cfcdda115580 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 17 Jul 2012 21:54:11 +0800 Subject: KVM: using get_fault_pfn to get the fault pfn Using get_fault_pfn to cleanup the code Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 685a485..f85cc21 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2513,10 +2513,8 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, unsigned long hva; slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log); - if (!slot) { - get_page(fault_page); - return page_to_pfn(fault_page); - } + if (!slot) + return get_fault_pfn(); hva = gfn_to_hva_memslot(slot, gfn); -- cgit v1.1 From d566104853361cc377c61f70e41c1ad3d44b86c6 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 17 Jul 2012 21:56:16 +0800 Subject: KVM: remove the unused parameter of gfn_to_pfn_memslot The parameter, 'kvm', is not used in gfn_to_pfn_memslot, we can happily remove it Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index f85cc21..4f77f7a 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2518,7 +2518,7 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, hva = gfn_to_hva_memslot(slot, gfn); - return hva_to_pfn_atomic(vcpu->kvm, hva); + return hva_to_pfn_atomic(hva); } static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, -- cgit v1.1 From 0fa060714753ef65aef294b3462efb0212520933 Mon Sep 17 00:00:00 2001 From: Guo Chao Date: Thu, 28 Jun 2012 15:16:19 +0800 Subject: KVM: VMX: Fix typos Signed-off-by: Guo Chao Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index c39b607..2300e53 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1343,7 +1343,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) guest_efer = vmx->vcpu.arch.efer; /* - * NX is emulated; LMA and LME handled by hardware; SCE meaninless + * NX is emulated; LMA and LME handled by hardware; SCE meaningless * outside long mode */ ignore_bits = EFER_NX | EFER_SCE; @@ -3261,7 +3261,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, * qemu binaries. * IA32 arch specifies that at the time of processor reset the * "Accessed" bit in the AR field of segment registers is 1. And qemu - * is setting it to 0 in the usedland code. This causes invalid guest + * is setting it to 0 in the userland code. This causes invalid guest * state vmexit when "unrestricted guest" mode is turned on. * Fix for this setup issue in cpu_reset is being pushed in the qemu * tree. Newer qemu binaries with that qemu fix would not need this @@ -4446,7 +4446,7 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) hypercall[2] = 0xc1; } -/* called to set cr0 as approriate for a mov-to-cr0 exit. */ +/* called to set cr0 as appropriate for a mov-to-cr0 exit. */ static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) { if (to_vmx(vcpu)->nested.vmxon && -- cgit v1.1 From c5ec2e56d0a654797ee43a31001738ccd05eef0b Mon Sep 17 00:00:00 2001 From: Guo Chao Date: Thu, 28 Jun 2012 15:16:43 +0800 Subject: KVM: SVM: Fix typos Signed-off-by: Guo Chao Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/svm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index baead95..687d0c3 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2063,7 +2063,7 @@ static inline bool nested_svm_intr(struct vcpu_svm *svm) if (svm->nested.intercept & 1ULL) { /* * The #vmexit can't be emulated here directly because this - * code path runs with irqs and preemtion disabled. A + * code path runs with irqs and preemption disabled. A * #vmexit emulation might sleep. Only signal request for * the #vmexit here. */ @@ -2409,7 +2409,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) { /* * This function merges the msr permission bitmaps of kvm and the - * nested vmcb. It is omptimized in that it only merges the parts where + * nested vmcb. It is optimized in that it only merges the parts where * the kvm msr permission bitmap may contain zero bits */ int i; -- cgit v1.1 From 4a9699807c491740c4dfe7b6a06703e1d262e802 Mon Sep 17 00:00:00 2001 From: Guo Chao Date: Thu, 28 Jun 2012 15:17:27 +0800 Subject: KVM: x86: Fix typos in x86.c Signed-off-by: Guo Chao Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ecc71dd..3d9d08ed 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1093,7 +1093,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) * For each generation, we track the original measured * nanosecond time, offset, and write, so if TSCs are in * sync, we can match exact offset, and if not, we can match - * exact software computaion in compute_guest_tsc() + * exact software computation in compute_guest_tsc() * * These values are tracked in kvm->arch.cur_xxx variables. */ @@ -1500,7 +1500,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) { gpa_t gpa = data & ~0x3f; - /* Bits 2:5 are resrved, Should be zero */ + /* Bits 2:5 are reserved, Should be zero */ if (data & 0x3c) return 1; @@ -1723,7 +1723,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) * Ignore all writes to this no longer documented MSR. * Writes are only relevant for old K7 processors, * all pre-dating SVM, but a recommended workaround from - * AMD for these chips. It is possible to speicify the + * AMD for these chips. It is possible to specify the * affected processor models on the command line, hence * the need to ignore the workaround. */ @@ -4491,7 +4491,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva) /* * if emulation was due to access to shadowed page table - * and it failed try to unshadow page and re-entetr the + * and it failed try to unshadow page and re-enter the * guest to let CPU execute the instruction. */ if (kvm_mmu_unprotect_page_virt(vcpu, gva)) @@ -5587,7 +5587,7 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) /* * We are here if userspace calls get_regs() in the middle of * instruction emulation. Registers state needs to be copied - * back from emulation context to vcpu. Usrapace shouldn't do + * back from emulation context to vcpu. Userspace shouldn't do * that usually, but some bad designed PV devices (vmware * backdoor interface) need this to work */ @@ -6116,7 +6116,7 @@ int kvm_arch_hardware_enable(void *garbage) * as we reset last_host_tsc on all VCPUs to stop this from being * called multiple times (one for each physical CPU bringup). * - * Platforms with unnreliable TSCs don't have to deal with this, they + * Platforms with unreliable TSCs don't have to deal with this, they * will be compensated by the logic in vcpu_load, which sets the TSC to * catchup mode. This will catchup all VCPUs to real time, but cannot * guarantee that they stay in perfect synchronization. @@ -6391,7 +6391,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, map_flags = MAP_SHARED | MAP_ANONYMOUS; /*To keep backward compatibility with older userspace, - *x86 needs to hanlde !user_alloc case. + *x86 needs to handle !user_alloc case. */ if (!user_alloc) { if (npages && !old.rmap) { -- cgit v1.1 From fc0586807dc4e307da6d3ba4ed5c927b6d27276c Mon Sep 17 00:00:00 2001 From: Guo Chao Date: Thu, 28 Jun 2012 15:19:51 +0800 Subject: KVM: x86: Fix typos in emulate.c Signed-off-by: Guo Chao Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 97d9a99..85b611e 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -642,7 +642,7 @@ static int __linearize(struct x86_emulate_ctxt *ctxt, if (addr.ea > lim || (u32)(addr.ea + size - 1) > lim) goto bad; } else { - /* exapand-down segment */ + /* expand-down segment */ if (addr.ea <= lim || (u32)(addr.ea + size - 1) <= lim) goto bad; lim = desc.d ? 0xffffffff : 0xffff; @@ -1383,7 +1383,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, err_code = selector & 0xfffc; err_vec = GP_VECTOR; - /* can't load system descriptor into segment selecor */ + /* can't load system descriptor into segment selector */ if (seg <= VCPU_SREG_GS && !seg_desc.s) goto exception; @@ -2398,7 +2398,7 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt, set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS); /* - * Now load segment descriptors. If fault happenes at this stage + * Now load segment descriptors. If fault happens at this stage * it is handled in a context of new task */ ret = load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR); @@ -2640,7 +2640,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, * * 1. jmp/call/int to task gate: Check against DPL of the task gate * 2. Exception/IRQ/iret: No check is performed - * 3. jmp/call to TSS: Check agains DPL of the TSS + * 3. jmp/call to TSS: Check against DPL of the TSS */ if (reason == TASK_SWITCH_GATE) { if (idt_index != -1) { @@ -2681,7 +2681,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, ctxt->eflags = ctxt->eflags & ~X86_EFLAGS_NT; /* set back link to prev task only if NT bit is set in eflags - note that old_tss_sel is not used afetr this point */ + note that old_tss_sel is not used after this point */ if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE) old_tss_sel = 0xffff; -- cgit v1.1 From bbbda79510b6e5d399fae76bdb9b999286eb1b59 Mon Sep 17 00:00:00 2001 From: Guo Chao Date: Thu, 28 Jun 2012 15:20:58 +0800 Subject: KVM: x86: Fix typos in cpuid.c Signed-off-by: Guo Chao Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/cpuid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 0595f13..b496da6 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -316,7 +316,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, } case 7: { entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; - /* Mask ebx against host capbability word 9 */ + /* Mask ebx against host capability word 9 */ if (index == 0) { entry->ebx &= kvm_supported_word9_x86_features; cpuid_mask(&entry->ebx, 9); -- cgit v1.1 From d5b0b5b196b474ef26f46f2036c8fbaa8cca2cf5 Mon Sep 17 00:00:00 2001 From: Guo Chao Date: Thu, 28 Jun 2012 15:22:57 +0800 Subject: KVM: x86: Fix typos in lapic.c Signed-off-by: Guo Chao Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/lapic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index ce87878..fff7173 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -719,7 +719,7 @@ static int apic_reg_read(struct kvm_lapic *apic, u32 offset, int len, { unsigned char alignment = offset & 0xf; u32 result; - /* this bitmask has a bit cleared for each reserver register */ + /* this bitmask has a bit cleared for each reserved register */ static const u64 rmask = 0x43ff01ffffffe70cULL; if ((alignment + len) > 4) { @@ -792,7 +792,7 @@ static void start_apic_timer(struct kvm_lapic *apic) atomic_set(&apic->lapic_timer.pending, 0); if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) { - /* lapic timer in oneshot or peroidic mode */ + /* lapic timer in oneshot or periodic mode */ now = apic->lapic_timer.timer.base->get_time(); apic->lapic_timer.period = (u64)apic_get_reg(apic, APIC_TMICT) * APIC_BUS_CYCLE_NS * apic->divide_count; -- cgit v1.1 From c7a7062fa00db7dc66280a72cd9dad0f3595bc66 Mon Sep 17 00:00:00 2001 From: Guo Chao Date: Thu, 28 Jun 2012 15:23:08 +0800 Subject: KVM: x86: Fix typos in pmu.c Signed-off-by: Guo Chao Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/pmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 2e88438..db206a4 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -1,5 +1,5 @@ /* - * Kernel-based Virtual Machine -- Performane Monitoring Unit support + * Kernel-based Virtual Machine -- Performance Monitoring Unit support * * Copyright 2011 Red Hat, Inc. and/or its affiliates. * -- cgit v1.1 From 93b6547e2219784b2df790353e083e0bdbebd2c2 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Thu, 19 Jul 2012 13:55:53 +0300 Subject: KVM: switch to symbolic name for irq_states size Use PIC_NUM_PINS instead of hard-coded 16 for pic pins. Signed-off-by: Michael S. Tsirkin Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/irq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 2086f2b..2d03568 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -70,7 +70,7 @@ struct kvm_pic { struct kvm_io_device dev_slave; struct kvm_io_device dev_eclr; void (*ack_notifier)(void *opaque, int irq); - unsigned long irq_states[16]; + unsigned long irq_states[PIC_NUM_PINS]; }; struct kvm_pic *kvm_create_pic(struct kvm *kvm); -- cgit v1.1 From f2a743473194a1ad44a85f8b63aeef9d63e5bf47 Mon Sep 17 00:00:00 2001 From: Raghavendra K T Date: Wed, 18 Jul 2012 19:07:32 +0530 Subject: KVM: Add config to support ple or cpu relax optimzation Suggested-by: Avi Kivity Signed-off-by: Raghavendra K T Reviewed-by: Marcelo Tosatti Reviewed-by: Rik van Riel Tested-by: Christian Borntraeger # on s390x Signed-off-by: Avi Kivity --- arch/x86/kvm/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index a28f338..45c044f 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -37,6 +37,7 @@ config KVM select TASK_DELAY_ACCT select PERF_EVENTS select HAVE_KVM_MSI + select HAVE_KVM_CPU_RELAX_INTERCEPT ---help--- Support hosting fully virtualized guest machines using hardware virtualization extensions. You will need a fairly recent -- cgit v1.1 From 3b2bd2f800ba9488d9ad493216a0c07d71055b56 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 26 Jul 2012 11:57:43 +0800 Subject: KVM: MMU: use kvm_release_pfn_clean to release pfn The current code depends on the fact that fault_page is the normal page, however, we will use the error code instead of these dummy pages in the later patch, so we use kvm_release_pfn_clean to release pfn which will release the error code properly Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 2419934..a9a2052 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3275,7 +3275,7 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, if (!async) return false; /* *pfn has correct page already */ - put_page(pfn_to_page(*pfn)); + kvm_release_pfn_clean(*pfn); if (!prefault && can_do_async_pf(vcpu)) { trace_kvm_try_async_get_page(gva, gfn); -- cgit v1.1 From f23b070e662ca55a8fdaaa28537af06cab664499 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 26 Jul 2012 13:12:22 +0800 Subject: KVM: x86 emulator: simplify read_emulated No need split mmio read region into 8-bits pieces since we do it in emulator_read_write_onepage Changelog: Add a WARN_ON to check read-cache overflow Acked-by: Marcelo Tosatti Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 85b611e..2c5d1e6 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1166,24 +1166,21 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt, int rc; struct read_cache *mc = &ctxt->mem_read; - while (size) { - int n = min(size, 8u); - size -= n; - if (mc->pos < mc->end) - goto read_cached; - - rc = ctxt->ops->read_emulated(ctxt, addr, mc->data + mc->end, n, - &ctxt->exception); - if (rc != X86EMUL_CONTINUE) - return rc; - mc->end += n; + if (mc->pos < mc->end) + goto read_cached; - read_cached: - memcpy(dest, mc->data + mc->pos, n); - mc->pos += n; - dest += n; - addr += n; - } + WARN_ON((mc->end + size) >= sizeof(mc->data)); + + rc = ctxt->ops->read_emulated(ctxt, addr, mc->data + mc->end, size, + &ctxt->exception); + if (rc != X86EMUL_CONTINUE) + return rc; + + mc->end += size; + +read_cached: + memcpy(dest, mc->data + mc->pos, size); + mc->pos += size; return X86EMUL_CONTINUE; } -- cgit v1.1 From 99245b507dc3b1b2815d6a6cb4e94a6b7018a24b Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 25 Jul 2012 15:49:42 +0300 Subject: KVM: x86 emulator: drop unneeded call to get_segment() setup_syscalls_segments() calls get_segment() and than overwrites all but one of the structure fields and this one should also be overwritten anyway, so we can drop call to get_segment() and avoid a couple of vmreads on vmx. Also drop zeroing ss/cs structures since most of the fields are set anyway. Just set those that were not set explicitly. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 2c5d1e6..10f0136 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2035,12 +2035,6 @@ static void setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, struct desc_struct *cs, struct desc_struct *ss) { - u16 selector; - - memset(cs, 0, sizeof(struct desc_struct)); - ctxt->ops->get_segment(ctxt, &selector, cs, NULL, VCPU_SREG_CS); - memset(ss, 0, sizeof(struct desc_struct)); - cs->l = 0; /* will be adjusted later */ set_desc_base(cs, 0); /* flat segment */ cs->g = 1; /* 4kb granularity */ @@ -2050,6 +2044,7 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, cs->dpl = 0; /* will be adjusted later */ cs->p = 1; cs->d = 1; + cs->avl = 0; set_desc_base(ss, 0); /* flat segment */ set_desc_limit(ss, 0xfffff); /* 4GB limit */ @@ -2059,6 +2054,8 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, ss->d = 1; /* 32bit stack segment */ ss->dpl = 0; ss->p = 1; + ss->l = 0; + ss->avl = 0; } static bool vendor_intel(struct x86_emulate_ctxt *ctxt) -- cgit v1.1 From 23d43cf998275bc97437931c0cdee1df2c1aa3ca Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Tue, 24 Jul 2012 08:51:20 -0400 Subject: KVM: Move KVM_IRQ_LINE to arch-generic code Handle KVM_IRQ_LINE and KVM_IRQ_LINE_STATUS in the generic kvm_vm_ioctl() function and call into kvm_vm_ioctl_irq_line(). This is even more relevant when KVM/ARM also uses this ioctl. Signed-off-by: Christoffer Dall Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 33 ++++++++++----------------------- 1 file changed, 10 insertions(+), 23 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3d9d08ed..b6379e5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3165,6 +3165,16 @@ out: return r; } +int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event) +{ + if (!irqchip_in_kernel(kvm)) + return -ENXIO; + + irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, + irq_event->irq, irq_event->level); + return 0; +} + long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -3271,29 +3281,6 @@ long kvm_arch_vm_ioctl(struct file *filp, create_pit_unlock: mutex_unlock(&kvm->slots_lock); break; - case KVM_IRQ_LINE_STATUS: - case KVM_IRQ_LINE: { - struct kvm_irq_level irq_event; - - r = -EFAULT; - if (copy_from_user(&irq_event, argp, sizeof irq_event)) - goto out; - r = -ENXIO; - if (irqchip_in_kernel(kvm)) { - __s32 status; - status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, - irq_event.irq, irq_event.level); - if (ioctl == KVM_IRQ_LINE_STATUS) { - r = -EFAULT; - irq_event.status = status; - if (copy_to_user(argp, &irq_event, - sizeof irq_event)) - goto out; - } - r = 0; - } - break; - } case KVM_GET_IRQCHIP: { /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ struct kvm_irqchip *chip; -- cgit v1.1 From 7d0642b93780a7309d2954de6f6126d6ceb526f0 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Wed, 11 Jul 2012 15:03:18 -0400 Subject: xen/perf: Define .glob for the different hypercalls. This allows us in perf to have this: 99.67% [kernel] [k] xen_hypercall_sched_op 0.11% [kernel] [k] xen_hypercall_xen_version instead of the borring ever-encompassing: 99.13% [kernel] [k] hypercall_page [v2: Use a macro to define the name and skip] [v3: Use balign per Jan's suggestion] Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/xen-head.S | 56 +++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 54 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index aaa7291..7faed58 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -28,9 +28,61 @@ ENTRY(startup_xen) __FINIT .pushsection .text - .align PAGE_SIZE + .balign PAGE_SIZE ENTRY(hypercall_page) - .skip PAGE_SIZE +#define NEXT_HYPERCALL(x) \ + ENTRY(xen_hypercall_##x) \ + .skip 32 + +NEXT_HYPERCALL(set_trap_table) +NEXT_HYPERCALL(mmu_update) +NEXT_HYPERCALL(set_gdt) +NEXT_HYPERCALL(stack_switch) +NEXT_HYPERCALL(set_callbacks) +NEXT_HYPERCALL(fpu_taskswitch) +NEXT_HYPERCALL(sched_op_compat) +NEXT_HYPERCALL(platform_op) +NEXT_HYPERCALL(set_debugreg) +NEXT_HYPERCALL(get_debugreg) +NEXT_HYPERCALL(update_descriptor) +NEXT_HYPERCALL(ni) +NEXT_HYPERCALL(memory_op) +NEXT_HYPERCALL(multicall) +NEXT_HYPERCALL(update_va_mapping) +NEXT_HYPERCALL(set_timer_op) +NEXT_HYPERCALL(event_channel_op_compat) +NEXT_HYPERCALL(xen_version) +NEXT_HYPERCALL(console_io) +NEXT_HYPERCALL(physdev_op_compat) +NEXT_HYPERCALL(grant_table_op) +NEXT_HYPERCALL(vm_assist) +NEXT_HYPERCALL(update_va_mapping_otherdomain) +NEXT_HYPERCALL(iret) +NEXT_HYPERCALL(vcpu_op) +NEXT_HYPERCALL(set_segment_base) +NEXT_HYPERCALL(mmuext_op) +NEXT_HYPERCALL(xsm_op) +NEXT_HYPERCALL(nmi_op) +NEXT_HYPERCALL(sched_op) +NEXT_HYPERCALL(callback_op) +NEXT_HYPERCALL(xenoprof_op) +NEXT_HYPERCALL(event_channel_op) +NEXT_HYPERCALL(physdev_op) +NEXT_HYPERCALL(hvm_op) +NEXT_HYPERCALL(sysctl) +NEXT_HYPERCALL(domctl) +NEXT_HYPERCALL(kexec_op) +NEXT_HYPERCALL(tmem_op) /* 38 */ +ENTRY(xen_hypercall_rsvr) + .skip 320 +NEXT_HYPERCALL(mca) /* 48 */ +NEXT_HYPERCALL(arch_1) +NEXT_HYPERCALL(arch_2) +NEXT_HYPERCALL(arch_3) +NEXT_HYPERCALL(arch_4) +NEXT_HYPERCALL(arch_5) +NEXT_HYPERCALL(arch_6) + .balign PAGE_SIZE .popsection ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") -- cgit v1.1 From 4a4541a40e1fe145c72c4b959fac524a5600d9fb Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 22 Jul 2012 17:41:00 +0300 Subject: KVM: Don't update PPR on any APIC read The current code will update the PPR on almost any APIC read; however that's only required if we read the PPR. kvm_update_ppr() shows up in some profiles, albeit with a low usage (~1%). This should reduce it further (it will still be called during interrupt processing). Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/lapic.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index fff7173..ad7fff7 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -696,12 +696,14 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset) val = apic_get_tmcct(apic); break; - + case APIC_PROCPRI: + apic_update_ppr(apic); + val = apic_get_reg(apic, offset); + break; case APIC_TASKPRI: report_tpr_access(apic, false); /* fall thru */ default: - apic_update_ppr(apic); val = apic_get_reg(apic, offset); break; } -- cgit v1.1 From e4ea3f6b1bf3d489674a3660db652636e50186f9 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Thu, 19 Jul 2012 13:04:47 -0400 Subject: ftrace/x86_32: Simplify parameter setup for ftrace_regs_caller The final position of the stack after saving regs and setting up the parameters for ftrace_regs_call, is the position of the pt_regs needed for the 4th parameter. Instead of saving it into a temporary reg and pushing the reg, simply push the stack pointer. Link: http://lkml.kernel.org/r/1342702344.12353.16.camel@gandalf.stny.rr.com Reviewed-by: Masami Hiramatsu Signed-off-by: Uros Bizjak Signed-off-by: Steven Rostedt --- arch/x86/kernel/entry_32.S | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 46caa56..4dc3017 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1169,10 +1169,9 @@ ENTRY(ftrace_regs_caller) movl $__KERNEL_CS,13*4(%esp) movl 12*4(%esp), %eax /* Load ip (1st parameter) */ - movl 0x4(%ebp), %edx /* Load parent ip (2cd parameter) */ - lea (%esp), %ecx - pushl %ecx /* Save pt_regs as 4th parameter */ + movl 0x4(%ebp), %edx /* Load parent ip (2nd parameter) */ leal function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */ + pushl %esp /* Save pt_regs as 4th parameter */ GLOBAL(ftrace_regs_call) call ftrace_stub -- cgit v1.1 From 5767cfeaa9ec7b67c802143394f3ad9f8b174eb8 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 3 Jul 2012 16:16:09 -0400 Subject: ftrace/x86: Remove function_trace_stop check from graph caller The graph caller is called by the mcount callers, which already does the check against the function_trace_stop variable. No reason to check it again. Link: http://lkml.kernel.org/r/20120711195745.588538769@goodmis.org Signed-off-by: Steven Rostedt --- arch/x86/kernel/entry_32.S | 3 --- arch/x86/kernel/entry_64.S | 3 --- 2 files changed, 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 4dc3017..061ac17 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1241,9 +1241,6 @@ END(mcount) #ifdef CONFIG_FUNCTION_GRAPH_TRACER ENTRY(ftrace_graph_caller) - cmpl $0, function_trace_stop - jne ftrace_stub - pushl %eax pushl %ecx pushl %edx diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 52bda2e..38308fa 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -213,9 +213,6 @@ END(mcount) #ifdef CONFIG_FUNCTION_GRAPH_TRACER ENTRY(ftrace_graph_caller) - cmpl $0, function_trace_stop - jne ftrace_stub - MCOUNT_SAVE_FRAME leaq 8(%rbp), %rdi -- cgit v1.1 From e52538965119319447c0800c534da73142c27be2 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 5 Jun 2012 19:28:38 +0900 Subject: kprobes/x86: ftrace based optimization for x86 Add function tracer based kprobe optimization support handlers on x86. This allows kprobes to use function tracer for probing on mcount call. Link: http://lkml.kernel.org/r/20120605102838.27845.26317.stgit@localhost.localdomain Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Ananth N Mavinakayanahalli Cc: "Frank Ch. Eigler" Cc: Andrew Morton Cc: Frederic Weisbecker Signed-off-by: Masami Hiramatsu [ Updated to new port of ftrace save regs functions ] Signed-off-by: Steven Rostedt --- arch/x86/include/asm/kprobes.h | 1 + arch/x86/kernel/kprobes.c | 48 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h index 5478825..d3ddd17 100644 --- a/arch/x86/include/asm/kprobes.h +++ b/arch/x86/include/asm/kprobes.h @@ -27,6 +27,7 @@ #include #define __ARCH_WANT_KPROBES_INSN_SLOT +#define ARCH_SUPPORTS_KPROBES_ON_FTRACE struct pt_regs; struct kprobe; diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index e2f751e..47ae102 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -1052,6 +1052,54 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) return 0; } +#ifdef KPROBES_CAN_USE_FTRACE +/* Ftrace callback handler for kprobes */ +void __kprobes kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *ops, struct pt_regs *regs) +{ + struct kprobe *p; + struct kprobe_ctlblk *kcb; + unsigned long flags; + + /* Disable irq for emulating a breakpoint and avoiding preempt */ + local_irq_save(flags); + + p = get_kprobe((kprobe_opcode_t *)ip); + if (unlikely(!p) || kprobe_disabled(p)) + goto end; + + kcb = get_kprobe_ctlblk(); + if (kprobe_running()) { + kprobes_inc_nmissed_count(p); + } else { + regs->ip += sizeof(kprobe_opcode_t); + + __this_cpu_write(current_kprobe, p); + kcb->kprobe_status = KPROBE_HIT_ACTIVE; + if (p->pre_handler) + p->pre_handler(p, regs); + + if (unlikely(p->post_handler)) { + /* Emulate singlestep as if there is a 5byte nop */ + regs->ip = ip + MCOUNT_INSN_SIZE; + kcb->kprobe_status = KPROBE_HIT_SSDONE; + p->post_handler(p, regs, 0); + } + __this_cpu_write(current_kprobe, NULL); + regs->ip = ip; /* Recover for next callback */ + } +end: + local_irq_restore(flags); +} + +int __kprobes arch_prepare_kprobe_ftrace(struct kprobe *p) +{ + p->ainsn.insn = NULL; + p->ainsn.boostable = -1; + return 0; +} +#endif + int __init arch_init_kprobes(void) { return arch_init_optprobes(); -- cgit v1.1 From e9d90d472da97e1b1560bffb89578ba082c88a69 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 26 Jul 2012 18:01:50 +0300 Subject: KVM: Remove internal timer abstraction kvm_timer_fn(), the sole inhabitant of timer.c, is only used by lapic.c. Move it there to make it easier to hack on it. struct kvm_timer is a thin wrapper around hrtimer, and only adds obfuscation. Move near its two users (with different names) to prepare for simplification. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/Makefile | 2 +- arch/x86/kvm/i8254.c | 8 ++++---- arch/x86/kvm/i8254.h | 18 +++++++++++++++++- arch/x86/kvm/kvm_timer.h | 18 ------------------ arch/x86/kvm/lapic.c | 30 +++++++++++++++++++++++++++++- arch/x86/kvm/lapic.h | 17 ++++++++++++++++- arch/x86/kvm/timer.c | 47 ----------------------------------------------- 7 files changed, 67 insertions(+), 73 deletions(-) delete mode 100644 arch/x86/kvm/kvm_timer.h delete mode 100644 arch/x86/kvm/timer.c (limited to 'arch/x86') diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 4f579e8..04d3040 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -12,7 +12,7 @@ kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o) kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o) kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ - i8254.o timer.o cpuid.o pmu.o + i8254.o cpuid.o pmu.o kvm-intel-y += vmx.o kvm-amd-y += svm.o diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index adba28f..1d8e757 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -272,14 +272,14 @@ static void destroy_pit_timer(struct kvm_pit *pit) flush_kthread_work(&pit->expired); } -static bool kpit_is_periodic(struct kvm_timer *ktimer) +static bool kpit_is_periodic(struct kvm_pit_timer *ktimer) { struct kvm_kpit_state *ps = container_of(ktimer, struct kvm_kpit_state, pit_timer); return ps->is_periodic; } -static struct kvm_timer_ops kpit_ops = { +static struct kvm_pit_timer_ops kpit_ops = { .is_periodic = kpit_is_periodic, }; @@ -322,7 +322,7 @@ static void pit_do_work(struct kthread_work *work) static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) { - struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer); + struct kvm_pit_timer *ktimer = container_of(data, struct kvm_pit_timer, timer); struct kvm_pit *pt = ktimer->kvm->arch.vpit; if (ktimer->reinject || !atomic_read(&ktimer->pending)) { @@ -340,7 +340,7 @@ static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) static void create_pit_timer(struct kvm *kvm, u32 val, int is_period) { struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state; - struct kvm_timer *pt = &ps->pit_timer; + struct kvm_pit_timer *pt = &ps->pit_timer; s64 interval; if (!irqchip_in_kernel(kvm) || ps->flags & KVM_PIT_FLAGS_HPET_LEGACY) diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index fdf4042..3351816 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h @@ -21,10 +21,26 @@ struct kvm_kpit_channel_state { ktime_t count_load_time; }; +struct kvm_pit_timer { + struct hrtimer timer; + s64 period; /* unit: ns */ + u32 timer_mode_mask; + u64 tscdeadline; + atomic_t pending; /* accumulated triggered timers */ + bool reinject; + struct kvm_pit_timer_ops *t_ops; + struct kvm *kvm; + struct kvm_vcpu *vcpu; +}; + +struct kvm_pit_timer_ops { + bool (*is_periodic)(struct kvm_pit_timer *); +}; + struct kvm_kpit_state { struct kvm_kpit_channel_state channels[3]; u32 flags; - struct kvm_timer pit_timer; + struct kvm_pit_timer pit_timer; bool is_periodic; u32 speaker_data_on; struct mutex lock; diff --git a/arch/x86/kvm/kvm_timer.h b/arch/x86/kvm/kvm_timer.h deleted file mode 100644 index 497dbaa..0000000 --- a/arch/x86/kvm/kvm_timer.h +++ /dev/null @@ -1,18 +0,0 @@ - -struct kvm_timer { - struct hrtimer timer; - s64 period; /* unit: ns */ - u32 timer_mode_mask; - u64 tscdeadline; - atomic_t pending; /* accumulated triggered timers */ - bool reinject; - struct kvm_timer_ops *t_ops; - struct kvm *kvm; - struct kvm_vcpu *vcpu; -}; - -struct kvm_timer_ops { - bool (*is_periodic)(struct kvm_timer *); -}; - -enum hrtimer_restart kvm_timer_fn(struct hrtimer *data); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index ad7fff7..61ed32c 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1262,6 +1262,34 @@ static const struct kvm_io_device_ops apic_mmio_ops = { .write = apic_mmio_write, }; +static enum hrtimer_restart apic_timer_fn(struct hrtimer *data) +{ + struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer); + struct kvm_vcpu *vcpu = ktimer->vcpu; + wait_queue_head_t *q = &vcpu->wq; + + /* + * There is a race window between reading and incrementing, but we do + * not care about potentially losing timer events in the !reinject + * case anyway. Note: KVM_REQ_PENDING_TIMER is implicitly checked + * in vcpu_enter_guest. + */ + if (ktimer->reinject || !atomic_read(&ktimer->pending)) { + atomic_inc(&ktimer->pending); + /* FIXME: this code should not know anything about vcpus */ + kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); + } + + if (waitqueue_active(q)) + wake_up_interruptible(q); + + if (ktimer->t_ops->is_periodic(ktimer)) { + hrtimer_add_expires_ns(&ktimer->timer, ktimer->period); + return HRTIMER_RESTART; + } else + return HRTIMER_NORESTART; +} + int kvm_create_lapic(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic; @@ -1285,7 +1313,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - apic->lapic_timer.timer.function = kvm_timer_fn; + apic->lapic_timer.timer.function = apic_timer_fn; apic->lapic_timer.t_ops = &lapic_timer_ops; apic->lapic_timer.kvm = vcpu->kvm; apic->lapic_timer.vcpu = vcpu; diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 4af5405..d7251c9 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -2,10 +2,25 @@ #define __KVM_X86_LAPIC_H #include "iodev.h" -#include "kvm_timer.h" #include +struct kvm_timer { + struct hrtimer timer; + s64 period; /* unit: ns */ + u32 timer_mode_mask; + u64 tscdeadline; + atomic_t pending; /* accumulated triggered timers */ + bool reinject; + struct kvm_timer_ops *t_ops; + struct kvm *kvm; + struct kvm_vcpu *vcpu; +}; + +struct kvm_timer_ops { + bool (*is_periodic)(struct kvm_timer *); +}; + struct kvm_lapic { unsigned long base_address; struct kvm_io_device dev; diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c deleted file mode 100644 index 6b85cc6..0000000 --- a/arch/x86/kvm/timer.c +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Kernel-based Virtual Machine driver for Linux - * - * This module enables machines with Intel VT-x extensions to run virtual - * machines without emulation or binary translation. - * - * timer support - * - * Copyright 2010 Red Hat, Inc. and/or its affiliates. - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - */ - -#include -#include -#include -#include -#include "kvm_timer.h" - -enum hrtimer_restart kvm_timer_fn(struct hrtimer *data) -{ - struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer); - struct kvm_vcpu *vcpu = ktimer->vcpu; - wait_queue_head_t *q = &vcpu->wq; - - /* - * There is a race window between reading and incrementing, but we do - * not care about potentially losing timer events in the !reinject - * case anyway. Note: KVM_REQ_PENDING_TIMER is implicitly checked - * in vcpu_enter_guest. - */ - if (ktimer->reinject || !atomic_read(&ktimer->pending)) { - atomic_inc(&ktimer->pending); - /* FIXME: this code should not know anything about vcpus */ - kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); - } - - if (waitqueue_active(q)) - wake_up_interruptible(q); - - if (ktimer->t_ops->is_periodic(ktimer)) { - hrtimer_add_expires_ns(&ktimer->timer, ktimer->period); - return HRTIMER_RESTART; - } else - return HRTIMER_NORESTART; -} -- cgit v1.1 From 2a6eac9638a92b61de04bac4233d8ca665ae96af Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 26 Jul 2012 18:01:51 +0300 Subject: KVM: Simplify kvm_timer 'reinject' is never initialized 't_ops' only serves as indirection to lapic_is_periodic; call that directly instead 'kvm' is never used 'vcpu' can be derived via container_of Remove these fields. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/lapic.c | 18 +++++------------- arch/x86/kvm/lapic.h | 8 -------- 2 files changed, 5 insertions(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 61ed32c..0cd431c 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1214,10 +1214,8 @@ int kvm_lapic_enabled(struct kvm_vcpu *vcpu) *---------------------------------------------------------------------- */ -static bool lapic_is_periodic(struct kvm_timer *ktimer) +static bool lapic_is_periodic(struct kvm_lapic *apic) { - struct kvm_lapic *apic = container_of(ktimer, struct kvm_lapic, - lapic_timer); return apic_lvtt_period(apic); } @@ -1253,10 +1251,6 @@ void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu) kvm_apic_local_deliver(apic, APIC_LVT0); } -static struct kvm_timer_ops lapic_timer_ops = { - .is_periodic = lapic_is_periodic, -}; - static const struct kvm_io_device_ops apic_mmio_ops = { .read = apic_mmio_read, .write = apic_mmio_write, @@ -1265,7 +1259,8 @@ static const struct kvm_io_device_ops apic_mmio_ops = { static enum hrtimer_restart apic_timer_fn(struct hrtimer *data) { struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer); - struct kvm_vcpu *vcpu = ktimer->vcpu; + struct kvm_lapic *apic = container_of(ktimer, struct kvm_lapic, lapic_timer); + struct kvm_vcpu *vcpu = apic->vcpu; wait_queue_head_t *q = &vcpu->wq; /* @@ -1274,7 +1269,7 @@ static enum hrtimer_restart apic_timer_fn(struct hrtimer *data) * case anyway. Note: KVM_REQ_PENDING_TIMER is implicitly checked * in vcpu_enter_guest. */ - if (ktimer->reinject || !atomic_read(&ktimer->pending)) { + if (!atomic_read(&ktimer->pending)) { atomic_inc(&ktimer->pending); /* FIXME: this code should not know anything about vcpus */ kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); @@ -1283,7 +1278,7 @@ static enum hrtimer_restart apic_timer_fn(struct hrtimer *data) if (waitqueue_active(q)) wake_up_interruptible(q); - if (ktimer->t_ops->is_periodic(ktimer)) { + if (lapic_is_periodic(apic)) { hrtimer_add_expires_ns(&ktimer->timer, ktimer->period); return HRTIMER_RESTART; } else @@ -1314,9 +1309,6 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); apic->lapic_timer.timer.function = apic_timer_fn; - apic->lapic_timer.t_ops = &lapic_timer_ops; - apic->lapic_timer.kvm = vcpu->kvm; - apic->lapic_timer.vcpu = vcpu; apic->base_address = APIC_DEFAULT_PHYS_BASE; vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE; diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index d7251c9..166766f 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -11,14 +11,6 @@ struct kvm_timer { u32 timer_mode_mask; u64 tscdeadline; atomic_t pending; /* accumulated triggered timers */ - bool reinject; - struct kvm_timer_ops *t_ops; - struct kvm *kvm; - struct kvm_vcpu *vcpu; -}; - -struct kvm_timer_ops { - bool (*is_periodic)(struct kvm_timer *); }; struct kvm_lapic { -- cgit v1.1 From 9d9d2239bdecd525ce3eb6cbfe4abb925c98208c Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 26 Jul 2012 18:01:52 +0300 Subject: KVM: Simplify kvm_pit_timer 'timer_mode_mask' is unused 'tscdeadline' is unused 't_ops' only adds needless indirection 'vcpu' is unused Remove. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/i8254.c | 14 +------------- arch/x86/kvm/i8254.h | 8 -------- 2 files changed, 1 insertion(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 1d8e757..a9e187a 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -272,17 +272,6 @@ static void destroy_pit_timer(struct kvm_pit *pit) flush_kthread_work(&pit->expired); } -static bool kpit_is_periodic(struct kvm_pit_timer *ktimer) -{ - struct kvm_kpit_state *ps = container_of(ktimer, struct kvm_kpit_state, - pit_timer); - return ps->is_periodic; -} - -static struct kvm_pit_timer_ops kpit_ops = { - .is_periodic = kpit_is_periodic, -}; - static void pit_do_work(struct kthread_work *work) { struct kvm_pit *pit = container_of(work, struct kvm_pit, expired); @@ -330,7 +319,7 @@ static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) queue_kthread_work(&pt->worker, &pt->expired); } - if (ktimer->t_ops->is_periodic(ktimer)) { + if (pt->pit_state.is_periodic) { hrtimer_add_expires_ns(&ktimer->timer, ktimer->period); return HRTIMER_RESTART; } else @@ -357,7 +346,6 @@ static void create_pit_timer(struct kvm *kvm, u32 val, int is_period) ps->is_periodic = is_period; pt->timer.function = pit_timer_fn; - pt->t_ops = &kpit_ops; pt->kvm = ps->pit->kvm; atomic_set(&pt->pending, 0); diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index 3351816..c9bbcb8 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h @@ -24,17 +24,9 @@ struct kvm_kpit_channel_state { struct kvm_pit_timer { struct hrtimer timer; s64 period; /* unit: ns */ - u32 timer_mode_mask; - u64 tscdeadline; atomic_t pending; /* accumulated triggered timers */ bool reinject; - struct kvm_pit_timer_ops *t_ops; struct kvm *kvm; - struct kvm_vcpu *vcpu; -}; - -struct kvm_pit_timer_ops { - bool (*is_periodic)(struct kvm_pit_timer *); }; struct kvm_kpit_state { -- cgit v1.1 From 26ef19242f6e4d747a61b5fd8da72343838864e4 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 26 Jul 2012 18:01:53 +0300 Subject: KVM: fold kvm_pit_timer into kvm_kpit_state One structure nests inside the other, providing no value at all. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/i8254.c | 52 +++++++++++++++++++++++++--------------------------- arch/x86/kvm/i8254.h | 14 +++++--------- arch/x86/kvm/x86.c | 2 +- 3 files changed, 31 insertions(+), 37 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index a9e187a..11300d2 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -108,7 +108,7 @@ static s64 __kpit_elapsed(struct kvm *kvm) ktime_t remaining; struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state; - if (!ps->pit_timer.period) + if (!ps->period) return 0; /* @@ -120,9 +120,9 @@ static s64 __kpit_elapsed(struct kvm *kvm) * itself with the initial count and continues counting * from there. */ - remaining = hrtimer_get_remaining(&ps->pit_timer.timer); - elapsed = ps->pit_timer.period - ktime_to_ns(remaining); - elapsed = mod_64(elapsed, ps->pit_timer.period); + remaining = hrtimer_get_remaining(&ps->timer); + elapsed = ps->period - ktime_to_ns(remaining); + elapsed = mod_64(elapsed, ps->period); return elapsed; } @@ -238,12 +238,12 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian) int value; spin_lock(&ps->inject_lock); - value = atomic_dec_return(&ps->pit_timer.pending); + value = atomic_dec_return(&ps->pending); if (value < 0) /* spurious acks can be generated if, for example, the * PIC is being reset. Handle it gracefully here */ - atomic_inc(&ps->pit_timer.pending); + atomic_inc(&ps->pending); else if (value > 0) /* in this case, we had multiple outstanding pit interrupts * that we needed to inject. Reinject @@ -261,14 +261,14 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) if (!kvm_vcpu_is_bsp(vcpu) || !pit) return; - timer = &pit->pit_state.pit_timer.timer; + timer = &pit->pit_state.timer; if (hrtimer_cancel(timer)) hrtimer_start_expires(timer, HRTIMER_MODE_ABS); } static void destroy_pit_timer(struct kvm_pit *pit) { - hrtimer_cancel(&pit->pit_state.pit_timer.timer); + hrtimer_cancel(&pit->pit_state.timer); flush_kthread_work(&pit->expired); } @@ -311,16 +311,16 @@ static void pit_do_work(struct kthread_work *work) static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) { - struct kvm_pit_timer *ktimer = container_of(data, struct kvm_pit_timer, timer); - struct kvm_pit *pt = ktimer->kvm->arch.vpit; + struct kvm_kpit_state *ps = container_of(data, struct kvm_kpit_state, timer); + struct kvm_pit *pt = ps->kvm->arch.vpit; - if (ktimer->reinject || !atomic_read(&ktimer->pending)) { - atomic_inc(&ktimer->pending); + if (ps->reinject || !atomic_read(&ps->pending)) { + atomic_inc(&ps->pending); queue_kthread_work(&pt->worker, &pt->expired); } - if (pt->pit_state.is_periodic) { - hrtimer_add_expires_ns(&ktimer->timer, ktimer->period); + if (ps->is_periodic) { + hrtimer_add_expires_ns(&ps->timer, ps->period); return HRTIMER_RESTART; } else return HRTIMER_NORESTART; @@ -329,7 +329,6 @@ static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) static void create_pit_timer(struct kvm *kvm, u32 val, int is_period) { struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state; - struct kvm_pit_timer *pt = &ps->pit_timer; s64 interval; if (!irqchip_in_kernel(kvm) || ps->flags & KVM_PIT_FLAGS_HPET_LEGACY) @@ -340,18 +339,18 @@ static void create_pit_timer(struct kvm *kvm, u32 val, int is_period) pr_debug("create pit timer, interval is %llu nsec\n", interval); /* TODO The new value only affected after the retriggered */ - hrtimer_cancel(&pt->timer); + hrtimer_cancel(&ps->timer); flush_kthread_work(&ps->pit->expired); - pt->period = interval; + ps->period = interval; ps->is_periodic = is_period; - pt->timer.function = pit_timer_fn; - pt->kvm = ps->pit->kvm; + ps->timer.function = pit_timer_fn; + ps->kvm = ps->pit->kvm; - atomic_set(&pt->pending, 0); + atomic_set(&ps->pending, 0); ps->irq_ack = 1; - hrtimer_start(&pt->timer, ktime_add_ns(ktime_get(), interval), + hrtimer_start(&ps->timer, ktime_add_ns(ktime_get(), interval), HRTIMER_MODE_ABS); } @@ -627,7 +626,7 @@ void kvm_pit_reset(struct kvm_pit *pit) } mutex_unlock(&pit->pit_state.lock); - atomic_set(&pit->pit_state.pit_timer.pending, 0); + atomic_set(&pit->pit_state.pending, 0); pit->pit_state.irq_ack = 1; } @@ -636,7 +635,7 @@ static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask) struct kvm_pit *pit = container_of(kimn, struct kvm_pit, mask_notifier); if (!mask) { - atomic_set(&pit->pit_state.pit_timer.pending, 0); + atomic_set(&pit->pit_state.pending, 0); pit->pit_state.irq_ack = 1; } } @@ -694,12 +693,11 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) pit_state = &pit->pit_state; pit_state->pit = pit; - hrtimer_init(&pit_state->pit_timer.timer, - CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + hrtimer_init(&pit_state->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); pit_state->irq_ack_notifier.gsi = 0; pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq; kvm_register_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier); - pit_state->pit_timer.reinject = true; + pit_state->reinject = true; mutex_unlock(&pit->pit_state.lock); kvm_pit_reset(pit); @@ -749,7 +747,7 @@ void kvm_free_pit(struct kvm *kvm) kvm_unregister_irq_ack_notifier(kvm, &kvm->arch.vpit->pit_state.irq_ack_notifier); mutex_lock(&kvm->arch.vpit->pit_state.lock); - timer = &kvm->arch.vpit->pit_state.pit_timer.timer; + timer = &kvm->arch.vpit->pit_state.timer; hrtimer_cancel(timer); flush_kthread_work(&kvm->arch.vpit->expired); kthread_stop(kvm->arch.vpit->worker_task); diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index c9bbcb8..dd1b16b 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h @@ -21,19 +21,15 @@ struct kvm_kpit_channel_state { ktime_t count_load_time; }; -struct kvm_pit_timer { - struct hrtimer timer; - s64 period; /* unit: ns */ - atomic_t pending; /* accumulated triggered timers */ - bool reinject; - struct kvm *kvm; -}; - struct kvm_kpit_state { struct kvm_kpit_channel_state channels[3]; u32 flags; - struct kvm_pit_timer pit_timer; bool is_periodic; + s64 period; /* unit: ns */ + struct hrtimer timer; + atomic_t pending; /* accumulated triggered timers */ + bool reinject; + struct kvm *kvm; u32 speaker_data_on; struct mutex lock; struct kvm_pit *pit; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b6379e5..3a53bcc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3082,7 +3082,7 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm, if (!kvm->arch.vpit) return -ENXIO; mutex_lock(&kvm->arch.vpit->pit_state.lock); - kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject; + kvm->arch.vpit->pit_state.reinject = control->pit_reinject; mutex_unlock(&kvm->arch.vpit->pit_state.lock); return 0; } -- cgit v1.1 From 7af6c2456851eb08d51d95de38ae8302994031e9 Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Wed, 11 Jul 2012 14:20:51 +0300 Subject: crypto: arch/x86 - cleanup - remove unneeded crypto_alg.cra_list initializations Initialization of cra_list is currently mixed, most ciphers initialize this field and most shashes do not. Initialization however is not needed at all since cra_list is initialized/overwritten in __crypto_register_alg() with list_add(). Therefore perform cleanup to remove all unneeded initializations of this field in 'arch/x86/crypto/'. Signed-off-by: Jussi Kivilinna Signed-off-by: Herbert Xu --- arch/x86/crypto/aes_glue.c | 1 - arch/x86/crypto/aesni-intel_glue.c | 5 +---- arch/x86/crypto/blowfish_glue.c | 4 ---- arch/x86/crypto/camellia_glue.c | 6 ------ arch/x86/crypto/ghash-clmulni-intel_glue.c | 2 -- arch/x86/crypto/salsa20_glue.c | 1 - arch/x86/crypto/serpent_avx_glue.c | 10 ---------- arch/x86/crypto/serpent_sse2_glue.c | 10 ---------- arch/x86/crypto/twofish_avx_glue.c | 10 ---------- arch/x86/crypto/twofish_glue.c | 1 - arch/x86/crypto/twofish_glue_3way.c | 5 ----- 11 files changed, 1 insertion(+), 54 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/aes_glue.c b/arch/x86/crypto/aes_glue.c index 59b37de..aafe8ce 100644 --- a/arch/x86/crypto/aes_glue.c +++ b/arch/x86/crypto/aes_glue.c @@ -40,7 +40,6 @@ static struct crypto_alg aes_alg = { .cra_blocksize = AES_BLOCK_SIZE, .cra_ctxsize = sizeof(struct crypto_aes_ctx), .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(aes_alg.cra_list), .cra_u = { .cipher = { .cia_min_keysize = AES_MIN_KEY_SIZE, diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 34fdcff..648347a 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -1118,7 +1118,7 @@ MODULE_DEVICE_TABLE(x86cpu, aesni_cpu_id); static int __init aesni_init(void) { - int err, i; + int err; if (!x86_match_cpu(aesni_cpu_id)) return -ENODEV; @@ -1127,9 +1127,6 @@ static int __init aesni_init(void) if (err) return err; - for (i = 0; i < ARRAY_SIZE(aesni_algs); i++) - INIT_LIST_HEAD(&aesni_algs[i].cra_list); - return crypto_register_algs(aesni_algs, ARRAY_SIZE(aesni_algs)); } diff --git a/arch/x86/crypto/blowfish_glue.c b/arch/x86/crypto/blowfish_glue.c index 7967474..50ec333 100644 --- a/arch/x86/crypto/blowfish_glue.c +++ b/arch/x86/crypto/blowfish_glue.c @@ -367,7 +367,6 @@ static struct crypto_alg bf_algs[4] = { { .cra_ctxsize = sizeof(struct bf_ctx), .cra_alignmask = 0, .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(bf_algs[0].cra_list), .cra_u = { .cipher = { .cia_min_keysize = BF_MIN_KEY_SIZE, @@ -387,7 +386,6 @@ static struct crypto_alg bf_algs[4] = { { .cra_alignmask = 0, .cra_type = &crypto_blkcipher_type, .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(bf_algs[1].cra_list), .cra_u = { .blkcipher = { .min_keysize = BF_MIN_KEY_SIZE, @@ -407,7 +405,6 @@ static struct crypto_alg bf_algs[4] = { { .cra_alignmask = 0, .cra_type = &crypto_blkcipher_type, .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(bf_algs[2].cra_list), .cra_u = { .blkcipher = { .min_keysize = BF_MIN_KEY_SIZE, @@ -428,7 +425,6 @@ static struct crypto_alg bf_algs[4] = { { .cra_alignmask = 0, .cra_type = &crypto_blkcipher_type, .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(bf_algs[3].cra_list), .cra_u = { .blkcipher = { .min_keysize = BF_MIN_KEY_SIZE, diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c index eeb2b3b..7a74d7b 100644 --- a/arch/x86/crypto/camellia_glue.c +++ b/arch/x86/crypto/camellia_glue.c @@ -1601,7 +1601,6 @@ static struct crypto_alg camellia_algs[6] = { { .cra_ctxsize = sizeof(struct camellia_ctx), .cra_alignmask = 0, .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(camellia_algs[0].cra_list), .cra_u = { .cipher = { .cia_min_keysize = CAMELLIA_MIN_KEY_SIZE, @@ -1621,7 +1620,6 @@ static struct crypto_alg camellia_algs[6] = { { .cra_alignmask = 0, .cra_type = &crypto_blkcipher_type, .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(camellia_algs[1].cra_list), .cra_u = { .blkcipher = { .min_keysize = CAMELLIA_MIN_KEY_SIZE, @@ -1641,7 +1639,6 @@ static struct crypto_alg camellia_algs[6] = { { .cra_alignmask = 0, .cra_type = &crypto_blkcipher_type, .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(camellia_algs[2].cra_list), .cra_u = { .blkcipher = { .min_keysize = CAMELLIA_MIN_KEY_SIZE, @@ -1662,7 +1659,6 @@ static struct crypto_alg camellia_algs[6] = { { .cra_alignmask = 0, .cra_type = &crypto_blkcipher_type, .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(camellia_algs[3].cra_list), .cra_u = { .blkcipher = { .min_keysize = CAMELLIA_MIN_KEY_SIZE, @@ -1683,7 +1679,6 @@ static struct crypto_alg camellia_algs[6] = { { .cra_alignmask = 0, .cra_type = &crypto_blkcipher_type, .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(camellia_algs[4].cra_list), .cra_exit = lrw_exit_tfm, .cra_u = { .blkcipher = { @@ -1707,7 +1702,6 @@ static struct crypto_alg camellia_algs[6] = { { .cra_alignmask = 0, .cra_type = &crypto_blkcipher_type, .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(camellia_algs[5].cra_list), .cra_u = { .blkcipher = { .min_keysize = CAMELLIA_MIN_KEY_SIZE * 2, diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c index b4bf0a6..6759dd1 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_glue.c +++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c @@ -150,7 +150,6 @@ static struct shash_alg ghash_alg = { .cra_blocksize = GHASH_BLOCK_SIZE, .cra_ctxsize = sizeof(struct ghash_ctx), .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(ghash_alg.base.cra_list), }, }; @@ -288,7 +287,6 @@ static struct ahash_alg ghash_async_alg = { .cra_blocksize = GHASH_BLOCK_SIZE, .cra_type = &crypto_ahash_type, .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(ghash_async_alg.halg.base.cra_list), .cra_init = ghash_async_init_tfm, .cra_exit = ghash_async_exit_tfm, }, diff --git a/arch/x86/crypto/salsa20_glue.c b/arch/x86/crypto/salsa20_glue.c index bccb76d..a3a3c02 100644 --- a/arch/x86/crypto/salsa20_glue.c +++ b/arch/x86/crypto/salsa20_glue.c @@ -97,7 +97,6 @@ static struct crypto_alg alg = { .cra_ctxsize = sizeof(struct salsa20_ctx), .cra_alignmask = 3, .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(alg.cra_list), .cra_u = { .blkcipher = { .setkey = setkey, diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c index b36bdac..3f543a0 100644 --- a/arch/x86/crypto/serpent_avx_glue.c +++ b/arch/x86/crypto/serpent_avx_glue.c @@ -390,7 +390,6 @@ static struct crypto_alg serpent_algs[10] = { { .cra_alignmask = 0, .cra_type = &crypto_blkcipher_type, .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(serpent_algs[0].cra_list), .cra_u = { .blkcipher = { .min_keysize = SERPENT_MIN_KEY_SIZE, @@ -410,7 +409,6 @@ static struct crypto_alg serpent_algs[10] = { { .cra_alignmask = 0, .cra_type = &crypto_blkcipher_type, .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(serpent_algs[1].cra_list), .cra_u = { .blkcipher = { .min_keysize = SERPENT_MIN_KEY_SIZE, @@ -430,7 +428,6 @@ static struct crypto_alg serpent_algs[10] = { { .cra_alignmask = 0, .cra_type = &crypto_blkcipher_type, .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(serpent_algs[2].cra_list), .cra_u = { .blkcipher = { .min_keysize = SERPENT_MIN_KEY_SIZE, @@ -451,7 +448,6 @@ static struct crypto_alg serpent_algs[10] = { { .cra_alignmask = 0, .cra_type = &crypto_blkcipher_type, .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(serpent_algs[3].cra_list), .cra_exit = lrw_exit_tfm, .cra_u = { .blkcipher = { @@ -475,7 +471,6 @@ static struct crypto_alg serpent_algs[10] = { { .cra_alignmask = 0, .cra_type = &crypto_blkcipher_type, .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(serpent_algs[4].cra_list), .cra_u = { .blkcipher = { .min_keysize = SERPENT_MIN_KEY_SIZE * 2, @@ -496,7 +491,6 @@ static struct crypto_alg serpent_algs[10] = { { .cra_alignmask = 0, .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(serpent_algs[5].cra_list), .cra_init = ablk_init, .cra_exit = ablk_exit, .cra_u = { @@ -518,7 +512,6 @@ static struct crypto_alg serpent_algs[10] = { { .cra_alignmask = 0, .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(serpent_algs[6].cra_list), .cra_init = ablk_init, .cra_exit = ablk_exit, .cra_u = { @@ -541,7 +534,6 @@ static struct crypto_alg serpent_algs[10] = { { .cra_alignmask = 0, .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(serpent_algs[7].cra_list), .cra_init = ablk_init, .cra_exit = ablk_exit, .cra_u = { @@ -565,7 +557,6 @@ static struct crypto_alg serpent_algs[10] = { { .cra_alignmask = 0, .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(serpent_algs[8].cra_list), .cra_init = ablk_init, .cra_exit = ablk_exit, .cra_u = { @@ -590,7 +581,6 @@ static struct crypto_alg serpent_algs[10] = { { .cra_alignmask = 0, .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(serpent_algs[9].cra_list), .cra_init = ablk_init, .cra_exit = ablk_exit, .cra_u = { diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c index d679c86..9107a99 100644 --- a/arch/x86/crypto/serpent_sse2_glue.c +++ b/arch/x86/crypto/serpent_sse2_glue.c @@ -393,7 +393,6 @@ static struct crypto_alg serpent_algs[10] = { { .cra_alignmask = 0, .cra_type = &crypto_blkcipher_type, .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(serpent_algs[0].cra_list), .cra_u = { .blkcipher = { .min_keysize = SERPENT_MIN_KEY_SIZE, @@ -413,7 +412,6 @@ static struct crypto_alg serpent_algs[10] = { { .cra_alignmask = 0, .cra_type = &crypto_blkcipher_type, .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(serpent_algs[1].cra_list), .cra_u = { .blkcipher = { .min_keysize = SERPENT_MIN_KEY_SIZE, @@ -433,7 +431,6 @@ static struct crypto_alg serpent_algs[10] = { { .cra_alignmask = 0, .cra_type = &crypto_blkcipher_type, .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(serpent_algs[2].cra_list), .cra_u = { .blkcipher = { .min_keysize = SERPENT_MIN_KEY_SIZE, @@ -454,7 +451,6 @@ static struct crypto_alg serpent_algs[10] = { { .cra_alignmask = 0, .cra_type = &crypto_blkcipher_type, .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(serpent_algs[3].cra_list), .cra_exit = lrw_exit_tfm, .cra_u = { .blkcipher = { @@ -478,7 +474,6 @@ static struct crypto_alg serpent_algs[10] = { { .cra_alignmask = 0, .cra_type = &crypto_blkcipher_type, .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(serpent_algs[4].cra_list), .cra_u = { .blkcipher = { .min_keysize = SERPENT_MIN_KEY_SIZE * 2, @@ -499,7 +494,6 @@ static struct crypto_alg serpent_algs[10] = { { .cra_alignmask = 0, .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(serpent_algs[5].cra_list), .cra_init = ablk_init, .cra_exit = ablk_exit, .cra_u = { @@ -521,7 +515,6 @@ static struct crypto_alg serpent_algs[10] = { { .cra_alignmask = 0, .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(serpent_algs[6].cra_list), .cra_init = ablk_init, .cra_exit = ablk_exit, .cra_u = { @@ -544,7 +537,6 @@ static struct crypto_alg serpent_algs[10] = { { .cra_alignmask = 0, .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(serpent_algs[7].cra_list), .cra_init = ablk_init, .cra_exit = ablk_exit, .cra_u = { @@ -568,7 +560,6 @@ static struct crypto_alg serpent_algs[10] = { { .cra_alignmask = 0, .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(serpent_algs[8].cra_list), .cra_init = ablk_init, .cra_exit = ablk_exit, .cra_u = { @@ -593,7 +584,6 @@ static struct crypto_alg serpent_algs[10] = { { .cra_alignmask = 0, .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(serpent_algs[9].cra_list), .cra_init = ablk_init, .cra_exit = ablk_exit, .cra_u = { diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c index 782b67d..e7708b5 100644 --- a/arch/x86/crypto/twofish_avx_glue.c +++ b/arch/x86/crypto/twofish_avx_glue.c @@ -378,7 +378,6 @@ static struct crypto_alg twofish_algs[10] = { { .cra_alignmask = 0, .cra_type = &crypto_blkcipher_type, .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(twofish_algs[0].cra_list), .cra_u = { .blkcipher = { .min_keysize = TF_MIN_KEY_SIZE, @@ -398,7 +397,6 @@ static struct crypto_alg twofish_algs[10] = { { .cra_alignmask = 0, .cra_type = &crypto_blkcipher_type, .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(twofish_algs[1].cra_list), .cra_u = { .blkcipher = { .min_keysize = TF_MIN_KEY_SIZE, @@ -418,7 +416,6 @@ static struct crypto_alg twofish_algs[10] = { { .cra_alignmask = 0, .cra_type = &crypto_blkcipher_type, .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(twofish_algs[2].cra_list), .cra_u = { .blkcipher = { .min_keysize = TF_MIN_KEY_SIZE, @@ -439,7 +436,6 @@ static struct crypto_alg twofish_algs[10] = { { .cra_alignmask = 0, .cra_type = &crypto_blkcipher_type, .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(twofish_algs[3].cra_list), .cra_exit = lrw_twofish_exit_tfm, .cra_u = { .blkcipher = { @@ -463,7 +459,6 @@ static struct crypto_alg twofish_algs[10] = { { .cra_alignmask = 0, .cra_type = &crypto_blkcipher_type, .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(twofish_algs[4].cra_list), .cra_u = { .blkcipher = { .min_keysize = TF_MIN_KEY_SIZE * 2, @@ -484,7 +479,6 @@ static struct crypto_alg twofish_algs[10] = { { .cra_alignmask = 0, .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(twofish_algs[5].cra_list), .cra_init = ablk_init, .cra_exit = ablk_exit, .cra_u = { @@ -506,7 +500,6 @@ static struct crypto_alg twofish_algs[10] = { { .cra_alignmask = 0, .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(twofish_algs[6].cra_list), .cra_init = ablk_init, .cra_exit = ablk_exit, .cra_u = { @@ -529,7 +522,6 @@ static struct crypto_alg twofish_algs[10] = { { .cra_alignmask = 0, .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(twofish_algs[7].cra_list), .cra_init = ablk_init, .cra_exit = ablk_exit, .cra_u = { @@ -553,7 +545,6 @@ static struct crypto_alg twofish_algs[10] = { { .cra_alignmask = 0, .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(twofish_algs[8].cra_list), .cra_init = ablk_init, .cra_exit = ablk_exit, .cra_u = { @@ -578,7 +569,6 @@ static struct crypto_alg twofish_algs[10] = { { .cra_alignmask = 0, .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(twofish_algs[9].cra_list), .cra_init = ablk_init, .cra_exit = ablk_exit, .cra_u = { diff --git a/arch/x86/crypto/twofish_glue.c b/arch/x86/crypto/twofish_glue.c index 359ae08..0a52023 100644 --- a/arch/x86/crypto/twofish_glue.c +++ b/arch/x86/crypto/twofish_glue.c @@ -70,7 +70,6 @@ static struct crypto_alg alg = { .cra_ctxsize = sizeof(struct twofish_ctx), .cra_alignmask = 0, .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(alg.cra_list), .cra_u = { .cipher = { .cia_min_keysize = TF_MIN_KEY_SIZE, diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c index 15f9347..aa3eb35 100644 --- a/arch/x86/crypto/twofish_glue_3way.c +++ b/arch/x86/crypto/twofish_glue_3way.c @@ -342,7 +342,6 @@ static struct crypto_alg tf_algs[5] = { { .cra_alignmask = 0, .cra_type = &crypto_blkcipher_type, .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(tf_algs[0].cra_list), .cra_u = { .blkcipher = { .min_keysize = TF_MIN_KEY_SIZE, @@ -362,7 +361,6 @@ static struct crypto_alg tf_algs[5] = { { .cra_alignmask = 0, .cra_type = &crypto_blkcipher_type, .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(tf_algs[1].cra_list), .cra_u = { .blkcipher = { .min_keysize = TF_MIN_KEY_SIZE, @@ -383,7 +381,6 @@ static struct crypto_alg tf_algs[5] = { { .cra_alignmask = 0, .cra_type = &crypto_blkcipher_type, .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(tf_algs[2].cra_list), .cra_u = { .blkcipher = { .min_keysize = TF_MIN_KEY_SIZE, @@ -404,7 +401,6 @@ static struct crypto_alg tf_algs[5] = { { .cra_alignmask = 0, .cra_type = &crypto_blkcipher_type, .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(tf_algs[3].cra_list), .cra_exit = lrw_twofish_exit_tfm, .cra_u = { .blkcipher = { @@ -426,7 +422,6 @@ static struct crypto_alg tf_algs[5] = { { .cra_alignmask = 0, .cra_type = &crypto_blkcipher_type, .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(tf_algs[4].cra_list), .cra_u = { .blkcipher = { .min_keysize = TF_MIN_KEY_SIZE * 2, -- cgit v1.1 From 4d6d6a2c850f89bc9283d02519cb536baba72032 Mon Sep 17 00:00:00 2001 From: Johannes Goetzfried Date: Wed, 11 Jul 2012 19:37:37 +0200 Subject: crypto: cast5 - add x86_64/avx assembler implementation This patch adds a x86_64/avx assembler implementation of the Cast5 block cipher. The implementation processes sixteen blocks in parallel (four 4 block chunk AVX operations). The table-lookups are done in general-purpose registers. For small blocksizes the functions from the generic module are called. A good performance increase is provided for blocksizes greater or equal to 128B. Patch has been tested with tcrypt and automated filesystem tests. Tcrypt benchmark results: Intel Core i5-2500 CPU (fam:6, model:42, step:7) cast5-avx-x86_64 vs. cast5-generic 64bit key: size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec 16B 0.99x 0.99x 1.00x 1.00x 1.02x 1.01x 64B 1.00x 1.00x 0.98x 1.00x 1.01x 1.02x 256B 2.03x 2.01x 0.95x 2.11x 2.12x 2.13x 1024B 2.30x 2.24x 0.95x 2.29x 2.35x 2.35x 8192B 2.31x 2.27x 0.95x 2.31x 2.39x 2.39x 128bit key: size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec 16B 0.99x 0.99x 1.00x 1.00x 1.01x 1.01x 64B 1.00x 1.00x 0.98x 1.01x 1.02x 1.01x 256B 2.17x 2.13x 0.96x 2.19x 2.19x 2.19x 1024B 2.29x 2.32x 0.95x 2.34x 2.37x 2.38x 8192B 2.35x 2.32x 0.95x 2.35x 2.39x 2.39x Signed-off-by: Johannes Goetzfried Signed-off-by: Herbert Xu --- arch/x86/crypto/Makefile | 2 + arch/x86/crypto/cast5-avx-x86_64-asm_64.S | 322 ++++++++++++++++++ arch/x86/crypto/cast5_avx_glue.c | 530 ++++++++++++++++++++++++++++++ 3 files changed, 854 insertions(+) create mode 100644 arch/x86/crypto/cast5-avx-x86_64-asm_64.S create mode 100644 arch/x86/crypto/cast5_avx_glue.c (limited to 'arch/x86') diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index e908e5d..565e82b 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -12,6 +12,7 @@ obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o +obj-$(CONFIG_CRYPTO_CAST5_AVX_X86_64) += cast5-avx-x86_64.o obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o @@ -32,6 +33,7 @@ serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o +cast5-avx-x86_64-y := cast5-avx-x86_64-asm_64.o cast5_avx_glue.o blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o diff --git a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S new file mode 100644 index 0000000..94693c8 --- /dev/null +++ b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S @@ -0,0 +1,322 @@ +/* + * Cast5 Cipher 16-way parallel algorithm (AVX/x86_64) + * + * Copyright (C) 2012 Johannes Goetzfried + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + * USA + * + */ + +.file "cast5-avx-x86_64-asm_64.S" +.text + +.extern cast5_s1 +.extern cast5_s2 +.extern cast5_s3 +.extern cast5_s4 + +/* structure of crypto context */ +#define km 0 +#define kr (16*4) +#define rr ((16*4)+16) + +/* s-boxes */ +#define s1 cast5_s1 +#define s2 cast5_s2 +#define s3 cast5_s3 +#define s4 cast5_s4 + +/********************************************************************** + 16-way AVX cast5 + **********************************************************************/ +#define CTX %rdi + +#define RL1 %xmm0 +#define RR1 %xmm1 +#define RL2 %xmm2 +#define RR2 %xmm3 +#define RL3 %xmm4 +#define RR3 %xmm5 +#define RL4 %xmm6 +#define RR4 %xmm7 + +#define RX %xmm8 + +#define RKM %xmm9 +#define RKRF %xmm10 +#define RKRR %xmm11 + +#define RTMP %xmm12 +#define RMASK %xmm13 +#define R32 %xmm14 + +#define RID1 %rax +#define RID1b %al +#define RID2 %rbx +#define RID2b %bl + +#define RGI1 %rdx +#define RGI1bl %dl +#define RGI1bh %dh +#define RGI2 %rcx +#define RGI2bl %cl +#define RGI2bh %ch + +#define RFS1 %r8 +#define RFS1d %r8d +#define RFS2 %r9 +#define RFS2d %r9d +#define RFS3 %r10 +#define RFS3d %r10d + + +#define lookup_32bit(src, dst, op1, op2, op3) \ + movb src ## bl, RID1b; \ + movb src ## bh, RID2b; \ + movl s1(, RID1, 4), dst ## d; \ + op1 s2(, RID2, 4), dst ## d; \ + shrq $16, src; \ + movb src ## bl, RID1b; \ + movb src ## bh, RID2b; \ + op2 s3(, RID1, 4), dst ## d; \ + op3 s4(, RID2, 4), dst ## d; + +#define F(a, x, op0, op1, op2, op3) \ + op0 a, RKM, x; \ + vpslld RKRF, x, RTMP; \ + vpsrld RKRR, x, x; \ + vpor RTMP, x, x; \ + \ + vpshufb RMASK, x, x; \ + vmovq x, RGI1; \ + vpsrldq $8, x, x; \ + vmovq x, RGI2; \ + \ + lookup_32bit(RGI1, RFS1, op1, op2, op3); \ + shrq $16, RGI1; \ + lookup_32bit(RGI1, RFS2, op1, op2, op3); \ + shlq $32, RFS2; \ + orq RFS1, RFS2; \ + \ + lookup_32bit(RGI2, RFS1, op1, op2, op3); \ + shrq $16, RGI2; \ + lookup_32bit(RGI2, RFS3, op1, op2, op3); \ + shlq $32, RFS3; \ + orq RFS1, RFS3; \ + \ + vmovq RFS2, x; \ + vpinsrq $1, RFS3, x, x; + +#define F1(b, x) F(b, x, vpaddd, xorl, subl, addl) +#define F2(b, x) F(b, x, vpxor, subl, addl, xorl) +#define F3(b, x) F(b, x, vpsubd, addl, xorl, subl) + +#define subround(a, b, x, n, f) \ + F ## f(b, x); \ + vpxor a, x, a; + +#define round(l, r, n, f) \ + vbroadcastss (km+(4*n))(CTX), RKM; \ + vpinsrb $0, (kr+n)(CTX), RKRF, RKRF; \ + vpsubq RKRF, R32, RKRR; \ + subround(l ## 1, r ## 1, RX, n, f); \ + subround(l ## 2, r ## 2, RX, n, f); \ + subround(l ## 3, r ## 3, RX, n, f); \ + subround(l ## 4, r ## 4, RX, n, f); + + +#define transpose_2x4(x0, x1, t0, t1) \ + vpunpckldq x1, x0, t0; \ + vpunpckhdq x1, x0, t1; \ + \ + vpunpcklqdq t1, t0, x0; \ + vpunpckhqdq t1, t0, x1; + +#define inpack_blocks(in, x0, x1, t0, t1) \ + vmovdqu (0*4*4)(in), x0; \ + vmovdqu (1*4*4)(in), x1; \ + vpshufb RMASK, x0, x0; \ + vpshufb RMASK, x1, x1; \ + \ + transpose_2x4(x0, x1, t0, t1) + +#define outunpack_blocks(out, x0, x1, t0, t1) \ + transpose_2x4(x0, x1, t0, t1) \ + \ + vpshufb RMASK, x0, x0; \ + vpshufb RMASK, x1, x1; \ + vmovdqu x0, (0*4*4)(out); \ + vmovdqu x1, (1*4*4)(out); + +#define outunpack_xor_blocks(out, x0, x1, t0, t1) \ + transpose_2x4(x0, x1, t0, t1) \ + \ + vpshufb RMASK, x0, x0; \ + vpshufb RMASK, x1, x1; \ + vpxor (0*4*4)(out), x0, x0; \ + vmovdqu x0, (0*4*4)(out); \ + vpxor (1*4*4)(out), x1, x1; \ + vmovdqu x1, (1*4*4)(out); + +.align 16 +.Lbswap_mask: + .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 +.L32_mask: + .byte 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0, 0, 0, 0, 0 + +.align 16 +.global __cast5_enc_blk_16way +.type __cast5_enc_blk_16way,@function; + +__cast5_enc_blk_16way: + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + * %rcx: bool, if true: xor output + */ + + pushq %rbx; + pushq %rcx; + + vmovdqu .Lbswap_mask, RMASK; + vmovdqu .L32_mask, R32; + vpxor RKRF, RKRF, RKRF; + + inpack_blocks(%rdx, RL1, RR1, RTMP, RX); + leaq (2*4*4)(%rdx), %rax; + inpack_blocks(%rax, RL2, RR2, RTMP, RX); + leaq (2*4*4)(%rax), %rax; + inpack_blocks(%rax, RL3, RR3, RTMP, RX); + leaq (2*4*4)(%rax), %rax; + inpack_blocks(%rax, RL4, RR4, RTMP, RX); + + xorq RID1, RID1; + xorq RID2, RID2; + + round(RL, RR, 0, 1); + round(RR, RL, 1, 2); + round(RL, RR, 2, 3); + round(RR, RL, 3, 1); + round(RL, RR, 4, 2); + round(RR, RL, 5, 3); + round(RL, RR, 6, 1); + round(RR, RL, 7, 2); + round(RL, RR, 8, 3); + round(RR, RL, 9, 1); + round(RL, RR, 10, 2); + round(RR, RL, 11, 3); + + movb rr(CTX), %al; + testb %al, %al; + jnz __skip_enc; + + round(RL, RR, 12, 1); + round(RR, RL, 13, 2); + round(RL, RR, 14, 3); + round(RR, RL, 15, 1); + +__skip_enc: + popq %rcx; + popq %rbx; + + testb %cl, %cl; + jnz __enc_xor16; + + outunpack_blocks(%rsi, RR1, RL1, RTMP, RX); + leaq (2*4*4)(%rsi), %rax; + outunpack_blocks(%rax, RR2, RL2, RTMP, RX); + leaq (2*4*4)(%rax), %rax; + outunpack_blocks(%rax, RR3, RL3, RTMP, RX); + leaq (2*4*4)(%rax), %rax; + outunpack_blocks(%rax, RR4, RL4, RTMP, RX); + + ret; + +__enc_xor16: + outunpack_xor_blocks(%rsi, RR1, RL1, RTMP, RX); + leaq (2*4*4)(%rsi), %rax; + outunpack_xor_blocks(%rax, RR2, RL2, RTMP, RX); + leaq (2*4*4)(%rax), %rax; + outunpack_xor_blocks(%rax, RR3, RL3, RTMP, RX); + leaq (2*4*4)(%rax), %rax; + outunpack_xor_blocks(%rax, RR4, RL4, RTMP, RX); + + ret; + +.align 16 +.global cast5_dec_blk_16way +.type cast5_dec_blk_16way,@function; + +cast5_dec_blk_16way: + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + */ + + pushq %rbx; + + vmovdqu .Lbswap_mask, RMASK; + vmovdqu .L32_mask, R32; + vpxor RKRF, RKRF, RKRF; + + inpack_blocks(%rdx, RL1, RR1, RTMP, RX); + leaq (2*4*4)(%rdx), %rax; + inpack_blocks(%rax, RL2, RR2, RTMP, RX); + leaq (2*4*4)(%rax), %rax; + inpack_blocks(%rax, RL3, RR3, RTMP, RX); + leaq (2*4*4)(%rax), %rax; + inpack_blocks(%rax, RL4, RR4, RTMP, RX); + + xorq RID1, RID1; + xorq RID2, RID2; + + movb rr(CTX), %al; + testb %al, %al; + jnz __skip_dec; + + round(RL, RR, 15, 1); + round(RR, RL, 14, 3); + round(RL, RR, 13, 2); + round(RR, RL, 12, 1); + +__skip_dec: + round(RL, RR, 11, 3); + round(RR, RL, 10, 2); + round(RL, RR, 9, 1); + round(RR, RL, 8, 3); + round(RL, RR, 7, 2); + round(RR, RL, 6, 1); + round(RL, RR, 5, 3); + round(RR, RL, 4, 2); + round(RL, RR, 3, 1); + round(RR, RL, 2, 3); + round(RL, RR, 1, 2); + round(RR, RL, 0, 1); + + popq %rbx; + + outunpack_blocks(%rsi, RR1, RL1, RTMP, RX); + leaq (2*4*4)(%rsi), %rax; + outunpack_blocks(%rax, RR2, RL2, RTMP, RX); + leaq (2*4*4)(%rax), %rax; + outunpack_blocks(%rax, RR3, RL3, RTMP, RX); + leaq (2*4*4)(%rax), %rax; + outunpack_blocks(%rax, RR4, RL4, RTMP, RX); + + ret; diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c new file mode 100644 index 0000000..445aab0 --- /dev/null +++ b/arch/x86/crypto/cast5_avx_glue.c @@ -0,0 +1,530 @@ +/* + * Glue Code for the AVX assembler implemention of the Cast5 Cipher + * + * Copyright (C) 2012 Johannes Goetzfried + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + * USA + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define CAST5_PARALLEL_BLOCKS 16 + +asmlinkage void __cast5_enc_blk_16way(struct cast5_ctx *ctx, u8 *dst, + const u8 *src, bool xor); +asmlinkage void cast5_dec_blk_16way(struct cast5_ctx *ctx, u8 *dst, + const u8 *src); + +static inline void cast5_enc_blk_xway(struct cast5_ctx *ctx, u8 *dst, + const u8 *src) +{ + __cast5_enc_blk_16way(ctx, dst, src, false); +} + +static inline void cast5_enc_blk_xway_xor(struct cast5_ctx *ctx, u8 *dst, + const u8 *src) +{ + __cast5_enc_blk_16way(ctx, dst, src, true); +} + +static inline void cast5_dec_blk_xway(struct cast5_ctx *ctx, u8 *dst, + const u8 *src) +{ + cast5_dec_blk_16way(ctx, dst, src); +} + + +static inline bool cast5_fpu_begin(bool fpu_enabled, unsigned int nbytes) +{ + return glue_fpu_begin(CAST5_BLOCK_SIZE, CAST5_PARALLEL_BLOCKS, + NULL, fpu_enabled, nbytes); +} + +static inline void cast5_fpu_end(bool fpu_enabled) +{ + return glue_fpu_end(fpu_enabled); +} + +static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk, + bool enc) +{ + bool fpu_enabled = false; + struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + const unsigned int bsize = CAST5_BLOCK_SIZE; + unsigned int nbytes; + int err; + + err = blkcipher_walk_virt(desc, walk); + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + + while ((nbytes = walk->nbytes)) { + u8 *wsrc = walk->src.virt.addr; + u8 *wdst = walk->dst.virt.addr; + + fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes); + + /* Process multi-block batch */ + if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) { + do { + if (enc) + cast5_enc_blk_xway(ctx, wdst, wsrc); + else + cast5_dec_blk_xway(ctx, wdst, wsrc); + + wsrc += bsize * CAST5_PARALLEL_BLOCKS; + wdst += bsize * CAST5_PARALLEL_BLOCKS; + nbytes -= bsize * CAST5_PARALLEL_BLOCKS; + } while (nbytes >= bsize * CAST5_PARALLEL_BLOCKS); + + if (nbytes < bsize) + goto done; + } + + /* Handle leftovers */ + do { + if (enc) + __cast5_encrypt(ctx, wdst, wsrc); + else + __cast5_decrypt(ctx, wdst, wsrc); + + wsrc += bsize; + wdst += bsize; + nbytes -= bsize; + } while (nbytes >= bsize); + +done: + err = blkcipher_walk_done(desc, walk, nbytes); + } + + cast5_fpu_end(fpu_enabled); + return err; +} + +static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct blkcipher_walk walk; + + blkcipher_walk_init(&walk, dst, src, nbytes); + return ecb_crypt(desc, &walk, true); +} + +static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct blkcipher_walk walk; + + blkcipher_walk_init(&walk, dst, src, nbytes); + return ecb_crypt(desc, &walk, false); +} + +static unsigned int __cbc_encrypt(struct blkcipher_desc *desc, + struct blkcipher_walk *walk) +{ + struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + const unsigned int bsize = CAST5_BLOCK_SIZE; + unsigned int nbytes = walk->nbytes; + u64 *src = (u64 *)walk->src.virt.addr; + u64 *dst = (u64 *)walk->dst.virt.addr; + u64 *iv = (u64 *)walk->iv; + + do { + *dst = *src ^ *iv; + __cast5_encrypt(ctx, (u8 *)dst, (u8 *)dst); + iv = dst; + + src += 1; + dst += 1; + nbytes -= bsize; + } while (nbytes >= bsize); + + *(u64 *)walk->iv ^= *iv; + return nbytes; +} + +static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct blkcipher_walk walk; + int err; + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt(desc, &walk); + + while ((nbytes = walk.nbytes)) { + nbytes = __cbc_encrypt(desc, &walk); + err = blkcipher_walk_done(desc, &walk, nbytes); + } + + return err; +} + +static unsigned int __cbc_decrypt(struct blkcipher_desc *desc, + struct blkcipher_walk *walk) +{ + struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + const unsigned int bsize = CAST5_BLOCK_SIZE; + unsigned int nbytes = walk->nbytes; + u64 *src = (u64 *)walk->src.virt.addr; + u64 *dst = (u64 *)walk->dst.virt.addr; + u64 ivs[CAST5_PARALLEL_BLOCKS - 1]; + u64 last_iv; + int i; + + /* Start of the last block. */ + src += nbytes / bsize - 1; + dst += nbytes / bsize - 1; + + last_iv = *src; + + /* Process multi-block batch */ + if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) { + do { + nbytes -= bsize * (CAST5_PARALLEL_BLOCKS - 1); + src -= CAST5_PARALLEL_BLOCKS - 1; + dst -= CAST5_PARALLEL_BLOCKS - 1; + + for (i = 0; i < CAST5_PARALLEL_BLOCKS - 1; i++) + ivs[i] = src[i]; + + cast5_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src); + + for (i = 0; i < CAST5_PARALLEL_BLOCKS - 1; i++) + *(dst + (i + 1)) ^= *(ivs + i); + + nbytes -= bsize; + if (nbytes < bsize) + goto done; + + *dst ^= *(src - 1); + src -= 1; + dst -= 1; + } while (nbytes >= bsize * CAST5_PARALLEL_BLOCKS); + + if (nbytes < bsize) + goto done; + } + + /* Handle leftovers */ + for (;;) { + __cast5_decrypt(ctx, (u8 *)dst, (u8 *)src); + + nbytes -= bsize; + if (nbytes < bsize) + break; + + *dst ^= *(src - 1); + src -= 1; + dst -= 1; + } + +done: + *dst ^= *(u64 *)walk->iv; + *(u64 *)walk->iv = last_iv; + + return nbytes; +} + +static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + bool fpu_enabled = false; + struct blkcipher_walk walk; + int err; + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt(desc, &walk); + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + + while ((nbytes = walk.nbytes)) { + fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes); + nbytes = __cbc_decrypt(desc, &walk); + err = blkcipher_walk_done(desc, &walk, nbytes); + } + + cast5_fpu_end(fpu_enabled); + return err; +} + +static void ctr_crypt_final(struct blkcipher_desc *desc, + struct blkcipher_walk *walk) +{ + struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + u8 *ctrblk = walk->iv; + u8 keystream[CAST5_BLOCK_SIZE]; + u8 *src = walk->src.virt.addr; + u8 *dst = walk->dst.virt.addr; + unsigned int nbytes = walk->nbytes; + + __cast5_encrypt(ctx, keystream, ctrblk); + crypto_xor(keystream, src, nbytes); + memcpy(dst, keystream, nbytes); + + crypto_inc(ctrblk, CAST5_BLOCK_SIZE); +} + +static unsigned int __ctr_crypt(struct blkcipher_desc *desc, + struct blkcipher_walk *walk) +{ + struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + const unsigned int bsize = CAST5_BLOCK_SIZE; + unsigned int nbytes = walk->nbytes; + u64 *src = (u64 *)walk->src.virt.addr; + u64 *dst = (u64 *)walk->dst.virt.addr; + u64 ctrblk = be64_to_cpu(*(__be64 *)walk->iv); + __be64 ctrblocks[CAST5_PARALLEL_BLOCKS]; + int i; + + /* Process multi-block batch */ + if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) { + do { + /* create ctrblks for parallel encrypt */ + for (i = 0; i < CAST5_PARALLEL_BLOCKS; i++) { + if (dst != src) + dst[i] = src[i]; + + ctrblocks[i] = cpu_to_be64(ctrblk++); + } + + cast5_enc_blk_xway_xor(ctx, (u8 *)dst, + (u8 *)ctrblocks); + + src += CAST5_PARALLEL_BLOCKS; + dst += CAST5_PARALLEL_BLOCKS; + nbytes -= bsize * CAST5_PARALLEL_BLOCKS; + } while (nbytes >= bsize * CAST5_PARALLEL_BLOCKS); + + if (nbytes < bsize) + goto done; + } + + /* Handle leftovers */ + do { + if (dst != src) + *dst = *src; + + ctrblocks[0] = cpu_to_be64(ctrblk++); + + __cast5_encrypt(ctx, (u8 *)ctrblocks, (u8 *)ctrblocks); + *dst ^= ctrblocks[0]; + + src += 1; + dst += 1; + nbytes -= bsize; + } while (nbytes >= bsize); + +done: + *(__be64 *)walk->iv = cpu_to_be64(ctrblk); + return nbytes; +} + +static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + bool fpu_enabled = false; + struct blkcipher_walk walk; + int err; + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt_block(desc, &walk, CAST5_BLOCK_SIZE); + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + + while ((nbytes = walk.nbytes) >= CAST5_BLOCK_SIZE) { + fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes); + nbytes = __ctr_crypt(desc, &walk); + err = blkcipher_walk_done(desc, &walk, nbytes); + } + + cast5_fpu_end(fpu_enabled); + + if (walk.nbytes) { + ctr_crypt_final(desc, &walk); + err = blkcipher_walk_done(desc, &walk, 0); + } + + return err; +} + + +static struct crypto_alg cast5_algs[6] = { { + .cra_name = "__ecb-cast5-avx", + .cra_driver_name = "__driver-ecb-cast5-avx", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = CAST5_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct cast5_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_u = { + .blkcipher = { + .min_keysize = CAST5_MIN_KEY_SIZE, + .max_keysize = CAST5_MAX_KEY_SIZE, + .setkey = cast5_setkey, + .encrypt = ecb_encrypt, + .decrypt = ecb_decrypt, + }, + }, +}, { + .cra_name = "__cbc-cast5-avx", + .cra_driver_name = "__driver-cbc-cast5-avx", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = CAST5_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct cast5_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_u = { + .blkcipher = { + .min_keysize = CAST5_MIN_KEY_SIZE, + .max_keysize = CAST5_MAX_KEY_SIZE, + .setkey = cast5_setkey, + .encrypt = cbc_encrypt, + .decrypt = cbc_decrypt, + }, + }, +}, { + .cra_name = "__ctr-cast5-avx", + .cra_driver_name = "__driver-ctr-cast5-avx", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct cast5_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_u = { + .blkcipher = { + .min_keysize = CAST5_MIN_KEY_SIZE, + .max_keysize = CAST5_MAX_KEY_SIZE, + .ivsize = CAST5_BLOCK_SIZE, + .setkey = cast5_setkey, + .encrypt = ctr_crypt, + .decrypt = ctr_crypt, + }, + }, +}, { + .cra_name = "ecb(cast5)", + .cra_driver_name = "ecb-cast5-avx", + .cra_priority = 200, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = CAST5_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_helper_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_init = ablk_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = CAST5_MIN_KEY_SIZE, + .max_keysize = CAST5_MAX_KEY_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_decrypt, + }, + }, +}, { + .cra_name = "cbc(cast5)", + .cra_driver_name = "cbc-cast5-avx", + .cra_priority = 200, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = CAST5_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_helper_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_init = ablk_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = CAST5_MIN_KEY_SIZE, + .max_keysize = CAST5_MAX_KEY_SIZE, + .ivsize = CAST5_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = __ablk_encrypt, + .decrypt = ablk_decrypt, + }, + }, +}, { + .cra_name = "ctr(cast5)", + .cra_driver_name = "ctr-cast5-avx", + .cra_priority = 200, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct async_helper_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_init = ablk_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = CAST5_MIN_KEY_SIZE, + .max_keysize = CAST5_MAX_KEY_SIZE, + .ivsize = CAST5_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_encrypt, + .geniv = "chainiv", + }, + }, +} }; + +static int __init cast5_init(void) +{ + u64 xcr0; + + if (!cpu_has_avx || !cpu_has_osxsave) { + pr_info("AVX instructions are not detected.\n"); + return -ENODEV; + } + + xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); + if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { + pr_info("AVX detected but unusable.\n"); + return -ENODEV; + } + + return crypto_register_algs(cast5_algs, ARRAY_SIZE(cast5_algs)); +} + +static void __exit cast5_exit(void) +{ + crypto_unregister_algs(cast5_algs, ARRAY_SIZE(cast5_algs)); +} + +module_init(cast5_init); +module_exit(cast5_exit); + +MODULE_DESCRIPTION("Cast5 Cipher Algorithm, AVX optimized"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("cast5"); -- cgit v1.1 From 4ea1277d301eb776e321684cd4ea95116b4e8847 Mon Sep 17 00:00:00 2001 From: Johannes Goetzfried Date: Wed, 11 Jul 2012 19:38:57 +0200 Subject: crypto: cast6 - add x86_64/avx assembler implementation This patch adds a x86_64/avx assembler implementation of the Cast6 block cipher. The implementation processes eight blocks in parallel (two 4 block chunk AVX operations). The table-lookups are done in general-purpose registers. For small blocksizes the functions from the generic module are called. A good performance increase is provided for blocksizes greater or equal to 128B. Patch has been tested with tcrypt and automated filesystem tests. Tcrypt benchmark results: Intel Core i5-2500 CPU (fam:6, model:42, step:7) cast6-avx-x86_64 vs. cast6-generic 128bit key: (lrw:256bit) (xts:256bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 0.97x 1.00x 1.01x 1.01x 0.99x 0.97x 0.98x 1.01x 0.96x 0.98x 64B 0.98x 0.99x 1.02x 1.01x 0.99x 1.00x 1.01x 0.99x 1.00x 0.99x 256B 1.77x 1.84x 0.99x 1.85x 1.77x 1.77x 1.70x 1.74x 1.69x 1.72x 1024B 1.93x 1.95x 0.99x 1.96x 1.93x 1.93x 1.84x 1.85x 1.89x 1.87x 8192B 1.91x 1.95x 0.99x 1.97x 1.95x 1.91x 1.86x 1.87x 1.93x 1.90x 256bit key: (lrw:384bit) (xts:512bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 0.97x 0.99x 1.02x 1.01x 0.98x 0.99x 1.00x 1.00x 0.98x 0.98x 64B 0.98x 0.99x 1.01x 1.00x 1.00x 1.00x 1.01x 1.01x 0.97x 1.00x 256B 1.77x 1.83x 1.00x 1.86x 1.79x 1.78x 1.70x 1.76x 1.71x 1.69x 1024B 1.92x 1.95x 0.99x 1.96x 1.93x 1.93x 1.83x 1.86x 1.89x 1.87x 8192B 1.94x 1.95x 0.99x 1.97x 1.95x 1.95x 1.87x 1.87x 1.93x 1.91x Signed-off-by: Johannes Goetzfried Signed-off-by: Herbert Xu --- arch/x86/crypto/Makefile | 2 + arch/x86/crypto/cast6-avx-x86_64-asm_64.S | 335 +++++++++++++++ arch/x86/crypto/cast6_avx_glue.c | 648 ++++++++++++++++++++++++++++++ 3 files changed, 985 insertions(+) create mode 100644 arch/x86/crypto/cast6-avx-x86_64-asm_64.S create mode 100644 arch/x86/crypto/cast6_avx_glue.c (limited to 'arch/x86') diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 565e82b..5bacb4a 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -13,6 +13,7 @@ obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o obj-$(CONFIG_CRYPTO_CAST5_AVX_X86_64) += cast5-avx-x86_64.o +obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o @@ -34,6 +35,7 @@ serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o cast5-avx-x86_64-y := cast5-avx-x86_64-asm_64.o cast5_avx_glue.o +cast6-avx-x86_64-y := cast6-avx-x86_64-asm_64.o cast6_avx_glue.o blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o diff --git a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S new file mode 100644 index 0000000..d258ce0 --- /dev/null +++ b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S @@ -0,0 +1,335 @@ +/* + * Cast6 Cipher 8-way parallel algorithm (AVX/x86_64) + * + * Copyright (C) 2012 Johannes Goetzfried + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + * USA + * + */ + +.file "cast6-avx-x86_64-asm_64.S" +.text + +.extern cast6_s1 +.extern cast6_s2 +.extern cast6_s3 +.extern cast6_s4 + +/* structure of crypto context */ +#define km 0 +#define kr (12*4*4) + +/* s-boxes */ +#define s1 cast6_s1 +#define s2 cast6_s2 +#define s3 cast6_s3 +#define s4 cast6_s4 + +/********************************************************************** + 8-way AVX cast6 + **********************************************************************/ +#define CTX %rdi + +#define RA1 %xmm0 +#define RB1 %xmm1 +#define RC1 %xmm2 +#define RD1 %xmm3 + +#define RA2 %xmm4 +#define RB2 %xmm5 +#define RC2 %xmm6 +#define RD2 %xmm7 + +#define RX %xmm8 + +#define RKM %xmm9 +#define RKRF %xmm10 +#define RKRR %xmm11 + +#define RTMP %xmm12 +#define RMASK %xmm13 +#define R32 %xmm14 + +#define RID1 %rax +#define RID1b %al +#define RID2 %rbx +#define RID2b %bl + +#define RGI1 %rdx +#define RGI1bl %dl +#define RGI1bh %dh +#define RGI2 %rcx +#define RGI2bl %cl +#define RGI2bh %ch + +#define RFS1 %r8 +#define RFS1d %r8d +#define RFS2 %r9 +#define RFS2d %r9d +#define RFS3 %r10 +#define RFS3d %r10d + + +#define lookup_32bit(src, dst, op1, op2, op3) \ + movb src ## bl, RID1b; \ + movb src ## bh, RID2b; \ + movl s1(, RID1, 4), dst ## d; \ + op1 s2(, RID2, 4), dst ## d; \ + shrq $16, src; \ + movb src ## bl, RID1b; \ + movb src ## bh, RID2b; \ + op2 s3(, RID1, 4), dst ## d; \ + op3 s4(, RID2, 4), dst ## d; + +#define F(a, x, op0, op1, op2, op3) \ + op0 a, RKM, x; \ + vpslld RKRF, x, RTMP; \ + vpsrld RKRR, x, x; \ + vpor RTMP, x, x; \ + \ + vpshufb RMASK, x, x; \ + vmovq x, RGI1; \ + vpsrldq $8, x, x; \ + vmovq x, RGI2; \ + \ + lookup_32bit(RGI1, RFS1, op1, op2, op3); \ + shrq $16, RGI1; \ + lookup_32bit(RGI1, RFS2, op1, op2, op3); \ + shlq $32, RFS2; \ + orq RFS1, RFS2; \ + \ + lookup_32bit(RGI2, RFS1, op1, op2, op3); \ + shrq $16, RGI2; \ + lookup_32bit(RGI2, RFS3, op1, op2, op3); \ + shlq $32, RFS3; \ + orq RFS1, RFS3; \ + \ + vmovq RFS2, x; \ + vpinsrq $1, RFS3, x, x; + +#define F1(b, x) F(b, x, vpaddd, xorl, subl, addl) +#define F2(b, x) F(b, x, vpxor, subl, addl, xorl) +#define F3(b, x) F(b, x, vpsubd, addl, xorl, subl) + +#define qop(in, out, x, f) \ + F ## f(in ## 1, x); \ + vpxor out ## 1, x, out ## 1; \ + F ## f(in ## 2, x); \ + vpxor out ## 2, x, out ## 2; \ + +#define Q(n) \ + vbroadcastss (km+(4*(4*n+0)))(CTX), RKM; \ + vpinsrb $0, (kr+(4*n+0))(CTX), RKRF, RKRF; \ + vpsubq RKRF, R32, RKRR; \ + qop(RD, RC, RX, 1); \ + \ + vbroadcastss (km+(4*(4*n+1)))(CTX), RKM; \ + vpinsrb $0, (kr+(4*n+1))(CTX), RKRF, RKRF; \ + vpsubq RKRF, R32, RKRR; \ + qop(RC, RB, RX, 2); \ + \ + vbroadcastss (km+(4*(4*n+2)))(CTX), RKM; \ + vpinsrb $0, (kr+(4*n+2))(CTX), RKRF, RKRF; \ + vpsubq RKRF, R32, RKRR; \ + qop(RB, RA, RX, 3); \ + \ + vbroadcastss (km+(4*(4*n+3)))(CTX), RKM; \ + vpinsrb $0, (kr+(4*n+3))(CTX), RKRF, RKRF; \ + vpsubq RKRF, R32, RKRR; \ + qop(RA, RD, RX, 1); + +#define QBAR(n) \ + vbroadcastss (km+(4*(4*n+3)))(CTX), RKM; \ + vpinsrb $0, (kr+(4*n+3))(CTX), RKRF, RKRF; \ + vpsubq RKRF, R32, RKRR; \ + qop(RA, RD, RX, 1); \ + \ + vbroadcastss (km+(4*(4*n+2)))(CTX), RKM; \ + vpinsrb $0, (kr+(4*n+2))(CTX), RKRF, RKRF; \ + vpsubq RKRF, R32, RKRR; \ + qop(RB, RA, RX, 3); \ + \ + vbroadcastss (km+(4*(4*n+1)))(CTX), RKM; \ + vpinsrb $0, (kr+(4*n+1))(CTX), RKRF, RKRF; \ + vpsubq RKRF, R32, RKRR; \ + qop(RC, RB, RX, 2); \ + \ + vbroadcastss (km+(4*(4*n+0)))(CTX), RKM; \ + vpinsrb $0, (kr+(4*n+0))(CTX), RKRF, RKRF; \ + vpsubq RKRF, R32, RKRR; \ + qop(RD, RC, RX, 1); + + +#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ + vpunpckldq x1, x0, t0; \ + vpunpckhdq x1, x0, t2; \ + vpunpckldq x3, x2, t1; \ + vpunpckhdq x3, x2, x3; \ + \ + vpunpcklqdq t1, t0, x0; \ + vpunpckhqdq t1, t0, x1; \ + vpunpcklqdq x3, t2, x2; \ + vpunpckhqdq x3, t2, x3; + +#define inpack_blocks(in, x0, x1, x2, x3, t0, t1, t2) \ + vmovdqu (0*4*4)(in), x0; \ + vmovdqu (1*4*4)(in), x1; \ + vmovdqu (2*4*4)(in), x2; \ + vmovdqu (3*4*4)(in), x3; \ + vpshufb RMASK, x0, x0; \ + vpshufb RMASK, x1, x1; \ + vpshufb RMASK, x2, x2; \ + vpshufb RMASK, x3, x3; \ + \ + transpose_4x4(x0, x1, x2, x3, t0, t1, t2) + +#define outunpack_blocks(out, x0, x1, x2, x3, t0, t1, t2) \ + transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ + \ + vpshufb RMASK, x0, x0; \ + vpshufb RMASK, x1, x1; \ + vpshufb RMASK, x2, x2; \ + vpshufb RMASK, x3, x3; \ + vmovdqu x0, (0*4*4)(out); \ + vmovdqu x1, (1*4*4)(out); \ + vmovdqu x2, (2*4*4)(out); \ + vmovdqu x3, (3*4*4)(out); + +#define outunpack_xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \ + transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ + \ + vpshufb RMASK, x0, x0; \ + vpshufb RMASK, x1, x1; \ + vpshufb RMASK, x2, x2; \ + vpshufb RMASK, x3, x3; \ + vpxor (0*4*4)(out), x0, x0; \ + vmovdqu x0, (0*4*4)(out); \ + vpxor (1*4*4)(out), x1, x1; \ + vmovdqu x1, (1*4*4)(out); \ + vpxor (2*4*4)(out), x2, x2; \ + vmovdqu x2, (2*4*4)(out); \ + vpxor (3*4*4)(out), x3, x3; \ + vmovdqu x3, (3*4*4)(out); + +.align 16 +.Lbswap_mask: + .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 +.L32_mask: + .byte 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0, 0, 0, 0, 0 + +.align 16 +.global __cast6_enc_blk_8way +.type __cast6_enc_blk_8way,@function; + +__cast6_enc_blk_8way: + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + * %rcx: bool, if true: xor output + */ + + pushq %rbx; + pushq %rcx; + + vmovdqu .Lbswap_mask, RMASK; + vmovdqu .L32_mask, R32; + vpxor RKRF, RKRF, RKRF; + + leaq (4*4*4)(%rdx), %rax; + inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RTMP, RX, RKM); + inpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKM); + + xorq RID1, RID1; + xorq RID2, RID2; + + Q(0); + Q(1); + Q(2); + Q(3); + Q(4); + Q(5); + QBAR(6); + QBAR(7); + QBAR(8); + QBAR(9); + QBAR(10); + QBAR(11); + + popq %rcx; + popq %rbx; + + leaq (4*4*4)(%rsi), %rax; + + testb %cl, %cl; + jnz __enc_xor8; + + outunpack_blocks(%rsi, RA1, RB1, RC1, RD1, RTMP, RX, RKM); + outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKM); + + ret; + +__enc_xor8: + outunpack_xor_blocks(%rsi, RA1, RB1, RC1, RD1, RTMP, RX, RKM); + outunpack_xor_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKM); + + ret; + +.align 16 +.global cast6_dec_blk_8way +.type cast6_dec_blk_8way,@function; + +cast6_dec_blk_8way: + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + */ + + pushq %rbx; + + vmovdqu .Lbswap_mask, RMASK; + vmovdqu .L32_mask, R32; + vpxor RKRF, RKRF, RKRF; + + leaq (4*4*4)(%rdx), %rax; + inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RTMP, RX, RKM); + inpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKM); + + xorq RID1, RID1; + xorq RID2, RID2; + + Q(11); + Q(10); + Q(9); + Q(8); + Q(7); + Q(6); + QBAR(5); + QBAR(4); + QBAR(3); + QBAR(2); + QBAR(1); + QBAR(0); + + popq %rbx; + + leaq (4*4*4)(%rsi), %rax; + outunpack_blocks(%rsi, RA1, RB1, RC1, RD1, RTMP, RX, RKM); + outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKM); + + ret; diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c new file mode 100644 index 0000000..15e5f85 --- /dev/null +++ b/arch/x86/crypto/cast6_avx_glue.c @@ -0,0 +1,648 @@ +/* + * Glue Code for the AVX assembler implemention of the Cast6 Cipher + * + * Copyright (C) 2012 Johannes Goetzfried + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + * USA + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define CAST6_PARALLEL_BLOCKS 8 + +asmlinkage void __cast6_enc_blk_8way(struct cast6_ctx *ctx, u8 *dst, + const u8 *src, bool xor); +asmlinkage void cast6_dec_blk_8way(struct cast6_ctx *ctx, u8 *dst, + const u8 *src); + +static inline void cast6_enc_blk_xway(struct cast6_ctx *ctx, u8 *dst, + const u8 *src) +{ + __cast6_enc_blk_8way(ctx, dst, src, false); +} + +static inline void cast6_enc_blk_xway_xor(struct cast6_ctx *ctx, u8 *dst, + const u8 *src) +{ + __cast6_enc_blk_8way(ctx, dst, src, true); +} + +static inline void cast6_dec_blk_xway(struct cast6_ctx *ctx, u8 *dst, + const u8 *src) +{ + cast6_dec_blk_8way(ctx, dst, src); +} + + +static void cast6_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src) +{ + u128 ivs[CAST6_PARALLEL_BLOCKS - 1]; + unsigned int j; + + for (j = 0; j < CAST6_PARALLEL_BLOCKS - 1; j++) + ivs[j] = src[j]; + + cast6_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src); + + for (j = 0; j < CAST6_PARALLEL_BLOCKS - 1; j++) + u128_xor(dst + (j + 1), dst + (j + 1), ivs + j); +} + +static void cast6_crypt_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv) +{ + be128 ctrblk; + + u128_to_be128(&ctrblk, iv); + u128_inc(iv); + + __cast6_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk); + u128_xor(dst, src, (u128 *)&ctrblk); +} + +static void cast6_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src, + u128 *iv) +{ + be128 ctrblks[CAST6_PARALLEL_BLOCKS]; + unsigned int i; + + for (i = 0; i < CAST6_PARALLEL_BLOCKS; i++) { + if (dst != src) + dst[i] = src[i]; + + u128_to_be128(&ctrblks[i], iv); + u128_inc(iv); + } + + cast6_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks); +} + +static const struct common_glue_ctx cast6_enc = { + .num_funcs = 2, + .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS, + + .funcs = { { + .num_blocks = CAST6_PARALLEL_BLOCKS, + .fn_u = { .ecb = GLUE_FUNC_CAST(cast6_enc_blk_xway) } + }, { + .num_blocks = 1, + .fn_u = { .ecb = GLUE_FUNC_CAST(__cast6_encrypt) } + } } +}; + +static const struct common_glue_ctx cast6_ctr = { + .num_funcs = 2, + .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS, + + .funcs = { { + .num_blocks = CAST6_PARALLEL_BLOCKS, + .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(cast6_crypt_ctr_xway) } + }, { + .num_blocks = 1, + .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(cast6_crypt_ctr) } + } } +}; + +static const struct common_glue_ctx cast6_dec = { + .num_funcs = 2, + .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS, + + .funcs = { { + .num_blocks = CAST6_PARALLEL_BLOCKS, + .fn_u = { .ecb = GLUE_FUNC_CAST(cast6_dec_blk_xway) } + }, { + .num_blocks = 1, + .fn_u = { .ecb = GLUE_FUNC_CAST(__cast6_decrypt) } + } } +}; + +static const struct common_glue_ctx cast6_dec_cbc = { + .num_funcs = 2, + .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS, + + .funcs = { { + .num_blocks = CAST6_PARALLEL_BLOCKS, + .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(cast6_decrypt_cbc_xway) } + }, { + .num_blocks = 1, + .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__cast6_decrypt) } + } } +}; + +static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + return glue_ecb_crypt_128bit(&cast6_enc, desc, dst, src, nbytes); +} + +static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + return glue_ecb_crypt_128bit(&cast6_dec, desc, dst, src, nbytes); +} + +static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(__cast6_encrypt), desc, + dst, src, nbytes); +} + +static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + return glue_cbc_decrypt_128bit(&cast6_dec_cbc, desc, dst, src, + nbytes); +} + +static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + return glue_ctr_crypt_128bit(&cast6_ctr, desc, dst, src, nbytes); +} + +static inline bool cast6_fpu_begin(bool fpu_enabled, unsigned int nbytes) +{ + return glue_fpu_begin(CAST6_BLOCK_SIZE, CAST6_PARALLEL_BLOCKS, + NULL, fpu_enabled, nbytes); +} + +static inline void cast6_fpu_end(bool fpu_enabled) +{ + glue_fpu_end(fpu_enabled); +} + +struct crypt_priv { + struct cast6_ctx *ctx; + bool fpu_enabled; +}; + +static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) +{ + const unsigned int bsize = CAST6_BLOCK_SIZE; + struct crypt_priv *ctx = priv; + int i; + + ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes); + + if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) { + cast6_enc_blk_xway(ctx->ctx, srcdst, srcdst); + return; + } + + for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) + __cast6_encrypt(ctx->ctx, srcdst, srcdst); +} + +static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) +{ + const unsigned int bsize = CAST6_BLOCK_SIZE; + struct crypt_priv *ctx = priv; + int i; + + ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes); + + if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) { + cast6_dec_blk_xway(ctx->ctx, srcdst, srcdst); + return; + } + + for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) + __cast6_decrypt(ctx->ctx, srcdst, srcdst); +} + +struct cast6_lrw_ctx { + struct lrw_table_ctx lrw_table; + struct cast6_ctx cast6_ctx; +}; + +static int lrw_cast6_setkey(struct crypto_tfm *tfm, const u8 *key, + unsigned int keylen) +{ + struct cast6_lrw_ctx *ctx = crypto_tfm_ctx(tfm); + int err; + + err = __cast6_setkey(&ctx->cast6_ctx, key, keylen - CAST6_BLOCK_SIZE, + &tfm->crt_flags); + if (err) + return err; + + return lrw_init_table(&ctx->lrw_table, key + keylen - CAST6_BLOCK_SIZE); +} + +static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct cast6_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + be128 buf[CAST6_PARALLEL_BLOCKS]; + struct crypt_priv crypt_ctx = { + .ctx = &ctx->cast6_ctx, + .fpu_enabled = false, + }; + struct lrw_crypt_req req = { + .tbuf = buf, + .tbuflen = sizeof(buf), + + .table_ctx = &ctx->lrw_table, + .crypt_ctx = &crypt_ctx, + .crypt_fn = encrypt_callback, + }; + int ret; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + ret = lrw_crypt(desc, dst, src, nbytes, &req); + cast6_fpu_end(crypt_ctx.fpu_enabled); + + return ret; +} + +static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct cast6_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + be128 buf[CAST6_PARALLEL_BLOCKS]; + struct crypt_priv crypt_ctx = { + .ctx = &ctx->cast6_ctx, + .fpu_enabled = false, + }; + struct lrw_crypt_req req = { + .tbuf = buf, + .tbuflen = sizeof(buf), + + .table_ctx = &ctx->lrw_table, + .crypt_ctx = &crypt_ctx, + .crypt_fn = decrypt_callback, + }; + int ret; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + ret = lrw_crypt(desc, dst, src, nbytes, &req); + cast6_fpu_end(crypt_ctx.fpu_enabled); + + return ret; +} + +static void lrw_exit_tfm(struct crypto_tfm *tfm) +{ + struct cast6_lrw_ctx *ctx = crypto_tfm_ctx(tfm); + + lrw_free_table(&ctx->lrw_table); +} + +struct cast6_xts_ctx { + struct cast6_ctx tweak_ctx; + struct cast6_ctx crypt_ctx; +}; + +static int xts_cast6_setkey(struct crypto_tfm *tfm, const u8 *key, + unsigned int keylen) +{ + struct cast6_xts_ctx *ctx = crypto_tfm_ctx(tfm); + u32 *flags = &tfm->crt_flags; + int err; + + /* key consists of keys of equal size concatenated, therefore + * the length must be even + */ + if (keylen % 2) { + *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; + return -EINVAL; + } + + /* first half of xts-key is for crypt */ + err = __cast6_setkey(&ctx->crypt_ctx, key, keylen / 2, flags); + if (err) + return err; + + /* second half of xts-key is for tweak */ + return __cast6_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2, + flags); +} + +static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct cast6_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + be128 buf[CAST6_PARALLEL_BLOCKS]; + struct crypt_priv crypt_ctx = { + .ctx = &ctx->crypt_ctx, + .fpu_enabled = false, + }; + struct xts_crypt_req req = { + .tbuf = buf, + .tbuflen = sizeof(buf), + + .tweak_ctx = &ctx->tweak_ctx, + .tweak_fn = XTS_TWEAK_CAST(__cast6_encrypt), + .crypt_ctx = &crypt_ctx, + .crypt_fn = encrypt_callback, + }; + int ret; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + ret = xts_crypt(desc, dst, src, nbytes, &req); + cast6_fpu_end(crypt_ctx.fpu_enabled); + + return ret; +} + +static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct cast6_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + be128 buf[CAST6_PARALLEL_BLOCKS]; + struct crypt_priv crypt_ctx = { + .ctx = &ctx->crypt_ctx, + .fpu_enabled = false, + }; + struct xts_crypt_req req = { + .tbuf = buf, + .tbuflen = sizeof(buf), + + .tweak_ctx = &ctx->tweak_ctx, + .tweak_fn = XTS_TWEAK_CAST(__cast6_encrypt), + .crypt_ctx = &crypt_ctx, + .crypt_fn = decrypt_callback, + }; + int ret; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + ret = xts_crypt(desc, dst, src, nbytes, &req); + cast6_fpu_end(crypt_ctx.fpu_enabled); + + return ret; +} + +static struct crypto_alg cast6_algs[10] = { { + .cra_name = "__ecb-cast6-avx", + .cra_driver_name = "__driver-ecb-cast6-avx", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = CAST6_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct cast6_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_u = { + .blkcipher = { + .min_keysize = CAST6_MIN_KEY_SIZE, + .max_keysize = CAST6_MAX_KEY_SIZE, + .setkey = cast6_setkey, + .encrypt = ecb_encrypt, + .decrypt = ecb_decrypt, + }, + }, +}, { + .cra_name = "__cbc-cast6-avx", + .cra_driver_name = "__driver-cbc-cast6-avx", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = CAST6_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct cast6_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_u = { + .blkcipher = { + .min_keysize = CAST6_MIN_KEY_SIZE, + .max_keysize = CAST6_MAX_KEY_SIZE, + .setkey = cast6_setkey, + .encrypt = cbc_encrypt, + .decrypt = cbc_decrypt, + }, + }, +}, { + .cra_name = "__ctr-cast6-avx", + .cra_driver_name = "__driver-ctr-cast6-avx", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct cast6_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_u = { + .blkcipher = { + .min_keysize = CAST6_MIN_KEY_SIZE, + .max_keysize = CAST6_MAX_KEY_SIZE, + .ivsize = CAST6_BLOCK_SIZE, + .setkey = cast6_setkey, + .encrypt = ctr_crypt, + .decrypt = ctr_crypt, + }, + }, +}, { + .cra_name = "__lrw-cast6-avx", + .cra_driver_name = "__driver-lrw-cast6-avx", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = CAST6_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct cast6_lrw_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_exit = lrw_exit_tfm, + .cra_u = { + .blkcipher = { + .min_keysize = CAST6_MIN_KEY_SIZE + + CAST6_BLOCK_SIZE, + .max_keysize = CAST6_MAX_KEY_SIZE + + CAST6_BLOCK_SIZE, + .ivsize = CAST6_BLOCK_SIZE, + .setkey = lrw_cast6_setkey, + .encrypt = lrw_encrypt, + .decrypt = lrw_decrypt, + }, + }, +}, { + .cra_name = "__xts-cast6-avx", + .cra_driver_name = "__driver-xts-cast6-avx", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = CAST6_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct cast6_xts_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_u = { + .blkcipher = { + .min_keysize = CAST6_MIN_KEY_SIZE * 2, + .max_keysize = CAST6_MAX_KEY_SIZE * 2, + .ivsize = CAST6_BLOCK_SIZE, + .setkey = xts_cast6_setkey, + .encrypt = xts_encrypt, + .decrypt = xts_decrypt, + }, + }, +}, { + .cra_name = "ecb(cast6)", + .cra_driver_name = "ecb-cast6-avx", + .cra_priority = 200, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = CAST6_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_helper_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_init = ablk_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = CAST6_MIN_KEY_SIZE, + .max_keysize = CAST6_MAX_KEY_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_decrypt, + }, + }, +}, { + .cra_name = "cbc(cast6)", + .cra_driver_name = "cbc-cast6-avx", + .cra_priority = 200, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = CAST6_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_helper_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_init = ablk_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = CAST6_MIN_KEY_SIZE, + .max_keysize = CAST6_MAX_KEY_SIZE, + .ivsize = CAST6_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = __ablk_encrypt, + .decrypt = ablk_decrypt, + }, + }, +}, { + .cra_name = "ctr(cast6)", + .cra_driver_name = "ctr-cast6-avx", + .cra_priority = 200, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct async_helper_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_init = ablk_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = CAST6_MIN_KEY_SIZE, + .max_keysize = CAST6_MAX_KEY_SIZE, + .ivsize = CAST6_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_encrypt, + .geniv = "chainiv", + }, + }, +}, { + .cra_name = "lrw(cast6)", + .cra_driver_name = "lrw-cast6-avx", + .cra_priority = 200, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = CAST6_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_helper_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_init = ablk_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = CAST6_MIN_KEY_SIZE + + CAST6_BLOCK_SIZE, + .max_keysize = CAST6_MAX_KEY_SIZE + + CAST6_BLOCK_SIZE, + .ivsize = CAST6_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_decrypt, + }, + }, +}, { + .cra_name = "xts(cast6)", + .cra_driver_name = "xts-cast6-avx", + .cra_priority = 200, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = CAST6_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_helper_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_init = ablk_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = CAST6_MIN_KEY_SIZE * 2, + .max_keysize = CAST6_MAX_KEY_SIZE * 2, + .ivsize = CAST6_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_decrypt, + }, + }, +} }; + +static int __init cast6_init(void) +{ + u64 xcr0; + + if (!cpu_has_avx || !cpu_has_osxsave) { + pr_info("AVX instructions are not detected.\n"); + return -ENODEV; + } + + xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); + if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { + pr_info("AVX detected but unusable.\n"); + return -ENODEV; + } + + return crypto_register_algs(cast6_algs, ARRAY_SIZE(cast6_algs)); +} + +static void __exit cast6_exit(void) +{ + crypto_unregister_algs(cast6_algs, ARRAY_SIZE(cast6_algs)); +} + +module_init(cast6_init); +module_exit(cast6_exit); + +MODULE_DESCRIPTION("Cast6 Cipher Algorithm, AVX optimized"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("cast6"); -- cgit v1.1 From e115676e042f4d9268c6b6d8cb7dc962aa6cfd7d Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 1 Aug 2012 17:01:42 +0300 Subject: KVM: x86: update KVM_SAVE_MSRS_BEGIN to correct value When MSR_KVM_PV_EOI_EN was added to msrs_to_save array KVM_SAVE_MSRS_BEGIN was not updated accordingly. Signed-off-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3a53bcc..a87c82a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -806,7 +806,7 @@ EXPORT_SYMBOL_GPL(kvm_rdpmc); * kvm-specific. Those are put in the beginning of the list. */ -#define KVM_SAVE_MSRS_BEGIN 9 +#define KVM_SAVE_MSRS_BEGIN 10 static u32 msrs_to_save[] = { MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, -- cgit v1.1 From ea22571c8fd912f28e2525f7112bbb84b474ff3a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 19 Jul 2012 13:59:37 -0400 Subject: x86: mce: Disable preemption when calling raise_local() raise_mce() has a code path which does not disable preemption when the raise_local() is called. The per cpu variable access in raise_local() depends on preemption being disabled to be functional. So that code path was either never tested or never tested with CONFIG_DEBUG_PREEMPT enabled. Add the missing preempt_disable/enable() pair around the call. Signed-off-by: Thomas Gleixner Signed-off-by: Chen Gong Signed-off-by: Tony Luck --- arch/x86/kernel/cpu/mcheck/mce-inject.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index fc4beb3..753746f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -194,7 +194,11 @@ static void raise_mce(struct mce *m) put_online_cpus(); } else #endif + { + preempt_disable(); raise_local(); + preempt_enable(); + } } /* Error injection interface */ -- cgit v1.1 From b5975917a3e5f93b5d1c95561aab0aa44327baea Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 19 Jul 2012 13:59:38 -0400 Subject: x86: mce: Serialize mce injection raise_mce() fiddles with global state, but lacks any kind of serialization. Add a mutex around the raise_mce() call, so concurrent writers do not stomp on each other toes. Signed-off-by: Thomas Gleixner Signed-off-by: Chen Gong Signed-off-by: Tony Luck --- arch/x86/kernel/cpu/mcheck/mce-inject.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index 753746f..ddc72f8 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -78,6 +78,7 @@ static void raise_exception(struct mce *m, struct pt_regs *pregs) } static cpumask_var_t mce_inject_cpumask; +static DEFINE_MUTEX(mce_inject_mutex); static int mce_raise_notify(unsigned int cmd, struct pt_regs *regs) { @@ -229,7 +230,10 @@ static ssize_t mce_write(struct file *filp, const char __user *ubuf, * so do it a jiffie or two later everywhere. */ schedule_timeout(2); + + mutex_lock(&mce_inject_mutex); raise_mce(&m); + mutex_unlock(&mce_inject_mutex); return usize; } -- cgit v1.1 From 26c3c283c5b08dd250279c06ba3ab5b094bbacc3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 19 Jul 2012 13:59:39 -0400 Subject: x86: mce: Split timer init Split timer init function into the init and the start part, so the start part can replace the open coded version in CPU_DOWN_FAILED. Signed-off-by: Thomas Gleixner Signed-off-by: Chen Gong Acked-by: Borislav Petkov Signed-off-by: Tony Luck --- arch/x86/kernel/cpu/mcheck/mce.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 5e095f8..bd3a5e8 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1557,23 +1557,28 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) } } -static void __mcheck_cpu_init_timer(void) +static void mce_start_timer(unsigned int cpu, struct timer_list *t) { - struct timer_list *t = &__get_cpu_var(mce_timer); unsigned long iv = check_interval * HZ; - setup_timer(t, mce_timer_fn, smp_processor_id()); + __this_cpu_write(mce_next_interval, iv); - if (mce_ignore_ce) + if (mce_ignore_ce || !iv) return; - __this_cpu_write(mce_next_interval, iv); - if (!iv) - return; t->expires = round_jiffies(jiffies + iv); add_timer_on(t, smp_processor_id()); } +static void __mcheck_cpu_init_timer(void) +{ + struct timer_list *t = &__get_cpu_var(mce_timer); + unsigned int cpu = smp_processor_id(); + + setup_timer(t, mce_timer_fn, cpu); + mce_start_timer(cpu, t); +} + /* Handle unconfigured int18 (should never happen) */ static void unexpected_machine_check(struct pt_regs *regs, long error_code) { @@ -2277,12 +2282,8 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) break; case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: - if (!mce_ignore_ce && check_interval) { - t->expires = round_jiffies(jiffies + - per_cpu(mce_next_interval, cpu)); - add_timer_on(t, cpu); - } smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); + mce_start_timer(cpu, t); break; case CPU_POST_DEAD: /* intentionally ignoring frozen here */ -- cgit v1.1 From 1a65f970d10ace7a1e399f9061a65679c0ae57d0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 19 Jul 2012 13:59:40 -0400 Subject: x86: mce: Remove the frozen cases in the hotplug code No point in having double cases if we can simply mask the FROZEN bit out. Signed-off-by: Thomas Gleixner Signed-off-by: Chen Gong Signed-off-by: Tony Luck --- arch/x86/kernel/cpu/mcheck/mce.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index bd3a5e8..b4dde15 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -2262,34 +2262,32 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) unsigned int cpu = (unsigned long)hcpu; struct timer_list *t = &per_cpu(mce_timer, cpu); - switch (action) { + switch (action & ~CPU_TASKS_FROZEN) { case CPU_ONLINE: - case CPU_ONLINE_FROZEN: mce_device_create(cpu); if (threshold_cpu_callback) threshold_cpu_callback(action, cpu); break; case CPU_DEAD: - case CPU_DEAD_FROZEN: if (threshold_cpu_callback) threshold_cpu_callback(action, cpu); mce_device_remove(cpu); break; case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: del_timer_sync(t); smp_call_function_single(cpu, mce_disable_cpu, &action, 1); break; case CPU_DOWN_FAILED: - case CPU_DOWN_FAILED_FROZEN: smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); mce_start_timer(cpu, t); break; - case CPU_POST_DEAD: + } + + if (action == CPU_POST_DEAD) { /* intentionally ignoring frozen here */ cmci_rediscover(cpu); - break; } + return NOTIFY_OK; } -- cgit v1.1 From 439793d4b3c99e550daebd868bbd58967c93d0b3 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 1 Aug 2012 17:01:42 +0300 Subject: KVM: x86: update KVM_SAVE_MSRS_BEGIN to correct value When MSR_KVM_PV_EOI_EN was added to msrs_to_save array KVM_SAVE_MSRS_BEGIN was not updated accordingly. Signed-off-by: Gleb Natapov Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 42bce48..dce75b76 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -806,7 +806,7 @@ EXPORT_SYMBOL_GPL(kvm_rdpmc); * kvm-specific. Those are put in the beginning of the list. */ -#define KVM_SAVE_MSRS_BEGIN 9 +#define KVM_SAVE_MSRS_BEGIN 10 static u32 msrs_to_save[] = { MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, -- cgit v1.1 From aab2eb7a38e0e510874acca01838f5badbca6c7e Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Wed, 1 Aug 2012 18:01:10 +0900 Subject: KVM: Stop checking rmap to see if slot is being created Instead, check npages consistently. This helps to make rmap architecture specific in a later patch. Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3ca90d7..abc039d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6385,7 +6385,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, *x86 needs to handle !user_alloc case. */ if (!user_alloc) { - if (npages && !old.rmap) { + if (npages && !old.npages) { unsigned long userspace_addr; userspace_addr = vm_mmap(NULL, 0, @@ -6413,7 +6413,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT; - if (!user_alloc && !old.user_alloc && old.rmap && !npages) { + if (!user_alloc && !old.user_alloc && old.npages && !npages) { int ret; ret = vm_munmap(old.userspace_addr, -- cgit v1.1 From 65fbe37c42ed75604c9a770639209dcee162ebe7 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Wed, 1 Aug 2012 18:02:01 +0900 Subject: KVM: MMU: Use gfn_to_rmap() instead of directly reading rmap array This helps to make rmap architecture specific in a later patch. Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 3 ++- arch/x86/kvm/mmu_audit.c | 4 +--- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index a9a2052..ee768bb 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1181,7 +1181,8 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, unsigned long *rmapp; while (mask) { - rmapp = &slot->rmap[gfn_offset + __ffs(mask)]; + rmapp = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), + PT_PAGE_TABLE_LEVEL, slot); __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL, false); /* clear the first set bit */ diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c index 7d7d0b9..ca403f9 100644 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu_audit.c @@ -190,7 +190,6 @@ static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp) static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp) { - struct kvm_memory_slot *slot; unsigned long *rmapp; u64 *sptep; struct rmap_iterator iter; @@ -198,8 +197,7 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp) if (sp->role.direct || sp->unsync || sp->role.invalid) return; - slot = gfn_to_memslot(kvm, sp->gfn); - rmapp = &slot->rmap[sp->gfn - slot->base_gfn]; + rmapp = gfn_to_rmap(kvm, sp->gfn, PT_PAGE_TABLE_LEVEL); for (sptep = rmap_get_first(*rmapp, &iter); sptep; sptep = rmap_get_next(&iter)) { -- cgit v1.1 From d89cc617b954aff4030fce178f7d86f59aaf713d Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Wed, 1 Aug 2012 18:03:28 +0900 Subject: KVM: Push rmap into kvm_arch_memory_slot Two reasons: - x86 can integrate rmap and rmap_pde and remove heuristics in __gfn_to_rmap(). - Some architectures do not need rmap. Since rmap is one of the most memory consuming stuff in KVM, ppc'd better restrict the allocation to Book3S HV. Signed-off-by: Takuya Yoshikawa Acked-by: Paul Mackerras Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/mmu.c | 5 +--- arch/x86/kvm/x86.c | 55 ++++++++++++++++++++++++----------------- 3 files changed, 34 insertions(+), 28 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 48e7131..1309e69 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -504,7 +504,7 @@ struct kvm_lpage_info { }; struct kvm_arch_memory_slot { - unsigned long *rmap_pde[KVM_NR_PAGE_SIZES - 1]; + unsigned long *rmap[KVM_NR_PAGE_SIZES]; struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1]; }; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index ee768bb..aa9a987 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -970,11 +970,8 @@ static unsigned long *__gfn_to_rmap(gfn_t gfn, int level, { unsigned long idx; - if (likely(level == PT_PAGE_TABLE_LEVEL)) - return &slot->rmap[gfn - slot->base_gfn]; - idx = gfn_to_index(gfn, slot->base_gfn, level); - return &slot->arch.rmap_pde[level - PT_DIRECTORY_LEVEL][idx]; + return &slot->arch.rmap[level - PT_PAGE_TABLE_LEVEL][idx]; } /* diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index abc039d..ebf2109 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6303,14 +6303,18 @@ void kvm_arch_free_memslot(struct kvm_memory_slot *free, { int i; - for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { - if (!dont || free->arch.rmap_pde[i] != dont->arch.rmap_pde[i]) { - kvm_kvfree(free->arch.rmap_pde[i]); - free->arch.rmap_pde[i] = NULL; + for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { + if (!dont || free->arch.rmap[i] != dont->arch.rmap[i]) { + kvm_kvfree(free->arch.rmap[i]); + free->arch.rmap[i] = NULL; } - if (!dont || free->arch.lpage_info[i] != dont->arch.lpage_info[i]) { - kvm_kvfree(free->arch.lpage_info[i]); - free->arch.lpage_info[i] = NULL; + if (i == 0) + continue; + + if (!dont || free->arch.lpage_info[i - 1] != + dont->arch.lpage_info[i - 1]) { + kvm_kvfree(free->arch.lpage_info[i - 1]); + free->arch.lpage_info[i - 1] = NULL; } } } @@ -6319,28 +6323,30 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) { int i; - for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { + for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { unsigned long ugfn; int lpages; - int level = i + 2; + int level = i + 1; lpages = gfn_to_index(slot->base_gfn + npages - 1, slot->base_gfn, level) + 1; - slot->arch.rmap_pde[i] = - kvm_kvzalloc(lpages * sizeof(*slot->arch.rmap_pde[i])); - if (!slot->arch.rmap_pde[i]) + slot->arch.rmap[i] = + kvm_kvzalloc(lpages * sizeof(*slot->arch.rmap[i])); + if (!slot->arch.rmap[i]) goto out_free; + if (i == 0) + continue; - slot->arch.lpage_info[i] = - kvm_kvzalloc(lpages * sizeof(*slot->arch.lpage_info[i])); - if (!slot->arch.lpage_info[i]) + slot->arch.lpage_info[i - 1] = kvm_kvzalloc(lpages * + sizeof(*slot->arch.lpage_info[i - 1])); + if (!slot->arch.lpage_info[i - 1]) goto out_free; if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1)) - slot->arch.lpage_info[i][0].write_count = 1; + slot->arch.lpage_info[i - 1][0].write_count = 1; if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1)) - slot->arch.lpage_info[i][lpages - 1].write_count = 1; + slot->arch.lpage_info[i - 1][lpages - 1].write_count = 1; ugfn = slot->userspace_addr >> PAGE_SHIFT; /* * If the gfn and userspace address are not aligned wrt each @@ -6352,18 +6358,21 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) unsigned long j; for (j = 0; j < lpages; ++j) - slot->arch.lpage_info[i][j].write_count = 1; + slot->arch.lpage_info[i - 1][j].write_count = 1; } } return 0; out_free: - for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { - kvm_kvfree(slot->arch.rmap_pde[i]); - kvm_kvfree(slot->arch.lpage_info[i]); - slot->arch.rmap_pde[i] = NULL; - slot->arch.lpage_info[i] = NULL; + for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { + kvm_kvfree(slot->arch.rmap[i]); + slot->arch.rmap[i] = NULL; + if (i == 0) + continue; + + kvm_kvfree(slot->arch.lpage_info[i - 1]); + slot->arch.lpage_info[i - 1] = NULL; } return -ENOMEM; } -- cgit v1.1 From 6c8ee57be9350c5c2cafdd6a99d0462d528676e2 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 3 Aug 2012 15:37:54 +0800 Subject: KVM: introduce KVM_PFN_ERR_FAULT After that, the exported and un-inline function, get_fault_pfn, can be removed Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index aa9a987..9cf90c8 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2512,7 +2512,7 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log); if (!slot) - return get_fault_pfn(); + return KVM_PFN_ERR_FAULT; hva = gfn_to_hva_memslot(slot, gfn); -- cgit v1.1 From e6c1502b3f933ace20c711ce34ab696f5a67086d Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 3 Aug 2012 15:38:36 +0800 Subject: KVM: introduce KVM_PFN_ERR_HWPOISON Then, get_hwpoison_pfn and is_hwpoison_pfn can be removed Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 9cf90c8..d3cdf69 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2649,7 +2649,7 @@ static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct * static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn) { kvm_release_pfn_clean(pfn); - if (is_hwpoison_pfn(pfn)) { + if (pfn == KVM_PFN_ERR_HWPOISON) { kvm_send_hwpoison_signal(gfn_to_hva(vcpu->kvm, gfn), current); return 0; } -- cgit v1.1 From cb9aaa30b133574b646d9d4766ef08a843211393 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 3 Aug 2012 15:42:10 +0800 Subject: KVM: do not release the error pfn After commit a2766325cf9f9, the error pfn is replaced by the error code, it need not be released anymore [ The patch has been compiling tested for powerpc ] Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 7 +++---- arch/x86/kvm/mmu_audit.c | 4 +--- arch/x86/kvm/paging_tmpl.h | 8 ++------ 3 files changed, 6 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index d3cdf69..9651c2cd 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2496,7 +2496,9 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, rmap_recycle(vcpu, sptep, gfn); } } - kvm_release_pfn_clean(pfn); + + if (!is_error_pfn(pfn)) + kvm_release_pfn_clean(pfn); } static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) @@ -2648,7 +2650,6 @@ static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct * static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn) { - kvm_release_pfn_clean(pfn); if (pfn == KVM_PFN_ERR_HWPOISON) { kvm_send_hwpoison_signal(gfn_to_hva(vcpu->kvm, gfn), current); return 0; @@ -3273,8 +3274,6 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, if (!async) return false; /* *pfn has correct page already */ - kvm_release_pfn_clean(*pfn); - if (!prefault && can_do_async_pf(vcpu)) { trace_kvm_try_async_get_page(gva, gfn); if (kvm_find_async_pf_gfn(vcpu, gfn)) { diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c index ca403f9..daff69e 100644 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu_audit.c @@ -116,10 +116,8 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level) gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn); - if (is_error_pfn(pfn)) { - kvm_release_pfn_clean(pfn); + if (is_error_pfn(pfn)) return; - } hpa = pfn << PAGE_SHIFT; if ((*sptep & PT64_BASE_ADDR_MASK) != hpa) diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index bb7cf01..bf8c42b 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -370,10 +370,8 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte, true); pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte)); - if (mmu_invalid_pfn(pfn)) { - kvm_release_pfn_clean(pfn); + if (mmu_invalid_pfn(pfn)) return; - } /* * we call mmu_set_spte() with host_writable = true because that @@ -448,10 +446,8 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, gfn = gpte_to_gfn(gpte); pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, pte_access & ACC_WRITE_MASK); - if (mmu_invalid_pfn(pfn)) { - kvm_release_pfn_clean(pfn); + if (mmu_invalid_pfn(pfn)) break; - } mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, NULL, PT_PAGE_TABLE_LEVEL, gfn, -- cgit v1.1 From 32cad84f44d186654492f1a50a1424c8906ccbd9 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 3 Aug 2012 15:42:52 +0800 Subject: KVM: do not release the error page After commit a2766325cf9f9, the error page is replaced by the error code, it need not be released anymore [ The patch has been compiling tested for powerpc ] Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 1 - arch/x86/kvm/vmx.c | 5 ++--- arch/x86/kvm/x86.c | 9 +++------ 3 files changed, 5 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 687d0c3..31be4a5 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2105,7 +2105,6 @@ static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page) return kmap(page); error: - kvm_release_page_clean(page); kvm_inject_gp(&svm->vcpu, 0); return NULL; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d6e4cbc..cc8ad98 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -596,10 +596,9 @@ static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu) static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr) { struct page *page = gfn_to_page(vcpu->kvm, addr >> PAGE_SHIFT); - if (is_error_page(page)) { - kvm_release_page_clean(page); + if (is_error_page(page)) return NULL; - } + return page; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ebf2109..7953a9e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1639,10 +1639,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) vcpu->arch.time_page = gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT); - if (is_error_page(vcpu->arch.time_page)) { - kvm_release_page_clean(vcpu->arch.time_page); + if (is_error_page(vcpu->arch.time_page)) vcpu->arch.time_page = NULL; - } + break; } case MSR_KVM_ASYNC_PF_EN: @@ -3945,10 +3944,8 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, goto emul_write; page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); - if (is_error_page(page)) { - kvm_release_page_clean(page); + if (is_error_page(page)) goto emul_write; - } kaddr = kmap_atomic(page); kaddr += offset_in_page(gpa); -- cgit v1.1 From 8a5a87d9b7aef24bcdba763d8ee14982477b0a2e Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Sun, 5 Aug 2012 15:58:26 +0300 Subject: KVM: clean up kvm_(set|get)_apic_base kvm_get_apic_base() needlessly checks irqchip_in_kernel although it does the same no matter what result of the check is. kvm_set_apic_base() also checks for irqchip_in_kernel, but kvm_lapic_set_base() can handle this case. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7953a9e..3cafbb1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -246,20 +246,14 @@ static void drop_user_return_notifiers(void *ignore) u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) { - if (irqchip_in_kernel(vcpu->kvm)) - return vcpu->arch.apic_base; - else - return vcpu->arch.apic_base; + return vcpu->arch.apic_base; } EXPORT_SYMBOL_GPL(kvm_get_apic_base); void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data) { /* TODO: reserve bits check */ - if (irqchip_in_kernel(vcpu->kvm)) - kvm_lapic_set_base(vcpu, data); - else - vcpu->arch.apic_base = data; + kvm_lapic_set_base(vcpu, data); } EXPORT_SYMBOL_GPL(kvm_set_apic_base); -- cgit v1.1 From 5dbc8f3fed0b4cb04dfb276150294f21c5f0fc66 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Sun, 5 Aug 2012 15:58:27 +0300 Subject: KVM: use kvm_lapic_set_base() to change apic_base Do not change apic_base directly. Use kvm_lapic_set_base() instead. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/lapic.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 0cd431c..49f4ac0 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1185,7 +1185,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) update_divide_count(apic); atomic_set(&apic->lapic_timer.pending, 0); if (kvm_vcpu_is_bsp(vcpu)) - vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; + kvm_lapic_set_base(vcpu, + vcpu->arch.apic_base | MSR_IA32_APICBASE_BSP); vcpu->arch.pv_eoi.msr_val = 0; apic_update_ppr(apic); @@ -1310,8 +1311,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) HRTIMER_MODE_ABS); apic->lapic_timer.timer.function = apic_timer_fn; - apic->base_address = APIC_DEFAULT_PHYS_BASE; - vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE; + kvm_lapic_set_base(vcpu, APIC_DEFAULT_PHYS_BASE); kvm_lapic_reset(vcpu); kvm_iodevice_init(&apic->dev, &apic_mmio_ops); @@ -1380,8 +1380,7 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - apic->base_address = vcpu->arch.apic_base & - MSR_IA32_APICBASE_BASE; + kvm_lapic_set_base(vcpu, vcpu->arch.apic_base); kvm_apic_set_version(vcpu); apic_update_ppr(apic); -- cgit v1.1 From 6aed64a8a440360be875f31eabeacaddb83ef25f Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Sun, 5 Aug 2012 15:58:28 +0300 Subject: KVM: mark apic enabled on start up According to SDM apic is enabled on start up. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/lapic.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 49f4ac0..c3f14fe 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1311,7 +1311,8 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) HRTIMER_MODE_ABS); apic->lapic_timer.timer.function = apic_timer_fn; - kvm_lapic_set_base(vcpu, APIC_DEFAULT_PHYS_BASE); + kvm_lapic_set_base(vcpu, + APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE); kvm_lapic_reset(vcpu); kvm_iodevice_init(&apic->dev, &apic_mmio_ops); -- cgit v1.1 From c5cc421ba3219b90f11d151bc55f1608c12830fa Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Sun, 5 Aug 2012 15:58:30 +0300 Subject: KVM: use jump label to optimize checking for HW enabled APIC in APIC_BASE MSR Usually all APICs are HW enabled so the check can be optimized out. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/lapic.c | 29 ++++++++++++++++++++++++++++- arch/x86/kvm/lapic.h | 1 + arch/x86/kvm/x86.c | 1 + 3 files changed, 30 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index c3f14fe..5b46cab 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -34,6 +34,7 @@ #include #include #include +#include #include "kvm_cache_regs.h" #include "irq.h" #include "trace.h" @@ -117,9 +118,13 @@ static inline int __apic_test_and_clear_vector(int vec, void *bitmap) return __test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); } +struct static_key_deferred apic_hw_disabled __read_mostly; + static inline int apic_hw_enabled(struct kvm_lapic *apic) { - return (apic)->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE; + if (static_key_false(&apic_hw_disabled.key)) + return apic->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE; + return MSR_IA32_APICBASE_ENABLE; } static inline int apic_sw_enabled(struct kvm_lapic *apic) @@ -1055,6 +1060,9 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu) hrtimer_cancel(&vcpu->arch.apic->lapic_timer.timer); + if (!(vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE)) + static_key_slow_dec_deferred(&apic_hw_disabled); + if (vcpu->arch.apic->regs) free_page((unsigned long)vcpu->arch.apic->regs); @@ -1125,6 +1133,14 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) return; } + /* update jump label if enable bit changes */ + if ((vcpu->arch.apic_base ^ value) & MSR_IA32_APICBASE_ENABLE) { + if (value & MSR_IA32_APICBASE_ENABLE) + static_key_slow_dec_deferred(&apic_hw_disabled); + else + static_key_slow_inc(&apic_hw_disabled.key); + } + if (!kvm_vcpu_is_bsp(apic->vcpu)) value &= ~MSR_IA32_APICBASE_BSP; @@ -1311,6 +1327,11 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) HRTIMER_MODE_ABS); apic->lapic_timer.timer.function = apic_timer_fn; + /* + * APIC is created enabled. This will prevent kvm_lapic_set_base from + * thinking that APIC satet has changed. + */ + vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE; kvm_lapic_set_base(vcpu, APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE); @@ -1598,3 +1619,9 @@ int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data) return kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_eoi.data, addr); } + +void kvm_lapic_init(void) +{ + /* do not patch jump label more than once per second */ + jump_label_rate_limit(&apic_hw_disabled, HZ); +} diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 166766f..73fa299 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -78,4 +78,5 @@ static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu) } int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data); +void kvm_lapic_init(void); #endif diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3cafbb1..29fa18d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4903,6 +4903,7 @@ int kvm_arch_init(void *opaque) if (cpu_has_xsave) host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); + kvm_lapic_init(); return 0; out: -- cgit v1.1 From f8c1ea103947038b7197bdd4c8451886a58af0c0 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Sun, 5 Aug 2012 15:58:31 +0300 Subject: KVM: use jump label to optimize checking for SW enabled apic in spurious interrupt register Usually all APICs are SW enabled so the check can be optimized out. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/lapic.c | 39 +++++++++++++++++++++++++++++++-------- 1 file changed, 31 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 5b46cab..7f77e96 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -127,9 +127,24 @@ static inline int apic_hw_enabled(struct kvm_lapic *apic) return MSR_IA32_APICBASE_ENABLE; } -static inline int apic_sw_enabled(struct kvm_lapic *apic) +struct static_key_deferred apic_sw_disabled __read_mostly; + +static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val) +{ + if ((apic_get_reg(apic, APIC_SPIV) ^ val) & APIC_SPIV_APIC_ENABLED) { + if (val & APIC_SPIV_APIC_ENABLED) + static_key_slow_dec_deferred(&apic_sw_disabled); + else + static_key_slow_inc(&apic_sw_disabled.key); + } + apic_set_reg(apic, APIC_SPIV, val); +} + +static inline int apic_sw_enabled(struct kvm_lapic *apic) { - return apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED; + if (static_key_false(&apic_sw_disabled.key)) + return apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED; + return APIC_SPIV_APIC_ENABLED; } static inline int apic_enabled(struct kvm_lapic *apic) @@ -918,7 +933,7 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) u32 mask = 0x3ff; if (apic_get_reg(apic, APIC_LVR) & APIC_LVR_DIRECTED_EOI) mask |= APIC_SPIV_DIRECTED_EOI; - apic_set_reg(apic, APIC_SPIV, val & mask); + apic_set_spiv(apic, val & mask); if (!(val & APIC_SPIV_APIC_ENABLED)) { int i; u32 lvt_val; @@ -1055,18 +1070,23 @@ EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi); void kvm_free_lapic(struct kvm_vcpu *vcpu) { + struct kvm_lapic *apic = vcpu->arch.apic; + if (!vcpu->arch.apic) return; - hrtimer_cancel(&vcpu->arch.apic->lapic_timer.timer); + hrtimer_cancel(&apic->lapic_timer.timer); if (!(vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE)) static_key_slow_dec_deferred(&apic_hw_disabled); - if (vcpu->arch.apic->regs) - free_page((unsigned long)vcpu->arch.apic->regs); + if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED)) + static_key_slow_dec_deferred(&apic_sw_disabled); - kfree(vcpu->arch.apic); + if (apic->regs) + free_page((unsigned long)apic->regs); + + kfree(apic); } /* @@ -1182,7 +1202,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT)); apic_set_reg(apic, APIC_DFR, 0xffffffffU); - apic_set_reg(apic, APIC_SPIV, 0xff); + apic_set_spiv(apic, 0xff); apic_set_reg(apic, APIC_TASKPRI, 0); apic_set_reg(apic, APIC_LDR, 0); apic_set_reg(apic, APIC_ESR, 0); @@ -1335,6 +1355,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) kvm_lapic_set_base(vcpu, APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE); + static_key_slow_inc(&apic_sw_disabled.key); /* sw disabled at reset */ kvm_lapic_reset(vcpu); kvm_iodevice_init(&apic->dev, &apic_mmio_ops); @@ -1404,6 +1425,7 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu) kvm_lapic_set_base(vcpu, vcpu->arch.apic_base); kvm_apic_set_version(vcpu); + apic_set_spiv(apic, apic_get_reg(apic, APIC_SPIV)); apic_update_ppr(apic); hrtimer_cancel(&apic->lapic_timer.timer); @@ -1624,4 +1646,5 @@ void kvm_lapic_init(void) { /* do not patch jump label more than once per second */ jump_label_rate_limit(&apic_hw_disabled, HZ); + jump_label_rate_limit(&apic_sw_disabled, HZ); } -- cgit v1.1 From 54e9818f3903902a4ea3046035739b8770880092 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Sun, 5 Aug 2012 15:58:32 +0300 Subject: KVM: use jump label to optimize checking for in kernel local apic presence Usually all vcpus have local apic pointer initialized, so the check may be completely skipped. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/lapic.c | 62 ++++++++++++++++++++++++++++------------------------ arch/x86/kvm/x86.c | 7 +++++- arch/x86/kvm/x86.h | 1 + 3 files changed, 41 insertions(+), 29 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 7f77e96..650379b 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -152,6 +152,13 @@ static inline int apic_enabled(struct kvm_lapic *apic) return apic_sw_enabled(apic) && apic_hw_enabled(apic); } +static inline bool vcpu_has_lapic(struct kvm_vcpu *vcpu) +{ + if (static_key_false(&kvm_no_apic_vcpu)) + return vcpu->arch.apic; + return true; +} + #define LVT_MASK \ (APIC_LVT_MASKED | APIC_SEND_PENDING | APIC_VECTOR_MASK) @@ -204,7 +211,7 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu) struct kvm_cpuid_entry2 *feat; u32 v = APIC_VERSION; - if (!irqchip_in_kernel(vcpu->kvm)) + if (!vcpu_has_lapic(vcpu)) return; feat = kvm_find_cpuid_entry(apic->vcpu, 0x1, 0); @@ -305,7 +312,6 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic) int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) { - struct kvm_lapic *apic = vcpu->arch.apic; int highest_irr; /* This may race with setting of irr in __apic_accept_irq() and @@ -313,9 +319,9 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) * will cause vmexit immediately and the value will be recalculated * on the next vmentry. */ - if (!apic) + if (!vcpu_has_lapic(vcpu)) return 0; - highest_irr = apic_find_highest_irr(apic); + highest_irr = apic_find_highest_irr(vcpu->arch.apic); return highest_irr; } @@ -1061,9 +1067,7 @@ static int apic_mmio_write(struct kvm_io_device *this, void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu) { - struct kvm_lapic *apic = vcpu->arch.apic; - - if (apic) + if (vcpu_has_lapic(vcpu)) apic_reg_write(vcpu->arch.apic, APIC_EOI, 0); } EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi); @@ -1098,10 +1102,9 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu) u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - if (!apic) - return 0; - if (apic_lvtt_oneshot(apic) || apic_lvtt_period(apic)) + if (!vcpu_has_lapic(vcpu) || apic_lvtt_oneshot(apic) || + apic_lvtt_period(apic)) return 0; return apic->lapic_timer.tscdeadline; @@ -1110,10 +1113,9 @@ u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu) void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data) { struct kvm_lapic *apic = vcpu->arch.apic; - if (!apic) - return; - if (apic_lvtt_oneshot(apic) || apic_lvtt_period(apic)) + if (!vcpu_has_lapic(vcpu) || apic_lvtt_oneshot(apic) || + apic_lvtt_period(apic)) return; hrtimer_cancel(&apic->lapic_timer.timer); @@ -1125,20 +1127,21 @@ void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8) { struct kvm_lapic *apic = vcpu->arch.apic; - if (!apic) + if (!vcpu_has_lapic(vcpu)) return; + apic_set_tpr(apic, ((cr8 & 0x0f) << 4) | (apic_get_reg(apic, APIC_TASKPRI) & 4)); } u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu) { - struct kvm_lapic *apic = vcpu->arch.apic; u64 tpr; - if (!apic) + if (!vcpu_has_lapic(vcpu)) return 0; - tpr = (u64) apic_get_reg(apic, APIC_TASKPRI); + + tpr = (u64) apic_get_reg(vcpu->arch.apic, APIC_TASKPRI); return (tpr & 0xf0) >> 4; } @@ -1237,7 +1240,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) bool kvm_apic_present(struct kvm_vcpu *vcpu) { - return vcpu->arch.apic && apic_hw_enabled(vcpu->arch.apic); + return vcpu_has_lapic(vcpu) && apic_hw_enabled(vcpu->arch.apic); } int kvm_lapic_enabled(struct kvm_vcpu *vcpu) @@ -1258,10 +1261,11 @@ static bool lapic_is_periodic(struct kvm_lapic *apic) int apic_has_pending_timer(struct kvm_vcpu *vcpu) { - struct kvm_lapic *lapic = vcpu->arch.apic; + struct kvm_lapic *apic = vcpu->arch.apic; - if (lapic && apic_enabled(lapic) && apic_lvt_enabled(lapic, APIC_LVTT)) - return atomic_read(&lapic->lapic_timer.pending); + if (vcpu_has_lapic(vcpu) && apic_enabled(apic) && + apic_lvt_enabled(apic, APIC_LVTT)) + return atomic_read(&apic->lapic_timer.pending); return 0; } @@ -1371,7 +1375,7 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu) struct kvm_lapic *apic = vcpu->arch.apic; int highest_irr; - if (!apic || !apic_enabled(apic)) + if (!vcpu_has_lapic(vcpu) || !apic_enabled(apic)) return -1; apic_update_ppr(apic); @@ -1399,7 +1403,10 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - if (apic && atomic_read(&apic->lapic_timer.pending) > 0) { + if (!vcpu_has_lapic(vcpu)) + return; + + if (atomic_read(&apic->lapic_timer.pending) > 0) { if (kvm_apic_local_deliver(apic, APIC_LVTT)) atomic_dec(&apic->lapic_timer.pending); } @@ -1439,13 +1446,12 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu) void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) { - struct kvm_lapic *apic = vcpu->arch.apic; struct hrtimer *timer; - if (!apic) + if (!vcpu_has_lapic(vcpu)) return; - timer = &apic->lapic_timer.timer; + timer = &vcpu->arch.apic->lapic_timer.timer; if (hrtimer_cancel(timer)) hrtimer_start_expires(timer, HRTIMER_MODE_ABS); } @@ -1602,7 +1608,7 @@ int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 reg, u64 data) { struct kvm_lapic *apic = vcpu->arch.apic; - if (!irqchip_in_kernel(vcpu->kvm)) + if (!vcpu_has_lapic(vcpu)) return 1; /* if this is ICR write vector before command */ @@ -1616,7 +1622,7 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data) struct kvm_lapic *apic = vcpu->arch.apic; u32 low, high = 0; - if (!irqchip_in_kernel(vcpu->kvm)) + if (!vcpu_has_lapic(vcpu)) return 1; if (apic_reg_read(apic, reg, 4, &low)) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 29fa18d..8ebf65c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6152,6 +6152,8 @@ bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu) return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL); } +struct static_key kvm_no_apic_vcpu __read_mostly; + int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) { struct page *page; @@ -6184,7 +6186,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) r = kvm_create_lapic(vcpu); if (r < 0) goto fail_mmu_destroy; - } + } else + static_key_slow_inc(&kvm_no_apic_vcpu); vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4, GFP_KERNEL); @@ -6224,6 +6227,8 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) kvm_mmu_destroy(vcpu); srcu_read_unlock(&vcpu->kvm->srcu, idx); free_page((unsigned long)vcpu->arch.pio_data); + if (!irqchip_in_kernel(vcpu->kvm)) + static_key_slow_dec(&kvm_no_apic_vcpu); } int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 3d1134d..2b5219c 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -124,4 +124,5 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, extern u64 host_xcr0; +extern struct static_key kvm_no_apic_vcpu; #endif -- cgit v1.1 From c48f14966cc41957d88c66dfe49a439e708ab7b8 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Sun, 5 Aug 2012 15:58:33 +0300 Subject: KVM: inline kvm_apic_present() and kvm_lapic_enabled() Those functions are used during interrupt injection. When inlined they become nops on the fast path. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/lapic.c | 143 +++++++++++++++++++-------------------------------- arch/x86/kvm/lapic.h | 45 +++++++++++++++- 2 files changed, 96 insertions(+), 92 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 650379b..333c27f 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -73,11 +73,6 @@ static unsigned int min_timer_period_us = 500; module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR); -static inline u32 apic_get_reg(struct kvm_lapic *apic, int reg_off) -{ - return *((u32 *) (apic->regs + reg_off)); -} - static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val) { *((u32 *) (apic->regs + reg_off)) = val; @@ -119,19 +114,11 @@ static inline int __apic_test_and_clear_vector(int vec, void *bitmap) } struct static_key_deferred apic_hw_disabled __read_mostly; - -static inline int apic_hw_enabled(struct kvm_lapic *apic) -{ - if (static_key_false(&apic_hw_disabled.key)) - return apic->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE; - return MSR_IA32_APICBASE_ENABLE; -} - struct static_key_deferred apic_sw_disabled __read_mostly; static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val) { - if ((apic_get_reg(apic, APIC_SPIV) ^ val) & APIC_SPIV_APIC_ENABLED) { + if ((kvm_apic_get_reg(apic, APIC_SPIV) ^ val) & APIC_SPIV_APIC_ENABLED) { if (val & APIC_SPIV_APIC_ENABLED) static_key_slow_dec_deferred(&apic_sw_disabled); else @@ -140,23 +127,9 @@ static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val) apic_set_reg(apic, APIC_SPIV, val); } -static inline int apic_sw_enabled(struct kvm_lapic *apic) -{ - if (static_key_false(&apic_sw_disabled.key)) - return apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED; - return APIC_SPIV_APIC_ENABLED; -} - static inline int apic_enabled(struct kvm_lapic *apic) { - return apic_sw_enabled(apic) && apic_hw_enabled(apic); -} - -static inline bool vcpu_has_lapic(struct kvm_vcpu *vcpu) -{ - if (static_key_false(&kvm_no_apic_vcpu)) - return vcpu->arch.apic; - return true; + return kvm_apic_sw_enabled(apic) && kvm_apic_hw_enabled(apic); } #define LVT_MASK \ @@ -168,34 +141,34 @@ static inline bool vcpu_has_lapic(struct kvm_vcpu *vcpu) static inline int kvm_apic_id(struct kvm_lapic *apic) { - return (apic_get_reg(apic, APIC_ID) >> 24) & 0xff; + return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff; } static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type) { - return !(apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED); + return !(kvm_apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED); } static inline int apic_lvt_vector(struct kvm_lapic *apic, int lvt_type) { - return apic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK; + return kvm_apic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK; } static inline int apic_lvtt_oneshot(struct kvm_lapic *apic) { - return ((apic_get_reg(apic, APIC_LVTT) & + return ((kvm_apic_get_reg(apic, APIC_LVTT) & apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_ONESHOT); } static inline int apic_lvtt_period(struct kvm_lapic *apic) { - return ((apic_get_reg(apic, APIC_LVTT) & + return ((kvm_apic_get_reg(apic, APIC_LVTT) & apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_PERIODIC); } static inline int apic_lvtt_tscdeadline(struct kvm_lapic *apic) { - return ((apic_get_reg(apic, APIC_LVTT) & + return ((kvm_apic_get_reg(apic, APIC_LVTT) & apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_TSCDEADLINE); } @@ -211,7 +184,7 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu) struct kvm_cpuid_entry2 *feat; u32 v = APIC_VERSION; - if (!vcpu_has_lapic(vcpu)) + if (!kvm_vcpu_has_lapic(vcpu)) return; feat = kvm_find_cpuid_entry(apic->vcpu, 0x1, 0); @@ -319,7 +292,7 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) * will cause vmexit immediately and the value will be recalculated * on the next vmentry. */ - if (!vcpu_has_lapic(vcpu)) + if (!kvm_vcpu_has_lapic(vcpu)) return 0; highest_irr = apic_find_highest_irr(vcpu->arch.apic); @@ -404,8 +377,8 @@ static void apic_update_ppr(struct kvm_lapic *apic) u32 tpr, isrv, ppr, old_ppr; int isr; - old_ppr = apic_get_reg(apic, APIC_PROCPRI); - tpr = apic_get_reg(apic, APIC_TASKPRI); + old_ppr = kvm_apic_get_reg(apic, APIC_PROCPRI); + tpr = kvm_apic_get_reg(apic, APIC_TASKPRI); isr = apic_find_highest_isr(apic); isrv = (isr != -1) ? isr : 0; @@ -441,13 +414,13 @@ int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda) u32 logical_id; if (apic_x2apic_mode(apic)) { - logical_id = apic_get_reg(apic, APIC_LDR); + logical_id = kvm_apic_get_reg(apic, APIC_LDR); return logical_id & mda; } - logical_id = GET_APIC_LOGICAL_ID(apic_get_reg(apic, APIC_LDR)); + logical_id = GET_APIC_LOGICAL_ID(kvm_apic_get_reg(apic, APIC_LDR)); - switch (apic_get_reg(apic, APIC_DFR)) { + switch (kvm_apic_get_reg(apic, APIC_DFR)) { case APIC_DFR_FLAT: if (logical_id & mda) result = 1; @@ -459,7 +432,7 @@ int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda) break; default: apic_debug("Bad DFR vcpu %d: %08x\n", - apic->vcpu->vcpu_id, apic_get_reg(apic, APIC_DFR)); + apic->vcpu->vcpu_id, kvm_apic_get_reg(apic, APIC_DFR)); break; } @@ -617,7 +590,7 @@ static int apic_set_eoi(struct kvm_lapic *apic) apic_clear_isr(vector, apic); apic_update_ppr(apic); - if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) && + if (!(kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) && kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) { int trigger_mode; if (apic_test_vector(vector, apic->regs + APIC_TMR)) @@ -632,8 +605,8 @@ static int apic_set_eoi(struct kvm_lapic *apic) static void apic_send_ipi(struct kvm_lapic *apic) { - u32 icr_low = apic_get_reg(apic, APIC_ICR); - u32 icr_high = apic_get_reg(apic, APIC_ICR2); + u32 icr_low = kvm_apic_get_reg(apic, APIC_ICR); + u32 icr_high = kvm_apic_get_reg(apic, APIC_ICR2); struct kvm_lapic_irq irq; irq.vector = icr_low & APIC_VECTOR_MASK; @@ -668,7 +641,7 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic) ASSERT(apic != NULL); /* if initial count is 0, current count should also be 0 */ - if (apic_get_reg(apic, APIC_TMICT) == 0) + if (kvm_apic_get_reg(apic, APIC_TMICT) == 0) return 0; remaining = hrtimer_get_remaining(&apic->lapic_timer.timer); @@ -724,13 +697,13 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset) break; case APIC_PROCPRI: apic_update_ppr(apic); - val = apic_get_reg(apic, offset); + val = kvm_apic_get_reg(apic, offset); break; case APIC_TASKPRI: report_tpr_access(apic, false); /* fall thru */ default: - val = apic_get_reg(apic, offset); + val = kvm_apic_get_reg(apic, offset); break; } @@ -782,7 +755,7 @@ static int apic_reg_read(struct kvm_lapic *apic, u32 offset, int len, static int apic_mmio_in_range(struct kvm_lapic *apic, gpa_t addr) { - return apic_hw_enabled(apic) && + return kvm_apic_hw_enabled(apic) && addr >= apic->base_address && addr < apic->base_address + LAPIC_MMIO_LENGTH; } @@ -805,7 +778,7 @@ static void update_divide_count(struct kvm_lapic *apic) { u32 tmp1, tmp2, tdcr; - tdcr = apic_get_reg(apic, APIC_TDCR); + tdcr = kvm_apic_get_reg(apic, APIC_TDCR); tmp1 = tdcr & 0xf; tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1; apic->divide_count = 0x1 << (tmp2 & 0x7); @@ -822,7 +795,7 @@ static void start_apic_timer(struct kvm_lapic *apic) if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) { /* lapic timer in oneshot or periodic mode */ now = apic->lapic_timer.timer.base->get_time(); - apic->lapic_timer.period = (u64)apic_get_reg(apic, APIC_TMICT) + apic->lapic_timer.period = (u64)kvm_apic_get_reg(apic, APIC_TMICT) * APIC_BUS_CYCLE_NS * apic->divide_count; if (!apic->lapic_timer.period) @@ -854,7 +827,7 @@ static void start_apic_timer(struct kvm_lapic *apic) "timer initial count 0x%x, period %lldns, " "expire @ 0x%016" PRIx64 ".\n", __func__, APIC_BUS_CYCLE_NS, ktime_to_ns(now), - apic_get_reg(apic, APIC_TMICT), + kvm_apic_get_reg(apic, APIC_TMICT), apic->lapic_timer.period, ktime_to_ns(ktime_add_ns(now, apic->lapic_timer.period))); @@ -886,7 +859,7 @@ static void start_apic_timer(struct kvm_lapic *apic) static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val) { - int nmi_wd_enabled = apic_lvt_nmi_mode(apic_get_reg(apic, APIC_LVT0)); + int nmi_wd_enabled = apic_lvt_nmi_mode(kvm_apic_get_reg(apic, APIC_LVT0)); if (apic_lvt_nmi_mode(lvt0_val)) { if (!nmi_wd_enabled) { @@ -937,7 +910,7 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) case APIC_SPIV: { u32 mask = 0x3ff; - if (apic_get_reg(apic, APIC_LVR) & APIC_LVR_DIRECTED_EOI) + if (kvm_apic_get_reg(apic, APIC_LVR) & APIC_LVR_DIRECTED_EOI) mask |= APIC_SPIV_DIRECTED_EOI; apic_set_spiv(apic, val & mask); if (!(val & APIC_SPIV_APIC_ENABLED)) { @@ -945,7 +918,7 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) u32 lvt_val; for (i = 0; i < APIC_LVT_NUM; i++) { - lvt_val = apic_get_reg(apic, + lvt_val = kvm_apic_get_reg(apic, APIC_LVTT + 0x10 * i); apic_set_reg(apic, APIC_LVTT + 0x10 * i, lvt_val | APIC_LVT_MASKED); @@ -974,7 +947,7 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) case APIC_LVT1: case APIC_LVTERR: /* TODO: Check vector */ - if (!apic_sw_enabled(apic)) + if (!kvm_apic_sw_enabled(apic)) val |= APIC_LVT_MASKED; val &= apic_lvt_mask[(reg - APIC_LVTT) >> 4]; @@ -983,12 +956,12 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) break; case APIC_LVTT: - if ((apic_get_reg(apic, APIC_LVTT) & + if ((kvm_apic_get_reg(apic, APIC_LVTT) & apic->lapic_timer.timer_mode_mask) != (val & apic->lapic_timer.timer_mode_mask)) hrtimer_cancel(&apic->lapic_timer.timer); - if (!apic_sw_enabled(apic)) + if (!kvm_apic_sw_enabled(apic)) val |= APIC_LVT_MASKED; val &= (apic_lvt_mask[0] | apic->lapic_timer.timer_mode_mask); apic_set_reg(apic, APIC_LVTT, val); @@ -1067,7 +1040,7 @@ static int apic_mmio_write(struct kvm_io_device *this, void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu) { - if (vcpu_has_lapic(vcpu)) + if (kvm_vcpu_has_lapic(vcpu)) apic_reg_write(vcpu->arch.apic, APIC_EOI, 0); } EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi); @@ -1084,7 +1057,7 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu) if (!(vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE)) static_key_slow_dec_deferred(&apic_hw_disabled); - if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED)) + if (!(kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED)) static_key_slow_dec_deferred(&apic_sw_disabled); if (apic->regs) @@ -1103,7 +1076,7 @@ u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - if (!vcpu_has_lapic(vcpu) || apic_lvtt_oneshot(apic) || + if (!kvm_vcpu_has_lapic(vcpu) || apic_lvtt_oneshot(apic) || apic_lvtt_period(apic)) return 0; @@ -1114,7 +1087,7 @@ void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data) { struct kvm_lapic *apic = vcpu->arch.apic; - if (!vcpu_has_lapic(vcpu) || apic_lvtt_oneshot(apic) || + if (!kvm_vcpu_has_lapic(vcpu) || apic_lvtt_oneshot(apic) || apic_lvtt_period(apic)) return; @@ -1127,21 +1100,21 @@ void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8) { struct kvm_lapic *apic = vcpu->arch.apic; - if (!vcpu_has_lapic(vcpu)) + if (!kvm_vcpu_has_lapic(vcpu)) return; apic_set_tpr(apic, ((cr8 & 0x0f) << 4) - | (apic_get_reg(apic, APIC_TASKPRI) & 4)); + | (kvm_apic_get_reg(apic, APIC_TASKPRI) & 4)); } u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu) { u64 tpr; - if (!vcpu_has_lapic(vcpu)) + if (!kvm_vcpu_has_lapic(vcpu)) return 0; - tpr = (u64) apic_get_reg(vcpu->arch.apic, APIC_TASKPRI); + tpr = (u64) kvm_apic_get_reg(vcpu->arch.apic, APIC_TASKPRI); return (tpr & 0xf0) >> 4; } @@ -1238,16 +1211,6 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) vcpu->arch.apic_base, apic->base_address); } -bool kvm_apic_present(struct kvm_vcpu *vcpu) -{ - return vcpu_has_lapic(vcpu) && apic_hw_enabled(vcpu->arch.apic); -} - -int kvm_lapic_enabled(struct kvm_vcpu *vcpu) -{ - return kvm_apic_present(vcpu) && apic_sw_enabled(vcpu->arch.apic); -} - /* *---------------------------------------------------------------------- * timer interface @@ -1263,7 +1226,7 @@ int apic_has_pending_timer(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - if (vcpu_has_lapic(vcpu) && apic_enabled(apic) && + if (kvm_vcpu_has_lapic(vcpu) && apic_enabled(apic) && apic_lvt_enabled(apic, APIC_LVTT)) return atomic_read(&apic->lapic_timer.pending); @@ -1272,10 +1235,10 @@ int apic_has_pending_timer(struct kvm_vcpu *vcpu) int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type) { - u32 reg = apic_get_reg(apic, lvt_type); + u32 reg = kvm_apic_get_reg(apic, lvt_type); int vector, mode, trig_mode; - if (apic_hw_enabled(apic) && !(reg & APIC_LVT_MASKED)) { + if (kvm_apic_hw_enabled(apic) && !(reg & APIC_LVT_MASKED)) { vector = reg & APIC_VECTOR_MASK; mode = reg & APIC_MODE_MASK; trig_mode = reg & APIC_LVT_LEVEL_TRIGGER; @@ -1375,23 +1338,23 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu) struct kvm_lapic *apic = vcpu->arch.apic; int highest_irr; - if (!vcpu_has_lapic(vcpu) || !apic_enabled(apic)) + if (!kvm_vcpu_has_lapic(vcpu) || !apic_enabled(apic)) return -1; apic_update_ppr(apic); highest_irr = apic_find_highest_irr(apic); if ((highest_irr == -1) || - ((highest_irr & 0xF0) <= apic_get_reg(apic, APIC_PROCPRI))) + ((highest_irr & 0xF0) <= kvm_apic_get_reg(apic, APIC_PROCPRI))) return -1; return highest_irr; } int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu) { - u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0); + u32 lvt0 = kvm_apic_get_reg(vcpu->arch.apic, APIC_LVT0); int r = 0; - if (!apic_hw_enabled(vcpu->arch.apic)) + if (!kvm_apic_hw_enabled(vcpu->arch.apic)) r = 1; if ((lvt0 & APIC_LVT_MASKED) == 0 && GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT) @@ -1403,7 +1366,7 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - if (!vcpu_has_lapic(vcpu)) + if (!kvm_vcpu_has_lapic(vcpu)) return; if (atomic_read(&apic->lapic_timer.pending) > 0) { @@ -1432,7 +1395,7 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu) kvm_lapic_set_base(vcpu, vcpu->arch.apic_base); kvm_apic_set_version(vcpu); - apic_set_spiv(apic, apic_get_reg(apic, APIC_SPIV)); + apic_set_spiv(apic, kvm_apic_get_reg(apic, APIC_SPIV)); apic_update_ppr(apic); hrtimer_cancel(&apic->lapic_timer.timer); @@ -1448,7 +1411,7 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) { struct hrtimer *timer; - if (!vcpu_has_lapic(vcpu)) + if (!kvm_vcpu_has_lapic(vcpu)) return; timer = &vcpu->arch.apic->lapic_timer.timer; @@ -1549,7 +1512,7 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu) if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) return; - tpr = apic_get_reg(apic, APIC_TASKPRI) & 0xff; + tpr = kvm_apic_get_reg(apic, APIC_TASKPRI) & 0xff; max_irr = apic_find_highest_irr(apic); if (max_irr < 0) max_irr = 0; @@ -1608,7 +1571,7 @@ int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 reg, u64 data) { struct kvm_lapic *apic = vcpu->arch.apic; - if (!vcpu_has_lapic(vcpu)) + if (!kvm_vcpu_has_lapic(vcpu)) return 1; /* if this is ICR write vector before command */ @@ -1622,7 +1585,7 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data) struct kvm_lapic *apic = vcpu->arch.apic; u32 low, high = 0; - if (!vcpu_has_lapic(vcpu)) + if (!kvm_vcpu_has_lapic(vcpu)) return 1; if (apic_reg_read(apic, reg, 4, &low)) diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 73fa299..2ad9caa 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -55,8 +55,6 @@ int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type); u64 kvm_get_apic_base(struct kvm_vcpu *vcpu); void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data); void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu); -int kvm_lapic_enabled(struct kvm_vcpu *vcpu); -bool kvm_apic_present(struct kvm_vcpu *vcpu); int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu); @@ -79,4 +77,47 @@ static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu) int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data); void kvm_lapic_init(void); + +static inline u32 kvm_apic_get_reg(struct kvm_lapic *apic, int reg_off) +{ + return *((u32 *) (apic->regs + reg_off)); +} + +extern struct static_key kvm_no_apic_vcpu; + +static inline bool kvm_vcpu_has_lapic(struct kvm_vcpu *vcpu) +{ + if (static_key_false(&kvm_no_apic_vcpu)) + return vcpu->arch.apic; + return true; +} + +extern struct static_key_deferred apic_hw_disabled; + +static inline int kvm_apic_hw_enabled(struct kvm_lapic *apic) +{ + if (static_key_false(&apic_hw_disabled.key)) + return apic->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE; + return MSR_IA32_APICBASE_ENABLE; +} + +extern struct static_key_deferred apic_sw_disabled; + +static inline int kvm_apic_sw_enabled(struct kvm_lapic *apic) +{ + if (static_key_false(&apic_sw_disabled.key)) + return kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED; + return APIC_SPIV_APIC_ENABLED; +} + +static inline bool kvm_apic_present(struct kvm_vcpu *vcpu) +{ + return kvm_vcpu_has_lapic(vcpu) && kvm_apic_hw_enabled(vcpu->arch.apic); +} + +static inline int kvm_lapic_enabled(struct kvm_vcpu *vcpu) +{ + return kvm_apic_present(vcpu) && kvm_apic_sw_enabled(vcpu->arch.apic); +} + #endif -- cgit v1.1 From a9ad773e0dd833651f0831020a0ea0265c29f2ea Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 6 Aug 2012 19:00:36 +0200 Subject: x86, cpu: Fixup tlb_flushall_shift formatting The TLB characteristics appeared like this in dmesg: [ 0.065817] Last level iTLB entries: 4KB 512, 2MB 1024, 4MB 512 [ 0.065817] Last level dTLB entries: 4KB 1024, 2MB 1024, 4MB 512 [ 0.065817] tlb_flushall_shift is 0xffffffff where tlb_flushall_shift is actually -1 but dumped as a hex number. However, the Kconfig option CONFIG_DEBUG_TLBFLUSH and the rest of the code treats this as a signed decimal and states "If you set it to -1, the code flushes the whole TLB unconditionally." So, fix its formatting in accordance with the other references to it. Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1344272439-29080-2-git-send-email-bp@amd64.org Acked-by: Alex Shi Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 46d8786..d239977 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -474,7 +474,7 @@ void __cpuinit cpu_detect_tlb(struct cpuinfo_x86 *c) printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n" \ "Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d\n" \ - "tlb_flushall_shift is 0x%x\n", + "tlb_flushall_shift: %d\n", tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES], tlb_lli_4m[ENTRIES], tlb_lld_4k[ENTRIES], tlb_lld_2m[ENTRIES], tlb_lld_4m[ENTRIES], -- cgit v1.1 From 5b556332c3ab19e6375836d35ca658776e9ba0f6 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 6 Aug 2012 19:00:37 +0200 Subject: x86, cpu: Push TLB detection CPUID check down Push the max CPUID leaf check into the ->detect_tlb function and remove general test case from the generic path. Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1344272439-29080-3-git-send-email-bp@amd64.org Acked-by: Alex Shi Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/common.c | 3 +-- arch/x86/kernel/cpu/intel.c | 4 ++++ 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index d239977..080f4a7 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -940,8 +940,7 @@ void __init identify_boot_cpu(void) #else vgetcpu_set_mode(); #endif - if (boot_cpu_data.cpuid_level >= 2) - cpu_detect_tlb(&boot_cpu_data); + cpu_detect_tlb(&boot_cpu_data); } void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 0a4ce29..198e019 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -648,6 +648,10 @@ static void __cpuinit intel_detect_tlb(struct cpuinfo_x86 *c) int i, j, n; unsigned int regs[4]; unsigned char *desc = (unsigned char *)regs; + + if (c->cpuid_level < 2) + return; + /* Number of times to iterate */ n = cpuid_eax(2) & 0xFF; -- cgit v1.1 From b46882e4c4de4813947fce940fe74af794a1eb72 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 6 Aug 2012 19:00:38 +0200 Subject: x86, cpu: Add AMD TLB size detection Read I- and DTLB entries count from CPUID on AMD. Handle all the different family-specific cases. Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1344272439-29080-4-git-send-email-bp@amd64.org Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/amd.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 9d92e19..bcd2008 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -737,6 +737,59 @@ static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, } #endif +static void __cpuinit cpu_detect_tlb_amd(struct cpuinfo_x86 *c) +{ + u32 ebx, eax, ecx, edx; + u16 mask = 0xfff; + + if (c->x86 < 0xf) + return; + + if (c->extended_cpuid_level < 0x80000006) + return; + + cpuid(0x80000006, &eax, &ebx, &ecx, &edx); + + tlb_lld_4k[ENTRIES] = (ebx >> 16) & mask; + tlb_lli_4k[ENTRIES] = ebx & mask; + + /* + * K8 doesn't have 2M/4M entries in the L2 TLB so read out the L1 TLB + * characteristics from the CPUID function 0x80000005 instead. + */ + if (c->x86 == 0xf) { + cpuid(0x80000005, &eax, &ebx, &ecx, &edx); + mask = 0xff; + } + + /* Handle DTLB 2M and 4M sizes, fall back to L1 if L2 is disabled */ + if (!((eax >> 16) & mask)) { + u32 a, b, c, d; + + cpuid(0x80000005, &a, &b, &c, &d); + tlb_lld_2m[ENTRIES] = (a >> 16) & 0xff; + } else { + tlb_lld_2m[ENTRIES] = (eax >> 16) & mask; + } + + /* a 4M entry uses two 2M entries */ + tlb_lld_4m[ENTRIES] = tlb_lld_2m[ENTRIES] >> 1; + + /* Handle ITLB 2M and 4M sizes, fall back to L1 if L2 is disabled */ + if (!(eax & mask)) { + /* Erratum 658 */ + if (c->x86 == 0x15 && c->x86_model <= 0x1f) { + tlb_lli_2m[ENTRIES] = 1024; + } else { + cpuid(0x80000005, &eax, &ebx, &ecx, &edx); + tlb_lli_2m[ENTRIES] = eax & 0xff; + } + } else + tlb_lli_2m[ENTRIES] = eax & mask; + + tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1; +} + static const struct cpu_dev __cpuinitconst amd_cpu_dev = { .c_vendor = "AMD", .c_ident = { "AuthenticAMD" }, @@ -756,6 +809,7 @@ static const struct cpu_dev __cpuinitconst amd_cpu_dev = { .c_size_cache = amd_size_cache, #endif .c_early_init = early_init_amd, + .c_detect_tlb = cpu_detect_tlb_amd, .c_bsp_init = bsp_init_amd, .c_init = init_amd, .c_x86_vendor = X86_VENDOR_AMD, -- cgit v1.1 From 057237bb35a605d795fd787868a1088705f26ee5 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 6 Aug 2012 19:00:39 +0200 Subject: x86, cpu: Preset default tlb_flushall_shift on AMD Run the mprotect.c microbenchmark on all our families >= K8 and preset the flushall shift variable accordingly. Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1344272439-29080-5-git-send-email-bp@amd64.org Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/amd.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index bcd2008..f7e98a2 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -737,6 +737,17 @@ static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, } #endif +static void __cpuinit cpu_set_tlb_flushall_shift(struct cpuinfo_x86 *c) +{ + if (!cpu_has_invlpg) + return; + + tlb_flushall_shift = 5; + + if (c->x86 <= 0x11) + tlb_flushall_shift = 4; +} + static void __cpuinit cpu_detect_tlb_amd(struct cpuinfo_x86 *c) { u32 ebx, eax, ecx, edx; @@ -788,6 +799,8 @@ static void __cpuinit cpu_detect_tlb_amd(struct cpuinfo_x86 *c) tlb_lli_2m[ENTRIES] = eax & mask; tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1; + + cpu_set_tlb_flushall_shift(c); } static const struct cpu_dev __cpuinitconst amd_cpu_dev = { -- cgit v1.1 From 256f631f1f7e7bedc882510679ad4473a2274708 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Fri, 14 Sep 2012 13:34:43 +0000 Subject: xen/arm: Introduce xen_ulong_t for unsigned long All the original Xen headers have xen_ulong_t as unsigned long type, however when they have been imported in Linux, xen_ulong_t has been replaced with unsigned long. That might work for x86 and ia64 but it does not for arm. Bring back xen_ulong_t and let each architecture define xen_ulong_t as they see fit. Also explicitly size pointers (__DEFINE_GUEST_HANDLE) to 64 bit. Changes in v3: - remove the incorrect changes to multicall_entry; - remove the change to apic_physbase. Signed-off-by: Stefano Stabellini Acked-by: Konrad Rzeszutek Wilk --- arch/x86/include/asm/xen/interface.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h index 555f94d..28fc6211 100644 --- a/arch/x86/include/asm/xen/interface.h +++ b/arch/x86/include/asm/xen/interface.h @@ -51,6 +51,7 @@ * with Xen so that on ARM we can have one ABI that works for 32 and 64 * bit guests. */ typedef unsigned long xen_pfn_t; +typedef unsigned long xen_ulong_t; /* Guest handles for primitive C types. */ __DEFINE_GUEST_HANDLE(uchar, unsigned char); __DEFINE_GUEST_HANDLE(uint, unsigned int); -- cgit v1.1 From 0ec53ecf38bcbf95b4b057328a8fbba4d22ef28b Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Fri, 14 Sep 2012 13:37:32 +0000 Subject: xen/arm: receive Xen events on ARM Compile events.c on ARM. Parse, map and enable the IRQ to get event notifications from the device tree (node "/xen"). Signed-off-by: Stefano Stabellini Acked-by: Konrad Rzeszutek Wilk --- arch/x86/xen/enlighten.c | 1 + arch/x86/xen/irq.c | 1 + arch/x86/xen/xen-ops.h | 1 - 3 files changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 47b3acd..689a4c9 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -33,6 +33,7 @@ #include #include +#include #include #include #include diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c index 1573376..01a4dc0 100644 --- a/arch/x86/xen/irq.c +++ b/arch/x86/xen/irq.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index bb5a810..a95b417 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -35,7 +35,6 @@ void xen_set_pat(u64); char * __init xen_memory_setup(void); void __init xen_arch_setup(void); -void __init xen_init_IRQ(void); void xen_enable_sysenter(void); void xen_enable_syscall(void); void xen_vcpu_restore(void); -- cgit v1.1 From c6fd893da927c6cefb2ece22402765379921a834 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Tue, 31 Jul 2012 10:29:14 -0700 Subject: x86, avx: don't use avx instructions with "noxsave" boot param Clear AVX, AVX2 features along with clearing XSAVE feature bits, as part of the parsing "noxsave" parameter. Fixes the kernel boot panic with "noxsave" boot parameter. We could have checked cpu_has_osxsave along with cpu_has_avx etc, but Peter mentioned clearing the feature bits will be better for uses like static_cpu_has() etc. Signed-off-by: Suresh Siddha Link: http://lkml.kernel.org/r/1343755754.2041.2.camel@sbsiddha-desk.sc.intel.com Cc: # v3.5 Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/common.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 46d8786..a5fbc3c 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -144,6 +144,8 @@ static int __init x86_xsave_setup(char *s) { setup_clear_cpu_cap(X86_FEATURE_XSAVE); setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); + setup_clear_cpu_cap(X86_FEATURE_AVX); + setup_clear_cpu_cap(X86_FEATURE_AVX2); return 1; } __setup("noxsave", x86_xsave_setup); -- cgit v1.1 From 64eb0620296f924d5fded755c5ed95fb73649e06 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 8 Aug 2012 15:24:36 +0300 Subject: KVM: correctly detect APIC SW state in kvm_apic_post_state_restore() For apic_set_spiv() to track APIC SW state correctly it needs to see previous and next values of the spurious vector register, but currently memset() overwrite the old value before apic_set_spiv() get a chance to do tracking. Fix it by calling apic_set_spiv() before overwriting old value. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/lapic.c | 7 +++++-- arch/x86/kvm/lapic.h | 3 ++- arch/x86/kvm/x86.c | 3 +-- 3 files changed, 8 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 333c27f..18d149d 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1389,13 +1389,16 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) return vector; } -void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu) +void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu, + struct kvm_lapic_state *s) { struct kvm_lapic *apic = vcpu->arch.apic; kvm_lapic_set_base(vcpu, vcpu->arch.apic_base); + /* set SPIV separately to get count of SW disabled APICs right */ + apic_set_spiv(apic, *((u32 *)(s->regs + APIC_SPIV))); + memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s); kvm_apic_set_version(vcpu); - apic_set_spiv(apic, kvm_apic_get_reg(apic, APIC_SPIV)); apic_update_ppr(apic); hrtimer_cancel(&apic->lapic_timer.timer); diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 2ad9caa..615a8b0 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -54,7 +54,8 @@ int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type); u64 kvm_get_apic_base(struct kvm_vcpu *vcpu); void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data); -void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu); +void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu, + struct kvm_lapic_state *s); int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8ebf65c..91a5958 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2348,8 +2348,7 @@ static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { - memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s); - kvm_apic_post_state_restore(vcpu); + kvm_apic_post_state_restore(vcpu, s); update_cr8_intercept(vcpu); return 0; -- cgit v1.1 From 4670a300a2169e1e922593c5d35b0cdaee112901 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Thu, 9 Aug 2012 10:59:21 -0700 Subject: x86/mce: Make cmci_discover() quiet cmci_discover() works out which machine check banks support CMCI, and which of those are shared by multiple logical processors. It uses this information to ensure that exactly one cpu is designated the owner of each bank so that when interrupts are broadcast to multiple cpus, only one of them will look in a shared bank to log the error and clear the bank. At boot time cmci_discover() performs this task silently. But during certain cpu hotplug operations it prints out a set of summary lines like this: CPU 35 MCA banks CMCI:0 CMCI:1 CMCI:3 CMCI:5 CMCI:6 CMCI:7 CMCI:8 CMCI:9 CMCI:10 CMCI:11 CPU 1 MCA banks CMCI:0 CMCI:1 CMCI:3 CPU 39 MCA banks CMCI:0 CMCI:1 CMCI:3 CPU 38 MCA banks CMCI:0 CMCI:1 CMCI:3 CPU 32 MCA banks CMCI:0 CMCI:1 CMCI:3 CPU 37 MCA banks CMCI:0 CMCI:1 CMCI:3 CPU 36 MCA banks CMCI:0 CMCI:1 CMCI:3 CPU 34 MCA banks CMCI:0 CMCI:1 CMCI:3 The value of these messages seems very low. A user might painstakingly cross-check against the data sheet for a processor to ensure that all CMCI supported banks are correctly reported, but this seems improbable. If users really wanted to do this, we should print the information at boot time too. Remove the messages. Signed-off-by: Tony Luck --- arch/x86/kernel/cpu/mcheck/mce_intel.c | 25 ++++++------------------- 1 file changed, 6 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 38e49bc9..59648e4 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -65,24 +65,15 @@ static void intel_threshold_interrupt(void) mce_notify_irq(); } -static void print_update(char *type, int *hdr, int num) -{ - if (*hdr == 0) - printk(KERN_INFO "CPU %d MCA banks", smp_processor_id()); - *hdr = 1; - printk(KERN_CONT " %s:%d", type, num); -} - /* * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks * on this CPU. Use the algorithm recommended in the SDM to discover shared * banks. */ -static void cmci_discover(int banks, int boot) +static void cmci_discover(int banks) { unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned); unsigned long flags; - int hdr = 0; int i; raw_spin_lock_irqsave(&cmci_discover_lock, flags); @@ -96,8 +87,7 @@ static void cmci_discover(int banks, int boot) /* Already owned by someone else? */ if (val & MCI_CTL2_CMCI_EN) { - if (test_and_clear_bit(i, owned) && !boot) - print_update("SHD", &hdr, i); + clear_bit(i, owned); __clear_bit(i, __get_cpu_var(mce_poll_banks)); continue; } @@ -109,16 +99,13 @@ static void cmci_discover(int banks, int boot) /* Did the enable bit stick? -- the bank supports CMCI */ if (val & MCI_CTL2_CMCI_EN) { - if (!test_and_set_bit(i, owned) && !boot) - print_update("CMCI", &hdr, i); + set_bit(i, owned); __clear_bit(i, __get_cpu_var(mce_poll_banks)); } else { WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks))); } } raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); - if (hdr) - printk(KERN_CONT "\n"); } /* @@ -186,7 +173,7 @@ void cmci_rediscover(int dying) continue; /* Recheck banks in case CPUs don't all have the same */ if (cmci_supported(&banks)) - cmci_discover(banks, 0); + cmci_discover(banks); } set_cpus_allowed_ptr(current, old); @@ -200,7 +187,7 @@ void cmci_reenable(void) { int banks; if (cmci_supported(&banks)) - cmci_discover(banks, 0); + cmci_discover(banks); } static void intel_init_cmci(void) @@ -211,7 +198,7 @@ static void intel_init_cmci(void) return; mce_threshold_vector = intel_threshold_interrupt; - cmci_discover(banks, 1); + cmci_discover(banks); /* * For CPU #0 this runs with still disabled APIC, but that's * ok because only the vector is set up. We still do another -- cgit v1.1 From 55babd8f41f122f5f4c7cebf520c766c983282c6 Mon Sep 17 00:00:00 2001 From: Chen Gong Date: Thu, 9 Aug 2012 11:44:51 -0700 Subject: x86/mce: Add CMCI poll mode On Intel systems corrected machine check interrupts (CMCI) may be sent to multiple logical processors; possibly to all processors on the affected socket (SDM Volume 3B "15.5.1 CMCI Local APIC Interface"). This means that a persistent error (such as a stuck bit in ECC memory) may cause a storm of interrupts that greatly hinders or prevents forward progress (probably on many processors). To solve this we keep track of the rate at which each processor sees CMCI. If we exceed a threshold, we disable CMCI delivery and switch to polling the machine check banks. If the storm subsides (none of the affected processors see any more errors for a complete poll interval) we re-enable CMCI. [Tony: Added console messages when storm begins/ends and increased storm threshold from 5 to 15 so we have a few more logged entries before we disable interrupts and start dropping reports] Signed-off-by: Chen Gong Signed-off-by: Thomas Gleixner Tested-by: Chen Gong Signed-off-by: Tony Luck --- arch/x86/kernel/cpu/mcheck/mce-internal.h | 12 ++++ arch/x86/kernel/cpu/mcheck/mce.c | 47 +++++++++++-- arch/x86/kernel/cpu/mcheck/mce_intel.c | 108 +++++++++++++++++++++++++++++- 3 files changed, 160 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index ed44c8a..6a05c1d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -28,6 +28,18 @@ extern int mce_ser; extern struct mce_bank *mce_banks; +#ifdef CONFIG_X86_MCE_INTEL +unsigned long mce_intel_adjust_timer(unsigned long interval); +void mce_intel_cmci_poll(void); +void mce_intel_hcpu_update(unsigned long cpu); +#else +# define mce_intel_adjust_timer mce_adjust_timer_default +static inline void mce_intel_cmci_poll(void) { } +static inline void mce_intel_hcpu_update(unsigned long cpu) { } +#endif + +void mce_timer_kick(unsigned long interval); + #ifdef CONFIG_ACPI_APEI int apei_write_mce(struct mce *m); ssize_t apei_read_mce(struct mce *m, u64 *record_id); diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index b4dde15..8c1beea 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1260,6 +1260,14 @@ static unsigned long check_interval = 5 * 60; /* 5 minutes */ static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */ static DEFINE_PER_CPU(struct timer_list, mce_timer); +static unsigned long mce_adjust_timer_default(unsigned long interval) +{ + return interval; +} + +static unsigned long (*mce_adjust_timer)(unsigned long interval) = + mce_adjust_timer_default; + static void mce_timer_fn(unsigned long data) { struct timer_list *t = &__get_cpu_var(mce_timer); @@ -1270,6 +1278,7 @@ static void mce_timer_fn(unsigned long data) if (mce_available(__this_cpu_ptr(&cpu_info))) { machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_poll_banks)); + mce_intel_cmci_poll(); } /* @@ -1277,14 +1286,38 @@ static void mce_timer_fn(unsigned long data) * polling interval, otherwise increase the polling interval. */ iv = __this_cpu_read(mce_next_interval); - if (mce_notify_irq()) + if (mce_notify_irq()) { iv = max(iv / 2, (unsigned long) HZ/100); - else + } else { iv = min(iv * 2, round_jiffies_relative(check_interval * HZ)); + iv = mce_adjust_timer(iv); + } __this_cpu_write(mce_next_interval, iv); + /* Might have become 0 after CMCI storm subsided */ + if (iv) { + t->expires = jiffies + iv; + add_timer_on(t, smp_processor_id()); + } +} - t->expires = jiffies + iv; - add_timer_on(t, smp_processor_id()); +/* + * Ensure that the timer is firing in @interval from now. + */ +void mce_timer_kick(unsigned long interval) +{ + struct timer_list *t = &__get_cpu_var(mce_timer); + unsigned long when = jiffies + interval; + unsigned long iv = __this_cpu_read(mce_next_interval); + + if (timer_pending(t)) { + if (time_before(when, t->expires)) + mod_timer_pinned(t, when); + } else { + t->expires = round_jiffies(when); + add_timer_on(t, smp_processor_id()); + } + if (interval < iv) + __this_cpu_write(mce_next_interval, interval); } /* Must not be called in IRQ context where del_timer_sync() can deadlock */ @@ -1548,6 +1581,7 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) switch (c->x86_vendor) { case X86_VENDOR_INTEL: mce_intel_feature_init(c); + mce_adjust_timer = mce_intel_adjust_timer; break; case X86_VENDOR_AMD: mce_amd_feature_init(c); @@ -1559,7 +1593,7 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) static void mce_start_timer(unsigned int cpu, struct timer_list *t) { - unsigned long iv = check_interval * HZ; + unsigned long iv = mce_adjust_timer(check_interval * HZ); __this_cpu_write(mce_next_interval, iv); @@ -2272,10 +2306,11 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) if (threshold_cpu_callback) threshold_cpu_callback(action, cpu); mce_device_remove(cpu); + mce_intel_hcpu_update(cpu); break; case CPU_DOWN_PREPARE: - del_timer_sync(t); smp_call_function_single(cpu, mce_disable_cpu, &action, 1); + del_timer_sync(t); break; case CPU_DOWN_FAILED: smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 59648e4..098386f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -15,6 +15,8 @@ #include #include +#include "mce-internal.h" + /* * Support for Intel Correct Machine Check Interrupts. This allows * the CPU to raise an interrupt when a corrected machine check happened. @@ -30,7 +32,22 @@ static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned); */ static DEFINE_RAW_SPINLOCK(cmci_discover_lock); -#define CMCI_THRESHOLD 1 +#define CMCI_THRESHOLD 1 +#define CMCI_POLL_INTERVAL (30 * HZ) +#define CMCI_STORM_INTERVAL (1 * HZ) +#define CMCI_STORM_THRESHOLD 15 + +static DEFINE_PER_CPU(unsigned long, cmci_time_stamp); +static DEFINE_PER_CPU(unsigned int, cmci_storm_cnt); +static DEFINE_PER_CPU(unsigned int, cmci_storm_state); + +enum { + CMCI_STORM_NONE, + CMCI_STORM_ACTIVE, + CMCI_STORM_SUBSIDED, +}; + +static atomic_t cmci_storm_on_cpus; static int cmci_supported(int *banks) { @@ -53,6 +70,93 @@ static int cmci_supported(int *banks) return !!(cap & MCG_CMCI_P); } +void mce_intel_cmci_poll(void) +{ + if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE) + return; + machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); +} + +void mce_intel_hcpu_update(unsigned long cpu) +{ + if (per_cpu(cmci_storm_state, cpu) == CMCI_STORM_ACTIVE) + atomic_dec(&cmci_storm_on_cpus); + + per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE; +} + +unsigned long mce_intel_adjust_timer(unsigned long interval) +{ + int r; + + if (interval < CMCI_POLL_INTERVAL) + return interval; + + switch (__this_cpu_read(cmci_storm_state)) { + case CMCI_STORM_ACTIVE: + /* + * We switch back to interrupt mode once the poll timer has + * silenced itself. That means no events recorded and the + * timer interval is back to our poll interval. + */ + __this_cpu_write(cmci_storm_state, CMCI_STORM_SUBSIDED); + r = atomic_sub_return(1, &cmci_storm_on_cpus); + if (r == 0) + pr_notice("CMCI storm subsided: switching to interrupt mode\n"); + /* FALLTHROUGH */ + + case CMCI_STORM_SUBSIDED: + /* + * We wait for all cpus to go back to SUBSIDED + * state. When that happens we switch back to + * interrupt mode. + */ + if (!atomic_read(&cmci_storm_on_cpus)) { + __this_cpu_write(cmci_storm_state, CMCI_STORM_NONE); + cmci_reenable(); + cmci_recheck(); + } + return CMCI_POLL_INTERVAL; + default: + /* + * We have shiny weather. Let the poll do whatever it + * thinks. + */ + return interval; + } +} + +static bool cmci_storm_detect(void) +{ + unsigned int cnt = __this_cpu_read(cmci_storm_cnt); + unsigned long ts = __this_cpu_read(cmci_time_stamp); + unsigned long now = jiffies; + int r; + + if (__this_cpu_read(cmci_storm_state) != CMCI_STORM_NONE) + return true; + + if (time_before_eq(now, ts + CMCI_STORM_INTERVAL)) { + cnt++; + } else { + cnt = 1; + __this_cpu_write(cmci_time_stamp, now); + } + __this_cpu_write(cmci_storm_cnt, cnt); + + if (cnt <= CMCI_STORM_THRESHOLD) + return false; + + cmci_clear(); + __this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE); + r = atomic_add_return(1, &cmci_storm_on_cpus); + mce_timer_kick(CMCI_POLL_INTERVAL); + + if (r == 1) + pr_notice("CMCI storm detected: switching to poll mode\n"); + return true; +} + /* * The interrupt handler. This is called on every event. * Just call the poller directly to log any events. @@ -61,6 +165,8 @@ static int cmci_supported(int *banks) */ static void intel_threshold_interrupt(void) { + if (cmci_storm_detect()) + return; machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); mce_notify_irq(); } -- cgit v1.1 From c5e63197db519bae1c33e41ea0342a50f39e7a93 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 7 Aug 2012 15:20:36 +0200 Subject: perf: Unified API to record selective sets of arch registers This brings a new API to help the selective dump of registers on event sampling, and its implementation for x86 arch. Added HAVE_PERF_REGS config option to determine if the architecture provides perf registers ABI. The information about desired registers will be passed in u64 mask. It's up to the architecture to map the registers into the mask bits. For the x86 arch implementation, both 32 and 64 bit registers bits are defined within single enum to ensure 64 bit system can provide register dump for compat task if needed in the future. Original-patch-by: Frederic Weisbecker [ Added missing linux/errno.h include ] Signed-off-by: Jiri Olsa Cc: "Frank Ch. Eigler" Cc: Arun Sharma Cc: Benjamin Redelings Cc: Corey Ashford Cc: Cyrill Gorcunov Cc: Frank Ch. Eigler Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Masami Hiramatsu Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Robert Richter Cc: Stephane Eranian Cc: Tom Zanussi Cc: Ulrich Drepper Link: http://lkml.kernel.org/r/1344345647-11536-2-git-send-email-jolsa@redhat.com Signed-off-by: Arnaldo Carvalho de Melo --- arch/x86/Kconfig | 1 + arch/x86/include/asm/perf_regs.h | 33 +++++++++++++++ arch/x86/kernel/Makefile | 2 + arch/x86/kernel/perf_regs.c | 90 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 126 insertions(+) create mode 100644 arch/x86/include/asm/perf_regs.h create mode 100644 arch/x86/kernel/perf_regs.c (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 8ec3a1a..3fab6ec 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -60,6 +60,7 @@ config X86 select HAVE_MIXED_BREAKPOINTS_REGS select PERF_EVENTS select HAVE_PERF_EVENTS_NMI + select HAVE_PERF_REGS select ANON_INODES select HAVE_ALIGNED_STRUCT_PAGE if SLUB && !M386 select HAVE_CMPXCHG_LOCAL if !M386 diff --git a/arch/x86/include/asm/perf_regs.h b/arch/x86/include/asm/perf_regs.h new file mode 100644 index 0000000..3f2207b --- /dev/null +++ b/arch/x86/include/asm/perf_regs.h @@ -0,0 +1,33 @@ +#ifndef _ASM_X86_PERF_REGS_H +#define _ASM_X86_PERF_REGS_H + +enum perf_event_x86_regs { + PERF_REG_X86_AX, + PERF_REG_X86_BX, + PERF_REG_X86_CX, + PERF_REG_X86_DX, + PERF_REG_X86_SI, + PERF_REG_X86_DI, + PERF_REG_X86_BP, + PERF_REG_X86_SP, + PERF_REG_X86_IP, + PERF_REG_X86_FLAGS, + PERF_REG_X86_CS, + PERF_REG_X86_SS, + PERF_REG_X86_DS, + PERF_REG_X86_ES, + PERF_REG_X86_FS, + PERF_REG_X86_GS, + PERF_REG_X86_R8, + PERF_REG_X86_R9, + PERF_REG_X86_R10, + PERF_REG_X86_R11, + PERF_REG_X86_R12, + PERF_REG_X86_R13, + PERF_REG_X86_R14, + PERF_REG_X86_R15, + + PERF_REG_X86_32_MAX = PERF_REG_X86_GS + 1, + PERF_REG_X86_64_MAX = PERF_REG_X86_R15 + 1, +}; +#endif /* _ASM_X86_PERF_REGS_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 8215e56..8d7a619 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -100,6 +100,8 @@ obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o obj-$(CONFIG_OF) += devicetree.o obj-$(CONFIG_UPROBES) += uprobes.o +obj-$(CONFIG_PERF_EVENTS) += perf_regs.o + ### # 64 bit specific files ifeq ($(CONFIG_X86_64),y) diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c new file mode 100644 index 0000000..3d692352 --- /dev/null +++ b/arch/x86/kernel/perf_regs.c @@ -0,0 +1,90 @@ +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_X86_32 +#define PERF_REG_X86_MAX PERF_REG_X86_32_MAX +#else +#define PERF_REG_X86_MAX PERF_REG_X86_64_MAX +#endif + +#define PT_REGS_OFFSET(id, r) [id] = offsetof(struct pt_regs, r) + +static unsigned int pt_regs_offset[PERF_REG_X86_MAX] = { + PT_REGS_OFFSET(PERF_REG_X86_AX, ax), + PT_REGS_OFFSET(PERF_REG_X86_BX, bx), + PT_REGS_OFFSET(PERF_REG_X86_CX, cx), + PT_REGS_OFFSET(PERF_REG_X86_DX, dx), + PT_REGS_OFFSET(PERF_REG_X86_SI, si), + PT_REGS_OFFSET(PERF_REG_X86_DI, di), + PT_REGS_OFFSET(PERF_REG_X86_BP, bp), + PT_REGS_OFFSET(PERF_REG_X86_SP, sp), + PT_REGS_OFFSET(PERF_REG_X86_IP, ip), + PT_REGS_OFFSET(PERF_REG_X86_FLAGS, flags), + PT_REGS_OFFSET(PERF_REG_X86_CS, cs), + PT_REGS_OFFSET(PERF_REG_X86_SS, ss), +#ifdef CONFIG_X86_32 + PT_REGS_OFFSET(PERF_REG_X86_DS, ds), + PT_REGS_OFFSET(PERF_REG_X86_ES, es), + PT_REGS_OFFSET(PERF_REG_X86_FS, fs), + PT_REGS_OFFSET(PERF_REG_X86_GS, gs), +#else + /* + * The pt_regs struct does not store + * ds, es, fs, gs in 64 bit mode. + */ + (unsigned int) -1, + (unsigned int) -1, + (unsigned int) -1, + (unsigned int) -1, +#endif +#ifdef CONFIG_X86_64 + PT_REGS_OFFSET(PERF_REG_X86_R8, r8), + PT_REGS_OFFSET(PERF_REG_X86_R9, r9), + PT_REGS_OFFSET(PERF_REG_X86_R10, r10), + PT_REGS_OFFSET(PERF_REG_X86_R11, r11), + PT_REGS_OFFSET(PERF_REG_X86_R12, r12), + PT_REGS_OFFSET(PERF_REG_X86_R13, r13), + PT_REGS_OFFSET(PERF_REG_X86_R14, r14), + PT_REGS_OFFSET(PERF_REG_X86_R15, r15), +#endif +}; + +u64 perf_reg_value(struct pt_regs *regs, int idx) +{ + if (WARN_ON_ONCE(idx > ARRAY_SIZE(pt_regs_offset))) + return 0; + + return regs_get_register(regs, pt_regs_offset[idx]); +} + +#define REG_RESERVED (~((1ULL << PERF_REG_X86_MAX) - 1ULL)) + +#ifdef CONFIG_X86_32 +int perf_reg_validate(u64 mask) +{ + if (!mask || mask & REG_RESERVED) + return -EINVAL; + + return 0; +} +#else /* CONFIG_X86_64 */ +#define REG_NOSUPPORT ((1ULL << PERF_REG_X86_DS) | \ + (1ULL << PERF_REG_X86_ES) | \ + (1ULL << PERF_REG_X86_FS) | \ + (1ULL << PERF_REG_X86_GS)) + +int perf_reg_validate(u64 mask) +{ + if (!mask || mask & REG_RESERVED) + return -EINVAL; + + if (mask & REG_NOSUPPORT) + return -EINVAL; + + return 0; +} +#endif /* CONFIG_X86_32 */ -- cgit v1.1 From 4018994f3d8785275ef0e7391b75c3462c029e56 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 7 Aug 2012 15:20:37 +0200 Subject: perf: Add ability to attach user level registers dump to sample Introducing PERF_SAMPLE_REGS_USER sample type bit to trigger the dump of user level registers on sample. Registers we want to dump are specified by sample_regs_user bitmask. Only user level registers are dumped at the moment. Meaning the register values of the user space context as it was before the user entered the kernel for whatever reason (syscall, irq, exception, or a PMI happening in userspace). The layout of the sample_regs_user bitmap is described in asm/perf_regs.h for archs that support register dump. This is going to be useful to bring Dwarf CFI based stack unwinding on top of samples. Original-patch-by: Frederic Weisbecker [ Dump registers ABI specification. ] Signed-off-by: Jiri Olsa Suggested-by: Stephane Eranian Cc: "Frank Ch. Eigler" Cc: Arun Sharma Cc: Benjamin Redelings Cc: Corey Ashford Cc: Cyrill Gorcunov Cc: Frank Ch. Eigler Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Masami Hiramatsu Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Robert Richter Cc: Stephane Eranian Cc: Tom Zanussi Cc: Ulrich Drepper Link: http://lkml.kernel.org/r/1344345647-11536-3-git-send-email-jolsa@redhat.com Signed-off-by: Arnaldo Carvalho de Melo --- arch/x86/kernel/perf_regs.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c index 3d692352..c5a3e5c 100644 --- a/arch/x86/kernel/perf_regs.c +++ b/arch/x86/kernel/perf_regs.c @@ -1,5 +1,7 @@ #include #include +#include +#include #include #include #include @@ -71,6 +73,11 @@ int perf_reg_validate(u64 mask) return 0; } + +u64 perf_reg_abi(struct task_struct *task) +{ + return PERF_SAMPLE_REGS_ABI_32; +} #else /* CONFIG_X86_64 */ #define REG_NOSUPPORT ((1ULL << PERF_REG_X86_DS) | \ (1ULL << PERF_REG_X86_ES) | \ @@ -87,4 +94,12 @@ int perf_reg_validate(u64 mask) return 0; } + +u64 perf_reg_abi(struct task_struct *task) +{ + if (test_tsk_thread_flag(task, TIF_IA32)) + return PERF_SAMPLE_REGS_ABI_32; + else + return PERF_SAMPLE_REGS_ABI_64; +} #endif /* CONFIG_X86_32 */ -- cgit v1.1 From 91d7753a45f8525dc75b6be01e427dc1c378dc16 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 7 Aug 2012 15:20:38 +0200 Subject: perf: Factor __output_copy to be usable with specific copy function Adding a generic way to use __output_copy function with specific copy function via DEFINE_PERF_OUTPUT_COPY macro. Using this to add new __output_copy_user function, that provides output copy from user pointers. For x86 the copy_from_user_nmi function is used and __copy_from_user_inatomic for the rest of the architectures. This new function will be used in user stack dump on sample, coming in next patches. Signed-off-by: Jiri Olsa Cc: "Frank Ch. Eigler" Cc: Arun Sharma Cc: Benjamin Redelings Cc: Corey Ashford Cc: Cyrill Gorcunov Cc: Frank Ch. Eigler Cc: Ingo Molnar Cc: Jiri Olsa Cc: Masami Hiramatsu Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Robert Richter Cc: Stephane Eranian Cc: Tom Zanussi Cc: Ulrich Drepper Link: http://lkml.kernel.org/r/1344345647-11536-4-git-send-email-jolsa@redhat.com Signed-off-by: Frederic Weisbecker Signed-off-by: Arnaldo Carvalho de Melo --- arch/x86/include/asm/perf_event.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index cb4e43b..4fabcdf 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -262,4 +262,6 @@ static inline void perf_check_microcode(void) { } static inline void amd_pmu_disable_virt(void) { } #endif +#define arch_perf_out_copy_user copy_from_user_nmi + #endif /* _ASM_X86_PERF_EVENT_H */ -- cgit v1.1 From c5ebcedb566ef17bda7b02686e0d658a7bb42ee7 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 7 Aug 2012 15:20:40 +0200 Subject: perf: Add ability to attach user stack dump to sample Introducing PERF_SAMPLE_STACK_USER sample type bit to trigger the dump of the user level stack on sample. The size of the dump is specified by sample_stack_user value. Being able to dump parts of the user stack, starting from the stack pointer, will be useful to make a post mortem dwarf CFI based stack unwinding. Added HAVE_PERF_USER_STACK_DUMP config option to determine if the architecture provides user stack dump on perf event samples. This needs access to the user stack pointer which is not unified across architectures. Enabling this for x86 architecture. Signed-off-by: Jiri Olsa Original-patch-by: Frederic Weisbecker Cc: "Frank Ch. Eigler" Cc: Arun Sharma Cc: Benjamin Redelings Cc: Corey Ashford Cc: Cyrill Gorcunov Cc: Frank Ch. Eigler Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Masami Hiramatsu Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Robert Richter Cc: Stephane Eranian Cc: Tom Zanussi Cc: Ulrich Drepper Link: http://lkml.kernel.org/r/1344345647-11536-6-git-send-email-jolsa@redhat.com Signed-off-by: Arnaldo Carvalho de Melo --- arch/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 3fab6ec..a2d19ee 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -61,6 +61,7 @@ config X86 select PERF_EVENTS select HAVE_PERF_EVENTS_NMI select HAVE_PERF_REGS + select HAVE_PERF_USER_STACK_DUMP select ANON_INODES select HAVE_ALIGNED_STRUCT_PAGE if SLUB && !M386 select HAVE_CMPXCHG_LOCAL if !M386 -- cgit v1.1 From 484d90eec884d814b005c9736bcf3fd018acba65 Mon Sep 17 00:00:00 2001 From: Andrew Boie Date: Fri, 10 Aug 2012 11:49:06 -0700 Subject: x86, build: Globally set -fno-pic GCC built with nonstandard options can enable -fpic by default. We never want this for 32-bit kernels and it will break the build. [ hpa: Notably the Android toolchain apparently does this. ] Change-Id: Iaab7d66e598b1c65ac4a4f0229eca2cd3d0d2898 Signed-off-by: Andrew Boie Link: http://lkml.kernel.org/r/1344624546-29691-1-git-send-email-andrew.p.boie@intel.com Signed-off-by: H. Peter Anvin --- arch/x86/Makefile | 4 ++++ arch/x86/boot/Makefile | 2 +- arch/x86/realmode/rm/Makefile | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Makefile b/arch/x86/Makefile index b0c5276..682e9c2 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -27,6 +27,10 @@ ifeq ($(CONFIG_X86_32),y) KBUILD_CFLAGS += -msoft-float -mregparm=3 -freg-struct-return + # Never want PIC in a 32-bit kernel, prevent breakage with GCC built + # with nonstandard options + KBUILD_CFLAGS += -fno-pic + # prevent gcc from keeping the stack 16 byte aligned KBUILD_CFLAGS += $(call cc-option,-mpreferred-stack-boundary=2) diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 5a747dd..f7535be 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -57,7 +57,7 @@ KBUILD_CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D__KERNEL__ \ -Wall -Wstrict-prototypes \ -march=i386 -mregparm=3 \ -include $(srctree)/$(src)/code16gcc.h \ - -fno-strict-aliasing -fomit-frame-pointer \ + -fno-strict-aliasing -fomit-frame-pointer -fno-pic \ $(call cc-option, -ffreestanding) \ $(call cc-option, -fno-toplevel-reorder,\ $(call cc-option, -fno-unit-at-a-time)) \ diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile index b2d534c..8869287 100644 --- a/arch/x86/realmode/rm/Makefile +++ b/arch/x86/realmode/rm/Makefile @@ -72,7 +72,7 @@ KBUILD_CFLAGS := $(LINUXINCLUDE) -m32 -g -Os -D_SETUP -D__KERNEL__ -D_WAKEUP \ -Wall -Wstrict-prototypes \ -march=i386 -mregparm=3 \ -include $(srctree)/$(src)/../../boot/code16gcc.h \ - -fno-strict-aliasing -fomit-frame-pointer \ + -fno-strict-aliasing -fomit-frame-pointer -fno-pic \ $(call cc-option, -ffreestanding) \ $(call cc-option, -fno-toplevel-reorder,\ $(call cc-option, -fno-unit-at-a-time)) \ -- cgit v1.1 From cffa59baa5f1cf3e3e9e172697db48912471531c Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 2 Aug 2012 12:55:27 +0200 Subject: perf, x86: Fix uncore_types_exit section mismatch Fix the following section mismatch: WARNING: arch/x86/kernel/cpu/built-in.o(.text+0x7ad9): Section mismatch in reference from the function uncore_types_exit() to the function .init.text:uncore_type_exit() The function uncore_types_exit() references the function __init uncore_type_exit(). This is often because uncore_types_exit lacks a __init annotation or the annotation of uncore_type_exit is wrong. caused by 14371cce03c2 ("perf: Add generic PCI uncore PMU device support"). Cc: Zheng Yan Cc: Ingo Molnar Signed-off-by: Borislav Petkov Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1339741902-8449-8-git-send-email-zheng.z.yan@intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/perf_event_intel_uncore.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index 7563fda..a7ccd68 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -2373,7 +2373,7 @@ static void __init uncore_type_exit(struct intel_uncore_type *type) type->attr_groups[1] = NULL; } -static void uncore_types_exit(struct intel_uncore_type **types) +static void __init uncore_types_exit(struct intel_uncore_type **types) { int i; for (i = 0; types[i]; i++) -- cgit v1.1 From ebb6cc03596cc89c89670473282ea46573feb34f Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 6 Aug 2012 13:11:21 +0800 Subject: perf/x86: Fixes for Nehalem-EX uncore driver This patch includes following fixes and update: - Only some events in the Sbox and Mbox can use the match/mask registers, add code to check this. - The format definitions for xbr_mm_cfg and xbr_match registers in the Rbox are wrong, xbr_mm_cfg should use 32 bits, xbr_match should use 64 bits. - Cleanup the Rbox code. Compute the addresses extra registers in the enable_event function instead of the hw_config function. This simplifies the code in nhmex_rbox_alter_er(). Signed-off-by: Yan, Zheng Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1344229882-3907-2-git-send-email-zheng.z.yan@intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/perf_event_intel_uncore.c | 195 ++++++++++++-------------- arch/x86/kernel/cpu/perf_event_intel_uncore.h | 1 + 2 files changed, 87 insertions(+), 109 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index a7ccd68..84434e2 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -796,7 +796,6 @@ static struct intel_uncore_type *nhm_msr_uncores[] = { DEFINE_UNCORE_FORMAT_ATTR(event5, event, "config:1-5"); DEFINE_UNCORE_FORMAT_ATTR(counter, counter, "config:6-7"); -DEFINE_UNCORE_FORMAT_ATTR(mm_cfg, mm_cfg, "config:63"); DEFINE_UNCORE_FORMAT_ATTR(match, match, "config1:0-63"); DEFINE_UNCORE_FORMAT_ATTR(mask, mask, "config2:0-63"); @@ -1032,24 +1031,22 @@ static struct intel_uncore_type nhmex_uncore_bbox = { static int nhmex_sbox_hw_config(struct intel_uncore_box *box, struct perf_event *event) { - struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; - struct hw_perf_event_extra *reg2 = &event->hw.branch_reg; + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + struct hw_perf_event_extra *reg2 = &hwc->branch_reg; - if (event->attr.config & NHMEX_S_PMON_MM_CFG_EN) { - reg1->config = event->attr.config1; - reg2->config = event->attr.config2; - } else { - reg1->config = ~0ULL; - reg2->config = ~0ULL; - } + /* only TO_R_PROG_EV event uses the match/mask register */ + if ((hwc->config & NHMEX_PMON_CTL_EV_SEL_MASK) != + NHMEX_S_EVENT_TO_R_PROG_EV) + return 0; if (box->pmu->pmu_idx == 0) reg1->reg = NHMEX_S0_MSR_MM_CFG; else reg1->reg = NHMEX_S1_MSR_MM_CFG; - reg1->idx = 0; - + reg1->config = event->attr.config1; + reg2->config = event->attr.config2; return 0; } @@ -1059,8 +1056,8 @@ static void nhmex_sbox_msr_enable_event(struct intel_uncore_box *box, struct per struct hw_perf_event_extra *reg1 = &hwc->extra_reg; struct hw_perf_event_extra *reg2 = &hwc->branch_reg; - wrmsrl(reg1->reg, 0); - if (reg1->config != ~0ULL || reg2->config != ~0ULL) { + if (reg1->idx != EXTRA_REG_NONE) { + wrmsrl(reg1->reg, 0); wrmsrl(reg1->reg + 1, reg1->config); wrmsrl(reg1->reg + 2, reg2->config); wrmsrl(reg1->reg, NHMEX_S_PMON_MM_CFG_EN); @@ -1074,7 +1071,6 @@ static struct attribute *nhmex_uncore_sbox_formats_attr[] = { &format_attr_edge.attr, &format_attr_inv.attr, &format_attr_thresh8.attr, - &format_attr_mm_cfg.attr, &format_attr_match.attr, &format_attr_mask.attr, NULL, @@ -1264,7 +1260,8 @@ again: } /* for the match/mask registers */ - if ((uncore_box_is_fake(box) || !reg2->alloc) && + if (reg2->idx != EXTRA_REG_NONE && + (uncore_box_is_fake(box) || !reg2->alloc) && !nhmex_mbox_get_shared_reg(box, reg2->idx, reg2->config)) goto fail; @@ -1278,7 +1275,8 @@ again: if (idx[0] != 0xff && idx[0] != __BITS_VALUE(reg1->idx, 0, 8)) nhmex_mbox_alter_er(event, idx[0], true); reg1->alloc |= alloc; - reg2->alloc = 1; + if (reg2->idx != EXTRA_REG_NONE) + reg2->alloc = 1; } return NULL; fail: @@ -1342,9 +1340,6 @@ static int nhmex_mbox_hw_config(struct intel_uncore_box *box, struct perf_event struct extra_reg *er; unsigned msr; int reg_idx = 0; - - if (WARN_ON_ONCE(reg1->idx != -1)) - return -EINVAL; /* * The mbox events may require 2 extra MSRs at the most. But only * the lower 32 bits in these MSRs are significant, so we can use @@ -1355,11 +1350,6 @@ static int nhmex_mbox_hw_config(struct intel_uncore_box *box, struct perf_event continue; if (event->attr.config1 & ~er->valid_mask) return -EINVAL; - if (er->idx == __BITS_VALUE(reg1->idx, 0, 8) || - er->idx == __BITS_VALUE(reg1->idx, 1, 8)) - continue; - if (WARN_ON_ONCE(reg_idx >= 2)) - return -EINVAL; msr = er->msr + type->msr_offset * box->pmu->pmu_idx; if (WARN_ON_ONCE(msr >= 0xffff || er->idx >= 0xff)) @@ -1368,6 +1358,8 @@ static int nhmex_mbox_hw_config(struct intel_uncore_box *box, struct perf_event /* always use the 32~63 bits to pass the PLD config */ if (er->idx == EXTRA_REG_NHMEX_M_PLD) reg_idx = 1; + else if (WARN_ON_ONCE(reg_idx > 0)) + return -EINVAL; reg1->idx &= ~(0xff << (reg_idx * 8)); reg1->reg &= ~(0xffff << (reg_idx * 16)); @@ -1376,17 +1368,21 @@ static int nhmex_mbox_hw_config(struct intel_uncore_box *box, struct perf_event reg1->config = event->attr.config1; reg_idx++; } - /* use config2 to pass the filter config */ - reg2->idx = EXTRA_REG_NHMEX_M_FILTER; - if (event->attr.config2 & NHMEX_M_PMON_MM_CFG_EN) - reg2->config = event->attr.config2; - else - reg2->config = ~0ULL; - if (box->pmu->pmu_idx == 0) - reg2->reg = NHMEX_M0_MSR_PMU_MM_CFG; - else - reg2->reg = NHMEX_M1_MSR_PMU_MM_CFG; - + /* + * The mbox only provides ability to perform address matching + * for the PLD events. + */ + if (reg_idx == 2) { + reg2->idx = EXTRA_REG_NHMEX_M_FILTER; + if (event->attr.config2 & NHMEX_M_PMON_MM_CFG_EN) + reg2->config = event->attr.config2; + else + reg2->config = ~0ULL; + if (box->pmu->pmu_idx == 0) + reg2->reg = NHMEX_M0_MSR_PMU_MM_CFG; + else + reg2->reg = NHMEX_M1_MSR_PMU_MM_CFG; + } return 0; } @@ -1422,34 +1418,36 @@ static void nhmex_mbox_msr_enable_event(struct intel_uncore_box *box, struct per wrmsrl(__BITS_VALUE(reg1->reg, 1, 16), nhmex_mbox_shared_reg_config(box, idx)); - wrmsrl(reg2->reg, 0); - if (reg2->config != ~0ULL) { - wrmsrl(reg2->reg + 1, - reg2->config & NHMEX_M_PMON_ADDR_MATCH_MASK); - wrmsrl(reg2->reg + 2, NHMEX_M_PMON_ADDR_MASK_MASK & - (reg2->config >> NHMEX_M_PMON_ADDR_MASK_SHIFT)); - wrmsrl(reg2->reg, NHMEX_M_PMON_MM_CFG_EN); + if (reg2->idx != EXTRA_REG_NONE) { + wrmsrl(reg2->reg, 0); + if (reg2->config != ~0ULL) { + wrmsrl(reg2->reg + 1, + reg2->config & NHMEX_M_PMON_ADDR_MATCH_MASK); + wrmsrl(reg2->reg + 2, NHMEX_M_PMON_ADDR_MASK_MASK & + (reg2->config >> NHMEX_M_PMON_ADDR_MASK_SHIFT)); + wrmsrl(reg2->reg, NHMEX_M_PMON_MM_CFG_EN); + } } wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0); } -DEFINE_UNCORE_FORMAT_ATTR(count_mode, count_mode, "config:2-3"); -DEFINE_UNCORE_FORMAT_ATTR(storage_mode, storage_mode, "config:4-5"); -DEFINE_UNCORE_FORMAT_ATTR(wrap_mode, wrap_mode, "config:6"); -DEFINE_UNCORE_FORMAT_ATTR(flag_mode, flag_mode, "config:7"); -DEFINE_UNCORE_FORMAT_ATTR(inc_sel, inc_sel, "config:9-13"); -DEFINE_UNCORE_FORMAT_ATTR(set_flag_sel, set_flag_sel, "config:19-21"); -DEFINE_UNCORE_FORMAT_ATTR(filter_cfg, filter_cfg, "config2:63"); -DEFINE_UNCORE_FORMAT_ATTR(filter_match, filter_match, "config2:0-33"); -DEFINE_UNCORE_FORMAT_ATTR(filter_mask, filter_mask, "config2:34-61"); -DEFINE_UNCORE_FORMAT_ATTR(dsp, dsp, "config1:0-31"); -DEFINE_UNCORE_FORMAT_ATTR(thr, thr, "config1:0-31"); -DEFINE_UNCORE_FORMAT_ATTR(fvc, fvc, "config1:0-31"); -DEFINE_UNCORE_FORMAT_ATTR(pgt, pgt, "config1:0-31"); -DEFINE_UNCORE_FORMAT_ATTR(map, map, "config1:0-31"); -DEFINE_UNCORE_FORMAT_ATTR(iss, iss, "config1:0-31"); -DEFINE_UNCORE_FORMAT_ATTR(pld, pld, "config1:32-63"); +DEFINE_UNCORE_FORMAT_ATTR(count_mode, count_mode, "config:2-3"); +DEFINE_UNCORE_FORMAT_ATTR(storage_mode, storage_mode, "config:4-5"); +DEFINE_UNCORE_FORMAT_ATTR(wrap_mode, wrap_mode, "config:6"); +DEFINE_UNCORE_FORMAT_ATTR(flag_mode, flag_mode, "config:7"); +DEFINE_UNCORE_FORMAT_ATTR(inc_sel, inc_sel, "config:9-13"); +DEFINE_UNCORE_FORMAT_ATTR(set_flag_sel, set_flag_sel, "config:19-21"); +DEFINE_UNCORE_FORMAT_ATTR(filter_cfg_en, filter_cfg_en, "config2:63"); +DEFINE_UNCORE_FORMAT_ATTR(filter_match, filter_match, "config2:0-33"); +DEFINE_UNCORE_FORMAT_ATTR(filter_mask, filter_mask, "config2:34-61"); +DEFINE_UNCORE_FORMAT_ATTR(dsp, dsp, "config1:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(thr, thr, "config1:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(fvc, fvc, "config1:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(pgt, pgt, "config1:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(map, map, "config1:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(iss, iss, "config1:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(pld, pld, "config1:32-63"); static struct attribute *nhmex_uncore_mbox_formats_attr[] = { &format_attr_count_mode.attr, @@ -1458,7 +1456,7 @@ static struct attribute *nhmex_uncore_mbox_formats_attr[] = { &format_attr_flag_mode.attr, &format_attr_inc_sel.attr, &format_attr_set_flag_sel.attr, - &format_attr_filter_cfg.attr, + &format_attr_filter_cfg_en.attr, &format_attr_filter_match.attr, &format_attr_filter_mask.attr, &format_attr_dsp.attr, @@ -1513,7 +1511,7 @@ void nhmex_rbox_alter_er(struct intel_uncore_box *box, struct perf_event *event) struct hw_perf_event_extra *reg1 = &hwc->extra_reg; int port; - /* adjust the main event selector */ + /* adjust the main event selector and extra register index */ if (reg1->idx % 2) { reg1->idx--; hwc->config -= 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT; @@ -1522,29 +1520,17 @@ void nhmex_rbox_alter_er(struct intel_uncore_box *box, struct perf_event *event) hwc->config += 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT; } - /* adjust address or config of extra register */ + /* adjust extra register config */ port = reg1->idx / 6 + box->pmu->pmu_idx * 4; switch (reg1->idx % 6) { - case 0: - reg1->reg = NHMEX_R_MSR_PORTN_IPERF_CFG0(port); - break; - case 1: - reg1->reg = NHMEX_R_MSR_PORTN_IPERF_CFG1(port); - break; case 2: - /* the 8~15 bits to the 0~7 bits */ + /* shift the 8~15 bits to the 0~7 bits */ reg1->config >>= 8; break; case 3: - /* the 0~7 bits to the 8~15 bits */ + /* shift the 0~7 bits to the 8~15 bits */ reg1->config <<= 8; break; - case 4: - reg1->reg = NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(port); - break; - case 5: - reg1->reg = NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(port); - break; }; } @@ -1671,7 +1657,7 @@ static int nhmex_rbox_hw_config(struct intel_uncore_box *box, struct perf_event struct hw_perf_event *hwc = &event->hw; struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; struct hw_perf_event_extra *reg2 = &event->hw.branch_reg; - int port, idx; + int idx; idx = (event->hw.config & NHMEX_R_PMON_CTL_EV_SEL_MASK) >> NHMEX_R_PMON_CTL_EV_SEL_SHIFT; @@ -1681,27 +1667,11 @@ static int nhmex_rbox_hw_config(struct intel_uncore_box *box, struct perf_event reg1->idx = idx; reg1->config = event->attr.config1; - port = idx / 6 + box->pmu->pmu_idx * 4; - idx %= 6; - switch (idx) { - case 0: - reg1->reg = NHMEX_R_MSR_PORTN_IPERF_CFG0(port); - break; - case 1: - reg1->reg = NHMEX_R_MSR_PORTN_IPERF_CFG1(port); - break; - case 2: - case 3: - reg1->reg = NHMEX_R_MSR_PORTN_QLX_CFG(port); - break; + switch (idx % 6) { case 4: case 5: - if (idx == 4) - reg1->reg = NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(port); - else - reg1->reg = NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(port); - reg2->config = event->attr.config2; hwc->config |= event->attr.config & (~0ULL << 32); + reg2->config = event->attr.config2; break; }; return 0; @@ -1727,28 +1697,34 @@ static void nhmex_rbox_msr_enable_event(struct intel_uncore_box *box, struct per struct hw_perf_event *hwc = &event->hw; struct hw_perf_event_extra *reg1 = &hwc->extra_reg; struct hw_perf_event_extra *reg2 = &hwc->branch_reg; - int idx, er_idx; + int idx, port; - idx = reg1->idx % 6; - er_idx = idx; - if (er_idx > 2) - er_idx--; - er_idx += (reg1->idx / 6) * 5; + idx = reg1->idx; + port = idx / 6 + box->pmu->pmu_idx * 4; - switch (idx) { + switch (idx % 6) { case 0: + wrmsrl(NHMEX_R_MSR_PORTN_IPERF_CFG0(port), reg1->config); + break; case 1: - wrmsrl(reg1->reg, reg1->config); + wrmsrl(NHMEX_R_MSR_PORTN_IPERF_CFG1(port), reg1->config); break; case 2: case 3: - wrmsrl(reg1->reg, nhmex_rbox_shared_reg_config(box, er_idx)); + wrmsrl(NHMEX_R_MSR_PORTN_QLX_CFG(port), + nhmex_rbox_shared_reg_config(box, 2 + (idx / 6) * 5)); break; case 4: + wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(port), + hwc->config >> 32); + wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MATCH(port), reg1->config); + wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MASK(port), reg2->config); + break; case 5: - wrmsrl(reg1->reg, reg1->config); - wrmsrl(reg1->reg + 1, hwc->config >> 32); - wrmsrl(reg1->reg + 2, reg2->config); + wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(port), + hwc->config >> 32); + wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MATCH(port), reg1->config); + wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MASK(port), reg2->config); break; }; @@ -1756,8 +1732,8 @@ static void nhmex_rbox_msr_enable_event(struct intel_uncore_box *box, struct per (hwc->config & NHMEX_R_PMON_CTL_EV_SEL_MASK)); } -DEFINE_UNCORE_FORMAT_ATTR(xbr_match, xbr_match, "config:32-63"); -DEFINE_UNCORE_FORMAT_ATTR(xbr_mm_cfg, xbr_mm_cfg, "config1:0-63"); +DEFINE_UNCORE_FORMAT_ATTR(xbr_mm_cfg, xbr_mm_cfg, "config:32-63"); +DEFINE_UNCORE_FORMAT_ATTR(xbr_match, xbr_match, "config1:0-63"); DEFINE_UNCORE_FORMAT_ATTR(xbr_mask, xbr_mask, "config2:0-63"); DEFINE_UNCORE_FORMAT_ATTR(qlx_cfg, qlx_cfg, "config1:0-15"); DEFINE_UNCORE_FORMAT_ATTR(iperf_cfg, iperf_cfg, "config1:0-31"); @@ -2303,6 +2279,7 @@ int uncore_pmu_event_init(struct perf_event *event) event->hw.idx = -1; event->hw.last_tag = ~0ULL; event->hw.extra_reg.idx = EXTRA_REG_NONE; + event->hw.branch_reg.idx = EXTRA_REG_NONE; if (event->attr.config == UNCORE_FIXED_EVENT) { /* no fixed counter */ diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h index c9e5dc5..8384e9b 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h @@ -230,6 +230,7 @@ #define NHMEX_S1_MSR_MASK 0xe5a #define NHMEX_S_PMON_MM_CFG_EN (0x1ULL << 63) +#define NHMEX_S_EVENT_TO_R_PROG_EV 0 /* NHM-EX Mbox */ #define NHMEX_M0_MSR_GLOBAL_CTL 0xca0 -- cgit v1.1 From cb37af77124e8532e6ae3f9ca332593ba423b5f8 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 6 Aug 2012 13:11:22 +0800 Subject: perf/x86: Add Intel Westmere-EX uncore support The Westmere-EX uncore is similar to the Nehalem-EX uncore. The differences are: - Westmere-EX uncore has 10 instances of Cbox. The MSRs for Cbox8 and Cbox9 in the Westmere-EX aren't contiguous with Cbox 0~7. - The fvid field in the ZDP_CTL_FVC register in the Mbox is different. It's 5 bits in the Nehalem-EX, 6 bits in the Westmere-EX. Signed-off-by: Yan, Zheng Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1344229882-3907-3-git-send-email-zheng.z.yan@intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/perf_event_intel_uncore.c | 56 ++++++++++++++++++++++----- arch/x86/kernel/cpu/perf_event_intel_uncore.h | 45 ++++++++++----------- 2 files changed, 68 insertions(+), 33 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index 84434e2..0a55710 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -901,16 +901,21 @@ static struct attribute_group nhmex_uncore_cbox_format_group = { .attrs = nhmex_uncore_cbox_formats_attr, }; +/* msr offset for each instance of cbox */ +static unsigned nhmex_cbox_msr_offsets[] = { + 0x0, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0, 0x240, 0x2c0, +}; + static struct intel_uncore_type nhmex_uncore_cbox = { .name = "cbox", .num_counters = 6, - .num_boxes = 8, + .num_boxes = 10, .perf_ctr_bits = 48, .event_ctl = NHMEX_C0_MSR_PMON_EV_SEL0, .perf_ctr = NHMEX_C0_MSR_PMON_CTR0, .event_mask = NHMEX_PMON_RAW_EVENT_MASK, .box_ctl = NHMEX_C0_MSR_PMON_GLOBAL_CTL, - .msr_offset = NHMEX_C_MSR_OFFSET, + .msr_offsets = nhmex_cbox_msr_offsets, .pair_ctr_ctl = 1, .ops = &nhmex_uncore_ops, .format_group = &nhmex_uncore_cbox_format_group @@ -1138,6 +1143,9 @@ static struct extra_reg nhmex_uncore_mbox_extra_regs[] = { EVENT_EXTRA_END }; +/* Nehalem-EX or Westmere-EX ? */ +bool uncore_nhmex; + static bool nhmex_mbox_get_shared_reg(struct intel_uncore_box *box, int idx, u64 config) { struct intel_uncore_extra_reg *er; @@ -1167,18 +1175,29 @@ static bool nhmex_mbox_get_shared_reg(struct intel_uncore_box *box, int idx, u64 return false; /* mask of the shared fields */ - mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK; + if (uncore_nhmex) + mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK; + else + mask = WSMEX_M_PMON_ZDP_CTL_FVC_MASK; er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC]; raw_spin_lock_irqsave(&er->lock, flags); /* add mask of the non-shared field if it's in use */ - if (__BITS_VALUE(atomic_read(&er->ref), idx, 8)) - mask |= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); + if (__BITS_VALUE(atomic_read(&er->ref), idx, 8)) { + if (uncore_nhmex) + mask |= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); + else + mask |= WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); + } if (!atomic_read(&er->ref) || !((er->config ^ config) & mask)) { atomic_add(1 << (idx * 8), &er->ref); - mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK | - NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); + if (uncore_nhmex) + mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK | + NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); + else + mask = WSMEX_M_PMON_ZDP_CTL_FVC_MASK | + WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); er->config &= ~mask; er->config |= (config & mask); ret = true; @@ -1212,7 +1231,10 @@ u64 nhmex_mbox_alter_er(struct perf_event *event, int new_idx, bool modify) /* get the non-shared control bits and shift them */ idx = orig_idx - EXTRA_REG_NHMEX_M_ZDP_CTL_FVC; - config &= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); + if (uncore_nhmex) + config &= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); + else + config &= WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); if (new_idx > orig_idx) { idx = new_idx - orig_idx; config <<= 3 * idx; @@ -1222,6 +1244,10 @@ u64 nhmex_mbox_alter_er(struct perf_event *event, int new_idx, bool modify) } /* add the shared control bits back */ + if (uncore_nhmex) + config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config; + else + config |= WSMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config; config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config; if (modify) { /* adjust the main event selector */ @@ -1480,6 +1506,12 @@ static struct uncore_event_desc nhmex_uncore_mbox_events[] = { { /* end: all zeroes */ }, }; +static struct uncore_event_desc wsmex_uncore_mbox_events[] = { + INTEL_UNCORE_EVENT_DESC(bbox_cmds_read, "inc_sel=0xd,fvc=0x5000"), + INTEL_UNCORE_EVENT_DESC(bbox_cmds_write, "inc_sel=0xd,fvc=0x5040"), + { /* end: all zeroes */ }, +}; + static struct intel_uncore_ops nhmex_uncore_mbox_ops = { NHMEX_UNCORE_OPS_COMMON_INIT(), .enable_event = nhmex_mbox_msr_enable_event, @@ -2791,7 +2823,13 @@ static int __init uncore_cpu_init(void) snbep_uncore_cbox.num_boxes = max_cores; msr_uncores = snbep_msr_uncores; break; - case 46: + case 46: /* Nehalem-EX */ + uncore_nhmex = true; + case 47: /* Westmere-EX aka. Xeon E7 */ + if (!uncore_nhmex) + nhmex_uncore_mbox.event_descs = wsmex_uncore_mbox_events; + if (nhmex_uncore_cbox.num_boxes > max_cores) + nhmex_uncore_cbox.num_boxes = max_cores; msr_uncores = nhmex_msr_uncores; break; default: diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h index 8384e9b..5b81c18 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h @@ -276,18 +276,12 @@ NHMEX_M_PMON_CTL_INC_SEL_MASK | \ NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK) - -#define NHMEX_M_PMON_ZDP_CTL_FVC_FVID_MASK 0x1f -#define NHMEX_M_PMON_ZDP_CTL_FVC_BCMD_MASK (0x7 << 5) -#define NHMEX_M_PMON_ZDP_CTL_FVC_RSP_MASK (0x7 << 8) -#define NHMEX_M_PMON_ZDP_CTL_FVC_PBOX_INIT_ERR (1 << 23) -#define NHMEX_M_PMON_ZDP_CTL_FVC_MASK \ - (NHMEX_M_PMON_ZDP_CTL_FVC_FVID_MASK | \ - NHMEX_M_PMON_ZDP_CTL_FVC_BCMD_MASK | \ - NHMEX_M_PMON_ZDP_CTL_FVC_RSP_MASK | \ - NHMEX_M_PMON_ZDP_CTL_FVC_PBOX_INIT_ERR) +#define NHMEX_M_PMON_ZDP_CTL_FVC_MASK (((1 << 11) - 1) | (1 << 23)) #define NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n) (0x7 << (11 + 3 * (n))) +#define WSMEX_M_PMON_ZDP_CTL_FVC_MASK (((1 << 12) - 1) | (1 << 24)) +#define WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n) (0x7 << (12 + 3 * (n))) + /* * use the 9~13 bits to select event If the 7th bit is not set, * otherwise use the 19~21 bits to select event. @@ -369,6 +363,7 @@ struct intel_uncore_type { unsigned num_shared_regs:8; unsigned single_fixed:1; unsigned pair_ctr_ctl:1; + unsigned *msr_offsets; struct event_constraint unconstrainted; struct event_constraint *constraints; struct intel_uncore_pmu *pmus; @@ -486,29 +481,31 @@ unsigned uncore_pci_perf_ctr(struct intel_uncore_box *box, int idx) return idx * 8 + box->pmu->type->perf_ctr; } -static inline -unsigned uncore_msr_box_ctl(struct intel_uncore_box *box) +static inline unsigned uncore_msr_box_offset(struct intel_uncore_box *box) +{ + struct intel_uncore_pmu *pmu = box->pmu; + return pmu->type->msr_offsets ? + pmu->type->msr_offsets[pmu->pmu_idx] : + pmu->type->msr_offset * pmu->pmu_idx; +} + +static inline unsigned uncore_msr_box_ctl(struct intel_uncore_box *box) { if (!box->pmu->type->box_ctl) return 0; - return box->pmu->type->box_ctl + - box->pmu->type->msr_offset * box->pmu->pmu_idx; + return box->pmu->type->box_ctl + uncore_msr_box_offset(box); } -static inline -unsigned uncore_msr_fixed_ctl(struct intel_uncore_box *box) +static inline unsigned uncore_msr_fixed_ctl(struct intel_uncore_box *box) { if (!box->pmu->type->fixed_ctl) return 0; - return box->pmu->type->fixed_ctl + - box->pmu->type->msr_offset * box->pmu->pmu_idx; + return box->pmu->type->fixed_ctl + uncore_msr_box_offset(box); } -static inline -unsigned uncore_msr_fixed_ctr(struct intel_uncore_box *box) +static inline unsigned uncore_msr_fixed_ctr(struct intel_uncore_box *box) { - return box->pmu->type->fixed_ctr + - box->pmu->type->msr_offset * box->pmu->pmu_idx; + return box->pmu->type->fixed_ctr + uncore_msr_box_offset(box); } static inline @@ -516,7 +513,7 @@ unsigned uncore_msr_event_ctl(struct intel_uncore_box *box, int idx) { return box->pmu->type->event_ctl + (box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) + - box->pmu->type->msr_offset * box->pmu->pmu_idx; + uncore_msr_box_offset(box); } static inline @@ -524,7 +521,7 @@ unsigned uncore_msr_perf_ctr(struct intel_uncore_box *box, int idx) { return box->pmu->type->perf_ctr + (box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) + - box->pmu->type->msr_offset * box->pmu->pmu_idx; + uncore_msr_box_offset(box); } static inline -- cgit v1.1 From 26a4f3c08de49c1437a7b7f97693cf22d8c31656 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Thu, 9 Aug 2012 11:52:34 +0300 Subject: perf/x86: disable PEBS on a guest entry. If PMU counter has PEBS enabled it is not enough to disable counter on a guest entry since PEBS memory write can overshoot guest entry and corrupt guest memory. Disabling PEBS during guest entry solves the problem. Tested-by: David Ahern Signed-off-by: Gleb Natapov Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120809085234.GI3341@redhat.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/perf_event_intel.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 3823669..7f2739e 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1522,8 +1522,16 @@ static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr) arr[0].msr = MSR_CORE_PERF_GLOBAL_CTRL; arr[0].host = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask; arr[0].guest = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_host_mask; + /* + * If PMU counter has PEBS enabled it is not enough to disable counter + * on a guest entry since PEBS memory write can overshoot guest entry + * and corrupt guest memory. Disabling PEBS solves the problem. + */ + arr[1].msr = MSR_IA32_PEBS_ENABLE; + arr[1].host = cpuc->pebs_enabled; + arr[1].guest = 0; - *nr = 1; + *nr = 2; return arr; } -- cgit v1.1 From 51d59c6b422f3f95940ae4e5b42f165595906aee Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Fri, 3 Aug 2012 15:57:49 -0300 Subject: KVM: x86: fix pvclock guest stopped flag reporting kvm_guest_time_update unconditionally clears hv_clock.flags field, so the notification never reaches the guest. Fix it by allowing PVCLOCK_GUEST_STOPPED to passthrough. Reviewed-by: Eric B Munson Reviewed-by: Amit Shah Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/x86.c | 13 ++++++++++--- 2 files changed, 12 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 1309e69..fc0e752 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -420,6 +420,8 @@ struct kvm_vcpu_arch { unsigned int hw_tsc_khz; unsigned int time_offset; struct page *time_page; + /* set guest stopped flag in pvclock flags field */ + bool pvclock_set_guest_stopped_request; struct { u64 msr_val; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 91a5958..fb0d937 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1134,6 +1134,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) unsigned long this_tsc_khz; s64 kernel_ns, max_kernel_ns; u64 tsc_timestamp; + u8 pvclock_flags; /* Keep irq disabled to prevent changes to the clock */ local_irq_save(flags); @@ -1215,7 +1216,14 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; vcpu->last_kernel_ns = kernel_ns; vcpu->last_guest_tsc = tsc_timestamp; - vcpu->hv_clock.flags = 0; + + pvclock_flags = 0; + if (vcpu->pvclock_set_guest_stopped_request) { + pvclock_flags |= PVCLOCK_GUEST_STOPPED; + vcpu->pvclock_set_guest_stopped_request = false; + } + + vcpu->hv_clock.flags = pvclock_flags; /* * The interface expects us to write an even number signaling that the @@ -2624,10 +2632,9 @@ static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu, */ static int kvm_set_guest_paused(struct kvm_vcpu *vcpu) { - struct pvclock_vcpu_time_info *src = &vcpu->arch.hv_clock; if (!vcpu->arch.time_page) return -EINVAL; - src->flags |= PVCLOCK_GUEST_STOPPED; + vcpu->arch.pvclock_set_guest_stopped_request = true; kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); return 0; } -- cgit v1.1 From e423ca155d3f5f16b46e30de9c818875b1fd617d Mon Sep 17 00:00:00 2001 From: Raghavendra K T Date: Tue, 7 Aug 2012 13:10:13 +0530 Subject: KVM: Correct vmrun to vmcall typo Signed-off-by: Raghavendra K T Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_para.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index 2f7712e..20f5697 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -116,7 +116,7 @@ static inline bool kvm_check_and_clear_guest_paused(void) */ #define KVM_HYPERCALL ".byte 0x0f,0x01,0xc1" -/* For KVM hypercalls, a three-byte sequence of either the vmrun or the vmmrun +/* For KVM hypercalls, a three-byte sequence of either the vmcall or the vmmcall * instruction. The hypervisor may replace it with something else but only the * instructions are guaranteed to be supported. * -- cgit v1.1 From 2a7921b7a033d5cfc176690d6297c82846c582b2 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Sun, 12 Aug 2012 16:12:29 +0300 Subject: KVM: VMX: restore MSR_IA32_DEBUGCTLMSR after VMEXIT MSR_IA32_DEBUGCTLMSR is zeroed on VMEXIT. Restore it to the correct value. Signed-off-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index cc8ad98..d0f4bec 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6222,6 +6222,7 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long debugctlmsr; if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); @@ -6261,6 +6262,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx_set_interrupt_shadow(vcpu, 0); atomic_switch_perf_msrs(vmx); + debugctlmsr = get_debugctlmsr(); vmx->__launched = vmx->loaded_vmcs->launched; asm( @@ -6362,6 +6364,10 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) #endif ); + /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */ + if (debugctlmsr) + update_debugctlmsr(debugctlmsr); + #ifndef CONFIG_X86_64 /* * The sysexit path does not restore ds/es, so we must set them to -- cgit v1.1 From dbcb4e798072d114fe68813f39a9efd239ab99c0 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 13 Aug 2012 15:38:22 +0300 Subject: KVM: VMX: Advertize RDTSC exiting to nested guests All processors that support VMX have that feature, and guests (Xen) depend on it. As we already implement it, advertize it to the guest. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d0f4bec..13e0296 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1990,7 +1990,7 @@ static __init void nested_vmx_setup_ctls_msrs(void) #endif CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING | - CPU_BASED_RDPMC_EXITING | + CPU_BASED_RDPMC_EXITING | CPU_BASED_RDTSC_EXITING | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; /* * We can allow some features even when not supported by the -- cgit v1.1 From f1c6300183dbf5b9da25988e13f6f25a9e27151b Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Wed, 8 Aug 2012 12:16:52 -0700 Subject: x86, apic: fix broken legacy interrupts in the logical apic mode Recent commit 332afa656e76458ee9cf0f0d123016a0658539e4 cleaned up a workaround that updates irq_cfg domain for legacy irq's that are handled by the IO-APIC. This was assuming that the recent changes in assign_irq_vector() were sufficient to remove the workaround. But this broke couple of AMD platforms. One of them seems to be sending interrupts to the offline cpu's, resulting in spurious "No irq handler for vector xx (irq -1)" messages when those cpu's come online. And the other platform seems to always send the interrupt to the last logical CPU (cpu-7). Recent changes had an unintended side effect of using only logical cpu-0 in the IO-APIC RTE (during boot for the legacy interrupts) and this broke the legacy interrupts not getting routed to the cpu-7 on the AMD platform, resulting in a boot hang. For now, reintroduce the removed workaround, (essentially not allowing the vector to change for legacy irq's when io-apic starts to handle the irq. Which also addressed the uninteded sife effect of just specifying cpu-0 in the IO-APIC RTE for those irq's during boot). Reported-and-tested-by: Robert Richter Reported-and-tested-by: Borislav Petkov Signed-off-by: Suresh Siddha Link: http://lkml.kernel.org/r/1344453412.29170.5.camel@sbsiddha-desk.sc.intel.com Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apic/io_apic.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index a6c64aa..c265593 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1356,6 +1356,16 @@ static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg, if (!IO_APIC_IRQ(irq)) return; + /* + * For legacy irqs, cfg->domain starts with cpu 0. Now that IO-APIC + * can handle this irq and the apic driver is finialized at this point, + * update the cfg->domain. + */ + if (irq < legacy_pic->nr_legacy_irqs && + cpumask_equal(cfg->domain, cpumask_of(0))) + apic->vector_allocation_domain(0, cfg->domain, + apic->target_cpus()); + if (assign_irq_vector(irq, cfg, apic->target_cpus())) return; -- cgit v1.1 From f026cfa82f628db24b8cea41b9d6202af104cecb Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 14 Aug 2012 09:53:38 -0700 Subject: Revert "x86-64/efi: Use EFI to deal with platform wall clock" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit bacef661acdb634170a8faddbc1cf28e8f8b9eee. This commit has been found to cause serious regressions on a number of ASUS machines at the least. We probably need to provide a 1:1 map in addition to the EFI virtual memory map in order for this to work. Signed-off-by: H. Peter Anvin Reported-and-bisected-by: Jérôme Carretero Cc: Jan Beulich Cc: Matt Fleming Cc: Matthew Garrett Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120805172903.5f8bb24c@zougloub.eu --- arch/x86/mm/pageattr.c | 10 ++++------ arch/x86/platform/efi/efi.c | 30 ++++++++++++++++++++++++++---- 2 files changed, 30 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 931930a..a718e0d 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -919,13 +919,11 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, /* * On success we use clflush, when the CPU supports it to - * avoid the wbindv. If the CPU does not support it, in the - * error case, and during early boot (for EFI) we fall back - * to cpa_flush_all (which uses wbinvd): + * avoid the wbindv. If the CPU does not support it and in the + * error case we fall back to cpa_flush_all (which uses + * wbindv): */ - if (early_boot_irqs_disabled) - __cpa_flush_all((void *)(long)cache); - else if (!ret && cpu_has_clflush) { + if (!ret && cpu_has_clflush) { if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) { cpa_flush_array(addr, numpages, cache, cpa.flags, pages); diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 2dc29f5..92660eda 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -234,7 +234,22 @@ static efi_status_t __init phys_efi_set_virtual_address_map( return status; } -static int efi_set_rtc_mmss(unsigned long nowtime) +static efi_status_t __init phys_efi_get_time(efi_time_t *tm, + efi_time_cap_t *tc) +{ + unsigned long flags; + efi_status_t status; + + spin_lock_irqsave(&rtc_lock, flags); + efi_call_phys_prelog(); + status = efi_call_phys2(efi_phys.get_time, virt_to_phys(tm), + virt_to_phys(tc)); + efi_call_phys_epilog(); + spin_unlock_irqrestore(&rtc_lock, flags); + return status; +} + +int efi_set_rtc_mmss(unsigned long nowtime) { int real_seconds, real_minutes; efi_status_t status; @@ -263,7 +278,7 @@ static int efi_set_rtc_mmss(unsigned long nowtime) return 0; } -static unsigned long efi_get_time(void) +unsigned long efi_get_time(void) { efi_status_t status; efi_time_t eft; @@ -606,13 +621,18 @@ static int __init efi_runtime_init(void) } /* * We will only need *early* access to the following - * EFI runtime service before set_virtual_address_map + * two EFI runtime services before set_virtual_address_map * is invoked. */ + efi_phys.get_time = (efi_get_time_t *)runtime->get_time; efi_phys.set_virtual_address_map = (efi_set_virtual_address_map_t *) runtime->set_virtual_address_map; - + /* + * Make efi_get_time can be called before entering + * virtual mode. + */ + efi.get_time = phys_efi_get_time; early_iounmap(runtime, sizeof(efi_runtime_services_t)); return 0; @@ -700,10 +720,12 @@ void __init efi_init(void) efi_enabled = 0; return; } +#ifdef CONFIG_X86_32 if (efi_native) { x86_platform.get_wallclock = efi_get_time; x86_platform.set_wallclock = efi_set_rtc_mmss; } +#endif #if EFI_DEBUG print_efi_memmap(); -- cgit v1.1 From 28a6fdabb3ea775d3d707afd9d2728b3ced2c34d Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Tue, 14 Aug 2012 19:20:28 +0300 Subject: KVM: x86: drop parameter validation in ioapic/pic We validate irq pin number when routing is setup, so code handling illegal irq # in pic and ioapic on each injection is never called. Drop it, replace with BUG_ON to catch out of bounds access bugs. Signed-off-by: Michael S. Tsirkin Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/i8259.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index e498b18..90c84f9 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -190,17 +190,17 @@ void kvm_pic_update_irq(struct kvm_pic *s) int kvm_pic_set_irq(struct kvm_pic *s, int irq, int irq_source_id, int level) { - int ret = -1; + int ret, irq_level; + + BUG_ON(irq < 0 || irq >= PIC_NUM_PINS); pic_lock(s); - if (irq >= 0 && irq < PIC_NUM_PINS) { - int irq_level = __kvm_irq_line_state(&s->irq_states[irq], - irq_source_id, level); - ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, irq_level); - pic_update_irq(s); - trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr, - s->pics[irq >> 3].imr, ret == 0); - } + irq_level = __kvm_irq_line_state(&s->irq_states[irq], + irq_source_id, level); + ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, irq_level); + pic_update_irq(s); + trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr, + s->pics[irq >> 3].imr, ret == 0); pic_unlock(s); return ret; -- cgit v1.1 From 8fbe6a541f50eeec5e3e49bd92db23ade9496673 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 15 Aug 2012 16:00:40 +0200 Subject: KVM guest: disable stealtime on reboot to avoid mem corruption else, host continues to update stealtime after reboot, which can corrupt e.g. initramfs area. found when tracking down initramfs unpack error on initial reboot (with qemu-kvm -smp 2, no problem with single-core). Signed-off-by: Florian Westphal Signed-off-by: Marcelo Tosatti --- arch/x86/kernel/kvm.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index c1d61ee..1596cc8 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -354,6 +354,7 @@ static void kvm_pv_guest_cpu_reboot(void *unused) if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) wrmsrl(MSR_KVM_PV_EOI_EN, 0); kvm_pv_disable_apf(); + kvm_disable_steal_time(); } static int kvm_pv_reboot_notify(struct notifier_block *nb, -- cgit v1.1 From ca08649eb5dd30f11a5a8fe8659b48899b7ea6a1 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 16 Aug 2012 11:31:27 -0400 Subject: Revert "xen PVonHVM: move shared_info to MMIO before kexec" This reverts commit 00e37bdb0113a98408de42db85be002f21dbffd3. During shutdown of PVHVM guests with more than 2VCPUs on certain machines we can hit the race where the replaced shared_info is not replaced fast enough and the PV time clock retries reading the same area over and over without any any success and is stuck in an infinite loop. Acked-by: Olaf Hering Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/enlighten.c | 118 +++++------------------------------------------ arch/x86/xen/suspend.c | 2 +- arch/x86/xen/xen-ops.h | 2 +- 3 files changed, 13 insertions(+), 109 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index a6f8acb..f1814fc 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -31,7 +31,6 @@ #include #include #include -#include #include #include @@ -1472,130 +1471,38 @@ asmlinkage void __init xen_start_kernel(void) #endif } -#ifdef CONFIG_XEN_PVHVM -/* - * The pfn containing the shared_info is located somewhere in RAM. This - * will cause trouble if the current kernel is doing a kexec boot into a - * new kernel. The new kernel (and its startup code) can not know where - * the pfn is, so it can not reserve the page. The hypervisor will - * continue to update the pfn, and as a result memory corruption occours - * in the new kernel. - * - * One way to work around this issue is to allocate a page in the - * xen-platform pci device's BAR memory range. But pci init is done very - * late and the shared_info page is already in use very early to read - * the pvclock. So moving the pfn from RAM to MMIO is racy because some - * code paths on other vcpus could access the pfn during the small - * window when the old pfn is moved to the new pfn. There is even a - * small window were the old pfn is not backed by a mfn, and during that - * time all reads return -1. - * - * Because it is not known upfront where the MMIO region is located it - * can not be used right from the start in xen_hvm_init_shared_info. - * - * To minimise trouble the move of the pfn is done shortly before kexec. - * This does not eliminate the race because all vcpus are still online - * when the syscore_ops will be called. But hopefully there is no work - * pending at this point in time. Also the syscore_op is run last which - * reduces the risk further. - */ - -static struct shared_info *xen_hvm_shared_info; - -static void xen_hvm_connect_shared_info(unsigned long pfn) +void __ref xen_hvm_init_shared_info(void) { + int cpu; struct xen_add_to_physmap xatp; + static struct shared_info *shared_info_page = 0; + if (!shared_info_page) + shared_info_page = (struct shared_info *) + extend_brk(PAGE_SIZE, PAGE_SIZE); xatp.domid = DOMID_SELF; xatp.idx = 0; xatp.space = XENMAPSPACE_shared_info; - xatp.gpfn = pfn; + xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT; if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) BUG(); -} -static void xen_hvm_set_shared_info(struct shared_info *sip) -{ - int cpu; - - HYPERVISOR_shared_info = sip; + HYPERVISOR_shared_info = (struct shared_info *)shared_info_page; /* xen_vcpu is a pointer to the vcpu_info struct in the shared_info * page, we use it in the event channel upcall and in some pvclock * related functions. We don't need the vcpu_info placement * optimizations because we don't use any pv_mmu or pv_irq op on * HVM. - * When xen_hvm_set_shared_info is run at boot time only vcpu 0 is - * online but xen_hvm_set_shared_info is run at resume time too and + * When xen_hvm_init_shared_info is run at boot time only vcpu 0 is + * online but xen_hvm_init_shared_info is run at resume time too and * in that case multiple vcpus might be online. */ for_each_online_cpu(cpu) { per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; } } -/* Reconnect the shared_info pfn to a mfn */ -void xen_hvm_resume_shared_info(void) -{ - xen_hvm_connect_shared_info(__pa(xen_hvm_shared_info) >> PAGE_SHIFT); -} - -#ifdef CONFIG_KEXEC -static struct shared_info *xen_hvm_shared_info_kexec; -static unsigned long xen_hvm_shared_info_pfn_kexec; - -/* Remember a pfn in MMIO space for kexec reboot */ -void __devinit xen_hvm_prepare_kexec(struct shared_info *sip, unsigned long pfn) -{ - xen_hvm_shared_info_kexec = sip; - xen_hvm_shared_info_pfn_kexec = pfn; -} - -static void xen_hvm_syscore_shutdown(void) -{ - struct xen_memory_reservation reservation = { - .domid = DOMID_SELF, - .nr_extents = 1, - }; - unsigned long prev_pfn; - int rc; - - if (!xen_hvm_shared_info_kexec) - return; - - prev_pfn = __pa(xen_hvm_shared_info) >> PAGE_SHIFT; - set_xen_guest_handle(reservation.extent_start, &prev_pfn); - - /* Move pfn to MMIO, disconnects previous pfn from mfn */ - xen_hvm_connect_shared_info(xen_hvm_shared_info_pfn_kexec); - - /* Update pointers, following hypercall is also a memory barrier */ - xen_hvm_set_shared_info(xen_hvm_shared_info_kexec); - - /* Allocate new mfn for previous pfn */ - do { - rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation); - if (rc == 0) - msleep(123); - } while (rc == 0); - - /* Make sure the previous pfn is really connected to a (new) mfn */ - BUG_ON(rc != 1); -} - -static struct syscore_ops xen_hvm_syscore_ops = { - .shutdown = xen_hvm_syscore_shutdown, -}; -#endif - -/* Use a pfn in RAM, may move to MMIO before kexec. */ -static void __init xen_hvm_init_shared_info(void) -{ - /* Remember pointer for resume */ - xen_hvm_shared_info = extend_brk(PAGE_SIZE, PAGE_SIZE); - xen_hvm_connect_shared_info(__pa(xen_hvm_shared_info) >> PAGE_SHIFT); - xen_hvm_set_shared_info(xen_hvm_shared_info); -} - +#ifdef CONFIG_XEN_PVHVM static void __init init_hvm_pv_info(void) { int major, minor; @@ -1646,9 +1553,6 @@ static void __init xen_hvm_guest_init(void) init_hvm_pv_info(); xen_hvm_init_shared_info(); -#ifdef CONFIG_KEXEC - register_syscore_ops(&xen_hvm_syscore_ops); -#endif if (xen_feature(XENFEAT_hvm_callback_vector)) xen_have_vector_callback = 1; diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index ae8a00c..45329c8 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -30,7 +30,7 @@ void xen_arch_hvm_post_suspend(int suspend_cancelled) { #ifdef CONFIG_XEN_PVHVM int cpu; - xen_hvm_resume_shared_info(); + xen_hvm_init_shared_info(); xen_callback_vector(); xen_unplug_emulated_devices(); if (xen_feature(XENFEAT_hvm_safe_pvclock)) { diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 1e4329e..202d4c1 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -41,7 +41,7 @@ void xen_enable_syscall(void); void xen_vcpu_restore(void); void xen_callback_vector(void); -void xen_hvm_resume_shared_info(void); +void xen_hvm_init_shared_info(void); void xen_unplug_emulated_devices(void); void __init xen_build_dynamic_phys_to_machine(void); -- cgit v1.1 From 250a41e0ecc433cdd553a364d0fc74c766425209 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 17 Aug 2012 09:27:35 -0400 Subject: xen/p2m: Reuse existing P2M leafs if they are filled with 1:1 PFNs or INVALID. If P2M leaf is completly packed with INVALID_P2M_ENTRY or with 1:1 PFNs (so IDENTITY_FRAME type PFNs), we can swap the P2M leaf with either a p2m_missing or p2m_identity respectively. The old page (which was created via extend_brk or was grafted on from the mfn_list) can be re-used for setting new PFNs. This also means we can remove git commit: 5bc6f9888db5739abfa0cae279b4b442e4db8049 xen/p2m: Reserve 8MB of _brk space for P2M leafs when populating back which tried to fix this. and make the amount that is required to be reserved much smaller. CC: stable@vger.kernel.org # for 3.5 only. Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/p2m.c | 95 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 92 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index b2e91d4..d4b2554 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -196,9 +196,11 @@ RESERVE_BRK(p2m_mid_identity, PAGE_SIZE * 2 * 3); /* When we populate back during bootup, the amount of pages can vary. The * max we have is seen is 395979, but that does not mean it can't be more. - * But some machines can have 3GB I/O holes even. So lets reserve enough - * for 4GB of I/O and E820 holes. */ -RESERVE_BRK(p2m_populated, PMD_SIZE * 4); + * Some machines can have 3GB I/O holes even. With early_can_reuse_p2m_middle + * it can re-use Xen provided mfn_list array, so we only need to allocate at + * most three P2M top nodes. */ +RESERVE_BRK(p2m_populated, PAGE_SIZE * 3); + static inline unsigned p2m_top_index(unsigned long pfn) { BUG_ON(pfn >= MAX_P2M_PFN); @@ -575,12 +577,99 @@ static bool __init early_alloc_p2m(unsigned long pfn) } return true; } + +/* + * Skim over the P2M tree looking at pages that are either filled with + * INVALID_P2M_ENTRY or with 1:1 PFNs. If found, re-use that page and + * replace the P2M leaf with a p2m_missing or p2m_identity. + * Stick the old page in the new P2M tree location. + */ +bool __init early_can_reuse_p2m_middle(unsigned long set_pfn, unsigned long set_mfn) +{ + unsigned topidx; + unsigned mididx; + unsigned ident_pfns; + unsigned inv_pfns; + unsigned long *p2m; + unsigned long *mid_mfn_p; + unsigned idx; + unsigned long pfn; + + /* We only look when this entails a P2M middle layer */ + if (p2m_index(set_pfn)) + return false; + + for (pfn = 0; pfn <= MAX_DOMAIN_PAGES; pfn += P2M_PER_PAGE) { + topidx = p2m_top_index(pfn); + + if (!p2m_top[topidx]) + continue; + + if (p2m_top[topidx] == p2m_mid_missing) + continue; + + mididx = p2m_mid_index(pfn); + p2m = p2m_top[topidx][mididx]; + if (!p2m) + continue; + + if ((p2m == p2m_missing) || (p2m == p2m_identity)) + continue; + + if ((unsigned long)p2m == INVALID_P2M_ENTRY) + continue; + + ident_pfns = 0; + inv_pfns = 0; + for (idx = 0; idx < P2M_PER_PAGE; idx++) { + /* IDENTITY_PFNs are 1:1 */ + if (p2m[idx] == IDENTITY_FRAME(pfn + idx)) + ident_pfns++; + else if (p2m[idx] == INVALID_P2M_ENTRY) + inv_pfns++; + else + break; + } + if ((ident_pfns == P2M_PER_PAGE) || (inv_pfns == P2M_PER_PAGE)) + goto found; + } + return false; +found: + /* Found one, replace old with p2m_identity or p2m_missing */ + p2m_top[topidx][mididx] = (ident_pfns ? p2m_identity : p2m_missing); + /* And the other for save/restore.. */ + mid_mfn_p = p2m_top_mfn_p[topidx]; + /* NOTE: Even if it is a p2m_identity it should still be point to + * a page filled with INVALID_P2M_ENTRY entries. */ + mid_mfn_p[mididx] = virt_to_mfn(p2m_missing); + + /* Reset where we want to stick the old page in. */ + topidx = p2m_top_index(set_pfn); + mididx = p2m_mid_index(set_pfn); + + /* This shouldn't happen */ + if (WARN_ON(p2m_top[topidx] == p2m_mid_missing)) + early_alloc_p2m(set_pfn); + + if (WARN_ON(p2m_top[topidx][mididx] != p2m_missing)) + return false; + + p2m_init(p2m); + p2m_top[topidx][mididx] = p2m; + mid_mfn_p = p2m_top_mfn_p[topidx]; + mid_mfn_p[mididx] = virt_to_mfn(p2m); + + return true; +} bool __init early_set_phys_to_machine(unsigned long pfn, unsigned long mfn) { if (unlikely(!__set_phys_to_machine(pfn, mfn))) { if (!early_alloc_p2m(pfn)) return false; + if (early_can_reuse_p2m_middle(pfn, mfn)) + return __set_phys_to_machine(pfn, mfn); + if (!early_alloc_p2m_middle(pfn, false /* boundary crossover OK!*/)) return false; -- cgit v1.1 From 515c7af85ed92696c311c53d53cb4898ff32d784 Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Sat, 18 Aug 2012 16:11:37 -0400 Subject: x32: Use compat shims for {g,s}etsockopt Some of the arguments to {g,s}etsockopt are passed in userland pointers. If we try to use the 64bit entry point, we end up sometimes failing. For example, dhcpcd doesn't run in x32: # dhcpcd eth0 dhcpcd[1979]: version 5.5.6 starting dhcpcd[1979]: eth0: broadcasting for a lease dhcpcd[1979]: eth0: open_socket: Invalid argument dhcpcd[1979]: eth0: send_raw_packet: Bad file descriptor The code in particular is getting back EINVAL when doing: struct sock_fprog pf; setsockopt(s, SOL_SOCKET, SO_ATTACH_FILTER, &pf, sizeof(pf)); Diving into the kernel code, we can see: include/linux/filter.h: struct sock_fprog { unsigned short len; struct sock_filter __user *filter; }; net/core/sock.c: case SO_ATTACH_FILTER: ret = -EINVAL; if (optlen == sizeof(struct sock_fprog)) { struct sock_fprog fprog; ret = -EFAULT; if (copy_from_user(&fprog, optval, sizeof(fprog))) break; ret = sk_attach_filter(&fprog, sk); } break; arch/x86/syscalls/syscall_64.tbl: 54 common setsockopt sys_setsockopt 55 common getsockopt sys_getsockopt So for x64, sizeof(sock_fprog) is 16 bytes. For x86/x32, it's 8 bytes. This comes down to the pointer being 32bit for x32, which means we need to do structure size translation. But since x32 comes in directly to sys_setsockopt, it doesn't get translated like x86. After changing the syscall table and rebuilding glibc with the new kernel headers, dhcp runs fine in an x32 userland. Oddly, it seems like Linus noted the same thing during the initial port, but I guess that was missed/lost along the way: https://lkml.org/lkml/2011/8/26/452 [ hpa: tagging for -stable since this is an ABI fix. ] Bugzilla: https://bugs.gentoo.org/423649 Reported-by: Mads Signed-off-by: Mike Frysinger Link: http://lkml.kernel.org/r/1345320697-15713-1-git-send-email-vapier@gentoo.org Cc: H. J. Lu Cc: v3.4..v3.5 Signed-off-by: H. Peter Anvin --- arch/x86/syscalls/syscall_64.tbl | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl index 29aed7a..a582bfe 100644 --- a/arch/x86/syscalls/syscall_64.tbl +++ b/arch/x86/syscalls/syscall_64.tbl @@ -60,8 +60,8 @@ 51 common getsockname sys_getsockname 52 common getpeername sys_getpeername 53 common socketpair sys_socketpair -54 common setsockopt sys_setsockopt -55 common getsockopt sys_getsockopt +54 64 setsockopt sys_setsockopt +55 64 getsockopt sys_getsockopt 56 common clone stub_clone 57 common fork stub_fork 58 common vfork stub_vfork @@ -353,3 +353,5 @@ 538 x32 sendmmsg compat_sys_sendmmsg 539 x32 process_vm_readv compat_sys_process_vm_readv 540 x32 process_vm_writev compat_sys_process_vm_writev +541 x32 setsockopt compat_sys_setsockopt +542 x32 getsockopt compat_sys_getsockopt -- cgit v1.1 From 023af608254add7ba037cd634cc5f2fb21ff6420 Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Sun, 22 Jul 2012 18:18:37 +0300 Subject: crypto: aesni_intel - improve lrw and xts performance by utilizing parallel AES-NI hardware pipelines Use parallel LRW and XTS encryption facilities to better utilize AES-NI hardware pipelines and gain extra performance. Tcrypt benchmark results (async), old vs new ratios: Intel Core i5-2450M CPU (fam: 6, model: 42, step: 7) aes:128bit lrw:256bit xts:256bit size lrw-enc lrw-dec xts-dec xts-dec 16B 0.99x 1.00x 1.22x 1.19x 64B 1.38x 1.50x 1.58x 1.61x 256B 2.04x 2.02x 2.27x 2.29x 1024B 2.56x 2.54x 2.89x 2.92x 8192B 2.85x 2.99x 3.40x 3.23x aes:192bit lrw:320bit xts:384bit size lrw-enc lrw-dec xts-dec xts-dec 16B 1.08x 1.08x 1.16x 1.17x 64B 1.48x 1.54x 1.59x 1.65x 256B 2.18x 2.17x 2.29x 2.28x 1024B 2.67x 2.67x 2.87x 3.05x 8192B 2.93x 2.84x 3.28x 3.33x aes:256bit lrw:348bit xts:512bit size lrw-enc lrw-dec xts-dec xts-dec 16B 1.07x 1.07x 1.18x 1.19x 64B 1.56x 1.56x 1.70x 1.71x 256B 2.22x 2.24x 2.46x 2.46x 1024B 2.76x 2.77x 3.13x 3.05x 8192B 2.99x 3.05x 3.40x 3.30x Cc: Huang Ying Signed-off-by: Jussi Kivilinna Reviewed-by: Kim Phillips Signed-off-by: Herbert Xu --- arch/x86/crypto/aesni-intel_glue.c | 253 ++++++++++++++++++++++++++++++++----- 1 file changed, 218 insertions(+), 35 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 648347a..7c04d0d 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -28,6 +28,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -41,18 +44,10 @@ #define HAS_CTR #endif -#if defined(CONFIG_CRYPTO_LRW) || defined(CONFIG_CRYPTO_LRW_MODULE) -#define HAS_LRW -#endif - #if defined(CONFIG_CRYPTO_PCBC) || defined(CONFIG_CRYPTO_PCBC_MODULE) #define HAS_PCBC #endif -#if defined(CONFIG_CRYPTO_XTS) || defined(CONFIG_CRYPTO_XTS_MODULE) -#define HAS_XTS -#endif - /* This data is stored at the end of the crypto_tfm struct. * It's a type of per "session" data storage location. * This needs to be 16 byte aligned. @@ -79,6 +74,16 @@ struct aesni_hash_subkey_req_data { #define AES_BLOCK_MASK (~(AES_BLOCK_SIZE-1)) #define RFC4106_HASH_SUBKEY_SIZE 16 +struct aesni_lrw_ctx { + struct lrw_table_ctx lrw_table; + u8 raw_aes_ctx[sizeof(struct crypto_aes_ctx) + AESNI_ALIGN - 1]; +}; + +struct aesni_xts_ctx { + u8 raw_tweak_ctx[sizeof(struct crypto_aes_ctx) + AESNI_ALIGN - 1]; + u8 raw_crypt_ctx[sizeof(struct crypto_aes_ctx) + AESNI_ALIGN - 1]; +}; + asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, unsigned int key_len); asmlinkage void aesni_enc(struct crypto_aes_ctx *ctx, u8 *out, @@ -398,13 +403,6 @@ static int ablk_rfc3686_ctr_init(struct crypto_tfm *tfm) #endif #endif -#ifdef HAS_LRW -static int ablk_lrw_init(struct crypto_tfm *tfm) -{ - return ablk_init_common(tfm, "fpu(lrw(__driver-aes-aesni))"); -} -#endif - #ifdef HAS_PCBC static int ablk_pcbc_init(struct crypto_tfm *tfm) { @@ -412,12 +410,160 @@ static int ablk_pcbc_init(struct crypto_tfm *tfm) } #endif -#ifdef HAS_XTS -static int ablk_xts_init(struct crypto_tfm *tfm) +static void lrw_xts_encrypt_callback(void *ctx, u8 *blks, unsigned int nbytes) { - return ablk_init_common(tfm, "fpu(xts(__driver-aes-aesni))"); + aesni_ecb_enc(ctx, blks, blks, nbytes); +} + +static void lrw_xts_decrypt_callback(void *ctx, u8 *blks, unsigned int nbytes) +{ + aesni_ecb_dec(ctx, blks, blks, nbytes); +} + +static int lrw_aesni_setkey(struct crypto_tfm *tfm, const u8 *key, + unsigned int keylen) +{ + struct aesni_lrw_ctx *ctx = crypto_tfm_ctx(tfm); + int err; + + err = aes_set_key_common(tfm, ctx->raw_aes_ctx, key, + keylen - AES_BLOCK_SIZE); + if (err) + return err; + + return lrw_init_table(&ctx->lrw_table, key + keylen - AES_BLOCK_SIZE); +} + +static void lrw_aesni_exit_tfm(struct crypto_tfm *tfm) +{ + struct aesni_lrw_ctx *ctx = crypto_tfm_ctx(tfm); + + lrw_free_table(&ctx->lrw_table); +} + +static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct aesni_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + be128 buf[8]; + struct lrw_crypt_req req = { + .tbuf = buf, + .tbuflen = sizeof(buf), + + .table_ctx = &ctx->lrw_table, + .crypt_ctx = aes_ctx(ctx->raw_aes_ctx), + .crypt_fn = lrw_xts_encrypt_callback, + }; + int ret; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + + kernel_fpu_begin(); + ret = lrw_crypt(desc, dst, src, nbytes, &req); + kernel_fpu_end(); + + return ret; +} + +static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct aesni_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + be128 buf[8]; + struct lrw_crypt_req req = { + .tbuf = buf, + .tbuflen = sizeof(buf), + + .table_ctx = &ctx->lrw_table, + .crypt_ctx = aes_ctx(ctx->raw_aes_ctx), + .crypt_fn = lrw_xts_decrypt_callback, + }; + int ret; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + + kernel_fpu_begin(); + ret = lrw_crypt(desc, dst, src, nbytes, &req); + kernel_fpu_end(); + + return ret; +} + +static int xts_aesni_setkey(struct crypto_tfm *tfm, const u8 *key, + unsigned int keylen) +{ + struct aesni_xts_ctx *ctx = crypto_tfm_ctx(tfm); + u32 *flags = &tfm->crt_flags; + int err; + + /* key consists of keys of equal size concatenated, therefore + * the length must be even + */ + if (keylen % 2) { + *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; + return -EINVAL; + } + + /* first half of xts-key is for crypt */ + err = aes_set_key_common(tfm, ctx->raw_crypt_ctx, key, keylen / 2); + if (err) + return err; + + /* second half of xts-key is for tweak */ + return aes_set_key_common(tfm, ctx->raw_tweak_ctx, key + keylen / 2, + keylen / 2); +} + + +static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct aesni_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + be128 buf[8]; + struct xts_crypt_req req = { + .tbuf = buf, + .tbuflen = sizeof(buf), + + .tweak_ctx = aes_ctx(ctx->raw_tweak_ctx), + .tweak_fn = XTS_TWEAK_CAST(aesni_enc), + .crypt_ctx = aes_ctx(ctx->raw_crypt_ctx), + .crypt_fn = lrw_xts_encrypt_callback, + }; + int ret; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + + kernel_fpu_begin(); + ret = xts_crypt(desc, dst, src, nbytes, &req); + kernel_fpu_end(); + + return ret; +} + +static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct aesni_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + be128 buf[8]; + struct xts_crypt_req req = { + .tbuf = buf, + .tbuflen = sizeof(buf), + + .tweak_ctx = aes_ctx(ctx->raw_tweak_ctx), + .tweak_fn = XTS_TWEAK_CAST(aesni_enc), + .crypt_ctx = aes_ctx(ctx->raw_crypt_ctx), + .crypt_fn = lrw_xts_decrypt_callback, + }; + int ret; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + + kernel_fpu_begin(); + ret = xts_crypt(desc, dst, src, nbytes, &req); + kernel_fpu_end(); + + return ret; } -#endif #ifdef CONFIG_X86_64 static int rfc4106_init(struct crypto_tfm *tfm) @@ -1035,10 +1181,10 @@ static struct crypto_alg aesni_algs[] = { { }, #endif #endif -#ifdef HAS_LRW +#ifdef HAS_PCBC }, { - .cra_name = "lrw(aes)", - .cra_driver_name = "lrw-aes-aesni", + .cra_name = "pcbc(aes)", + .cra_driver_name = "pcbc-aes-aesni", .cra_priority = 400, .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, .cra_blocksize = AES_BLOCK_SIZE, @@ -1046,12 +1192,12 @@ static struct crypto_alg aesni_algs[] = { { .cra_alignmask = 0, .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, - .cra_init = ablk_lrw_init, + .cra_init = ablk_pcbc_init, .cra_exit = ablk_exit, .cra_u = { .ablkcipher = { - .min_keysize = AES_MIN_KEY_SIZE + AES_BLOCK_SIZE, - .max_keysize = AES_MAX_KEY_SIZE + AES_BLOCK_SIZE, + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, .ivsize = AES_BLOCK_SIZE, .setkey = ablk_set_key, .encrypt = ablk_encrypt, @@ -1059,10 +1205,50 @@ static struct crypto_alg aesni_algs[] = { { }, }, #endif -#ifdef HAS_PCBC }, { - .cra_name = "pcbc(aes)", - .cra_driver_name = "pcbc-aes-aesni", + .cra_name = "__lrw-aes-aesni", + .cra_driver_name = "__driver-lrw-aes-aesni", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct aesni_lrw_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_exit = lrw_aesni_exit_tfm, + .cra_u = { + .blkcipher = { + .min_keysize = AES_MIN_KEY_SIZE + AES_BLOCK_SIZE, + .max_keysize = AES_MAX_KEY_SIZE + AES_BLOCK_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = lrw_aesni_setkey, + .encrypt = lrw_encrypt, + .decrypt = lrw_decrypt, + }, + }, +}, { + .cra_name = "__xts-aes-aesni", + .cra_driver_name = "__driver-xts-aes-aesni", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct aesni_xts_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_u = { + .blkcipher = { + .min_keysize = 2 * AES_MIN_KEY_SIZE, + .max_keysize = 2 * AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = xts_aesni_setkey, + .encrypt = xts_encrypt, + .decrypt = xts_decrypt, + }, + }, +}, { + .cra_name = "lrw(aes)", + .cra_driver_name = "lrw-aes-aesni", .cra_priority = 400, .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, .cra_blocksize = AES_BLOCK_SIZE, @@ -1070,20 +1256,18 @@ static struct crypto_alg aesni_algs[] = { { .cra_alignmask = 0, .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, - .cra_init = ablk_pcbc_init, + .cra_init = ablk_init, .cra_exit = ablk_exit, .cra_u = { .ablkcipher = { - .min_keysize = AES_MIN_KEY_SIZE, - .max_keysize = AES_MAX_KEY_SIZE, + .min_keysize = AES_MIN_KEY_SIZE + AES_BLOCK_SIZE, + .max_keysize = AES_MAX_KEY_SIZE + AES_BLOCK_SIZE, .ivsize = AES_BLOCK_SIZE, .setkey = ablk_set_key, .encrypt = ablk_encrypt, .decrypt = ablk_decrypt, }, }, -#endif -#ifdef HAS_XTS }, { .cra_name = "xts(aes)", .cra_driver_name = "xts-aes-aesni", @@ -1094,7 +1278,7 @@ static struct crypto_alg aesni_algs[] = { { .cra_alignmask = 0, .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, - .cra_init = ablk_xts_init, + .cra_init = ablk_init, .cra_exit = ablk_exit, .cra_u = { .ablkcipher = { @@ -1106,7 +1290,6 @@ static struct crypto_alg aesni_algs[] = { { .decrypt = ablk_decrypt, }, }, -#endif } }; -- cgit v1.1 From 0570a365a6b8ccbfe7baa459de2b7396ddf2de90 Mon Sep 17 00:00:00 2001 From: Paul Bolle Date: Sun, 19 Aug 2012 19:06:43 +0200 Subject: x86, boot: Remove obsolete and unused constant RAMDISK The named constant RAMDISK is unused. It used to set the (obsolete) kernel boot header field ram_size, but its usage for that purpose got dropped in commit 5e47c478b0b69bc9bc3ba544e4b1ca3268f98fef ("x86: remove zImage support"). Now remove this constant too. Signed-off-by: Paul Bolle Link: http://lkml.kernel.org/r/1345396003.1771.9.camel@x61.thuisdomein Signed-off-by: H. Peter Anvin --- arch/x86/boot/header.S | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index b4e15dd..2a01744 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -32,10 +32,6 @@ SYSSEG = 0x1000 /* historical load address >> 4 */ #define SVGA_MODE ASK_VGA #endif -#ifndef RAMDISK -#define RAMDISK 0 -#endif - #ifndef ROOT_RDONLY #define ROOT_RDONLY 1 #endif -- cgit v1.1 From a3118beb6a8cbe77ae3342125d920205871b0717 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 28 Jun 2012 22:12:36 -0400 Subject: xen/p2m: Fix the comment describing the P2M tree. It mixed up the p2m_mid_missing with p2m_missing. Also remove some extra spaces. Acked-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/p2m.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 64effdc6..e4adbfb 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -22,7 +22,7 @@ * * P2M_PER_PAGE depends on the architecture, as a mfn is always * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to - * 512 and 1024 entries respectively. + * 512 and 1024 entries respectively. * * In short, these structures contain the Machine Frame Number (MFN) of the PFN. * @@ -139,11 +139,11 @@ * / | ~0, ~0, .... | * | \---------------/ * | - * p2m_missing p2m_missing - * /------------------\ /------------\ - * | [p2m_mid_missing]+---->| ~0, ~0, ~0 | - * | [p2m_mid_missing]+---->| ..., ~0 | - * \------------------/ \------------/ + * p2m_mid_missing p2m_missing + * /-----------------\ /------------\ + * | [p2m_missing] +---->| ~0, ~0, ~0 | + * | [p2m_missing] +---->| ..., ~0 | + * \-----------------/ \------------/ * * where ~0 is INVALID_P2M_ENTRY. IDENTITY is (PFN | IDENTITY_BIT) */ @@ -423,7 +423,7 @@ static void free_p2m_page(void *p) free_page((unsigned long)p); } -/* +/* * Fully allocate the p2m structure for a given pfn. We need to check * that both the top and mid levels are allocated, and make sure the * parallel mfn tree is kept in sync. We may race with other cpus, so -- cgit v1.1 From 59b294403e9814e7c1154043567f0d71bac7a511 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 19 Jul 2012 10:23:47 -0400 Subject: xen/x86: Use memblock_reserve for sensitive areas. instead of a big memblock_reserve. This way we can be more selective in freeing regions (and it also makes it easier to understand where is what). [v1: Move the auto_translate_physmap to proper line] [v2: Per Stefano suggestion add more comments] Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/enlighten.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/xen/p2m.c | 5 +++++ arch/x86/xen/setup.c | 9 --------- 3 files changed, 53 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index ff962d4..e532eb5 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -998,7 +998,54 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) return ret; } +/* + * If the MFN is not in the m2p (provided to us by the hypervisor) this + * function won't do anything. In practice this means that the XenBus + * MFN won't be available for the initial domain. */ +static void __init xen_reserve_mfn(unsigned long mfn) +{ + unsigned long pfn; + + if (!mfn) + return; + pfn = mfn_to_pfn(mfn); + if (phys_to_machine_mapping_valid(pfn)) + memblock_reserve(PFN_PHYS(pfn), PAGE_SIZE); +} +static void __init xen_reserve_internals(void) +{ + unsigned long size; + + if (!xen_pv_domain()) + return; + + /* xen_start_info does not exist in the M2P, hence can't use + * xen_reserve_mfn. */ + memblock_reserve(__pa(xen_start_info), PAGE_SIZE); + + xen_reserve_mfn(PFN_DOWN(xen_start_info->shared_info)); + xen_reserve_mfn(xen_start_info->store_mfn); + if (!xen_initial_domain()) + xen_reserve_mfn(xen_start_info->console.domU.mfn); + + if (xen_feature(XENFEAT_auto_translated_physmap)) + return; + + /* + * ALIGN up to compensate for the p2m_page pointing to an array that + * can partially filled (look in xen_build_dynamic_phys_to_machine). + */ + + size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); + + /* We could use xen_reserve_mfn here, but would end up looping quite + * a lot (and call memblock_reserve for each PAGE), so lets just use + * the easy way and reserve it wholesale. */ + memblock_reserve(__pa(xen_start_info->mfn_list), size); + + /* The pagetables are reserved in mmu.c */ +} void xen_setup_shared_info(void) { if (!xen_feature(XENFEAT_auto_translated_physmap)) { @@ -1362,6 +1409,7 @@ asmlinkage void __init xen_start_kernel(void) xen_raw_console_write("mapping kernel into physical memory\n"); pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages); + xen_reserve_internals(); /* Allocate and initialize top and mid mfn levels for p2m structure */ xen_build_mfn_list_list(); diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index e4adbfb..6a2bfa4 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -388,6 +388,11 @@ void __init xen_build_dynamic_phys_to_machine(void) } m2p_override_init(); + + /* NOTE: We cannot call memblock_reserve here for the mfn_list as there + * isn't enough pieces to make it work (for one - we are still using the + * Xen provided pagetable). Do it later in xen_reserve_internals. + */ } unsigned long get_phys_to_machine(unsigned long pfn) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index a4790bf..9efca75 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -424,15 +424,6 @@ char * __init xen_memory_setup(void) e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS, E820_RESERVED); - /* - * Reserve Xen bits: - * - mfn_list - * - xen_start_info - * See comment above "struct start_info" in - */ - memblock_reserve(__pa(xen_start_info->mfn_list), - xen_start_info->pt_base - xen_start_info->mfn_list); - sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); return "Xen"; -- cgit v1.1 From 806c312e50f122c47913145cf884f53dd09d9199 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Tue, 21 Aug 2012 14:31:24 -0400 Subject: xen/x86: Workaround 64-bit hypervisor and 32-bit initial domain. If a 64-bit hypervisor is booted with a 32-bit initial domain, the hypervisor deals with the initial domain as "compat" and does some extra adjustments (like pagetables are 4 bytes instead of 8). It also adjusts the xen_start_info->pt_base incorrectly. When booted with a 32-bit hypervisor (32-bit initial domain): .. (XEN) Start info: cf831000->cf83147c (XEN) Page tables: cf832000->cf8b5000 .. [ 0.000000] PT: cf832000 (f832000) [ 0.000000] Reserving PT: f832000->f8b5000 And with a 64-bit hypervisor: (XEN) Start info: 00000000cf831000->00000000cf8314b4 (XEN) Page tables: 00000000cf832000->00000000cf8b6000 [ 0.000000] PT: cf834000 (f834000) [ 0.000000] Reserving PT: f834000->f8b8000 To deal with this, we keep keep track of the highest physical address we have reserved via memblock_reserve. If that address does not overlap with pt_base, we have a gap which we reserve. Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/enlighten.c | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index e532eb5..511f92d 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1002,19 +1002,24 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) * If the MFN is not in the m2p (provided to us by the hypervisor) this * function won't do anything. In practice this means that the XenBus * MFN won't be available for the initial domain. */ -static void __init xen_reserve_mfn(unsigned long mfn) +static unsigned long __init xen_reserve_mfn(unsigned long mfn) { - unsigned long pfn; + unsigned long pfn, end_pfn = 0; if (!mfn) - return; + return end_pfn; + pfn = mfn_to_pfn(mfn); - if (phys_to_machine_mapping_valid(pfn)) - memblock_reserve(PFN_PHYS(pfn), PAGE_SIZE); + if (phys_to_machine_mapping_valid(pfn)) { + end_pfn = PFN_PHYS(pfn) + PAGE_SIZE; + memblock_reserve(PFN_PHYS(pfn), end_pfn); + } + return end_pfn; } static void __init xen_reserve_internals(void) { unsigned long size; + unsigned long last_phys = 0; if (!xen_pv_domain()) return; @@ -1022,12 +1027,13 @@ static void __init xen_reserve_internals(void) /* xen_start_info does not exist in the M2P, hence can't use * xen_reserve_mfn. */ memblock_reserve(__pa(xen_start_info), PAGE_SIZE); + last_phys = __pa(xen_start_info) + PAGE_SIZE; - xen_reserve_mfn(PFN_DOWN(xen_start_info->shared_info)); - xen_reserve_mfn(xen_start_info->store_mfn); + last_phys = max(xen_reserve_mfn(PFN_DOWN(xen_start_info->shared_info)), last_phys); + last_phys = max(xen_reserve_mfn(xen_start_info->store_mfn), last_phys); if (!xen_initial_domain()) - xen_reserve_mfn(xen_start_info->console.domU.mfn); + last_phys = max(xen_reserve_mfn(xen_start_info->console.domU.mfn), last_phys); if (xen_feature(XENFEAT_auto_translated_physmap)) return; @@ -1043,8 +1049,14 @@ static void __init xen_reserve_internals(void) * a lot (and call memblock_reserve for each PAGE), so lets just use * the easy way and reserve it wholesale. */ memblock_reserve(__pa(xen_start_info->mfn_list), size); - + last_phys = max(__pa(xen_start_info->mfn_list) + size, last_phys); /* The pagetables are reserved in mmu.c */ + + /* Under 64-bit hypervisor with a 32-bit domain, the hypervisor + * offsets the pt_base by two pages. Hence the reservation that is done + * in mmu.c misses two pages. We correct it here if we detect this. */ + if (last_phys < __pa(xen_start_info->pt_base)) + memblock_reserve(last_phys, __pa(xen_start_info->pt_base) - last_phys); } void xen_setup_shared_info(void) { -- cgit v1.1 From 988f0e24bbcbbf550dff016faf8273a94f4eb1af Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 27 Jul 2012 20:10:58 -0400 Subject: xen/swiotlb: Simplify the logic. Its pretty easy: 1). We only check to see if we need Xen SWIOTLB for PV guests. 2). If swiotlb=force or iommu=soft is set, then Xen SWIOTLB will be enabled. 3). If it is an initial domain, then Xen SWIOTLB will be enabled. 4). Native SWIOTLB must be disabled for PV guests. Acked-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/pci-swiotlb-xen.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c index 967633a..b6a5340 100644 --- a/arch/x86/xen/pci-swiotlb-xen.c +++ b/arch/x86/xen/pci-swiotlb-xen.c @@ -34,19 +34,20 @@ static struct dma_map_ops xen_swiotlb_dma_ops = { int __init pci_xen_swiotlb_detect(void) { + if (!xen_pv_domain()) + return 0; + /* If running as PV guest, either iommu=soft, or swiotlb=force will * activate this IOMMU. If running as PV privileged, activate it * irregardless. */ - if ((xen_initial_domain() || swiotlb || swiotlb_force) && - (xen_pv_domain())) + if ((xen_initial_domain() || swiotlb || swiotlb_force)) xen_swiotlb = 1; /* If we are running under Xen, we MUST disable the native SWIOTLB. * Don't worry about swiotlb_force flag activating the native, as * the 'swiotlb' flag is the only one turning it on. */ - if (xen_pv_domain()) - swiotlb = 0; + swiotlb = 0; return xen_swiotlb; } -- cgit v1.1 From fc2341df9e31be8a3940f4e302372d7ef46bab8c Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 27 Jul 2012 20:16:00 -0400 Subject: xen/swiotlb: With more than 4GB on 64-bit, disable the native SWIOTLB. If a PV guest is booted the native SWIOTLB should not be turned on. It does not help us (we don't have any PCI devices) and it eats 64MB of good memory. In the case of PV guests with PCI devices we need the Xen-SWIOTLB one. [v1: Rewrite comment per Stefano's suggestion] Acked-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/pci-swiotlb-xen.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c index b6a5340..1c17227 100644 --- a/arch/x86/xen/pci-swiotlb-xen.c +++ b/arch/x86/xen/pci-swiotlb-xen.c @@ -8,6 +8,11 @@ #include #include +#ifdef CONFIG_X86_64 +#include +#include +#endif + int xen_swiotlb __read_mostly; static struct dma_map_ops xen_swiotlb_dma_ops = { @@ -49,6 +54,15 @@ int __init pci_xen_swiotlb_detect(void) * the 'swiotlb' flag is the only one turning it on. */ swiotlb = 0; +#ifdef CONFIG_X86_64 + /* pci_swiotlb_detect_4gb turns on native SWIOTLB if no_iommu == 0 + * (so no iommu=X command line over-writes). + * Considering that PV guests do not want the *native SWIOTLB* but + * only Xen SWIOTLB it is not useful to us so set no_iommu=1 here. + */ + if (max_pfn > MAX_DMA32_PFN) + no_iommu = 1; +#endif return xen_swiotlb; } -- cgit v1.1 From 4d9310e39728a87c86eb48492da7546f61189633 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Mon, 6 Aug 2012 15:27:09 +0100 Subject: xen: missing includes Changes in v2: - remove pvclock hack; - remove include linux/types.h from xen/interface/xen.h. v3: - Compile under IA64 Signed-off-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/include/asm/xen/interface.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h index cbf0c9d..a93db16 100644 --- a/arch/x86/include/asm/xen/interface.h +++ b/arch/x86/include/asm/xen/interface.h @@ -121,6 +121,8 @@ struct arch_shared_info { #include "interface_64.h" #endif +#include + #ifndef __ASSEMBLY__ /* * The following is all CPU context. Note that the fpu_ctxt block is filled -- cgit v1.1 From b8b0f559c7b1dcf5503817e518c81c9a18ee45e0 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Tue, 21 Aug 2012 14:49:34 -0400 Subject: xen/apic/xenbus/swiotlb/pcifront/grant/tmem: Make functions or variables static. There is no need for those functions/variables to be visible. Make them static and also fix the compile warnings of this sort: drivers/xen/.c: warning: symbol '' was not declared. Should it be static? Some of them just require including the header file that declares the functions. Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/apic.c | 3 ++- arch/x86/xen/enlighten.c | 2 ++ arch/x86/xen/pci-swiotlb-xen.c | 1 + arch/x86/xen/platform-pci-unplug.c | 1 + 4 files changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c index ec57bd3..7005ced 100644 --- a/arch/x86/xen/apic.c +++ b/arch/x86/xen/apic.c @@ -6,8 +6,9 @@ #include #include +#include "xen-ops.h" -unsigned int xen_io_apic_read(unsigned apic, unsigned reg) +static unsigned int xen_io_apic_read(unsigned apic, unsigned reg) { struct physdev_apic apic_op; int ret; diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index ff962d4..cb1b191 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -79,6 +79,8 @@ #include "smp.h" #include "multicalls.h" +#include + EXPORT_SYMBOL_GPL(hypercall_page); DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c index 967633a..2d58b3f 100644 --- a/arch/x86/xen/pci-swiotlb-xen.c +++ b/arch/x86/xen/pci-swiotlb-xen.c @@ -8,6 +8,7 @@ #include #include +#include int xen_swiotlb __read_mostly; static struct dma_map_ops xen_swiotlb_dma_ops = { diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c index ffcf261..0a78524 100644 --- a/arch/x86/xen/platform-pci-unplug.c +++ b/arch/x86/xen/platform-pci-unplug.c @@ -24,6 +24,7 @@ #include #include +#include "xen-ops.h" #define XEN_PLATFORM_ERR_MAGIC -1 #define XEN_PLATFORM_ERR_PROTOCOL -2 -- cgit v1.1 From ece3234a77ebcd5bbeea6b829c9798328d290cae Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 13 Aug 2012 22:23:33 +0200 Subject: x86: dt: Use linear irq domain for ioapic(s) The former conversion to irq_domain_add_legacy() did not fully work since we miss the irq decs for NR_IRQS_LEGACY+. Ideally we could use irq_domain_add_simple() or the no-map variant (and program the virq <-> line mapping directly into ioapic) but this would require a different irq lookup in "do_IRQ()" and won't work with ACPI without changes. So this is probably easiest for everyone. Tested-by: Thierry Reding Signed-off-by: Sebastian Andrzej Siewior Cc: Grant Likely Link: http://lkml.kernel.org/r/20120813202304.GA3529@breakpoint.cc Signed-off-by: Thomas Gleixner --- arch/x86/kernel/devicetree.c | 51 ++++++++++++++++++++++++++++++++++++-------- 1 file changed, 42 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 3ae2ced..b158152 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -342,6 +342,47 @@ const struct irq_domain_ops ioapic_irq_domain_ops = { .xlate = ioapic_xlate, }; +static void dt_add_ioapic_domain(unsigned int ioapic_num, + struct device_node *np) +{ + struct irq_domain *id; + struct mp_ioapic_gsi *gsi_cfg; + int ret; + int num; + + gsi_cfg = mp_ioapic_gsi_routing(ioapic_num); + num = gsi_cfg->gsi_end - gsi_cfg->gsi_base + 1; + + id = irq_domain_add_linear(np, num, &ioapic_irq_domain_ops, + (void *)ioapic_num); + BUG_ON(!id); + if (gsi_cfg->gsi_base == 0) { + /* + * The first NR_IRQS_LEGACY irq descs are allocated in + * early_irq_init() and need just a mapping. The + * remaining irqs need both. All of them are preallocated + * and assigned so we can keep the 1:1 mapping which the ioapic + * is having. + */ + ret = irq_domain_associate_many(id, 0, 0, NR_IRQS_LEGACY); + if (ret) + pr_err("Error mapping legacy IRQs: %d\n", ret); + + if (num > NR_IRQS_LEGACY) { + ret = irq_create_strict_mappings(id, NR_IRQS_LEGACY, + NR_IRQS_LEGACY, num - NR_IRQS_LEGACY); + if (ret) + pr_err("Error creating mapping for the " + "remaining IRQs: %d\n", ret); + } + irq_set_default_host(id); + } else { + ret = irq_create_strict_mappings(id, gsi_cfg->gsi_base, 0, num); + if (ret) + pr_err("Error creating IRQ mapping: %d\n", ret); + } +} + static void __init ioapic_add_ofnode(struct device_node *np) { struct resource r; @@ -356,15 +397,7 @@ static void __init ioapic_add_ofnode(struct device_node *np) for (i = 0; i < nr_ioapics; i++) { if (r.start == mpc_ioapic_addr(i)) { - struct irq_domain *id; - struct mp_ioapic_gsi *gsi_cfg; - - gsi_cfg = mp_ioapic_gsi_routing(i); - - id = irq_domain_add_legacy(np, 32, gsi_cfg->gsi_base, 0, - &ioapic_irq_domain_ops, - (void*)i); - BUG_ON(!id); + dt_add_ioapic_domain(i, np); return; } } -- cgit v1.1 From eb48c071464757414538c68a6033c8f8c15196f8 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 21 Aug 2012 16:15:52 -0700 Subject: mm: hugetlbfs: correctly populate shared pmd Each page mapped in a process's address space must be correctly accounted for in _mapcount. Normally the rules for this are straightforward but hugetlbfs page table sharing is different. The page table pages at the PMD level are reference counted while the mapcount remains the same. If this accounting is wrong, it causes bugs like this one reported by Larry Woodman: kernel BUG at mm/filemap.c:135! invalid opcode: 0000 [#1] SMP CPU 22 Modules linked in: bridge stp llc sunrpc binfmt_misc dcdbas microcode pcspkr acpi_pad acpi] Pid: 18001, comm: mpitest Tainted: G W 3.3.0+ #4 Dell Inc. PowerEdge R620/07NDJ2 RIP: 0010:[] [] __delete_from_page_cache+0x15d/0x170 Process mpitest (pid: 18001, threadinfo ffff880428972000, task ffff880428b5cc20) Call Trace: delete_from_page_cache+0x40/0x80 truncate_hugepages+0x115/0x1f0 hugetlbfs_evict_inode+0x18/0x30 evict+0x9f/0x1b0 iput_final+0xe3/0x1e0 iput+0x3e/0x50 d_kill+0xf8/0x110 dput+0xe2/0x1b0 __fput+0x162/0x240 During fork(), copy_hugetlb_page_range() detects if huge_pte_alloc() shared page tables with the check dst_pte == src_pte. The logic is if the PMD page is the same, they must be shared. This assumes that the sharing is between the parent and child. However, if the sharing is with a different process entirely then this check fails as in this diagram: parent | ------------>pmd src_pte----------> data page ^ other--------->pmd--------------------| ^ child-----------| dst_pte For this situation to occur, it must be possible for Parent and Other to have faulted and failed to share page tables with each other. This is possible due to the following style of race. PROC A PROC B copy_hugetlb_page_range copy_hugetlb_page_range src_pte == huge_pte_offset src_pte == huge_pte_offset !src_pte so no sharing !src_pte so no sharing (time passes) hugetlb_fault hugetlb_fault huge_pte_alloc huge_pte_alloc huge_pmd_share huge_pmd_share LOCK(i_mmap_mutex) find nothing, no sharing UNLOCK(i_mmap_mutex) LOCK(i_mmap_mutex) find nothing, no sharing UNLOCK(i_mmap_mutex) pmd_alloc pmd_alloc LOCK(instantiation_mutex) fault UNLOCK(instantiation_mutex) LOCK(instantiation_mutex) fault UNLOCK(instantiation_mutex) These two processes are not poing to the same data page but are not sharing page tables because the opportunity was missed. When either process later forks, the src_pte == dst pte is potentially insufficient. As the check falls through, the wrong PTE information is copied in (harmless but wrong) and the mapcount is bumped for a page mapped by a shared page table leading to the BUG_ON. This patch addresses the issue by moving pmd_alloc into huge_pmd_share which guarantees that the shared pud is populated in the same critical section as pmd. This also means that huge_pte_offset test in huge_pmd_share is serialized correctly now which in turn means that the success of the sharing will be higher as the racing tasks see the pud and pmd populated together. Race identified and changelog written mostly by Mel Gorman. {akpm@linux-foundation.org: attempt to make the huge_pmd_share() comment comprehensible, clean up coding style] Reported-by: Larry Woodman Tested-by: Larry Woodman Reviewed-by: Mel Gorman Signed-off-by: Michal Hocko Reviewed-by: Rik van Riel Cc: David Gibson Cc: Ken Chen Cc: Cong Wang Cc: Hillf Danton Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/hugetlbpage.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index f6679a7..b91e485 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -56,9 +56,16 @@ static int vma_shareable(struct vm_area_struct *vma, unsigned long addr) } /* - * search for a shareable pmd page for hugetlb. + * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc() + * and returns the corresponding pte. While this is not necessary for the + * !shared pmd case because we can allocate the pmd later as well, it makes the + * code much cleaner. pmd allocation is essential for the shared case because + * pud has to be populated inside the same i_mmap_mutex section - otherwise + * racing tasks could either miss the sharing (see huge_pte_offset) or select a + * bad pmd for sharing. */ -static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) +static pte_t * +huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) { struct vm_area_struct *vma = find_vma(mm, addr); struct address_space *mapping = vma->vm_file->f_mapping; @@ -68,9 +75,10 @@ static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) struct vm_area_struct *svma; unsigned long saddr; pte_t *spte = NULL; + pte_t *pte; if (!vma_shareable(vma, addr)) - return; + return (pte_t *)pmd_alloc(mm, pud, addr); mutex_lock(&mapping->i_mmap_mutex); vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) { @@ -97,7 +105,9 @@ static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) put_page(virt_to_page(spte)); spin_unlock(&mm->page_table_lock); out: + pte = (pte_t *)pmd_alloc(mm, pud, addr); mutex_unlock(&mapping->i_mmap_mutex); + return pte; } /* @@ -142,8 +152,9 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, } else { BUG_ON(sz != PMD_SIZE); if (pud_none(*pud)) - huge_pmd_share(mm, addr, pud); - pte = (pte_t *) pmd_alloc(mm, pud, addr); + pte = huge_pmd_share(mm, addr, pud); + else + pte = (pte_t *)pmd_alloc(mm, pud, addr); } } BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte)); -- cgit v1.1 From d3a8009b1731abb7026a840d1c2701f877f9429f Mon Sep 17 00:00:00 2001 From: Yuanhan Liu Date: Mon, 6 Aug 2012 22:13:00 +0800 Subject: x86/irq/i8259: Fix incorrect comment Signed-off-by: Yuanhan Liu Signed-off-by: Ingo Molnar --- arch/x86/kernel/i8259.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 36d1853..9a5c460 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -263,7 +263,7 @@ static void i8259A_shutdown(void) * out of. */ outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ - outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-1 */ + outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ } static struct syscore_ops i8259_syscore_ops = { -- cgit v1.1 From 83be4ffa1acbcd529b771f4d2e639b15e2b7957e Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Tue, 14 Aug 2012 14:47:37 -0700 Subject: x86/spinlocks: Fix comment in spinlock.h This comment is no longer true. We support up to 2^16 CPUs because __ticket_t is an u16 if NR_CPUS is larger than 256. Signed-off-by: Richard Weinberger Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- arch/x86/include/asm/spinlock.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index b315a33..33692ea 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -12,8 +12,7 @@ * Simple spin lock operations. There are two variants, one clears IRQ's * on the local processor, one does not. * - * These are fair FIFO ticket locks, which are currently limited to 256 - * CPUs. + * These are fair FIFO ticket locks, which support up to 2^16 CPUs. * * (the type definitions are in asm/spinlock_types.h) */ -- cgit v1.1 From 2530cd4f448935c74eeb49f29559589928e4b2f0 Mon Sep 17 00:00:00 2001 From: "Liu, Chuansheng" Date: Tue, 14 Aug 2012 06:55:01 +0000 Subject: x86/fixup_irq: Use cpu_online_mask instead of cpu_all_mask When one CPU is going down and this CPU is the last one in irq affinity, current code is setting cpu_all_mask as the new affinity for that irq. But for some systems (such as in Medfield Android mobile) the firmware sends the interrupt to each CPU in the irq affinity mask, averaged, and cpu_all_mask includes all potential CPUs, i.e. offline ones as well. So replace cpu_all_mask with cpu_online_mask. Signed-off-by: liu chuansheng Acked-by: Yanmin Zhang Acked-by: Thomas Gleixner Link: http://lkml.kernel.org/r/27240C0AC20F114CBF8149A2696CBE4A137286@SHSMSX101.ccr.corp.intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/irq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 7ad683d..d44f782 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -270,7 +270,7 @@ void fixup_irqs(void) if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { break_affinity = 1; - affinity = cpu_all_mask; + affinity = cpu_online_mask; } chip = irq_data_get_irq_chip(data); -- cgit v1.1 From cb09cad44f07044d9810f18f6f9a6a6f3771f979 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 22 Aug 2012 13:03:48 +0300 Subject: x86/alternatives: Fix p6 nops on non-modular kernels Probably a leftover from the early days of self-patching, p6nops are marked __initconst_or_module, which causes them to be discarded in a non-modular kernel. If something later triggers patching, it will overwrite kernel code with garbage. Reported-by: Tomas Racek Signed-off-by: Avi Kivity Cc: Michael Tokarev Cc: Borislav Petkov Cc: Marcelo Tosatti Cc: qemu-devel@nongnu.org Cc: Anthony Liguori Cc: H. Peter Anvin Cc: Alan Cox Cc: Alan Cox Link: http://lkml.kernel.org/r/5034AE84.90708@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/alternative.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index afb7ff7..ced4534 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -165,7 +165,7 @@ static const unsigned char * const k7_nops[ASM_NOP_MAX+2] = #endif #ifdef P6_NOP1 -static const unsigned char __initconst_or_module p6nops[] = +static const unsigned char p6nops[] = { P6_NOP1, P6_NOP2, -- cgit v1.1 From 8e3d9d061b5d132217629e7b5635ff0c02488e65 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 21 Aug 2012 10:57:42 +0800 Subject: KVM: x86: fix possible infinite loop caused by reexecute_instruction Currently, we reexecute all unhandleable instructions if they do not access on the mmio, however, it can not work if host map the readonly memory to guest. If the instruction try to write this kind of memory, it will fault again when guest retry it, then we will goto a infinite loop: retry instruction -> write #PF -> emulation fail -> retry instruction -> ... Fix it by retrying the instruction only when it faults on the writable memory Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fb0d937..704680d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4473,6 +4473,7 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu) static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva) { gpa_t gpa; + pfn_t pfn; if (tdp_enabled) return false; @@ -4490,8 +4491,17 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva) if (gpa == UNMAPPED_GVA) return true; /* let cpu generate fault */ - if (!kvm_is_error_hva(gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT))) + /* + * Do not retry the unhandleable instruction if it faults on the + * readonly host memory, otherwise it will goto a infinite loop: + * retry instruction -> write #PF -> emulation fail -> retry + * instruction -> ... + */ + pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa)); + if (!is_error_pfn(pfn)) { + kvm_release_pfn_clean(pfn); return true; + } return false; } -- cgit v1.1 From 037d92dc5d4691ae7cf44699c55ca83b1b441992 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 21 Aug 2012 10:59:12 +0800 Subject: KVM: introduce gfn_to_pfn_memslot_atomic It can instead of hva_to_pfn_atomic Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 9651c2cd..5548971 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2510,15 +2510,12 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log) { struct kvm_memory_slot *slot; - unsigned long hva; slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log); if (!slot) return KVM_PFN_ERR_FAULT; - hva = gfn_to_hva_memslot(slot, gfn); - - return hva_to_pfn_atomic(hva); + return gfn_to_pfn_memslot_atomic(slot, gfn); } static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, -- cgit v1.1 From 4d8b81abc47b83a1939e59df2fdb0e98dfe0eedd Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 21 Aug 2012 11:02:51 +0800 Subject: KVM: introduce readonly memslot In current code, if we map a readonly memory space from host to guest and the page is not currently mapped in the host, we will get a fault pfn and async is not allowed, then the vm will crash We introduce readonly memory region to map ROM/ROMD to the guest, read access is happy for readonly memslot, write access on readonly memslot will cause KVM_EXIT_MMIO exit Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm.h | 1 + arch/x86/kvm/mmu.c | 9 +++++++++ arch/x86/kvm/x86.c | 1 + 3 files changed, 11 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h index 246617e..521bf25 100644 --- a/arch/x86/include/asm/kvm.h +++ b/arch/x86/include/asm/kvm.h @@ -25,6 +25,7 @@ #define __KVM_HAVE_DEBUGREGS #define __KVM_HAVE_XSAVE #define __KVM_HAVE_XCRS +#define __KVM_HAVE_READONLY_MEM /* Architectural interrupt line count. */ #define KVM_NR_INTERRUPTS 256 diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 5548971..8e312a2 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2647,6 +2647,15 @@ static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct * static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn) { + /* + * Do not cache the mmio info caused by writing the readonly gfn + * into the spte otherwise read access on readonly gfn also can + * caused mmio page fault and treat it as mmio access. + * Return 1 to tell kvm to emulate it. + */ + if (pfn == KVM_PFN_ERR_RO_FAULT) + return 1; + if (pfn == KVM_PFN_ERR_HWPOISON) { kvm_send_hwpoison_signal(gfn_to_hva(vcpu->kvm, gfn), current); return 0; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 704680d..42bbf41 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2175,6 +2175,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_GET_TSC_KHZ: case KVM_CAP_PCI_2_3: case KVM_CAP_KVMCLOCK_CTRL: + case KVM_CAP_READONLY_MEM: r = 1; break; case KVM_CAP_COALESCED_MMIO: -- cgit v1.1 From 35f2d16bb9ace0fb2671b8232839944ad9057c6f Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Mon, 20 Aug 2012 18:35:39 +0900 Subject: KVM: MMU: Fix mmu_shrink() so that it can free mmu pages as intended Although the possible race described in commit 85b7059169e128c57a3a8a3e588fb89cb2031da1 KVM: MMU: fix shrinking page from the empty mmu was correct, the real cause of that issue was a more trivial bug of mmu_shrink() introduced by commit 1952639665e92481c34c34c3e2a71bf3e66ba362 KVM: MMU: do not iterate over all VMs in mmu_shrink() Here is the bug: if (kvm->arch.n_used_mmu_pages > 0) { if (!nr_to_scan--) break; continue; } We skip VMs whose n_used_mmu_pages is not zero and try to shrink others: in other words we try to shrink empty ones by mistake. This patch reverses the logic so that mmu_shrink() can free pages from the first VM whose n_used_mmu_pages is not zero. Note that we also add comments explaining the role of nr_to_scan which is not practically important now, hoping this will be improved in the future. Signed-off-by: Takuya Yoshikawa Cc: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 01ca004..7fbd0d2 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -4113,16 +4113,21 @@ static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) LIST_HEAD(invalid_list); /* + * Never scan more than sc->nr_to_scan VM instances. + * Will not hit this condition practically since we do not try + * to shrink more than one VM and it is very unlikely to see + * !n_used_mmu_pages so many times. + */ + if (!nr_to_scan--) + break; + /* * n_used_mmu_pages is accessed without holding kvm->mmu_lock * here. We may skip a VM instance errorneosly, but we do not * want to shrink a VM that only started to populate its MMU * anyway. */ - if (kvm->arch.n_used_mmu_pages > 0) { - if (!nr_to_scan--) - break; + if (!kvm->arch.n_used_mmu_pages) continue; - } idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); -- cgit v1.1 From 5ad105e569c45dcfad50d724c61d5061248be755 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 19 Aug 2012 14:34:31 +0300 Subject: KVM: x86 emulator: use stack size attribute to mask rsp in stack ops The sub-register used to access the stack (sp, esp, or rsp) is not determined by the address size attribute like other memory references, but by the stack segment's B bit (if not in x86_64 mode). Fix by using the existing stack_mask() to figure out the correct mask. This long-existing bug was exposed by a combination of a27685c33acccce (emulate invalid guest state by default), which causes many more instructions to be emulated, and a seabios change (possibly a bug) which causes the high 16 bits of esp to become polluted across calls to real mode software interrupts. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 97d9a99..a3b57a2 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -475,13 +475,26 @@ register_address(struct x86_emulate_ctxt *ctxt, unsigned long reg) return address_mask(ctxt, reg); } +static void masked_increment(ulong *reg, ulong mask, int inc) +{ + assign_masked(reg, *reg + inc, mask); +} + static inline void register_address_increment(struct x86_emulate_ctxt *ctxt, unsigned long *reg, int inc) { + ulong mask; + if (ctxt->ad_bytes == sizeof(unsigned long)) - *reg += inc; + mask = ~0UL; else - *reg = (*reg & ~ad_mask(ctxt)) | ((*reg + inc) & ad_mask(ctxt)); + mask = ad_mask(ctxt); + masked_increment(reg, mask, inc); +} + +static void rsp_increment(struct x86_emulate_ctxt *ctxt, int inc) +{ + masked_increment(&ctxt->regs[VCPU_REGS_RSP], stack_mask(ctxt), inc); } static inline void jmp_rel(struct x86_emulate_ctxt *ctxt, int rel) @@ -1522,8 +1535,8 @@ static int push(struct x86_emulate_ctxt *ctxt, void *data, int bytes) { struct segmented_address addr; - register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], -bytes); - addr.ea = register_address(ctxt, ctxt->regs[VCPU_REGS_RSP]); + rsp_increment(ctxt, -bytes); + addr.ea = ctxt->regs[VCPU_REGS_RSP] & stack_mask(ctxt); addr.seg = VCPU_SREG_SS; return segmented_write(ctxt, addr, data, bytes); @@ -1542,13 +1555,13 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt, int rc; struct segmented_address addr; - addr.ea = register_address(ctxt, ctxt->regs[VCPU_REGS_RSP]); + addr.ea = ctxt->regs[VCPU_REGS_RSP] & stack_mask(ctxt); addr.seg = VCPU_SREG_SS; rc = segmented_read(ctxt, addr, dest, len); if (rc != X86EMUL_CONTINUE) return rc; - register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], len); + rsp_increment(ctxt, len); return rc; } @@ -1688,8 +1701,7 @@ static int em_popa(struct x86_emulate_ctxt *ctxt) while (reg >= VCPU_REGS_RAX) { if (reg == VCPU_REGS_RSP) { - register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], - ctxt->op_bytes); + rsp_increment(ctxt, ctxt->op_bytes); --reg; } @@ -2825,7 +2837,7 @@ static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt) rc = emulate_pop(ctxt, &ctxt->dst.val, ctxt->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; - register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], ctxt->src.val); + rsp_increment(ctxt, ctxt->src.val); return X86EMUL_CONTINUE; } -- cgit v1.1 From 36bf50d7697be18c6bfd0401e037df10bff1e573 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 31 Jul 2012 15:41:45 +0200 Subject: x86, microcode, AMD: Fix broken ucode patch size check This issue was recently observed on an AMD C-50 CPU where a patch of maximum size was applied. Commit be62adb49294 ("x86, microcode, AMD: Simplify ucode verification") added current_size in get_matching_microcode(). This is calculated as size of the ucode patch + 8 (ie. size of the header). Later this is compared against the maximum possible ucode patch size for a CPU family. And of course this fails if the patch has already maximum size. Cc: [3.3+] Signed-off-by: Andreas Herrmann Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1344361461-10076-1-git-send-email-bp@amd64.org Signed-off-by: H. Peter Anvin --- arch/x86/kernel/microcode_amd.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 8a2ce8f..82746f9 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -143,11 +143,12 @@ static int get_matching_microcode(int cpu, const u8 *ucode_ptr, unsigned int *current_size) { struct microcode_header_amd *mc_hdr; - unsigned int actual_size; + unsigned int actual_size, patch_size; u16 equiv_cpu_id; /* size of the current patch we're staring at */ - *current_size = *(u32 *)(ucode_ptr + 4) + SECTION_HDR_SIZE; + patch_size = *(u32 *)(ucode_ptr + 4); + *current_size = patch_size + SECTION_HDR_SIZE; equiv_cpu_id = find_equiv_id(); if (!equiv_cpu_id) @@ -174,7 +175,7 @@ static int get_matching_microcode(int cpu, const u8 *ucode_ptr, /* * now that the header looks sane, verify its size */ - actual_size = verify_ucode_size(cpu, *current_size, leftover_size); + actual_size = verify_ucode_size(cpu, patch_size, leftover_size); if (!actual_size) return 0; -- cgit v1.1 From 4dbf32c30f7e5379588a692c7bf80b05d9ef8026 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 23 Jul 2012 19:05:53 +0200 Subject: x86, microcode: Save an indentation level in reload_for_cpu Invert the uci->valid check so that the later block can be aligned on the first indentation level of the function. No functional change. Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1344361461-10076-3-git-send-email-bp@amd64.org Signed-off-by: H. Peter Anvin --- arch/x86/kernel/microcode_core.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 4873e62..63a9568 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -276,19 +276,18 @@ static struct platform_device *microcode_pdev; static int reload_for_cpu(int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + enum ucode_state ustate; int err = 0; - if (uci->valid) { - enum ucode_state ustate; - - ustate = microcode_ops->request_microcode_fw(cpu, µcode_pdev->dev); - if (ustate == UCODE_OK) - apply_microcode_on_target(cpu); - else - if (ustate == UCODE_ERROR) - err = -EINVAL; - } + if (!uci->valid) + return err; + ustate = microcode_ops->request_microcode_fw(cpu, µcode_pdev->dev); + if (ustate == UCODE_OK) + apply_microcode_on_target(cpu); + else + if (ustate == UCODE_ERROR) + err = -EINVAL; return err; } -- cgit v1.1 From bb9d3e473d5b324907e15dff4e54410b28ea50e2 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 3 Aug 2012 15:26:50 +0200 Subject: x86, microcode: Drop uci->mc check on resume path Remove the uci->mc check on the cpu resume path because the low-level drivers do that anyway. More importantly, though, this fixes a contrived and obscure but still important case. Imagine the following: * boot machine, no new microcode in /lib/firmware * a subset of the CPUs is offlined * in the meantime, user puts new fresh microcode container into /lib/firmware and reloads it by doing $ echo 1 > /sys/devices/system/cpu/microcode/reload * offlined cores come back online and they don't get the newer microcode applied due to this check. Later patches take care of the issue on AMD. While at it, cleanup code around it. Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1344361461-10076-4-git-send-email-bp@amd64.org Signed-off-by: H. Peter Anvin --- arch/x86/kernel/microcode_core.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 63a9568..706a5c9 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -369,13 +369,10 @@ static void microcode_fini_cpu(int cpu) static enum ucode_state microcode_resume_cpu(int cpu) { - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - - if (!uci->mc) - return UCODE_NFOUND; - pr_debug("CPU%d updated upon resume\n", cpu); - apply_microcode_on_target(cpu); + + if (apply_microcode_on_target(cpu)) + return UCODE_ERROR; return UCODE_OK; } @@ -404,14 +401,11 @@ static enum ucode_state microcode_init_cpu(int cpu) static enum ucode_state microcode_update_cpu(int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - enum ucode_state ustate; if (uci->valid) - ustate = microcode_resume_cpu(cpu); - else - ustate = microcode_init_cpu(cpu); + return microcode_resume_cpu(cpu); - return ustate; + return microcode_init_cpu(cpu); } static int mc_device_add(struct device *dev, struct subsys_interface *sif) -- cgit v1.1 From 09c3f0d883300c8fc2bb62e9a70cf89a2ada9a80 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 23 Jul 2012 20:15:10 +0200 Subject: x86, microcode: Cleanup cpu hotplug notifier callback Mask out CPU_TASKS_FROZEN bit so that all _FROZEN cases can be dropped. Also, add some more comments as to why CPU_ONLINE falls through to CPU_DOWN_FAILED (no break), and for the CPU_DEAD case. Realign debug printks better. Idea blatantly stolen from a tglx patch: http://marc.info/?l=linux-kernel&m=134267779513862 Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1344361461-10076-5-git-send-email-bp@amd64.org Signed-off-by: H. Peter Anvin --- arch/x86/kernel/microcode_core.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 706a5c9..dcde544 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -470,34 +470,41 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) struct device *dev; dev = get_cpu_device(cpu); - switch (action) { + + switch (action & ~CPU_TASKS_FROZEN) { case CPU_ONLINE: - case CPU_ONLINE_FROZEN: microcode_update_cpu(cpu); - case CPU_DOWN_FAILED: - case CPU_DOWN_FAILED_FROZEN: pr_debug("CPU%d added\n", cpu); + /* + * "break" is missing on purpose here because we want to fall + * through in order to create the sysfs group. + */ + + case CPU_DOWN_FAILED: if (sysfs_create_group(&dev->kobj, &mc_attr_group)) pr_err("Failed to create group for CPU%d\n", cpu); break; + case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: /* Suspend is in progress, only remove the interface */ sysfs_remove_group(&dev->kobj, &mc_attr_group); pr_debug("CPU%d removed\n", cpu); break; /* + * case CPU_DEAD: + * * When a CPU goes offline, don't free up or invalidate the copy of * the microcode in kernel memory, so that we can reuse it when the * CPU comes back online without unnecessarily requesting the userspace * for it again. */ - case CPU_UP_CANCELED_FROZEN: - /* The CPU refused to come up during a system resume */ - microcode_fini_cpu(cpu); - break; } + + /* The CPU refused to come up during a system resume */ + if (action == CPU_UP_CANCELED_FROZEN) + microcode_fini_cpu(cpu); + return NOTIFY_OK; } -- cgit v1.1 From e43f6e67ec1c142550860bbe0b51166c5ee4cac8 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 1 Aug 2012 19:17:01 +0200 Subject: x86, microcode: Straighten out Kconfig text Update and clarify Kconfig help text along with menu names. Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1344361461-10076-6-git-send-email-bp@amd64.org Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 8ec3a1a..1ccccc6 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -982,25 +982,25 @@ config X86_REBOOTFIXUPS Say N otherwise. config MICROCODE - tristate "/dev/cpu/microcode - microcode support" + tristate "CPU microcode loading support" select FW_LOADER ---help--- + If you say Y here, you will be able to update the microcode on certain Intel and AMD processors. The Intel support is for the - IA32 family, e.g. Pentium Pro, Pentium II, Pentium III, - Pentium 4, Xeon etc. The AMD support is for family 0x10 and - 0x11 processors, e.g. Opteron, Phenom and Turion 64 Ultra. - You will obviously need the actual microcode binary data itself - which is not shipped with the Linux kernel. + IA32 family, e.g. Pentium Pro, Pentium II, Pentium III, Pentium 4, + Xeon etc. The AMD support is for families 0x10 and later. You will + obviously need the actual microcode binary data itself which is not + shipped with the Linux kernel. This option selects the general module only, you need to select at least one vendor specific module as well. - To compile this driver as a module, choose M here: the - module will be called microcode. + To compile this driver as a module, choose M here: the module + will be called microcode. config MICROCODE_INTEL - bool "Intel microcode patch loading support" + bool "Intel microcode loading support" depends on MICROCODE default MICROCODE select FW_LOADER @@ -1013,7 +1013,7 @@ config MICROCODE_INTEL . config MICROCODE_AMD - bool "AMD microcode patch loading support" + bool "AMD microcode loading support" depends on MICROCODE select FW_LOADER ---help--- -- cgit v1.1 From e7e632f5ba240fbc313c49ed6559681ea57534e9 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 20 Jul 2012 14:12:21 +0200 Subject: x86, microcode, AMD: Remove useless get_ucode_data wrapper get_ucode_data was a trivial memcpy wrapper. Remove it so as not to obfuscate code unnecessarily with no obvious gain. No functional change. Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1344361461-10076-7-git-send-email-bp@amd64.org Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/microcode.h | 6 ------ arch/x86/kernel/microcode_amd.c | 4 ++-- 2 files changed, 2 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h index 4ebe157..8813be6 100644 --- a/arch/x86/include/asm/microcode.h +++ b/arch/x86/include/asm/microcode.h @@ -49,12 +49,6 @@ static inline struct microcode_ops * __init init_intel_microcode(void) #ifdef CONFIG_MICROCODE_AMD extern struct microcode_ops * __init init_amd_microcode(void); extern void __exit exit_amd_microcode(void); - -static inline void get_ucode_data(void *to, const u8 *from, size_t n) -{ - memcpy(to, from, n); -} - #else static inline struct microcode_ops * __init init_amd_microcode(void) { diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 82746f9..9421338 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -183,7 +183,7 @@ static int get_matching_microcode(int cpu, const u8 *ucode_ptr, memset(patch, 0, PAGE_SIZE); /* all looks ok, get the binary patch */ - get_ucode_data(patch, ucode_ptr + SECTION_HDR_SIZE, actual_size); + memcpy(patch, ucode_ptr + SECTION_HDR_SIZE, actual_size); return actual_size; } @@ -238,7 +238,7 @@ static int install_equiv_cpu_table(const u8 *buf) return -ENOMEM; } - get_ucode_data(equiv_cpu_table, buf + CONTAINER_HDR_SZ, size); + memcpy(equiv_cpu_table, buf + CONTAINER_HDR_SZ, size); /* add header length */ return size + CONTAINER_HDR_SZ; -- cgit v1.1 From 685ca6d797af9d41164dd64dd60145d4946fc152 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 20 Jun 2012 16:17:51 +0200 Subject: x86, microcode, AMD: Check before applying a patch Make sure we're actually applying a microcode patch to a core which really needs it. This brings only a very very very minor slowdown on F10: 0.032218828 sec vs 0.056010626 sec with this patch. And small speedup on F15: 0.487089449 sec vs 0.180551162 sec (from perf output). Also, fixup comments while at it. Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1344361461-10076-8-git-send-email-bp@amd64.org Signed-off-by: H. Peter Anvin --- arch/x86/kernel/microcode_amd.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 9421338..8fdf7d9 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -202,11 +202,18 @@ static int apply_microcode_amd(int cpu) if (mc_amd == NULL) return 0; - wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code); - /* get patch id after patching */ rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); - /* check current patch id and patch's id for match */ + /* need to apply patch? */ + if (rev >= mc_amd->hdr.patch_id) { + c->microcode = rev; + return 0; + } + + wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code); + + /* verify patch application was successful */ + rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); if (rev != mc_amd->hdr.patch_id) { pr_err("CPU%d: update failed for patch_level=0x%08x\n", cpu, mc_amd->hdr.patch_id); -- cgit v1.1 From 5f5b747282c6cc57b91baba37f76de27398b9e60 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 25 Jul 2012 20:06:54 +0200 Subject: x86, microcode, AMD: Read CPUID(1).EAX on the correct cpu Read the CPUID(1).EAX leaf at the correct cpu and use it to search the equivalence table for matching microcode patch. No functionality change. Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1344361461-10076-9-git-send-email-bp@amd64.org Signed-off-by: H. Peter Anvin --- arch/x86/kernel/microcode_amd.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 8fdf7d9..25d34b1 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -82,6 +82,7 @@ static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) { struct cpuinfo_x86 *c = &cpu_data(cpu); + csig->sig = cpuid_eax(0x00000001); csig->rev = c->microcode; pr_info("CPU%d: patch_level=0x%08x\n", cpu, csig->rev); @@ -118,16 +119,15 @@ static unsigned int verify_ucode_size(int cpu, u32 patch_size, return patch_size; } -static u16 find_equiv_id(void) +static u16 find_equiv_id(unsigned int cpu) { - unsigned int current_cpu_id, i = 0; + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + int i = 0; BUG_ON(equiv_cpu_table == NULL); - current_cpu_id = cpuid_eax(0x00000001); - while (equiv_cpu_table[i].installed_cpu != 0) { - if (current_cpu_id == equiv_cpu_table[i].installed_cpu) + if (uci->cpu_sig.sig == equiv_cpu_table[i].installed_cpu) return equiv_cpu_table[i].equiv_cpu; i++; @@ -150,7 +150,7 @@ static int get_matching_microcode(int cpu, const u8 *ucode_ptr, patch_size = *(u32 *)(ucode_ptr + 4); *current_size = patch_size + SECTION_HDR_SIZE; - equiv_cpu_id = find_equiv_id(); + equiv_cpu_id = find_equiv_id(cpu); if (!equiv_cpu_id) return 0; -- cgit v1.1 From 48e30685caa8bdc4b8d4417d8ac31db59689742c Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 26 Jul 2012 15:51:00 +0200 Subject: x86, microcode: Add a refresh firmware flag to ->request_microcode_fw This is done in preparation for teaching the ucode driver to either load a new ucode patches container from userspace or use an already cached version. No functionality change in this patch. Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1344361461-10076-10-git-send-email-bp@amd64.org Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/microcode.h | 4 ++-- arch/x86/kernel/microcode_amd.c | 3 ++- arch/x86/kernel/microcode_core.c | 11 ++++++----- arch/x86/kernel/microcode_intel.c | 3 ++- 4 files changed, 12 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h index 8813be6..43d921b 100644 --- a/arch/x86/include/asm/microcode.h +++ b/arch/x86/include/asm/microcode.h @@ -15,8 +15,8 @@ struct microcode_ops { enum ucode_state (*request_microcode_user) (int cpu, const void __user *buf, size_t size); - enum ucode_state (*request_microcode_fw) (int cpu, - struct device *device); + enum ucode_state (*request_microcode_fw) (int cpu, struct device *, + bool refresh_fw); void (*microcode_fini_cpu) (int cpu); diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 25d34b1..94ecdaa 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -330,7 +330,8 @@ out: * * These might be larger than 2K. */ -static enum ucode_state request_microcode_amd(int cpu, struct device *device) +static enum ucode_state request_microcode_amd(int cpu, struct device *device, + bool refresh_fw) { char fw_name[36] = "amd-ucode/microcode_amd.bin"; const struct firmware *fw; diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index dcde544..9420972 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -282,7 +282,7 @@ static int reload_for_cpu(int cpu) if (!uci->valid) return err; - ustate = microcode_ops->request_microcode_fw(cpu, µcode_pdev->dev); + ustate = microcode_ops->request_microcode_fw(cpu, µcode_pdev->dev, true); if (ustate == UCODE_OK) apply_microcode_on_target(cpu); else @@ -377,7 +377,7 @@ static enum ucode_state microcode_resume_cpu(int cpu) return UCODE_OK; } -static enum ucode_state microcode_init_cpu(int cpu) +static enum ucode_state microcode_init_cpu(int cpu, bool refresh_fw) { enum ucode_state ustate; @@ -388,7 +388,8 @@ static enum ucode_state microcode_init_cpu(int cpu) if (system_state != SYSTEM_RUNNING) return UCODE_NFOUND; - ustate = microcode_ops->request_microcode_fw(cpu, µcode_pdev->dev); + ustate = microcode_ops->request_microcode_fw(cpu, µcode_pdev->dev, + refresh_fw); if (ustate == UCODE_OK) { pr_debug("CPU%d updated upon init\n", cpu); @@ -405,7 +406,7 @@ static enum ucode_state microcode_update_cpu(int cpu) if (uci->valid) return microcode_resume_cpu(cpu); - return microcode_init_cpu(cpu); + return microcode_init_cpu(cpu, false); } static int mc_device_add(struct device *dev, struct subsys_interface *sif) @@ -421,7 +422,7 @@ static int mc_device_add(struct device *dev, struct subsys_interface *sif) if (err) return err; - if (microcode_init_cpu(cpu) == UCODE_ERROR) + if (microcode_init_cpu(cpu, true) == UCODE_ERROR) return -EINVAL; return err; diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index 0327e2b..3544aed 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c @@ -405,7 +405,8 @@ static int get_ucode_fw(void *to, const void *from, size_t n) return 0; } -static enum ucode_state request_microcode_fw(int cpu, struct device *device) +static enum ucode_state request_microcode_fw(int cpu, struct device *device, + bool refresh_fw) { char name[30]; struct cpuinfo_x86 *c = &cpu_data(cpu); -- cgit v1.1 From c96d2c0905cc48e34f2b37b775b59932c416b343 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 1 Aug 2012 14:55:01 +0200 Subject: x86, microcode, AMD: Add reverse equiv table search We search the equivalence table using the CPUID(1) signature of the CPU in order to get the equivalence ID of the patch which we need to apply. Add a function which does the reverse - it will be needed in later patches. While at it, pull the other equiv table function up in the file so that it can be used by other functionality without forward declarations. Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1344361461-10076-11-git-send-email-bp@amd64.org Signed-off-by: H. Peter Anvin --- arch/x86/kernel/microcode_amd.c | 46 +++++++++++++++++++++++++++-------------- 1 file changed, 30 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 94ecdaa..03ed5af 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -78,6 +78,36 @@ static struct equiv_cpu_entry *equiv_cpu_table; /* page-sized ucode patch buffer */ void *patch; +static u16 find_equiv_id(unsigned int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + int i = 0; + + BUG_ON(equiv_cpu_table == NULL); + + while (equiv_cpu_table[i].installed_cpu != 0) { + if (uci->cpu_sig.sig == equiv_cpu_table[i].installed_cpu) + return equiv_cpu_table[i].equiv_cpu; + + i++; + } + return 0; +} + +static u32 find_cpu_family_by_equiv_cpu(u16 equiv_cpu) +{ + int i = 0; + + BUG_ON(!equiv_cpu_table); + + while (equiv_cpu_table[i].equiv_cpu != 0) { + if (equiv_cpu == equiv_cpu_table[i].equiv_cpu) + return equiv_cpu_table[i].installed_cpu; + i++; + } + return 0; +} + static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) { struct cpuinfo_x86 *c = &cpu_data(cpu); @@ -119,22 +149,6 @@ static unsigned int verify_ucode_size(int cpu, u32 patch_size, return patch_size; } -static u16 find_equiv_id(unsigned int cpu) -{ - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - int i = 0; - - BUG_ON(equiv_cpu_table == NULL); - - while (equiv_cpu_table[i].installed_cpu != 0) { - if (uci->cpu_sig.sig == equiv_cpu_table[i].installed_cpu) - return equiv_cpu_table[i].equiv_cpu; - - i++; - } - return 0; -} - /* * we signal a good patch is found by returning its size > 0 */ -- cgit v1.1 From a3eb3b4da106a23b5d41bf915726172e31654a65 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 1 Aug 2012 15:38:18 +0200 Subject: x86, microcode, AMD: Add a small, per-family patches cache This is a trivial cache which collects all ucode patches for the current family of CPUs on the system. If a newer patch appears due to the container file being updated in userspace, we replace our cached version with the new one. Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1344361461-10076-12-git-send-email-bp@amd64.org Signed-off-by: H. Peter Anvin --- arch/x86/kernel/microcode_amd.c | 67 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 66 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 03ed5af..cacdc9a 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -78,12 +78,22 @@ static struct equiv_cpu_entry *equiv_cpu_table; /* page-sized ucode patch buffer */ void *patch; +struct ucode_patch { + struct list_head plist; + void *data; + u32 patch_id; + u16 equiv_cpu; +}; + +static LIST_HEAD(pcache); + static u16 find_equiv_id(unsigned int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; int i = 0; - BUG_ON(equiv_cpu_table == NULL); + if (!equiv_cpu_table) + return 0; while (equiv_cpu_table[i].installed_cpu != 0) { if (uci->cpu_sig.sig == equiv_cpu_table[i].installed_cpu) @@ -108,6 +118,61 @@ static u32 find_cpu_family_by_equiv_cpu(u16 equiv_cpu) return 0; } +/* + * a small, trivial cache of per-family ucode patches + */ +static struct ucode_patch *cache_find_patch(u16 equiv_cpu) +{ + struct ucode_patch *p; + + list_for_each_entry(p, &pcache, plist) + if (p->equiv_cpu == equiv_cpu) + return p; + return NULL; +} + +static void update_cache(struct ucode_patch *new_patch) +{ + struct ucode_patch *p; + + list_for_each_entry(p, &pcache, plist) { + if (p->equiv_cpu == new_patch->equiv_cpu) { + if (p->patch_id >= new_patch->patch_id) + /* we already have the latest patch */ + return; + + list_replace(&p->plist, &new_patch->plist); + kfree(p->data); + kfree(p); + return; + } + } + /* no patch found, add it */ + list_add_tail(&new_patch->plist, &pcache); +} + +static void free_cache(void) +{ + struct ucode_patch *p; + + list_for_each_entry_reverse(p, &pcache, plist) { + __list_del(p->plist.prev, p->plist.next); + kfree(p->data); + kfree(p); + } +} + +static struct ucode_patch *find_patch(unsigned int cpu) +{ + u16 equiv_id; + + equiv_id = find_equiv_id(cpu); + if (!equiv_id) + return NULL; + + return cache_find_patch(equiv_id); +} + static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) { struct cpuinfo_x86 *c = &cpu_data(cpu); -- cgit v1.1 From 2efb05e8e9fa3510044e007b90263c73b6a83f84 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 1 Aug 2012 16:16:13 +0200 Subject: x86, microcode, AMD: Rewrite patch application procedure Limit the access to userspace only on the BSP where we load the container, verify the patches in it and put them in the patch cache. Then, at application time, we lookup the correct patch in the cache and use it. When we need to reload the userspace container, we do that over the reload interface: echo 1 > /sys/devices/system/cpu/microcode/reload which reloads (a possibly newer) container from userspace and applies then the newest patches from there. Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1344361461-10076-13-git-send-email-bp@amd64.org Signed-off-by: H. Peter Anvin --- arch/x86/kernel/microcode_amd.c | 236 ++++++++++++++++++++-------------------- 1 file changed, 121 insertions(+), 115 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index cacdc9a..5511216 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -75,9 +75,6 @@ struct microcode_amd { static struct equiv_cpu_entry *equiv_cpu_table; -/* page-sized ucode patch buffer */ -void *patch; - struct ucode_patch { struct list_head plist; void *data; @@ -184,7 +181,7 @@ static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) return 0; } -static unsigned int verify_ucode_size(int cpu, u32 patch_size, +static unsigned int verify_patch_size(int cpu, u32 patch_size, unsigned int size) { struct cpuinfo_x86 *c = &cpu_data(cpu); @@ -214,73 +211,25 @@ static unsigned int verify_ucode_size(int cpu, u32 patch_size, return patch_size; } -/* - * we signal a good patch is found by returning its size > 0 - */ -static int get_matching_microcode(int cpu, const u8 *ucode_ptr, - unsigned int leftover_size, int rev, - unsigned int *current_size) -{ - struct microcode_header_amd *mc_hdr; - unsigned int actual_size, patch_size; - u16 equiv_cpu_id; - - /* size of the current patch we're staring at */ - patch_size = *(u32 *)(ucode_ptr + 4); - *current_size = patch_size + SECTION_HDR_SIZE; - - equiv_cpu_id = find_equiv_id(cpu); - if (!equiv_cpu_id) - return 0; - - /* - * let's look at the patch header itself now - */ - mc_hdr = (struct microcode_header_amd *)(ucode_ptr + SECTION_HDR_SIZE); - - if (mc_hdr->processor_rev_id != equiv_cpu_id) - return 0; - - /* ucode might be chipset specific -- currently we don't support this */ - if (mc_hdr->nb_dev_id || mc_hdr->sb_dev_id) { - pr_err("CPU%d: chipset specific code not yet supported\n", - cpu); - return 0; - } - - if (mc_hdr->patch_id <= rev) - return 0; - - /* - * now that the header looks sane, verify its size - */ - actual_size = verify_ucode_size(cpu, patch_size, leftover_size); - if (!actual_size) - return 0; - - /* clear the patch buffer */ - memset(patch, 0, PAGE_SIZE); - - /* all looks ok, get the binary patch */ - memcpy(patch, ucode_ptr + SECTION_HDR_SIZE, actual_size); - - return actual_size; -} - static int apply_microcode_amd(int cpu) { - u32 rev, dummy; - int cpu_num = raw_smp_processor_id(); - struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; - struct microcode_amd *mc_amd = uci->mc; struct cpuinfo_x86 *c = &cpu_data(cpu); + struct microcode_amd *mc_amd; + struct ucode_cpu_info *uci; + struct ucode_patch *p; + u32 rev, dummy; + + BUG_ON(raw_smp_processor_id() != cpu); - /* We should bind the task to the CPU */ - BUG_ON(cpu_num != cpu); + uci = ucode_cpu_info + cpu; - if (mc_amd == NULL) + p = find_patch(cpu); + if (!p) return 0; + mc_amd = p->data; + uci->mc = p->data; + rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); /* need to apply patch? */ @@ -336,61 +285,113 @@ static void free_equiv_cpu_table(void) equiv_cpu_table = NULL; } -static enum ucode_state -generic_load_microcode(int cpu, const u8 *data, size_t size) +static void cleanup(void) { - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - struct microcode_header_amd *mc_hdr = NULL; - unsigned int mc_size, leftover, current_size = 0; + free_equiv_cpu_table(); + free_cache(); +} + +/* + * We return the current size even if some of the checks failed so that + * we can skip over the next patch. If we return a negative value, we + * signal a grave error like a memory allocation has failed and the + * driver cannot continue functioning normally. In such cases, we tear + * down everything we've used up so far and exit. + */ +static int verify_and_add_patch(unsigned int cpu, u8 *fw, unsigned int leftover) +{ + struct cpuinfo_x86 *c = &cpu_data(cpu); + struct microcode_header_amd *mc_hdr; + struct ucode_patch *patch; + unsigned int patch_size, crnt_size, ret; + u32 proc_fam; + u16 proc_id; + + patch_size = *(u32 *)(fw + 4); + crnt_size = patch_size + SECTION_HDR_SIZE; + mc_hdr = (struct microcode_header_amd *)(fw + SECTION_HDR_SIZE); + proc_id = mc_hdr->processor_rev_id; + + proc_fam = find_cpu_family_by_equiv_cpu(proc_id); + if (!proc_fam) { + pr_err("No patch family for equiv ID: 0x%04x\n", proc_id); + return crnt_size; + } + + /* check if patch is for the current family */ + proc_fam = ((proc_fam >> 8) & 0xf) + ((proc_fam >> 20) & 0xff); + if (proc_fam != c->x86) + return crnt_size; + + if (mc_hdr->nb_dev_id || mc_hdr->sb_dev_id) { + pr_err("Patch-ID 0x%08x: chipset-specific code unsupported.\n", + mc_hdr->patch_id); + return crnt_size; + } + + ret = verify_patch_size(cpu, patch_size, leftover); + if (!ret) { + pr_err("Patch-ID 0x%08x: size mismatch.\n", mc_hdr->patch_id); + return crnt_size; + } + + patch = kzalloc(sizeof(*patch), GFP_KERNEL); + if (!patch) { + pr_err("Patch allocation failure.\n"); + return -EINVAL; + } + + patch->data = kzalloc(patch_size, GFP_KERNEL); + if (!patch->data) { + pr_err("Patch data allocation failure.\n"); + kfree(patch); + return -EINVAL; + } + + /* All looks ok, copy patch... */ + memcpy(patch->data, fw + SECTION_HDR_SIZE, patch_size); + INIT_LIST_HEAD(&patch->plist); + patch->patch_id = mc_hdr->patch_id; + patch->equiv_cpu = proc_id; + + /* ... and add to cache. */ + update_cache(patch); + + return crnt_size; +} + +static enum ucode_state load_microcode_amd(int cpu, const u8 *data, size_t size) +{ + enum ucode_state ret = UCODE_ERROR; + unsigned int leftover; + u8 *fw = (u8 *)data; + int crnt_size = 0; int offset; - const u8 *ucode_ptr = data; - void *new_mc = NULL; - unsigned int new_rev = uci->cpu_sig.rev; - enum ucode_state state = UCODE_ERROR; - offset = install_equiv_cpu_table(ucode_ptr); + offset = install_equiv_cpu_table(data); if (offset < 0) { pr_err("failed to create equivalent cpu table\n"); - goto out; + return ret; } - ucode_ptr += offset; + fw += offset; leftover = size - offset; - if (*(u32 *)ucode_ptr != UCODE_UCODE_TYPE) { + if (*(u32 *)fw != UCODE_UCODE_TYPE) { pr_err("invalid type field in container file section header\n"); - goto free_table; + free_equiv_cpu_table(); + return ret; } while (leftover) { - mc_size = get_matching_microcode(cpu, ucode_ptr, leftover, - new_rev, ¤t_size); - if (mc_size) { - mc_hdr = patch; - new_mc = patch; - new_rev = mc_hdr->patch_id; - goto out_ok; - } + crnt_size = verify_and_add_patch(cpu, fw, leftover); + if (crnt_size < 0) + return ret; - ucode_ptr += current_size; - leftover -= current_size; + fw += crnt_size; + leftover -= crnt_size; } - if (!new_mc) { - state = UCODE_NFOUND; - goto free_table; - } - -out_ok: - uci->mc = new_mc; - state = UCODE_OK; - pr_debug("CPU%d update ucode (0x%08x -> 0x%08x)\n", - cpu, uci->cpu_sig.rev, new_rev); - -free_table: - free_equiv_cpu_table(); - -out: - return state; + return UCODE_OK; } /* @@ -401,7 +402,7 @@ out: * * This legacy file is always smaller than 2K in size. * - * Starting at family 15h they are in family specific firmware files: + * Beginning with family 15h, they are in family-specific firmware files: * * amd-ucode/microcode_amd_fam15h.bin * amd-ucode/microcode_amd_fam16h.bin @@ -413,9 +414,13 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device, bool refresh_fw) { char fw_name[36] = "amd-ucode/microcode_amd.bin"; - const struct firmware *fw; - enum ucode_state ret = UCODE_NFOUND; struct cpuinfo_x86 *c = &cpu_data(cpu); + enum ucode_state ret = UCODE_NFOUND; + const struct firmware *fw; + + /* reload ucode container only on the boot cpu */ + if (!refresh_fw || c->cpu_index != boot_cpu_data.cpu_index) + return UCODE_OK; if (c->x86 >= 0x15) snprintf(fw_name, sizeof(fw_name), "amd-ucode/microcode_amd_fam%.2xh.bin", c->x86); @@ -431,12 +436,17 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device, goto fw_release; } - ret = generic_load_microcode(cpu, fw->data, fw->size); + /* free old equiv table */ + free_equiv_cpu_table(); + + ret = load_microcode_amd(cpu, fw->data, fw->size); + if (ret != UCODE_OK) + cleanup(); -fw_release: + fw_release: release_firmware(fw); -out: + out: return ret; } @@ -470,14 +480,10 @@ struct microcode_ops * __init init_amd_microcode(void) return NULL; } - patch = (void *)get_zeroed_page(GFP_KERNEL); - if (!patch) - return NULL; - return µcode_amd_ops; } void __exit exit_amd_microcode(void) { - free_page((unsigned long)patch); + cleanup(); } -- cgit v1.1 From 90993cdd1800dc6ef9587431a0c625b978584e81 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Thu, 16 Aug 2012 17:00:19 -0300 Subject: x86: KVM guest: merge CONFIG_KVM_CLOCK into CONFIG_KVM_GUEST The distinction between CONFIG_KVM_CLOCK and CONFIG_KVM_GUEST is not so clear anymore, as demonstrated by recent bugs caused by poor handling of on/off combinations of these options. Merge CONFIG_KVM_CLOCK into CONFIG_KVM_GUEST. Reported-By: OGAWA Hirofumi Signed-off-by: Marcelo Tosatti --- arch/x86/Kconfig | 21 ++++++++------------- arch/x86/include/asm/kvm_para.h | 4 ++-- arch/x86/kernel/Makefile | 3 +-- arch/x86/kernel/kvm.c | 2 -- arch/x86/kernel/setup.c | 2 +- 5 files changed, 12 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 8ec3a1a..a42e2e9 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -573,23 +573,18 @@ config PARAVIRT_TIME_ACCOUNTING source "arch/x86/xen/Kconfig" -config KVM_CLOCK - bool "KVM paravirtualized clock" - select PARAVIRT - select PARAVIRT_CLOCK - ---help--- - Turning on this option will allow you to run a paravirtualized clock - when running over the KVM hypervisor. Instead of relying on a PIT - (or probably other) emulation by the underlying device model, the host - provides the guest with timing infrastructure such as time of day, and - system time - config KVM_GUEST - bool "KVM Guest support" + bool "KVM Guest support (including kvmclock)" + select PARAVIRT select PARAVIRT + select PARAVIRT_CLOCK + default y if PARAVIRT_GUEST ---help--- This option enables various optimizations for running under the KVM - hypervisor. + hypervisor. It includes a paravirtualized clock, so that instead + of relying on a PIT (or probably other) emulation by the + underlying device model, the host provides the guest with + timing infrastructure such as time of day, and system time source "arch/x86/lguest/Kconfig" diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index 20f5697..eb3e9d8 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -102,14 +102,14 @@ struct kvm_vcpu_pv_apf_data { extern void kvmclock_init(void); extern int kvm_register_clock(char *txt); -#ifdef CONFIG_KVM_CLOCK +#ifdef CONFIG_KVM_GUEST bool kvm_check_and_clear_guest_paused(void); #else static inline bool kvm_check_and_clear_guest_paused(void) { return false; } -#endif /* CONFIG_KVMCLOCK */ +#endif /* CONFIG_KVM_GUEST */ /* This instruction is vmcall. On non-VT architectures, it will generate a * trap that we will then rewrite to the appropriate instruction. diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 8215e56..7203298 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -81,8 +81,7 @@ obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o -obj-$(CONFIG_KVM_GUEST) += kvm.o -obj-$(CONFIG_KVM_CLOCK) += kvmclock.o +obj-$(CONFIG_KVM_GUEST) += kvm.o kvmclock.o obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 1596cc8..b3e5e51 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -397,9 +397,7 @@ void kvm_disable_steal_time(void) #ifdef CONFIG_SMP static void __init kvm_smp_prepare_boot_cpu(void) { -#ifdef CONFIG_KVM_CLOCK WARN_ON(kvm_register_clock("primary cpu clock")); -#endif kvm_guest_cpu_init(); native_smp_prepare_boot_cpu(); } diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index f4b9b80..b3386ae 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -957,7 +957,7 @@ void __init setup_arch(char **cmdline_p) initmem_init(); memblock_find_dma_reserve(); -#ifdef CONFIG_KVM_CLOCK +#ifdef CONFIG_KVM_GUEST kvmclock_init(); #endif -- cgit v1.1 From 816afe4ff98ee10b1d30fd66361be132a0a5cee6 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 6 Aug 2012 17:29:49 +0930 Subject: x86/smp: Don't ever patch back to UP if we unplug cpus We still patch SMP instructions to UP variants if we boot with a single CPU, but not at any other time. In particular, not if we unplug CPUs to return to a single cpu. Paul McKenney points out: mean offline overhead is 6251/48=130.2 milliseconds. If I remove the alternatives_smp_switch() from the offline path [...] the mean offline overhead is 550/42=13.1 milliseconds Basically, we're never going to get those 120ms back, and the code is pretty messy. We get rid of: 1) The "smp-alt-once" boot option. It's actually "smp-alt-boot", the documentation is wrong. It's now the default. 2) The skip_smp_alternatives flag used by suspend. 3) arch_disable_nonboot_cpus_begin() and arch_disable_nonboot_cpus_end() which were only used to set this one flag. Signed-off-by: Rusty Russell Cc: Paul McKenney Cc: Suresh Siddha Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/87vcgwwive.fsf@rustcorp.com.au Signed-off-by: Ingo Molnar --- arch/x86/include/asm/alternative.h | 4 +- arch/x86/kernel/alternative.c | 107 +++++++++---------------------------- arch/x86/kernel/smpboot.c | 20 +------ arch/x86/xen/smp.c | 6 +-- 4 files changed, 32 insertions(+), 105 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 7078068..444704c 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -60,7 +60,7 @@ extern void alternatives_smp_module_add(struct module *mod, char *name, void *locks, void *locks_end, void *text, void *text_end); extern void alternatives_smp_module_del(struct module *mod); -extern void alternatives_smp_switch(int smp); +extern void alternatives_enable_smp(void); extern int alternatives_text_reserved(void *start, void *end); extern bool skip_smp_alternatives; #else @@ -68,7 +68,7 @@ static inline void alternatives_smp_module_add(struct module *mod, char *name, void *locks, void *locks_end, void *text, void *text_end) {} static inline void alternatives_smp_module_del(struct module *mod) {} -static inline void alternatives_smp_switch(int smp) {} +static inline void alternatives_enable_smp(void) {} static inline int alternatives_text_reserved(void *start, void *end) { return 0; diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index afb7ff7..af1f326 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -23,19 +23,6 @@ #define MAX_PATCH_LEN (255-1) -#ifdef CONFIG_HOTPLUG_CPU -static int smp_alt_once; - -static int __init bootonly(char *str) -{ - smp_alt_once = 1; - return 1; -} -__setup("smp-alt-boot", bootonly); -#else -#define smp_alt_once 1 -#endif - static int __initdata_or_module debug_alternative; static int __init debug_alt(char *str) @@ -326,9 +313,6 @@ static void alternatives_smp_unlock(const s32 *start, const s32 *end, { const s32 *poff; - if (noreplace_smp) - return; - mutex_lock(&text_mutex); for (poff = start; poff < end; poff++) { u8 *ptr = (u8 *)poff + *poff; @@ -359,7 +343,7 @@ struct smp_alt_module { }; static LIST_HEAD(smp_alt_modules); static DEFINE_MUTEX(smp_alt); -static int smp_mode = 1; /* protected by smp_alt */ +static bool uniproc_patched = false; /* protected by smp_alt */ void __init_or_module alternatives_smp_module_add(struct module *mod, char *name, @@ -368,19 +352,18 @@ void __init_or_module alternatives_smp_module_add(struct module *mod, { struct smp_alt_module *smp; - if (noreplace_smp) - return; + mutex_lock(&smp_alt); + if (!uniproc_patched) + goto unlock; - if (smp_alt_once) { - if (boot_cpu_has(X86_FEATURE_UP)) - alternatives_smp_unlock(locks, locks_end, - text, text_end); - return; - } + if (num_possible_cpus() == 1) + /* Don't bother remembering, we'll never have to undo it. */ + goto smp_unlock; smp = kzalloc(sizeof(*smp), GFP_KERNEL); if (NULL == smp) - return; /* we'll run the (safe but slow) SMP code then ... */ + /* we'll run the (safe but slow) SMP code then ... */ + goto unlock; smp->mod = mod; smp->name = name; @@ -392,11 +375,10 @@ void __init_or_module alternatives_smp_module_add(struct module *mod, __func__, smp->locks, smp->locks_end, smp->text, smp->text_end, smp->name); - mutex_lock(&smp_alt); list_add_tail(&smp->next, &smp_alt_modules); - if (boot_cpu_has(X86_FEATURE_UP)) - alternatives_smp_unlock(smp->locks, smp->locks_end, - smp->text, smp->text_end); +smp_unlock: + alternatives_smp_unlock(locks, locks_end, text, text_end); +unlock: mutex_unlock(&smp_alt); } @@ -404,24 +386,18 @@ void __init_or_module alternatives_smp_module_del(struct module *mod) { struct smp_alt_module *item; - if (smp_alt_once || noreplace_smp) - return; - mutex_lock(&smp_alt); list_for_each_entry(item, &smp_alt_modules, next) { if (mod != item->mod) continue; list_del(&item->next); - mutex_unlock(&smp_alt); - DPRINTK("%s: %s\n", __func__, item->name); kfree(item); - return; + break; } mutex_unlock(&smp_alt); } -bool skip_smp_alternatives; -void alternatives_smp_switch(int smp) +void alternatives_enable_smp(void) { struct smp_alt_module *mod; @@ -436,34 +412,21 @@ void alternatives_smp_switch(int smp) pr_info("lockdep: fixing up alternatives\n"); #endif - if (noreplace_smp || smp_alt_once || skip_smp_alternatives) - return; - BUG_ON(!smp && (num_online_cpus() > 1)); + /* Why bother if there are no other CPUs? */ + BUG_ON(num_possible_cpus() == 1); mutex_lock(&smp_alt); - /* - * Avoid unnecessary switches because it forces JIT based VMs to - * throw away all cached translations, which can be quite costly. - */ - if (smp == smp_mode) { - /* nothing */ - } else if (smp) { + if (uniproc_patched) { pr_info("switching to SMP code\n"); + BUG_ON(num_online_cpus() != 1); clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP); clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP); list_for_each_entry(mod, &smp_alt_modules, next) alternatives_smp_lock(mod->locks, mod->locks_end, mod->text, mod->text_end); - } else { - pr_info("switching to UP code\n"); - set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP); - set_cpu_cap(&cpu_data(0), X86_FEATURE_UP); - list_for_each_entry(mod, &smp_alt_modules, next) - alternatives_smp_unlock(mod->locks, mod->locks_end, - mod->text, mod->text_end); + uniproc_patched = false; } - smp_mode = smp; mutex_unlock(&smp_alt); } @@ -540,40 +503,22 @@ void __init alternative_instructions(void) apply_alternatives(__alt_instructions, __alt_instructions_end); - /* switch to patch-once-at-boottime-only mode and free the - * tables in case we know the number of CPUs will never ever - * change */ -#ifdef CONFIG_HOTPLUG_CPU - if (num_possible_cpus() < 2) - smp_alt_once = 1; -#endif - #ifdef CONFIG_SMP - if (smp_alt_once) { - if (1 == num_possible_cpus()) { - pr_info("switching to UP code\n"); - set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP); - set_cpu_cap(&cpu_data(0), X86_FEATURE_UP); - - alternatives_smp_unlock(__smp_locks, __smp_locks_end, - _text, _etext); - } - } else { + /* Patch to UP if other cpus not imminent. */ + if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) { + uniproc_patched = true; alternatives_smp_module_add(NULL, "core kernel", __smp_locks, __smp_locks_end, _text, _etext); - - /* Only switch to UP mode if we don't immediately boot others */ - if (num_present_cpus() == 1 || setup_max_cpus <= 1) - alternatives_smp_switch(0); } -#endif - apply_paravirt(__parainstructions, __parainstructions_end); - if (smp_alt_once) + if (!uniproc_patched || num_possible_cpus() == 1) free_init_pages("SMP alternatives", (unsigned long)__smp_locks, (unsigned long)__smp_locks_end); +#endif + + apply_paravirt(__parainstructions, __parainstructions_end); restart_nmi(); } diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 7c5a8c3..c80a33b 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -665,7 +665,8 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle) unsigned long boot_error = 0; int timeout; - alternatives_smp_switch(1); + /* Just in case we booted with a single CPU. */ + alternatives_enable_smp(); idle->thread.sp = (unsigned long) (((struct pt_regs *) (THREAD_SIZE + task_stack_page(idle))) - 1); @@ -1053,20 +1054,6 @@ out: preempt_enable(); } -void arch_disable_nonboot_cpus_begin(void) -{ - /* - * Avoid the smp alternatives switch during the disable_nonboot_cpus(). - * In the suspend path, we will be back in the SMP mode shortly anyways. - */ - skip_smp_alternatives = true; -} - -void arch_disable_nonboot_cpus_end(void) -{ - skip_smp_alternatives = false; -} - void arch_enable_nonboot_cpus_begin(void) { set_mtrr_aps_delayed_init(); @@ -1256,9 +1243,6 @@ void native_cpu_die(unsigned int cpu) if (per_cpu(cpu_state, cpu) == CPU_DEAD) { if (system_state == SYSTEM_RUNNING) pr_info("CPU %u is now offline\n", cpu); - - if (1 == num_online_cpus()) - alternatives_smp_switch(0); return; } msleep(100); diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index f58dca7..353c50f 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -377,7 +377,8 @@ static int __cpuinit xen_cpu_up(unsigned int cpu, struct task_struct *idle) return rc; if (num_online_cpus() == 1) - alternatives_smp_switch(1); + /* Just in case we booted with a single CPU. */ + alternatives_enable_smp(); rc = xen_smp_intr_init(cpu); if (rc) @@ -424,9 +425,6 @@ static void xen_cpu_die(unsigned int cpu) unbind_from_irqhandler(per_cpu(xen_irq_work, cpu), NULL); xen_uninit_lock_cpu(cpu); xen_teardown_timer(cpu); - - if (num_online_cpus() == 1) - alternatives_smp_switch(0); } static void __cpuinit xen_play_dead(void) /* used only with HOTPLUG_CPU */ -- cgit v1.1 From c96aae1f7f393387d160211f60398d58463a7e65 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 17 Aug 2012 16:43:28 -0400 Subject: xen/setup: Fix one-off error when adding for-balloon PFNs to the P2M. When we are finished with return PFNs to the hypervisor, then populate it back, and also mark the E820 MMIO and E820 gaps as IDENTITY_FRAMEs, we then call P2M to set areas that can be used for ballooning. We were off by one, and ended up over-writting a P2M entry that most likely was an IDENTITY_FRAME. For example: 1-1 mapping on 40000->40200 1-1 mapping on bc558->bc5ac 1-1 mapping on bc5b4->bc8c5 1-1 mapping on bc8c6->bcb7c 1-1 mapping on bcd00->100000 Released 614 pages of unused memory Set 277889 page(s) to 1-1 mapping Populating 40200-40466 pfn range: 614 pages added => here we set from 40466 up to bc559 P2M tree to be INVALID_P2M_ENTRY. We should have done it up to bc558. The end result is that if anybody is trying to construct a PTE for PFN bc558 they end up with ~PAGE_PRESENT. CC: stable@vger.kernel.org Reported-by-and-Tested-by: Andre Przywara Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/setup.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index ead8557..d11ca11 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -78,9 +78,16 @@ static void __init xen_add_extra_mem(u64 start, u64 size) memblock_reserve(start, size); xen_max_p2m_pfn = PFN_DOWN(start + size); + for (pfn = PFN_DOWN(start); pfn < xen_max_p2m_pfn; pfn++) { + unsigned long mfn = pfn_to_mfn(pfn); + + if (WARN(mfn == pfn, "Trying to over-write 1-1 mapping (pfn: %lx)\n", pfn)) + continue; + WARN(mfn != INVALID_P2M_ENTRY, "Trying to remove %lx which has %lx mfn!\n", + pfn, mfn); - for (pfn = PFN_DOWN(start); pfn <= xen_max_p2m_pfn; pfn++) __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); + } } static unsigned long __init xen_do_chunk(unsigned long start, -- cgit v1.1 From bd3f79b71de0410352ab506496a467fcb0620912 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Wed, 22 Aug 2012 17:20:14 +0100 Subject: xen: Introduce xen_pfn_t for pfn and mfn types All the original Xen headers have xen_pfn_t as mfn and pfn type, however when they have been imported in Linux, xen_pfn_t has been replaced with unsigned long. That might work for x86 and ia64 but it does not for arm. Bring back xen_pfn_t and let each architecture define xen_pfn_t as they see fit. Signed-off-by: Stefano Stabellini Acked-by: Konrad Rzeszutek Wilk Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/include/asm/xen/interface.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h index a93db16..555f94d 100644 --- a/arch/x86/include/asm/xen/interface.h +++ b/arch/x86/include/asm/xen/interface.h @@ -47,6 +47,10 @@ #endif #ifndef __ASSEMBLY__ +/* Explicitly size integers that represent pfns in the public interface + * with Xen so that on ARM we can have one ABI that works for 32 and 64 + * bit guests. */ +typedef unsigned long xen_pfn_t; /* Guest handles for primitive C types. */ __DEFINE_GUEST_HANDLE(uchar, unsigned char); __DEFINE_GUEST_HANDLE(uint, unsigned int); @@ -57,6 +61,7 @@ DEFINE_GUEST_HANDLE(long); DEFINE_GUEST_HANDLE(void); DEFINE_GUEST_HANDLE(uint64_t); DEFINE_GUEST_HANDLE(uint32_t); +DEFINE_GUEST_HANDLE(xen_pfn_t); #endif #ifndef HYPERVISOR_VIRT_START -- cgit v1.1 From 1a1d43318aeb74d679372c0b65029957be274529 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Wed, 22 Aug 2012 17:20:16 +0100 Subject: xen: allow privcmd for HVM guests This patch removes the "return -ENOSYS" for auto_translated_physmap guests from privcmd_mmap, thus it allows ARM guests to issue privcmd mmap calls. However privcmd mmap calls are still going to fail for HVM and hybrid guests on x86 because the xen_remap_domain_mfn_range implementation is currently PV only. Changes in v2: - better commit message; - return -EINVAL from xen_remap_domain_mfn_range if auto_translated_physmap. Signed-off-by: Stefano Stabellini Acked-by: Konrad Rzeszutek Wilk Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/mmu.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 3a73785..885a223 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -2310,6 +2310,9 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma, unsigned long range; int err = 0; + if (xen_feature(XENFEAT_auto_translated_physmap)) + return -EINVAL; + prot = __pgprot(pgprot_val(prot) | _PAGE_IOMAP); BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_RESERVED | VM_IO)) == -- cgit v1.1 From 6d7083eee3bc088d1fc30eefabd6263bca40c95a Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Mon, 13 Aug 2012 11:00:08 -0400 Subject: xen/swiotlb: Fix compile warnings when using plain integer instead of NULL pointer. arch/x86/xen/pci-swiotlb-xen.c:96:1: warning: Using plain integer as NULL pointer arch/x86/xen/pci-swiotlb-xen.c:96:1: warning: Using plain integer as NULL pointer Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/pci-swiotlb-xen.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c index 2d58b3f..1ab4594 100644 --- a/arch/x86/xen/pci-swiotlb-xen.c +++ b/arch/x86/xen/pci-swiotlb-xen.c @@ -63,6 +63,6 @@ void __init pci_xen_swiotlb_init(void) } } IOMMU_INIT_FINISH(pci_xen_swiotlb_detect, - 0, + NULL, pci_xen_swiotlb_init, - 0); + NULL); -- cgit v1.1 From d57c5d51a30152f3175d2344cb6395f08bf8ee0c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 9 Feb 2011 13:32:18 -0500 Subject: ftrace/x86: Add support for -mfentry to x86_64 If the kernel is compiled with gcc 4.6.0 which supports -mfentry, then use that instead of mcount. With mcount, frame pointers are forced with the -pg option and we get something like: : 55 push %rbp 48 89 e5 mov %rsp,%rbp 53 push %rbx 41 51 push %r9 e8 fe 6a 39 00 callq ffffffff81483d00 31 c0 xor %eax,%eax 48 89 fb mov %rdi,%rbx 48 89 d7 mov %rdx,%rdi 48 33 73 30 xor 0x30(%rbx),%rsi 48 f7 c6 ff ff ff f7 test $0xfffffffff7ffffff,%rsi With -mfentry, frame pointers are no longer forced and the call looks like this: : e8 33 af 37 00 callq ffffffff81461b40 <__fentry__> 53 push %rbx 48 89 fb mov %rdi,%rbx 31 c0 xor %eax,%eax 48 89 d7 mov %rdx,%rdi 41 51 push %r9 48 33 73 30 xor 0x30(%rbx),%rsi 48 f7 c6 ff ff ff f7 test $0xfffffffff7ffffff,%rsi This adds the ftrace hook at the beginning of the function before a frame is set up, and allows the function callbacks to be able to access parameters. As kprobes now can use function tracing (at least on x86) this speeds up the kprobe hooks that are at the beginning of the function. Link: http://lkml.kernel.org/r/20120807194100.130477900@goodmis.org Acked-by: Ingo Molnar Reviewed-by: Masami Hiramatsu Cc: Andi Kleen Signed-off-by: Steven Rostedt --- arch/x86/Kconfig | 1 + arch/x86/include/asm/ftrace.h | 7 ++++++- arch/x86/kernel/entry_64.S | 32 +++++++++++++++++++++++++++----- arch/x86/kernel/x8664_ksyms_64.c | 6 +++++- 4 files changed, 39 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index a2d19ee..28dd891 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -36,6 +36,7 @@ config X86 select HAVE_KRETPROBES select HAVE_OPTPROBES select HAVE_FTRACE_MCOUNT_RECORD + select HAVE_FENTRY if X86_64 select HAVE_C_RECORDMCOUNT select HAVE_DYNAMIC_FTRACE select HAVE_FUNCTION_TRACER diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index a6cae0c..9a25b52 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -35,7 +35,11 @@ #endif #ifdef CONFIG_FUNCTION_TRACER -#define MCOUNT_ADDR ((long)(mcount)) +#ifdef CC_USING_FENTRY +# define MCOUNT_ADDR ((long)(__fentry__)) +#else +# define MCOUNT_ADDR ((long)(mcount)) +#endif #define MCOUNT_INSN_SIZE 5 /* sizeof mcount call */ #ifdef CONFIG_DYNAMIC_FTRACE @@ -46,6 +50,7 @@ #ifndef __ASSEMBLY__ extern void mcount(void); extern atomic_t modifying_ftrace_code; +extern void __fentry__(void); static inline unsigned long ftrace_call_adjust(unsigned long addr) { diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index b7a81dc..ed767b7 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -68,10 +68,18 @@ .section .entry.text, "ax" #ifdef CONFIG_FUNCTION_TRACER + +#ifdef CC_USING_FENTRY +# define function_hook __fentry__ +#else +# define function_hook mcount +#endif + #ifdef CONFIG_DYNAMIC_FTRACE -ENTRY(mcount) + +ENTRY(function_hook) retq -END(mcount) +END(function_hook) /* skip is set if stack has been adjusted */ .macro ftrace_caller_setup skip=0 @@ -84,7 +92,11 @@ END(mcount) movq RIP(%rsp), %rdi subq $MCOUNT_INSN_SIZE, %rdi /* Load the parent_ip into the second parameter */ +#ifdef CC_USING_FENTRY + movq SS+16(%rsp), %rsi +#else movq 8(%rbp), %rsi +#endif .endm ENTRY(ftrace_caller) @@ -177,7 +189,8 @@ END(ftrace_regs_caller) #else /* ! CONFIG_DYNAMIC_FTRACE */ -ENTRY(mcount) + +ENTRY(function_hook) cmpl $0, function_trace_stop jne ftrace_stub @@ -199,7 +212,11 @@ trace: MCOUNT_SAVE_FRAME movq RIP(%rsp), %rdi +#ifdef CC_USING_FENTRY + movq SS+16(%rsp), %rsi +#else movq 8(%rbp), %rsi +#endif subq $MCOUNT_INSN_SIZE, %rdi call *ftrace_trace_function @@ -207,7 +224,7 @@ trace: MCOUNT_RESTORE_FRAME jmp ftrace_stub -END(mcount) +END(function_hook) #endif /* CONFIG_DYNAMIC_FTRACE */ #endif /* CONFIG_FUNCTION_TRACER */ @@ -215,9 +232,14 @@ END(mcount) ENTRY(ftrace_graph_caller) MCOUNT_SAVE_FRAME +#ifdef CC_USING_FENTRY + leaq SS+16(%rsp), %rdi + movq $0, %rdx /* No framepointers needed */ +#else leaq 8(%rbp), %rdi - movq RIP(%rsp), %rsi movq (%rbp), %rdx +#endif + movq RIP(%rsp), %rsi subq $MCOUNT_INSN_SIZE, %rsi call prepare_ftrace_return diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index 6020f6f..1330dd1 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -13,9 +13,13 @@ #include #ifdef CONFIG_FUNCTION_TRACER -/* mcount is defined in assembly */ +/* mcount and __fentry__ are defined in assembly */ +#ifdef CC_USING_FENTRY +EXPORT_SYMBOL(__fentry__); +#else EXPORT_SYMBOL(mcount); #endif +#endif EXPORT_SYMBOL(__get_user_1); EXPORT_SYMBOL(__get_user_2); -- cgit v1.1 From 51faaf2b0d5c7f44d82964f0c70b1c4e44d4e633 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Wed, 22 Aug 2012 13:00:10 -0400 Subject: Revert "xen/x86: Workaround 64-bit hypervisor and 32-bit initial domain." and "xen/x86: Use memblock_reserve for sensitive areas." This reverts commit 806c312e50f122c47913145cf884f53dd09d9199 and commit 59b294403e9814e7c1154043567f0d71bac7a511. And also documents setup.c and why we want to do it that way, which is that we tried to make the the memblock_reserve more selective so that it would be clear what region is reserved. Sadly we ran in the problem wherein on a 64-bit hypervisor with a 32-bit initial domain, the pt_base has the cr3 value which is not neccessarily where the pagetable starts! As Jan put it: " Actually, the adjustment turns out to be correct: The page tables for a 32-on-64 dom0 get allocated in the order "first L1", "first L2", "first L3", so the offset to the page table base is indeed 2. When reading xen/include/public/xen.h's comment very strictly, this is not a violation (since there nothing is said that the first thing in the page table space is pointed to by pt_base; I admit that this seems to be implied though, namely do I think that it is implied that the page table space is the range [pt_base, pt_base + nt_pt_frames), whereas that range here indeed is [pt_base - 2, pt_base - 2 + nt_pt_frames), which - without a priori knowledge - the kernel would have difficulty to figure out)." - so lets just fall back to the easy way and reserve the whole region. Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/enlighten.c | 60 ------------------------------------------------ arch/x86/xen/p2m.c | 5 ---- arch/x86/xen/setup.c | 27 ++++++++++++++++++++++ 3 files changed, 27 insertions(+), 65 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 511f92d..ff962d4 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -998,66 +998,7 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) return ret; } -/* - * If the MFN is not in the m2p (provided to us by the hypervisor) this - * function won't do anything. In practice this means that the XenBus - * MFN won't be available for the initial domain. */ -static unsigned long __init xen_reserve_mfn(unsigned long mfn) -{ - unsigned long pfn, end_pfn = 0; - - if (!mfn) - return end_pfn; - - pfn = mfn_to_pfn(mfn); - if (phys_to_machine_mapping_valid(pfn)) { - end_pfn = PFN_PHYS(pfn) + PAGE_SIZE; - memblock_reserve(PFN_PHYS(pfn), end_pfn); - } - return end_pfn; -} -static void __init xen_reserve_internals(void) -{ - unsigned long size; - unsigned long last_phys = 0; - - if (!xen_pv_domain()) - return; - - /* xen_start_info does not exist in the M2P, hence can't use - * xen_reserve_mfn. */ - memblock_reserve(__pa(xen_start_info), PAGE_SIZE); - last_phys = __pa(xen_start_info) + PAGE_SIZE; - - last_phys = max(xen_reserve_mfn(PFN_DOWN(xen_start_info->shared_info)), last_phys); - last_phys = max(xen_reserve_mfn(xen_start_info->store_mfn), last_phys); - if (!xen_initial_domain()) - last_phys = max(xen_reserve_mfn(xen_start_info->console.domU.mfn), last_phys); - - if (xen_feature(XENFEAT_auto_translated_physmap)) - return; - - /* - * ALIGN up to compensate for the p2m_page pointing to an array that - * can partially filled (look in xen_build_dynamic_phys_to_machine). - */ - - size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); - - /* We could use xen_reserve_mfn here, but would end up looping quite - * a lot (and call memblock_reserve for each PAGE), so lets just use - * the easy way and reserve it wholesale. */ - memblock_reserve(__pa(xen_start_info->mfn_list), size); - last_phys = max(__pa(xen_start_info->mfn_list) + size, last_phys); - /* The pagetables are reserved in mmu.c */ - - /* Under 64-bit hypervisor with a 32-bit domain, the hypervisor - * offsets the pt_base by two pages. Hence the reservation that is done - * in mmu.c misses two pages. We correct it here if we detect this. */ - if (last_phys < __pa(xen_start_info->pt_base)) - memblock_reserve(last_phys, __pa(xen_start_info->pt_base) - last_phys); -} void xen_setup_shared_info(void) { if (!xen_feature(XENFEAT_auto_translated_physmap)) { @@ -1421,7 +1362,6 @@ asmlinkage void __init xen_start_kernel(void) xen_raw_console_write("mapping kernel into physical memory\n"); pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages); - xen_reserve_internals(); /* Allocate and initialize top and mid mfn levels for p2m structure */ xen_build_mfn_list_list(); diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 6a2bfa4..e4adbfb 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -388,11 +388,6 @@ void __init xen_build_dynamic_phys_to_machine(void) } m2p_override_init(); - - /* NOTE: We cannot call memblock_reserve here for the mfn_list as there - * isn't enough pieces to make it work (for one - we are still using the - * Xen provided pagetable). Do it later in xen_reserve_internals. - */ } unsigned long get_phys_to_machine(unsigned long pfn) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 9efca75..740517be 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -424,6 +424,33 @@ char * __init xen_memory_setup(void) e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS, E820_RESERVED); + /* + * Reserve Xen bits: + * - mfn_list + * - xen_start_info + * See comment above "struct start_info" in + * We tried to make the the memblock_reserve more selective so + * that it would be clear what region is reserved. Sadly we ran + * in the problem wherein on a 64-bit hypervisor with a 32-bit + * initial domain, the pt_base has the cr3 value which is not + * neccessarily where the pagetable starts! As Jan put it: " + * Actually, the adjustment turns out to be correct: The page + * tables for a 32-on-64 dom0 get allocated in the order "first L1", + * "first L2", "first L3", so the offset to the page table base is + * indeed 2. When reading xen/include/public/xen.h's comment + * very strictly, this is not a violation (since there nothing is said + * that the first thing in the page table space is pointed to by + * pt_base; I admit that this seems to be implied though, namely + * do I think that it is implied that the page table space is the + * range [pt_base, pt_base + nt_pt_frames), whereas that + * range here indeed is [pt_base - 2, pt_base - 2 + nt_pt_frames), + * which - without a priori knowledge - the kernel would have + * difficulty to figure out)." - so lets just fall back to the + * easy way and reserve the whole region. + */ + memblock_reserve(__pa(xen_start_info->mfn_list), + xen_start_info->pt_base - xen_start_info->mfn_list); + sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); return "Xen"; -- cgit v1.1 From 3699aad047e16a5775b1d051425f422a9384270d Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 28 Jun 2012 22:47:35 -0400 Subject: xen/mmu: The xen_setup_kernel_pagetable doesn't need to return anything. We don't need to return the new PGD - as we do not use it. Acked-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/enlighten.c | 5 +---- arch/x86/xen/mmu.c | 10 ++-------- arch/x86/xen/xen-ops.h | 2 +- 3 files changed, 4 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index ff962d4..d87a038 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1258,7 +1258,6 @@ asmlinkage void __init xen_start_kernel(void) { struct physdev_set_iopl set_iopl; int rc; - pgd_t *pgd; if (!xen_start_info) return; @@ -1350,8 +1349,6 @@ asmlinkage void __init xen_start_kernel(void) acpi_numa = -1; #endif - pgd = (pgd_t *)xen_start_info->pt_base; - /* Don't do the full vcpu_info placement stuff until we have a possible map and a non-dummy shared_info. */ per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; @@ -1360,7 +1357,7 @@ asmlinkage void __init xen_start_kernel(void) early_boot_irqs_disabled = true; xen_raw_console_write("mapping kernel into physical memory\n"); - pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages); + xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base, xen_start_info->nr_pages); /* Allocate and initialize top and mid mfn levels for p2m structure */ xen_build_mfn_list_list(); diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 3a73785..4ac21a4 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1719,8 +1719,7 @@ static void convert_pfn_mfn(void *v) * of the physical mapping once some sort of allocator has been set * up. */ -pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd, - unsigned long max_pfn) +void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) { pud_t *l3; pmd_t *l2; @@ -1781,8 +1780,6 @@ pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd, memblock_reserve(__pa(xen_start_info->pt_base), xen_start_info->nr_pt_frames * PAGE_SIZE); - - return pgd; } #else /* !CONFIG_X86_64 */ static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD); @@ -1825,8 +1822,7 @@ static void __init xen_write_cr3_init(unsigned long cr3) pv_mmu_ops.write_cr3 = &xen_write_cr3; } -pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd, - unsigned long max_pfn) +void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) { pmd_t *kernel_pmd; @@ -1858,8 +1854,6 @@ pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd, memblock_reserve(__pa(xen_start_info->pt_base), xen_start_info->nr_pt_frames * PAGE_SIZE); - - return initial_page_table; } #endif /* CONFIG_X86_64 */ diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 202d4c1..2230f57 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -27,7 +27,7 @@ void xen_setup_mfn_list_list(void); void xen_setup_shared_info(void); void xen_build_mfn_list_list(void); void xen_setup_machphys_mapping(void); -pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); +void xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); void xen_reserve_top(void); extern unsigned long xen_max_p2m_pfn; -- cgit v1.1 From 4fac153a7a260e40e10008a0d7a272719684e4cd Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 12 Jul 2012 13:55:25 -0400 Subject: xen/mmu: Provide comments describing the _ka and _va aliasing issue Which is that the level2_kernel_pgt (__ka virtual addresses) and level2_ident_pgt (__va virtual address) contain the same PMD entries. So if you modify a PTE in __ka, it will be reflected in __va (and vice-versa). Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/mmu.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 4ac21a4..6ba6100 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1734,19 +1734,36 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) init_level4_pgt[0] = __pgd(0); /* Pre-constructed entries are in pfn, so convert to mfn */ + /* L4[272] -> level3_ident_pgt + * L4[511] -> level3_kernel_pgt */ convert_pfn_mfn(init_level4_pgt); + + /* L3_i[0] -> level2_ident_pgt */ convert_pfn_mfn(level3_ident_pgt); + /* L3_k[510] -> level2_kernel_pgt + * L3_i[511] -> level2_fixmap_pgt */ convert_pfn_mfn(level3_kernel_pgt); + /* We get [511][511] and have Xen's version of level2_kernel_pgt */ l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd); l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud); + /* Graft it onto L4[272][0]. Note that we creating an aliasing problem: + * Both L4[272][0] and L4[511][511] have entries that point to the same + * L2 (PMD) tables. Meaning that if you modify it in __va space + * it will be also modified in the __ka space! (But if you just + * modify the PMD table to point to other PTE's or none, then you + * are OK - which is what cleanup_highmap does) */ memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD); + /* Graft it onto L4[511][511] */ memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD); + /* Get [511][510] and graft that in level2_fixmap_pgt */ l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd); l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud); memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD); + /* Note that we don't do anything with level1_fixmap_pgt which + * we don't need. */ /* Set up identity map */ xen_map_identity_early(level2_ident_pgt, max_pfn); -- cgit v1.1 From ae895ed7839f918bbc8d5425b8973b25a534f4eb Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 26 Jul 2012 11:57:04 -0400 Subject: xen/mmu: use copy_page instead of memcpy. After all, this is what it is there for. Acked-by: Jan Beulich Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/mmu.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 6ba6100..7247e5a 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1754,14 +1754,14 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) * it will be also modified in the __ka space! (But if you just * modify the PMD table to point to other PTE's or none, then you * are OK - which is what cleanup_highmap does) */ - memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD); + copy_page(level2_ident_pgt, l2); /* Graft it onto L4[511][511] */ - memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD); + copy_page(level2_kernel_pgt, l2); /* Get [511][510] and graft that in level2_fixmap_pgt */ l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd); l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud); - memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD); + copy_page(level2_fixmap_pgt, l2); /* Note that we don't do anything with level1_fixmap_pgt which * we don't need. */ @@ -1821,8 +1821,7 @@ static void __init xen_write_cr3_init(unsigned long cr3) */ swapper_kernel_pmd = extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE); - memcpy(swapper_kernel_pmd, initial_kernel_pmd, - sizeof(pmd_t) * PTRS_PER_PMD); + copy_page(swapper_kernel_pmd, initial_kernel_pmd); swapper_pg_dir[KERNEL_PGD_BOUNDARY] = __pgd(__pa(swapper_kernel_pmd) | _PAGE_PRESENT); set_page_prot(swapper_kernel_pmd, PAGE_KERNEL_RO); @@ -1851,11 +1850,11 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) 512*1024); kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd); - memcpy(initial_kernel_pmd, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD); + copy_page(initial_kernel_pmd, kernel_pmd); xen_map_identity_early(initial_kernel_pmd, max_pfn); - memcpy(initial_page_table, pgd, sizeof(pgd_t) * PTRS_PER_PGD); + copy_page(initial_page_table, pgd); initial_page_table[KERNEL_PGD_BOUNDARY] = __pgd(__pa(initial_kernel_pmd) | _PAGE_PRESENT); -- cgit v1.1 From caaf9ecf16feffa4f1a5a0d617bc78906a114514 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 12 Jul 2012 13:59:36 -0400 Subject: xen/mmu: For 64-bit do not call xen_map_identity_early B/c we do not need it. During the startup the Xen provides us with all the initial memory mapped that we need to function. The initial memory mapped is up to the bootstack, which means we can reference using __ka up to 4.f): (from xen/interface/xen.h): 4. This the order of bootstrap elements in the initial virtual region: a. relocated kernel image b. initial ram disk [mod_start, mod_len] c. list of allocated page frames [mfn_list, nr_pages] d. start_info_t structure [register ESI (x86)] e. bootstrap page tables [pt_base, CR3 (x86)] f. bootstrap stack [register ESP (x86)] (initial ram disk may be ommitted). [v1: More comments in git commit] Acked-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/mmu.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 7247e5a..a59070b 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -84,6 +84,7 @@ */ DEFINE_SPINLOCK(xen_reservation_lock); +#ifdef CONFIG_X86_32 /* * Identity map, in addition to plain kernel map. This needs to be * large enough to allocate page table pages to allocate the rest. @@ -91,7 +92,7 @@ DEFINE_SPINLOCK(xen_reservation_lock); */ #define LEVEL1_IDENT_ENTRIES (PTRS_PER_PTE * 4) static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES); - +#endif #ifdef CONFIG_X86_64 /* l3 pud for userspace vsyscall mapping */ static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss; @@ -1628,7 +1629,7 @@ static void set_page_prot(void *addr, pgprot_t prot) if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0)) BUG(); } - +#ifdef CONFIG_X86_32 static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) { unsigned pmdidx, pteidx; @@ -1679,7 +1680,7 @@ static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) set_page_prot(pmd, PAGE_KERNEL_RO); } - +#endif void __init xen_setup_machphys_mapping(void) { struct xen_machphys_mapping mapping; @@ -1765,14 +1766,12 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) /* Note that we don't do anything with level1_fixmap_pgt which * we don't need. */ - /* Set up identity map */ - xen_map_identity_early(level2_ident_pgt, max_pfn); - /* Make pagetable pieces RO */ set_page_prot(init_level4_pgt, PAGE_KERNEL_RO); set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO); set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO); set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO); + set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO); set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO); -- cgit v1.1 From 488f046df922af992c1a718eff276529c0510885 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 26 Jul 2012 12:00:56 -0400 Subject: xen/mmu: Recycle the Xen provided L4, L3, and L2 pages As we are not using them. We end up only using the L1 pagetables and grafting those to our page-tables. [v1: Per Stefano's suggestion squashed two commits] [v2: Per Stefano's suggestion simplified loop] [v3: Fix smatch warnings] [v4: Add more comments] Acked-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/mmu.c | 42 +++++++++++++++++++++++++++++++++++------- 1 file changed, 35 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index a59070b..b44e6a8 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1708,7 +1708,20 @@ static void convert_pfn_mfn(void *v) for (i = 0; i < PTRS_PER_PTE; i++) pte[i] = xen_make_pte(pte[i].pte); } - +static void __init check_pt_base(unsigned long *pt_base, unsigned long *pt_end, + unsigned long addr) +{ + if (*pt_base == PFN_DOWN(__pa(addr))) { + set_page_prot((void *)addr, PAGE_KERNEL); + clear_page((void *)addr); + (*pt_base)++; + } + if (*pt_end == PFN_DOWN(__pa(addr))) { + set_page_prot((void *)addr, PAGE_KERNEL); + clear_page((void *)addr); + (*pt_end)--; + } +} /* * Set up the initial kernel pagetable. * @@ -1724,6 +1737,9 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) { pud_t *l3; pmd_t *l2; + unsigned long addr[3]; + unsigned long pt_base, pt_end; + unsigned i; /* max_pfn_mapped is the last pfn mapped in the initial memory * mappings. Considering that on Xen after the kernel mappings we @@ -1731,6 +1747,9 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) * set max_pfn_mapped to the last real pfn mapped. */ max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list)); + pt_base = PFN_DOWN(__pa(xen_start_info->pt_base)); + pt_end = pt_base + xen_start_info->nr_pt_frames; + /* Zap identity mapping */ init_level4_pgt[0] = __pgd(0); @@ -1749,6 +1768,9 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd); l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud); + addr[0] = (unsigned long)pgd; + addr[1] = (unsigned long)l3; + addr[2] = (unsigned long)l2; /* Graft it onto L4[272][0]. Note that we creating an aliasing problem: * Both L4[272][0] and L4[511][511] have entries that point to the same * L2 (PMD) tables. Meaning that if you modify it in __va space @@ -1782,20 +1804,26 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) /* Unpin Xen-provided one */ pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); - /* Switch over */ - pgd = init_level4_pgt; - /* * At this stage there can be no user pgd, and no page * structure to attach it to, so make sure we just set kernel * pgd. */ xen_mc_batch(); - __xen_write_cr3(true, __pa(pgd)); + __xen_write_cr3(true, __pa(init_level4_pgt)); xen_mc_issue(PARAVIRT_LAZY_CPU); - memblock_reserve(__pa(xen_start_info->pt_base), - xen_start_info->nr_pt_frames * PAGE_SIZE); + /* We can't that easily rip out L3 and L2, as the Xen pagetables are + * set out this way: [L4], [L1], [L2], [L3], [L1], [L1] ... for + * the initial domain. For guests using the toolstack, they are in: + * [L4], [L3], [L2], [L1], [L1], order .. So for dom0 we can only + * rip out the [L4] (pgd), but for guests we shave off three pages. + */ + for (i = 0; i < ARRAY_SIZE(addr); i++) + check_pt_base(&pt_base, &pt_end, addr[i]); + + /* Our (by three pages) smaller Xen pagetable that we are using */ + memblock_reserve(PFN_PHYS(pt_base), (pt_end - pt_base) * PAGE_SIZE); } #else /* !CONFIG_X86_64 */ static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD); -- cgit v1.1 From 357a3cfb147ee8e97c6f9cdc51e9a33aa56f7d99 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 19 Jul 2012 13:52:29 -0400 Subject: xen/p2m: Add logic to revector a P2M tree to use __va leafs. During bootup Xen supplies us with a P2M array. It sticks it right after the ramdisk, as can be seen with a 128GB PV guest: (certain parts removed for clarity): xc_dom_build_image: called xc_dom_alloc_segment: kernel : 0xffffffff81000000 -> 0xffffffff81e43000 (pfn 0x1000 + 0xe43 pages) xc_dom_pfn_to_ptr: domU mapping: pfn 0x1000+0xe43 at 0x7f097d8bf000 xc_dom_alloc_segment: ramdisk : 0xffffffff81e43000 -> 0xffffffff925c7000 (pfn 0x1e43 + 0x10784 pages) xc_dom_pfn_to_ptr: domU mapping: pfn 0x1e43+0x10784 at 0x7f0952dd2000 xc_dom_alloc_segment: phys2mach : 0xffffffff925c7000 -> 0xffffffffa25c7000 (pfn 0x125c7 + 0x10000 pages) xc_dom_pfn_to_ptr: domU mapping: pfn 0x125c7+0x10000 at 0x7f0942dd2000 xc_dom_alloc_page : start info : 0xffffffffa25c7000 (pfn 0x225c7) xc_dom_alloc_page : xenstore : 0xffffffffa25c8000 (pfn 0x225c8) xc_dom_alloc_page : console : 0xffffffffa25c9000 (pfn 0x225c9) nr_page_tables: 0x0000ffffffffffff/48: 0xffff000000000000 -> 0xffffffffffffffff, 1 table(s) nr_page_tables: 0x0000007fffffffff/39: 0xffffff8000000000 -> 0xffffffffffffffff, 1 table(s) nr_page_tables: 0x000000003fffffff/30: 0xffffffff80000000 -> 0xffffffffbfffffff, 1 table(s) nr_page_tables: 0x00000000001fffff/21: 0xffffffff80000000 -> 0xffffffffa27fffff, 276 table(s) xc_dom_alloc_segment: page tables : 0xffffffffa25ca000 -> 0xffffffffa26e1000 (pfn 0x225ca + 0x117 pages) xc_dom_pfn_to_ptr: domU mapping: pfn 0x225ca+0x117 at 0x7f097d7a8000 xc_dom_alloc_page : boot stack : 0xffffffffa26e1000 (pfn 0x226e1) xc_dom_build_image : virt_alloc_end : 0xffffffffa26e2000 xc_dom_build_image : virt_pgtab_end : 0xffffffffa2800000 So the physical memory and virtual (using __START_KERNEL_map addresses) layout looks as so: phys __ka /------------\ /-------------------\ | 0 | empty | 0xffffffff80000000| | .. | | .. | | 16MB | <= kernel starts | 0xffffffff81000000| | .. | | | | 30MB | <= kernel ends => | 0xffffffff81e43000| | .. | & ramdisk starts | .. | | 293MB | <= ramdisk ends=> | 0xffffffff925c7000| | .. | & P2M starts | .. | | .. | | .. | | 549MB | <= P2M ends => | 0xffffffffa25c7000| | .. | start_info | 0xffffffffa25c7000| | .. | xenstore | 0xffffffffa25c8000| | .. | cosole | 0xffffffffa25c9000| | 549MB | <= page tables => | 0xffffffffa25ca000| | .. | | | | 550MB | <= PGT end => | 0xffffffffa26e1000| | .. | boot stack | | \------------/ \-------------------/ As can be seen, the ramdisk, P2M and pagetables are taking a bit of __ka addresses space. Which is a problem since the MODULES_VADDR starts at 0xffffffffa0000000 - and P2M sits right in there! This results during bootup with the inability to load modules, with this error: ------------[ cut here ]------------ WARNING: at /home/konrad/ssd/linux/mm/vmalloc.c:106 vmap_page_range_noflush+0x2d9/0x370() Call Trace: [] warn_slowpath_common+0x7a/0xb0 [] ? __raw_callee_save_xen_pmd_val+0x11/0x1e [] warn_slowpath_null+0x15/0x20 [] vmap_page_range_noflush+0x2d9/0x370 [] map_vm_area+0x2d/0x50 [] __vmalloc_node_range+0x160/0x250 [] ? module_alloc_update_bounds+0x19/0x80 [] ? load_module+0x66/0x19c0 [] module_alloc+0x5c/0x60 [] ? module_alloc_update_bounds+0x19/0x80 [] module_alloc_update_bounds+0x19/0x80 [] load_module+0xfa3/0x19c0 [] ? security_file_permission+0x86/0x90 [] sys_init_module+0x5a/0x220 [] system_call_fastpath+0x16/0x1b ---[ end trace fd8f7704fdea0291 ]--- vmalloc: allocation failure, allocated 16384 of 20480 bytes modprobe: page allocation failure: order:0, mode:0xd2 Since the __va and __ka are 1:1 up to MODULES_VADDR and cleanup_highmap rids __ka of the ramdisk mapping, what we want to do is similar - get rid of the P2M in the __ka address space. There are two ways of fixing this: 1) All P2M lookups instead of using the __ka address would use the __va address. This means we can safely erase from __ka space the PMD pointers that point to the PFNs for P2M array and be OK. 2). Allocate a new array, copy the existing P2M into it, revector the P2M tree to use that, and return the old P2M to the memory allocate. This has the advantage that it sets the stage for using XEN_ELF_NOTE_INIT_P2M feature. That feature allows us to set the exact virtual address space we want for the P2M - and allows us to boot as initial domain on large machines. So we pick option 2). This patch only lays the groundwork in the P2M code. The patch that modifies the MMU is called "xen/mmu: Copy and revector the P2M tree." Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/p2m.c | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/xen/xen-ops.h | 1 + 2 files changed, 71 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index e4adbfb..996ee2b 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -389,7 +389,77 @@ void __init xen_build_dynamic_phys_to_machine(void) m2p_override_init(); } +#ifdef CONFIG_X86_64 +#include +unsigned long __init xen_revector_p2m_tree(void) +{ + unsigned long va_start; + unsigned long va_end; + unsigned long pfn; + unsigned long *mfn_list = NULL; + unsigned long size; + + va_start = xen_start_info->mfn_list; + /*We copy in increments of P2M_PER_PAGE * sizeof(unsigned long), + * so make sure it is rounded up to that */ + size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); + va_end = va_start + size; + + /* If we were revectored already, don't do it again. */ + if (va_start <= __START_KERNEL_map && va_start >= __PAGE_OFFSET) + return 0; + + mfn_list = alloc_bootmem_align(size, PAGE_SIZE); + if (!mfn_list) { + pr_warn("Could not allocate space for a new P2M tree!\n"); + return xen_start_info->mfn_list; + } + /* Fill it out with INVALID_P2M_ENTRY value */ + memset(mfn_list, 0xFF, size); + + for (pfn = 0; pfn < ALIGN(MAX_DOMAIN_PAGES, P2M_PER_PAGE); pfn += P2M_PER_PAGE) { + unsigned topidx = p2m_top_index(pfn); + unsigned mididx; + unsigned long *mid_p; + + if (!p2m_top[topidx]) + continue; + + if (p2m_top[topidx] == p2m_mid_missing) + continue; + + mididx = p2m_mid_index(pfn); + mid_p = p2m_top[topidx][mididx]; + if (!mid_p) + continue; + if ((mid_p == p2m_missing) || (mid_p == p2m_identity)) + continue; + + if ((unsigned long)mid_p == INVALID_P2M_ENTRY) + continue; + + /* The old va. Rebase it on mfn_list */ + if (mid_p >= (unsigned long *)va_start && mid_p <= (unsigned long *)va_end) { + unsigned long *new; + + new = &mfn_list[pfn]; + + copy_page(new, mid_p); + p2m_top[topidx][mididx] = &mfn_list[pfn]; + p2m_top_mfn_p[topidx][mididx] = virt_to_mfn(&mfn_list[pfn]); + } + /* This should be the leafs allocated for identity from _brk. */ + } + return (unsigned long)mfn_list; + +} +#else +unsigned long __init xen_revector_p2m_tree(void) +{ + return 0; +} +#endif unsigned long get_phys_to_machine(unsigned long pfn) { unsigned topidx, mididx, idx; diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 2230f57..bb5a810 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -45,6 +45,7 @@ void xen_hvm_init_shared_info(void); void xen_unplug_emulated_devices(void); void __init xen_build_dynamic_phys_to_machine(void); +unsigned long __init xen_revector_p2m_tree(void); void xen_init_irq_ops(void); void xen_setup_timer(int cpu); -- cgit v1.1 From 7f9140626c757b773624b97865cb53c2a8348a69 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 26 Jul 2012 12:47:40 -0400 Subject: xen/mmu: Copy and revector the P2M tree. Please first read the description in "xen/p2m: Add logic to revector a P2M tree to use __va leafs" patch. The 'xen_revector_p2m_tree()' function allocates a new P2M tree copies the contents of the old one in it, and returns the new one. At this stage, the __ka address space (which is what the old P2M tree was using) is partially disassembled. The cleanup_highmap has removed the PMD entries from 0-16MB and anything past _brk_end up to the max_pfn_mapped (which is the end of the ramdisk). We have revectored the P2M tree (and the one for save/restore as well) to use new shiny __va address to new MFNs. The xen_start_info has been taken care of already in 'xen_setup_kernel_pagetable()' and xen_start_info->shared_info in 'xen_setup_shared_info()', so we are free to roam and delete PMD entries - which is exactly what we are going to do. We rip out the __ka for the old P2M array. [v1: Fix smatch warnings] [v2: memset was doing 0 instead of 0xff] Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/mmu.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index b44e6a8..a640949 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1183,9 +1183,64 @@ static __init void xen_mapping_pagetable_reserve(u64 start, u64 end) static void xen_post_allocator_init(void); +#ifdef CONFIG_X86_64 +static void __init xen_cleanhighmap(unsigned long vaddr, + unsigned long vaddr_end) +{ + unsigned long kernel_end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1; + pmd_t *pmd = level2_kernel_pgt + pmd_index(vaddr); + + /* NOTE: The loop is more greedy than the cleanup_highmap variant. + * We include the PMD passed in on _both_ boundaries. */ + for (; vaddr <= vaddr_end && (pmd < (level2_kernel_pgt + PAGE_SIZE)); + pmd++, vaddr += PMD_SIZE) { + if (pmd_none(*pmd)) + continue; + if (vaddr < (unsigned long) _text || vaddr > kernel_end) + set_pmd(pmd, __pmd(0)); + } + /* In case we did something silly, we should crash in this function + * instead of somewhere later and be confusing. */ + xen_mc_flush(); +} +#endif static void __init xen_pagetable_setup_done(pgd_t *base) { +#ifdef CONFIG_X86_64 + unsigned long size; + unsigned long addr; +#endif + xen_setup_shared_info(); +#ifdef CONFIG_X86_64 + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + unsigned long new_mfn_list; + + size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); + + /* On 32-bit, we get zero so this never gets executed. */ + new_mfn_list = xen_revector_p2m_tree(); + if (new_mfn_list && new_mfn_list != xen_start_info->mfn_list) { + /* using __ka address and sticking INVALID_P2M_ENTRY! */ + memset((void *)xen_start_info->mfn_list, 0xff, size); + + /* We should be in __ka space. */ + BUG_ON(xen_start_info->mfn_list < __START_KERNEL_map); + addr = xen_start_info->mfn_list; + size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); + /* We roundup to the PMD, which means that if anybody at this stage is + * using the __ka address of xen_start_info or xen_start_info->shared_info + * they are in going to crash. Fortunatly we have already revectored + * in xen_setup_kernel_pagetable and in xen_setup_shared_info. */ + size = roundup(size, PMD_SIZE); + xen_cleanhighmap(addr, addr + size); + + memblock_free(__pa(xen_start_info->mfn_list), size); + /* And revector! Bye bye old array */ + xen_start_info->mfn_list = new_mfn_list; + } + } +#endif xen_post_allocator_init(); } @@ -1824,6 +1879,8 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) /* Our (by three pages) smaller Xen pagetable that we are using */ memblock_reserve(PFN_PHYS(pt_base), (pt_end - pt_base) * PAGE_SIZE); + /* Revector the xen_start_info */ + xen_start_info = (struct start_info *)__va(__pa(xen_start_info)); } #else /* !CONFIG_X86_64 */ static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD); -- cgit v1.1 From 3aca7fbc8ede0dd194317b2e3144815128ffb232 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Tue, 14 Aug 2012 14:34:00 -0400 Subject: xen/mmu: Remove from __ka space PMD entries for pagetables. Please first read the description in "xen/mmu: Copy and revector the P2M tree." At this stage, the __ka address space (which is what the old P2M tree was using) is partially disassembled. The cleanup_highmap has removed the PMD entries from 0-16MB and anything past _brk_end up to the max_pfn_mapped (which is the end of the ramdisk). The xen_remove_p2m_tree and code around has ripped out the __ka for the old P2M array. Here we continue on doing it to where the Xen page-tables were. It is safe to do it, as the page-tables are addressed using __va. For good measure we delete anything that is within MODULES_VADDR and up to the end of the PMD. At this point the __ka only contains PMD entries for the start of the kernel up to __brk. [v1: Per Stefano's suggestion wrapped the MODULES_VADDR in debug] Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/mmu.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index a640949..3f8e963 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1240,6 +1240,25 @@ static void __init xen_pagetable_setup_done(pgd_t *base) xen_start_info->mfn_list = new_mfn_list; } } + /* At this stage, cleanup_highmap has already cleaned __ka space + * from _brk_limit way up to the max_pfn_mapped (which is the end of + * the ramdisk). We continue on, erasing PMD entries that point to page + * tables - do note that they are accessible at this stage via __va. + * For good measure we also round up to the PMD - which means that if + * anybody is using __ka address to the initial boot-stack - and try + * to use it - they are going to crash. The xen_start_info has been + * taken care of already in xen_setup_kernel_pagetable. */ + addr = xen_start_info->pt_base; + size = roundup(xen_start_info->nr_pt_frames * PAGE_SIZE, PMD_SIZE); + + xen_cleanhighmap(addr, addr + size); + xen_start_info->pt_base = (unsigned long)__va(__pa(xen_start_info->pt_base)); +#ifdef DEBUG + /* This is superflous and is not neccessary, but you know what + * lets do it. The MODULES_VADDR -> MODULES_END should be clear of + * anything at this stage. */ + xen_cleanhighmap(MODULES_VADDR, roundup(MODULES_VADDR, PUD_SIZE) - 1); +#endif #endif xen_post_allocator_init(); } -- cgit v1.1 From 785f62314984ea3af9dd830b020289ba2509ae69 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Tue, 14 Aug 2012 16:37:31 -0400 Subject: xen/mmu: Release just the MFN list, not MFN list and part of pagetables. We call memblock_reserve for [start of mfn list] -> [PMD aligned end of mfn list] instead of -> --- arch/x86/xen/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 3f8e963..5b2cb54 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1227,7 +1227,6 @@ static void __init xen_pagetable_setup_done(pgd_t *base) /* We should be in __ka space. */ BUG_ON(xen_start_info->mfn_list < __START_KERNEL_map); addr = xen_start_info->mfn_list; - size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); /* We roundup to the PMD, which means that if anybody at this stage is * using the __ka address of xen_start_info or xen_start_info->shared_info * they are in going to crash. Fortunatly we have already revectored @@ -1235,6 +1234,7 @@ static void __init xen_pagetable_setup_done(pgd_t *base) size = roundup(size, PMD_SIZE); xen_cleanhighmap(addr, addr + size); + size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); memblock_free(__pa(xen_start_info->mfn_list), size); /* And revector! Bye bye old array */ xen_start_info->mfn_list = new_mfn_list; -- cgit v1.1 From 3fc509fc0c590900568ef516a37101d88f3476f5 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 16 Aug 2012 16:38:55 -0400 Subject: xen/p2m: When revectoring deal with holes in the P2M array. When we free the PFNs and then subsequently populate them back during bootup: Freeing 20000-20200 pfn range: 512 pages freed 1-1 mapping on 20000->20200 Freeing 40000-40200 pfn range: 512 pages freed 1-1 mapping on 40000->40200 Freeing bad80-badf4 pfn range: 116 pages freed 1-1 mapping on bad80->badf4 Freeing badf6-bae7f pfn range: 137 pages freed 1-1 mapping on badf6->bae7f Freeing bb000-100000 pfn range: 282624 pages freed 1-1 mapping on bb000->100000 Released 283999 pages of unused memory Set 283999 page(s) to 1-1 mapping Populating 1acb8a-1f20e9 pfn range: 283999 pages added We end up having the P2M array (that is the one that was grafted on the P2M tree) filled with IDENTITY_FRAME or INVALID_P2M_ENTRY) entries. The patch titled "xen/p2m: Reuse existing P2M leafs if they are filled with 1:1 PFNs or INVALID." recycles said slots and replaces the P2M tree leaf's with &mfn_list[xx] with p2m_identity or p2m_missing. And re-uses the P2M array sections for other P2M tree leaf's. For the above mentioned bootup excerpt, the PFNs at 0x20000->0x20200 are going to be IDENTITY based: P2M[0][256][0] -> P2M[0][257][0] get turned in IDENTITY_FRAME. We can re-use that and replace P2M[0][256] to point to p2m_identity. The "old" page (the grafted P2M array provided by Xen) that was at P2M[0][256] gets put somewhere else. Specifically at P2M[6][358], b/c when we populate back: Populating 1acb8a-1f20e9 pfn range: 283999 pages added we fill P2M[6][358][0] (and P2M[6][358], P2M[6][359], ...) with the new MFNs. That is all OK, except when we revector we assume that the PFN count would be the same in the grafted P2M array and in the newly allocated. Since that is no longer the case, as we have holes in the P2M that point to p2m_missing or p2m_identity we have to take that into account. [v2: Check for overflow] [v3: Move within the __va check] [v4: Fix the computation] Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/p2m.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 996ee2b..c3e9291 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -396,6 +396,7 @@ unsigned long __init xen_revector_p2m_tree(void) unsigned long va_start; unsigned long va_end; unsigned long pfn; + unsigned long pfn_free = 0; unsigned long *mfn_list = NULL; unsigned long size; @@ -442,11 +443,18 @@ unsigned long __init xen_revector_p2m_tree(void) if (mid_p >= (unsigned long *)va_start && mid_p <= (unsigned long *)va_end) { unsigned long *new; - new = &mfn_list[pfn]; + if (pfn_free > (size / sizeof(unsigned long))) { + WARN(1, "Only allocated for %ld pages, but we want %ld!\n", + size / sizeof(unsigned long), pfn_free); + return 0; + } + new = &mfn_list[pfn_free]; copy_page(new, mid_p); - p2m_top[topidx][mididx] = &mfn_list[pfn]; - p2m_top_mfn_p[topidx][mididx] = virt_to_mfn(&mfn_list[pfn]); + p2m_top[topidx][mididx] = &mfn_list[pfn_free]; + p2m_top_mfn_p[topidx][mididx] = virt_to_mfn(&mfn_list[pfn_free]); + + pfn_free += P2M_PER_PAGE; } /* This should be the leafs allocated for identity from _brk. */ -- cgit v1.1 From 328731876451a837f56e66ffa11de053ed5daf73 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 17 Aug 2012 09:35:31 -0400 Subject: xen/mmu: If the revector fails, don't attempt to revector anything else. If the P2M revectoring would fail, we would try to continue on by cleaning the PMD for L1 (PTE) page-tables. The xen_cleanhighmap is greedy and erases the PMD on both boundaries. Since the P2M array can share the PMD, we would wipe out part of the __ka that is still used in the P2M tree to point to P2M leafs. This fixes it by bypassing the revectoring and continuing on. If the revector fails, a nice WARN is printed so we can still troubleshoot this. Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/mmu.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 5b2cb54..cb9db72 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1238,7 +1238,8 @@ static void __init xen_pagetable_setup_done(pgd_t *base) memblock_free(__pa(xen_start_info->mfn_list), size); /* And revector! Bye bye old array */ xen_start_info->mfn_list = new_mfn_list; - } + } else + goto skip; } /* At this stage, cleanup_highmap has already cleaned __ka space * from _brk_limit way up to the max_pfn_mapped (which is the end of @@ -1259,6 +1260,7 @@ static void __init xen_pagetable_setup_done(pgd_t *base) * anything at this stage. */ xen_cleanhighmap(MODULES_VADDR, roundup(MODULES_VADDR, PUD_SIZE) - 1); #endif +skip: #endif xen_post_allocator_init(); } -- cgit v1.1 From e3e45c01ae690e65f2650e5288b9af802e95a136 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Fri, 24 Aug 2012 15:34:34 +0200 Subject: perf/x86: Fix microcode revision check for SNB-PEBS The following patch makes the microcode update code path actually invoke the perf_check_microcode() function and thus potentially renabling SNB PEBS. By default, CONFIG_MICROCODE_OLD_INTERFACE is forced to Y in arch/x86/Kconfig. There is no way to disable this. That means that the code path used in arch/x86/kernel/microcode_core.c did not include the call to perf_check_microcode(). Thus, even though the microcode was updated to a version that fixes the SNB PEBS problem, perf_event would still return EOPNOTSUPP when enabling precise sampling. This patch simply adds a call to perf_check_microcode() in the call path used when OLD_INTERFACE=y. Signed-off-by: Stephane Eranian Acked-by: Borislav Petkov Cc: peterz@infradead.org Cc: andi@firstfloor.org Link: http://lkml.kernel.org/r/20120824133434.GA8014@quad Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_core.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 4873e62..9e5bcf1 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -225,6 +225,9 @@ static ssize_t microcode_write(struct file *file, const char __user *buf, if (do_microcode_update(buf, len) == 0) ret = (ssize_t)len; + if (ret > 0) + perf_check_microcode(); + mutex_unlock(µcode_mutex); put_online_cpus(); -- cgit v1.1 From 1d92128fe9e30c2340283361957a840f108e4abf Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Sun, 26 Aug 2012 18:00:29 +0300 Subject: KVM: x86: fix KVM_GET_MSR for PV EOI KVM_GET_MSR was missing support for PV EOI, which is needed for migration. Signed-off-by: Michael S. Tsirkin Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index dce75b76..148ed66 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2000,6 +2000,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_KVM_STEAL_TIME: data = vcpu->arch.st.msr_val; break; + case MSR_KVM_PV_EOI_EN: + data = vcpu->arch.pv_eoi.msr_val; + break; case MSR_IA32_P5_MC_ADDR: case MSR_IA32_P5_MC_TYPE: case MSR_IA32_MCG_CAP: -- cgit v1.1 From dd856efafe6097a5c9104725c2bca74430423db8 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 27 Aug 2012 23:46:17 +0300 Subject: KVM: x86 emulator: access GPRs on demand Instead of populating the entire register file, read in registers as they are accessed, and write back only the modified ones. This saves a VMREAD and VMWRITE on Intel (for rsp, since it is not usually used during emulation), and a two 128-byte copies for the registers. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_emulate.h | 20 ++- arch/x86/kvm/emulate.c | 299 +++++++++++++++++++++++-------------- arch/x86/kvm/x86.c | 45 +++--- 3 files changed, 220 insertions(+), 144 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index c764f43..282aee5 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -86,6 +86,19 @@ struct x86_instruction_info { struct x86_emulate_ops { /* + * read_gpr: read a general purpose register (rax - r15) + * + * @reg: gpr number. + */ + ulong (*read_gpr)(struct x86_emulate_ctxt *ctxt, unsigned reg); + /* + * write_gpr: write a general purpose register (rax - r15) + * + * @reg: gpr number. + * @val: value to write. + */ + void (*write_gpr)(struct x86_emulate_ctxt *ctxt, unsigned reg, ulong val); + /* * read_std: Read bytes of standard (non-emulated/special) memory. * Used for descriptor reading. * @addr: [IN ] Linear address from which to read. @@ -281,8 +294,10 @@ struct x86_emulate_ctxt { bool rip_relative; unsigned long _eip; struct operand memop; + u32 regs_valid; /* bitmaps of registers in _regs[] that can be read */ + u32 regs_dirty; /* bitmaps of registers in _regs[] that have been written */ /* Fields above regs are cleared together. */ - unsigned long regs[NR_VCPU_REGS]; + unsigned long _regs[NR_VCPU_REGS]; struct operand *memopp; struct fetch_cache fetch; struct read_cache io_read; @@ -394,4 +409,7 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt, u16 tss_selector, int idt_index, int reason, bool has_error_code, u32 error_code); int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq); +void emulator_invalidate_register_cache(struct x86_emulate_ctxt *ctxt); +void emulator_writeback_register_cache(struct x86_emulate_ctxt *ctxt); + #endif /* _ASM_X86_KVM_X86_EMULATE_H */ diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index e8fb6c5..5e27ba5 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -202,6 +202,42 @@ struct gprefix { #define EFLG_RESERVED_ZEROS_MASK 0xffc0802a #define EFLG_RESERVED_ONE_MASK 2 +static ulong reg_read(struct x86_emulate_ctxt *ctxt, unsigned nr) +{ + if (!(ctxt->regs_valid & (1 << nr))) { + ctxt->regs_valid |= 1 << nr; + ctxt->_regs[nr] = ctxt->ops->read_gpr(ctxt, nr); + } + return ctxt->_regs[nr]; +} + +static ulong *reg_write(struct x86_emulate_ctxt *ctxt, unsigned nr) +{ + ctxt->regs_valid |= 1 << nr; + ctxt->regs_dirty |= 1 << nr; + return &ctxt->_regs[nr]; +} + +static ulong *reg_rmw(struct x86_emulate_ctxt *ctxt, unsigned nr) +{ + reg_read(ctxt, nr); + return reg_write(ctxt, nr); +} + +static void writeback_registers(struct x86_emulate_ctxt *ctxt) +{ + unsigned reg; + + for_each_set_bit(reg, (ulong *)&ctxt->regs_dirty, 16) + ctxt->ops->write_gpr(ctxt, reg, ctxt->_regs[reg]); +} + +static void invalidate_registers(struct x86_emulate_ctxt *ctxt) +{ + ctxt->regs_dirty = 0; + ctxt->regs_valid = 0; +} + /* * Instruction emulation: * Most instructions are emulated directly via a fragment of inline assembly @@ -374,8 +410,8 @@ struct gprefix { #define __emulate_1op_rax_rdx(ctxt, _op, _suffix, _ex) \ do { \ unsigned long _tmp; \ - ulong *rax = &(ctxt)->regs[VCPU_REGS_RAX]; \ - ulong *rdx = &(ctxt)->regs[VCPU_REGS_RDX]; \ + ulong *rax = reg_rmw((ctxt), VCPU_REGS_RAX); \ + ulong *rdx = reg_rmw((ctxt), VCPU_REGS_RDX); \ \ __asm__ __volatile__ ( \ _PRE_EFLAGS("0", "5", "1") \ @@ -494,7 +530,7 @@ register_address_increment(struct x86_emulate_ctxt *ctxt, unsigned long *reg, in static void rsp_increment(struct x86_emulate_ctxt *ctxt, int inc) { - masked_increment(&ctxt->regs[VCPU_REGS_RSP], stack_mask(ctxt), inc); + masked_increment(reg_rmw(ctxt, VCPU_REGS_RSP), stack_mask(ctxt), inc); } static inline void jmp_rel(struct x86_emulate_ctxt *ctxt, int rel) @@ -786,14 +822,15 @@ static int do_insn_fetch(struct x86_emulate_ctxt *ctxt, * pointer into the block that addresses the relevant register. * @highbyte_regs specifies whether to decode AH,CH,DH,BH. */ -static void *decode_register(u8 modrm_reg, unsigned long *regs, +static void *decode_register(struct x86_emulate_ctxt *ctxt, u8 modrm_reg, int highbyte_regs) { void *p; - p = ®s[modrm_reg]; if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8) - p = (unsigned char *)®s[modrm_reg & 3] + 1; + p = (unsigned char *)reg_rmw(ctxt, modrm_reg & 3) + 1; + else + p = reg_rmw(ctxt, modrm_reg); return p; } @@ -982,10 +1019,10 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt, op->type = OP_REG; if (ctxt->d & ByteOp) { - op->addr.reg = decode_register(reg, ctxt->regs, highbyte_regs); + op->addr.reg = decode_register(ctxt, reg, highbyte_regs); op->bytes = 1; } else { - op->addr.reg = decode_register(reg, ctxt->regs, 0); + op->addr.reg = decode_register(ctxt, reg, 0); op->bytes = ctxt->op_bytes; } fetch_register_operand(op); @@ -1020,8 +1057,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, if (ctxt->modrm_mod == 3) { op->type = OP_REG; op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; - op->addr.reg = decode_register(ctxt->modrm_rm, - ctxt->regs, ctxt->d & ByteOp); + op->addr.reg = decode_register(ctxt, ctxt->modrm_rm, ctxt->d & ByteOp); if (ctxt->d & Sse) { op->type = OP_XMM; op->bytes = 16; @@ -1042,10 +1078,10 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, op->type = OP_MEM; if (ctxt->ad_bytes == 2) { - unsigned bx = ctxt->regs[VCPU_REGS_RBX]; - unsigned bp = ctxt->regs[VCPU_REGS_RBP]; - unsigned si = ctxt->regs[VCPU_REGS_RSI]; - unsigned di = ctxt->regs[VCPU_REGS_RDI]; + unsigned bx = reg_read(ctxt, VCPU_REGS_RBX); + unsigned bp = reg_read(ctxt, VCPU_REGS_RBP); + unsigned si = reg_read(ctxt, VCPU_REGS_RSI); + unsigned di = reg_read(ctxt, VCPU_REGS_RDI); /* 16-bit ModR/M decode. */ switch (ctxt->modrm_mod) { @@ -1102,17 +1138,17 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, if ((base_reg & 7) == 5 && ctxt->modrm_mod == 0) modrm_ea += insn_fetch(s32, ctxt); else { - modrm_ea += ctxt->regs[base_reg]; + modrm_ea += reg_read(ctxt, base_reg); adjust_modrm_seg(ctxt, base_reg); } if (index_reg != 4) - modrm_ea += ctxt->regs[index_reg] << scale; + modrm_ea += reg_read(ctxt, index_reg) << scale; } else if ((ctxt->modrm_rm & 7) == 5 && ctxt->modrm_mod == 0) { if (ctxt->mode == X86EMUL_MODE_PROT64) ctxt->rip_relative = 1; } else { base_reg = ctxt->modrm_rm; - modrm_ea += ctxt->regs[base_reg]; + modrm_ea += reg_read(ctxt, base_reg); adjust_modrm_seg(ctxt, base_reg); } switch (ctxt->modrm_mod) { @@ -1250,10 +1286,10 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, if (rc->pos == rc->end) { /* refill pio read ahead */ unsigned int in_page, n; unsigned int count = ctxt->rep_prefix ? - address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) : 1; + address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) : 1; in_page = (ctxt->eflags & EFLG_DF) ? - offset_in_page(ctxt->regs[VCPU_REGS_RDI]) : - PAGE_SIZE - offset_in_page(ctxt->regs[VCPU_REGS_RDI]); + offset_in_page(reg_read(ctxt, VCPU_REGS_RDI)) : + PAGE_SIZE - offset_in_page(reg_read(ctxt, VCPU_REGS_RDI)); n = min(min(in_page, (unsigned int)sizeof(rc->data)) / size, count); if (n == 0) @@ -1533,7 +1569,7 @@ static int push(struct x86_emulate_ctxt *ctxt, void *data, int bytes) struct segmented_address addr; rsp_increment(ctxt, -bytes); - addr.ea = ctxt->regs[VCPU_REGS_RSP] & stack_mask(ctxt); + addr.ea = reg_read(ctxt, VCPU_REGS_RSP) & stack_mask(ctxt); addr.seg = VCPU_SREG_SS; return segmented_write(ctxt, addr, data, bytes); @@ -1552,7 +1588,7 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt, int rc; struct segmented_address addr; - addr.ea = ctxt->regs[VCPU_REGS_RSP] & stack_mask(ctxt); + addr.ea = reg_read(ctxt, VCPU_REGS_RSP) & stack_mask(ctxt); addr.seg = VCPU_SREG_SS; rc = segmented_read(ctxt, addr, dest, len); if (rc != X86EMUL_CONTINUE) @@ -1620,26 +1656,28 @@ static int em_enter(struct x86_emulate_ctxt *ctxt) int rc; unsigned frame_size = ctxt->src.val; unsigned nesting_level = ctxt->src2.val & 31; + ulong rbp; if (nesting_level) return X86EMUL_UNHANDLEABLE; - rc = push(ctxt, &ctxt->regs[VCPU_REGS_RBP], stack_size(ctxt)); + rbp = reg_read(ctxt, VCPU_REGS_RBP); + rc = push(ctxt, &rbp, stack_size(ctxt)); if (rc != X86EMUL_CONTINUE) return rc; - assign_masked(&ctxt->regs[VCPU_REGS_RBP], ctxt->regs[VCPU_REGS_RSP], + assign_masked(reg_rmw(ctxt, VCPU_REGS_RBP), reg_read(ctxt, VCPU_REGS_RSP), stack_mask(ctxt)); - assign_masked(&ctxt->regs[VCPU_REGS_RSP], - ctxt->regs[VCPU_REGS_RSP] - frame_size, + assign_masked(reg_rmw(ctxt, VCPU_REGS_RSP), + reg_read(ctxt, VCPU_REGS_RSP) - frame_size, stack_mask(ctxt)); return X86EMUL_CONTINUE; } static int em_leave(struct x86_emulate_ctxt *ctxt) { - assign_masked(&ctxt->regs[VCPU_REGS_RSP], ctxt->regs[VCPU_REGS_RBP], + assign_masked(reg_rmw(ctxt, VCPU_REGS_RSP), reg_read(ctxt, VCPU_REGS_RBP), stack_mask(ctxt)); - return emulate_pop(ctxt, &ctxt->regs[VCPU_REGS_RBP], ctxt->op_bytes); + return emulate_pop(ctxt, reg_rmw(ctxt, VCPU_REGS_RBP), ctxt->op_bytes); } static int em_push_sreg(struct x86_emulate_ctxt *ctxt) @@ -1667,13 +1705,13 @@ static int em_pop_sreg(struct x86_emulate_ctxt *ctxt) static int em_pusha(struct x86_emulate_ctxt *ctxt) { - unsigned long old_esp = ctxt->regs[VCPU_REGS_RSP]; + unsigned long old_esp = reg_read(ctxt, VCPU_REGS_RSP); int rc = X86EMUL_CONTINUE; int reg = VCPU_REGS_RAX; while (reg <= VCPU_REGS_RDI) { (reg == VCPU_REGS_RSP) ? - (ctxt->src.val = old_esp) : (ctxt->src.val = ctxt->regs[reg]); + (ctxt->src.val = old_esp) : (ctxt->src.val = reg_read(ctxt, reg)); rc = em_push(ctxt); if (rc != X86EMUL_CONTINUE) @@ -1702,7 +1740,7 @@ static int em_popa(struct x86_emulate_ctxt *ctxt) --reg; } - rc = emulate_pop(ctxt, &ctxt->regs[reg], ctxt->op_bytes); + rc = emulate_pop(ctxt, reg_rmw(ctxt, reg), ctxt->op_bytes); if (rc != X86EMUL_CONTINUE) break; --reg; @@ -1710,7 +1748,7 @@ static int em_popa(struct x86_emulate_ctxt *ctxt) return rc; } -int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq) +static int __emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq) { struct x86_emulate_ops *ops = ctxt->ops; int rc; @@ -1759,11 +1797,22 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq) return rc; } +int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq) +{ + int rc; + + invalidate_registers(ctxt); + rc = __emulate_int_real(ctxt, irq); + if (rc == X86EMUL_CONTINUE) + writeback_registers(ctxt); + return rc; +} + static int emulate_int(struct x86_emulate_ctxt *ctxt, int irq) { switch(ctxt->mode) { case X86EMUL_MODE_REAL: - return emulate_int_real(ctxt, irq); + return __emulate_int_real(ctxt, irq); case X86EMUL_MODE_VM86: case X86EMUL_MODE_PROT16: case X86EMUL_MODE_PROT32: @@ -1970,14 +2019,14 @@ static int em_cmpxchg8b(struct x86_emulate_ctxt *ctxt) { u64 old = ctxt->dst.orig_val64; - if (((u32) (old >> 0) != (u32) ctxt->regs[VCPU_REGS_RAX]) || - ((u32) (old >> 32) != (u32) ctxt->regs[VCPU_REGS_RDX])) { - ctxt->regs[VCPU_REGS_RAX] = (u32) (old >> 0); - ctxt->regs[VCPU_REGS_RDX] = (u32) (old >> 32); + if (((u32) (old >> 0) != (u32) reg_read(ctxt, VCPU_REGS_RAX)) || + ((u32) (old >> 32) != (u32) reg_read(ctxt, VCPU_REGS_RDX))) { + *reg_write(ctxt, VCPU_REGS_RAX) = (u32) (old >> 0); + *reg_write(ctxt, VCPU_REGS_RDX) = (u32) (old >> 32); ctxt->eflags &= ~EFLG_ZF; } else { - ctxt->dst.val64 = ((u64)ctxt->regs[VCPU_REGS_RCX] << 32) | - (u32) ctxt->regs[VCPU_REGS_RBX]; + ctxt->dst.val64 = ((u64)reg_read(ctxt, VCPU_REGS_RCX) << 32) | + (u32) reg_read(ctxt, VCPU_REGS_RBX); ctxt->eflags |= EFLG_ZF; } @@ -2013,7 +2062,7 @@ static int em_cmpxchg(struct x86_emulate_ctxt *ctxt) { /* Save real source value, then compare EAX against destination. */ ctxt->src.orig_val = ctxt->src.val; - ctxt->src.val = ctxt->regs[VCPU_REGS_RAX]; + ctxt->src.val = reg_read(ctxt, VCPU_REGS_RAX); emulate_2op_SrcV(ctxt, "cmp"); if (ctxt->eflags & EFLG_ZF) { @@ -2022,7 +2071,7 @@ static int em_cmpxchg(struct x86_emulate_ctxt *ctxt) } else { /* Failure: write the value we saw to EAX. */ ctxt->dst.type = OP_REG; - ctxt->dst.addr.reg = (unsigned long *)&ctxt->regs[VCPU_REGS_RAX]; + ctxt->dst.addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX); } return X86EMUL_CONTINUE; } @@ -2159,10 +2208,10 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt) ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS); ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); - ctxt->regs[VCPU_REGS_RCX] = ctxt->_eip; + *reg_write(ctxt, VCPU_REGS_RCX) = ctxt->_eip; if (efer & EFER_LMA) { #ifdef CONFIG_X86_64 - ctxt->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF; + *reg_write(ctxt, VCPU_REGS_R11) = ctxt->eflags & ~EFLG_RF; ops->get_msr(ctxt, ctxt->mode == X86EMUL_MODE_PROT64 ? @@ -2241,7 +2290,7 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt) ctxt->_eip = msr_data; ops->get_msr(ctxt, MSR_IA32_SYSENTER_ESP, &msr_data); - ctxt->regs[VCPU_REGS_RSP] = msr_data; + *reg_write(ctxt, VCPU_REGS_RSP) = msr_data; return X86EMUL_CONTINUE; } @@ -2291,8 +2340,8 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt) ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS); ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); - ctxt->_eip = ctxt->regs[VCPU_REGS_RDX]; - ctxt->regs[VCPU_REGS_RSP] = ctxt->regs[VCPU_REGS_RCX]; + ctxt->_eip = reg_read(ctxt, VCPU_REGS_RDX); + *reg_write(ctxt, VCPU_REGS_RSP) = reg_read(ctxt, VCPU_REGS_RCX); return X86EMUL_CONTINUE; } @@ -2361,14 +2410,14 @@ static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt, { tss->ip = ctxt->_eip; tss->flag = ctxt->eflags; - tss->ax = ctxt->regs[VCPU_REGS_RAX]; - tss->cx = ctxt->regs[VCPU_REGS_RCX]; - tss->dx = ctxt->regs[VCPU_REGS_RDX]; - tss->bx = ctxt->regs[VCPU_REGS_RBX]; - tss->sp = ctxt->regs[VCPU_REGS_RSP]; - tss->bp = ctxt->regs[VCPU_REGS_RBP]; - tss->si = ctxt->regs[VCPU_REGS_RSI]; - tss->di = ctxt->regs[VCPU_REGS_RDI]; + tss->ax = reg_read(ctxt, VCPU_REGS_RAX); + tss->cx = reg_read(ctxt, VCPU_REGS_RCX); + tss->dx = reg_read(ctxt, VCPU_REGS_RDX); + tss->bx = reg_read(ctxt, VCPU_REGS_RBX); + tss->sp = reg_read(ctxt, VCPU_REGS_RSP); + tss->bp = reg_read(ctxt, VCPU_REGS_RBP); + tss->si = reg_read(ctxt, VCPU_REGS_RSI); + tss->di = reg_read(ctxt, VCPU_REGS_RDI); tss->es = get_segment_selector(ctxt, VCPU_SREG_ES); tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS); @@ -2384,14 +2433,14 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt, ctxt->_eip = tss->ip; ctxt->eflags = tss->flag | 2; - ctxt->regs[VCPU_REGS_RAX] = tss->ax; - ctxt->regs[VCPU_REGS_RCX] = tss->cx; - ctxt->regs[VCPU_REGS_RDX] = tss->dx; - ctxt->regs[VCPU_REGS_RBX] = tss->bx; - ctxt->regs[VCPU_REGS_RSP] = tss->sp; - ctxt->regs[VCPU_REGS_RBP] = tss->bp; - ctxt->regs[VCPU_REGS_RSI] = tss->si; - ctxt->regs[VCPU_REGS_RDI] = tss->di; + *reg_write(ctxt, VCPU_REGS_RAX) = tss->ax; + *reg_write(ctxt, VCPU_REGS_RCX) = tss->cx; + *reg_write(ctxt, VCPU_REGS_RDX) = tss->dx; + *reg_write(ctxt, VCPU_REGS_RBX) = tss->bx; + *reg_write(ctxt, VCPU_REGS_RSP) = tss->sp; + *reg_write(ctxt, VCPU_REGS_RBP) = tss->bp; + *reg_write(ctxt, VCPU_REGS_RSI) = tss->si; + *reg_write(ctxt, VCPU_REGS_RDI) = tss->di; /* * SDM says that segment selectors are loaded before segment @@ -2476,14 +2525,14 @@ static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt, tss->cr3 = ctxt->ops->get_cr(ctxt, 3); tss->eip = ctxt->_eip; tss->eflags = ctxt->eflags; - tss->eax = ctxt->regs[VCPU_REGS_RAX]; - tss->ecx = ctxt->regs[VCPU_REGS_RCX]; - tss->edx = ctxt->regs[VCPU_REGS_RDX]; - tss->ebx = ctxt->regs[VCPU_REGS_RBX]; - tss->esp = ctxt->regs[VCPU_REGS_RSP]; - tss->ebp = ctxt->regs[VCPU_REGS_RBP]; - tss->esi = ctxt->regs[VCPU_REGS_RSI]; - tss->edi = ctxt->regs[VCPU_REGS_RDI]; + tss->eax = reg_read(ctxt, VCPU_REGS_RAX); + tss->ecx = reg_read(ctxt, VCPU_REGS_RCX); + tss->edx = reg_read(ctxt, VCPU_REGS_RDX); + tss->ebx = reg_read(ctxt, VCPU_REGS_RBX); + tss->esp = reg_read(ctxt, VCPU_REGS_RSP); + tss->ebp = reg_read(ctxt, VCPU_REGS_RBP); + tss->esi = reg_read(ctxt, VCPU_REGS_RSI); + tss->edi = reg_read(ctxt, VCPU_REGS_RDI); tss->es = get_segment_selector(ctxt, VCPU_SREG_ES); tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS); @@ -2505,14 +2554,14 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, ctxt->eflags = tss->eflags | 2; /* General purpose registers */ - ctxt->regs[VCPU_REGS_RAX] = tss->eax; - ctxt->regs[VCPU_REGS_RCX] = tss->ecx; - ctxt->regs[VCPU_REGS_RDX] = tss->edx; - ctxt->regs[VCPU_REGS_RBX] = tss->ebx; - ctxt->regs[VCPU_REGS_RSP] = tss->esp; - ctxt->regs[VCPU_REGS_RBP] = tss->ebp; - ctxt->regs[VCPU_REGS_RSI] = tss->esi; - ctxt->regs[VCPU_REGS_RDI] = tss->edi; + *reg_write(ctxt, VCPU_REGS_RAX) = tss->eax; + *reg_write(ctxt, VCPU_REGS_RCX) = tss->ecx; + *reg_write(ctxt, VCPU_REGS_RDX) = tss->edx; + *reg_write(ctxt, VCPU_REGS_RBX) = tss->ebx; + *reg_write(ctxt, VCPU_REGS_RSP) = tss->esp; + *reg_write(ctxt, VCPU_REGS_RBP) = tss->ebp; + *reg_write(ctxt, VCPU_REGS_RSI) = tss->esi; + *reg_write(ctxt, VCPU_REGS_RDI) = tss->edi; /* * SDM says that segment selectors are loaded before segment @@ -2727,14 +2776,17 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt, { int rc; + invalidate_registers(ctxt); ctxt->_eip = ctxt->eip; ctxt->dst.type = OP_NONE; rc = emulator_do_task_switch(ctxt, tss_selector, idt_index, reason, has_error_code, error_code); - if (rc == X86EMUL_CONTINUE) + if (rc == X86EMUL_CONTINUE) { ctxt->eip = ctxt->_eip; + writeback_registers(ctxt); + } return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; } @@ -2744,8 +2796,8 @@ static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned seg, { int df = (ctxt->eflags & EFLG_DF) ? -1 : 1; - register_address_increment(ctxt, &ctxt->regs[reg], df * op->bytes); - op->addr.mem.ea = register_address(ctxt, ctxt->regs[reg]); + register_address_increment(ctxt, reg_rmw(ctxt, reg), df * op->bytes); + op->addr.mem.ea = register_address(ctxt, reg_read(ctxt, reg)); op->addr.mem.seg = seg; } @@ -2921,7 +2973,7 @@ static int em_cwd(struct x86_emulate_ctxt *ctxt) { ctxt->dst.type = OP_REG; ctxt->dst.bytes = ctxt->src.bytes; - ctxt->dst.addr.reg = &ctxt->regs[VCPU_REGS_RDX]; + ctxt->dst.addr.reg = reg_rmw(ctxt, VCPU_REGS_RDX); ctxt->dst.val = ~((ctxt->src.val >> (ctxt->src.bytes * 8 - 1)) - 1); return X86EMUL_CONTINUE; @@ -2932,8 +2984,8 @@ static int em_rdtsc(struct x86_emulate_ctxt *ctxt) u64 tsc = 0; ctxt->ops->get_msr(ctxt, MSR_IA32_TSC, &tsc); - ctxt->regs[VCPU_REGS_RAX] = (u32)tsc; - ctxt->regs[VCPU_REGS_RDX] = tsc >> 32; + *reg_write(ctxt, VCPU_REGS_RAX) = (u32)tsc; + *reg_write(ctxt, VCPU_REGS_RDX) = tsc >> 32; return X86EMUL_CONTINUE; } @@ -2941,10 +2993,10 @@ static int em_rdpmc(struct x86_emulate_ctxt *ctxt) { u64 pmc; - if (ctxt->ops->read_pmc(ctxt, ctxt->regs[VCPU_REGS_RCX], &pmc)) + if (ctxt->ops->read_pmc(ctxt, reg_read(ctxt, VCPU_REGS_RCX), &pmc)) return emulate_gp(ctxt, 0); - ctxt->regs[VCPU_REGS_RAX] = (u32)pmc; - ctxt->regs[VCPU_REGS_RDX] = pmc >> 32; + *reg_write(ctxt, VCPU_REGS_RAX) = (u32)pmc; + *reg_write(ctxt, VCPU_REGS_RDX) = pmc >> 32; return X86EMUL_CONTINUE; } @@ -2986,9 +3038,9 @@ static int em_wrmsr(struct x86_emulate_ctxt *ctxt) { u64 msr_data; - msr_data = (u32)ctxt->regs[VCPU_REGS_RAX] - | ((u64)ctxt->regs[VCPU_REGS_RDX] << 32); - if (ctxt->ops->set_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], msr_data)) + msr_data = (u32)reg_read(ctxt, VCPU_REGS_RAX) + | ((u64)reg_read(ctxt, VCPU_REGS_RDX) << 32); + if (ctxt->ops->set_msr(ctxt, reg_read(ctxt, VCPU_REGS_RCX), msr_data)) return emulate_gp(ctxt, 0); return X86EMUL_CONTINUE; @@ -2998,11 +3050,11 @@ static int em_rdmsr(struct x86_emulate_ctxt *ctxt) { u64 msr_data; - if (ctxt->ops->get_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], &msr_data)) + if (ctxt->ops->get_msr(ctxt, reg_read(ctxt, VCPU_REGS_RCX), &msr_data)) return emulate_gp(ctxt, 0); - ctxt->regs[VCPU_REGS_RAX] = (u32)msr_data; - ctxt->regs[VCPU_REGS_RDX] = msr_data >> 32; + *reg_write(ctxt, VCPU_REGS_RAX) = (u32)msr_data; + *reg_write(ctxt, VCPU_REGS_RDX) = msr_data >> 32; return X86EMUL_CONTINUE; } @@ -3182,8 +3234,8 @@ static int em_lmsw(struct x86_emulate_ctxt *ctxt) static int em_loop(struct x86_emulate_ctxt *ctxt) { - register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RCX], -1); - if ((address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) != 0) && + register_address_increment(ctxt, reg_rmw(ctxt, VCPU_REGS_RCX), -1); + if ((address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) != 0) && (ctxt->b == 0xe2 || test_cc(ctxt->b ^ 0x5, ctxt->eflags))) jmp_rel(ctxt, ctxt->src.val); @@ -3192,7 +3244,7 @@ static int em_loop(struct x86_emulate_ctxt *ctxt) static int em_jcxz(struct x86_emulate_ctxt *ctxt) { - if (address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) == 0) + if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0) jmp_rel(ctxt, ctxt->src.val); return X86EMUL_CONTINUE; @@ -3280,20 +3332,20 @@ static int em_cpuid(struct x86_emulate_ctxt *ctxt) { u32 eax, ebx, ecx, edx; - eax = ctxt->regs[VCPU_REGS_RAX]; - ecx = ctxt->regs[VCPU_REGS_RCX]; + eax = reg_read(ctxt, VCPU_REGS_RAX); + ecx = reg_read(ctxt, VCPU_REGS_RCX); ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx); - ctxt->regs[VCPU_REGS_RAX] = eax; - ctxt->regs[VCPU_REGS_RBX] = ebx; - ctxt->regs[VCPU_REGS_RCX] = ecx; - ctxt->regs[VCPU_REGS_RDX] = edx; + *reg_write(ctxt, VCPU_REGS_RAX) = eax; + *reg_write(ctxt, VCPU_REGS_RBX) = ebx; + *reg_write(ctxt, VCPU_REGS_RCX) = ecx; + *reg_write(ctxt, VCPU_REGS_RDX) = edx; return X86EMUL_CONTINUE; } static int em_lahf(struct x86_emulate_ctxt *ctxt) { - ctxt->regs[VCPU_REGS_RAX] &= ~0xff00UL; - ctxt->regs[VCPU_REGS_RAX] |= (ctxt->eflags & 0xff) << 8; + *reg_rmw(ctxt, VCPU_REGS_RAX) &= ~0xff00UL; + *reg_rmw(ctxt, VCPU_REGS_RAX) |= (ctxt->eflags & 0xff) << 8; return X86EMUL_CONTINUE; } @@ -3450,7 +3502,7 @@ static int check_svme(struct x86_emulate_ctxt *ctxt) static int check_svme_pa(struct x86_emulate_ctxt *ctxt) { - u64 rax = ctxt->regs[VCPU_REGS_RAX]; + u64 rax = reg_read(ctxt, VCPU_REGS_RAX); /* Valid physical address? */ if (rax & 0xffff000000000000ULL) @@ -3472,7 +3524,7 @@ static int check_rdtsc(struct x86_emulate_ctxt *ctxt) static int check_rdpmc(struct x86_emulate_ctxt *ctxt) { u64 cr4 = ctxt->ops->get_cr(ctxt, 4); - u64 rcx = ctxt->regs[VCPU_REGS_RCX]; + u64 rcx = reg_read(ctxt, VCPU_REGS_RCX); if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) || (rcx > 3)) @@ -3930,7 +3982,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, case OpAcc: op->type = OP_REG; op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; - op->addr.reg = &ctxt->regs[VCPU_REGS_RAX]; + op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX); fetch_register_operand(op); op->orig_val = op->val; break; @@ -3938,19 +3990,19 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, op->type = OP_MEM; op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; op->addr.mem.ea = - register_address(ctxt, ctxt->regs[VCPU_REGS_RDI]); + register_address(ctxt, reg_read(ctxt, VCPU_REGS_RDI)); op->addr.mem.seg = VCPU_SREG_ES; op->val = 0; break; case OpDX: op->type = OP_REG; op->bytes = 2; - op->addr.reg = &ctxt->regs[VCPU_REGS_RDX]; + op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RDX); fetch_register_operand(op); break; case OpCL: op->bytes = 1; - op->val = ctxt->regs[VCPU_REGS_RCX] & 0xff; + op->val = reg_read(ctxt, VCPU_REGS_RCX) & 0xff; break; case OpImmByte: rc = decode_imm(ctxt, op, 1, true); @@ -3981,7 +4033,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, op->type = OP_MEM; op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; op->addr.mem.ea = - register_address(ctxt, ctxt->regs[VCPU_REGS_RSI]); + register_address(ctxt, reg_read(ctxt, VCPU_REGS_RSI)); op->addr.mem.seg = seg_override(ctxt); op->val = 0; break; @@ -4287,6 +4339,7 @@ static void fetch_possible_mmx_operand(struct x86_emulate_ctxt *ctxt, read_mmx_reg(ctxt, &op->mm_val, op->addr.mm); } + int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) { struct x86_emulate_ops *ops = ctxt->ops; @@ -4371,7 +4424,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) if (ctxt->rep_prefix && (ctxt->d & String)) { /* All REP prefixes have the same first termination condition */ - if (address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) == 0) { + if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0) { ctxt->eip = ctxt->_eip; goto done; } @@ -4444,7 +4497,7 @@ special_insn: ctxt->dst.val = ctxt->src.addr.mem.ea; break; case 0x90 ... 0x97: /* nop / xchg reg, rax */ - if (ctxt->dst.addr.reg == &ctxt->regs[VCPU_REGS_RAX]) + if (ctxt->dst.addr.reg == reg_rmw(ctxt, VCPU_REGS_RAX)) break; rc = em_xchg(ctxt); break; @@ -4472,7 +4525,7 @@ special_insn: rc = em_grp2(ctxt); break; case 0xd2 ... 0xd3: /* Grp2 */ - ctxt->src.val = ctxt->regs[VCPU_REGS_RCX]; + ctxt->src.val = reg_read(ctxt, VCPU_REGS_RCX); rc = em_grp2(ctxt); break; case 0xe9: /* jmp rel */ @@ -4527,14 +4580,14 @@ writeback: if (ctxt->rep_prefix && (ctxt->d & String)) { struct read_cache *r = &ctxt->io_read; - register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RCX], -1); + register_address_increment(ctxt, reg_rmw(ctxt, VCPU_REGS_RCX), -1); if (!string_insn_completed(ctxt)) { /* * Re-enter guest when pio read ahead buffer is empty * or, if it is not used, after each 1024 iteration. */ - if ((r->end != 0 || ctxt->regs[VCPU_REGS_RCX] & 0x3ff) && + if ((r->end != 0 || reg_read(ctxt, VCPU_REGS_RCX) & 0x3ff) && (r->end == 0 || r->end != r->pos)) { /* * Reset read cache. Usually happens before @@ -4542,6 +4595,7 @@ writeback: * we have to do it here. */ ctxt->mem_read.end = 0; + writeback_registers(ctxt); return EMULATION_RESTART; } goto done; /* skip rip writeback */ @@ -4556,6 +4610,9 @@ done: if (rc == X86EMUL_INTERCEPTED) return EMULATION_INTERCEPTED; + if (rc == X86EMUL_CONTINUE) + writeback_registers(ctxt); + return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; twobyte_insn: @@ -4628,3 +4685,13 @@ twobyte_insn: cannot_emulate: return EMULATION_FAILED; } + +void emulator_invalidate_register_cache(struct x86_emulate_ctxt *ctxt) +{ + invalidate_registers(ctxt); +} + +void emulator_writeback_register_cache(struct x86_emulate_ctxt *ctxt) +{ + writeback_registers(ctxt); +} diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 42bbf41..e00050c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4313,7 +4313,19 @@ static void emulator_get_cpuid(struct x86_emulate_ctxt *ctxt, kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx); } +static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg) +{ + return kvm_register_read(emul_to_vcpu(ctxt), reg); +} + +static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulong val) +{ + kvm_register_write(emul_to_vcpu(ctxt), reg, val); +} + static struct x86_emulate_ops emulate_ops = { + .read_gpr = emulator_read_gpr, + .write_gpr = emulator_write_gpr, .read_std = kvm_read_guest_virt_system, .write_std = kvm_write_guest_virt_system, .fetch = kvm_fetch_guest_virt, @@ -4348,14 +4360,6 @@ static struct x86_emulate_ops emulate_ops = { .get_cpuid = emulator_get_cpuid, }; -static void cache_all_regs(struct kvm_vcpu *vcpu) -{ - kvm_register_read(vcpu, VCPU_REGS_RAX); - kvm_register_read(vcpu, VCPU_REGS_RSP); - kvm_register_read(vcpu, VCPU_REGS_RIP); - vcpu->arch.regs_dirty = ~0; -} - static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) { u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu, mask); @@ -4382,12 +4386,10 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu) kvm_queue_exception(vcpu, ctxt->exception.vector); } -static void init_decode_cache(struct x86_emulate_ctxt *ctxt, - const unsigned long *regs) +static void init_decode_cache(struct x86_emulate_ctxt *ctxt) { memset(&ctxt->twobyte, 0, - (void *)&ctxt->regs - (void *)&ctxt->twobyte); - memcpy(ctxt->regs, regs, sizeof(ctxt->regs)); + (void *)&ctxt->_regs - (void *)&ctxt->twobyte); ctxt->fetch.start = 0; ctxt->fetch.end = 0; @@ -4402,14 +4404,6 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; int cs_db, cs_l; - /* - * TODO: fix emulate.c to use guest_read/write_register - * instead of direct ->regs accesses, can save hundred cycles - * on Intel for instructions that don't read/change RSP, for - * for example. - */ - cache_all_regs(vcpu); - kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); ctxt->eflags = kvm_get_rflags(vcpu); @@ -4421,7 +4415,7 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) X86EMUL_MODE_PROT16; ctxt->guest_mode = is_guest_mode(vcpu); - init_decode_cache(ctxt, vcpu->arch.regs); + init_decode_cache(ctxt); vcpu->arch.emulate_regs_need_sync_from_vcpu = false; } @@ -4441,7 +4435,6 @@ int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) return EMULATE_FAIL; ctxt->eip = ctxt->_eip; - memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs); kvm_rip_write(vcpu, ctxt->eip); kvm_set_rflags(vcpu, ctxt->eflags); @@ -4599,7 +4592,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, changes registers values during IO operation */ if (vcpu->arch.emulate_regs_need_sync_from_vcpu) { vcpu->arch.emulate_regs_need_sync_from_vcpu = false; - memcpy(ctxt->regs, vcpu->arch.regs, sizeof ctxt->regs); + emulator_invalidate_register_cache(ctxt); } restart: @@ -4637,7 +4630,6 @@ restart: toggle_interruptibility(vcpu, ctxt->interruptibility); kvm_set_rflags(vcpu, ctxt->eflags); kvm_make_request(KVM_REQ_EVENT, vcpu); - memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs); vcpu->arch.emulate_regs_need_sync_to_vcpu = false; kvm_rip_write(vcpu, ctxt->eip); } else @@ -5591,8 +5583,7 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) * that usually, but some bad designed PV devices (vmware * backdoor interface) need this to work */ - struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; - memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs); + emulator_writeback_register_cache(&vcpu->arch.emulate_ctxt); vcpu->arch.emulate_regs_need_sync_to_vcpu = false; } regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); @@ -5723,6 +5714,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, { struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; int ret; + unsigned reg; init_emulate_ctxt(vcpu); @@ -5732,7 +5724,6 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, if (ret) return EMULATE_FAIL; - memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs); kvm_rip_write(vcpu, ctxt->eip); kvm_set_rflags(vcpu, ctxt->eflags); kvm_make_request(KVM_REQ_EVENT, vcpu); -- cgit v1.1 From baa7e81e325bbb6ddebd7680ac1068859244b61d Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 21 Aug 2012 17:06:58 +0300 Subject: KVM: VMX: Separate saving pre-realmode state from setting segments Commit b246dd5df139 ("KVM: VMX: Fix KVM_SET_SREGS with big real mode segments") moved fix_rmode_seg() to vmx_set_segment(), so that it is applied not just on transitions to real mode, but also on KVM_SET_SREGS (migration). However fix_rmode_seg() not only munges the vmcs segments, it also sets up the save area for us to restore when returning to protected mode or to return in vmx_get_segment(). Move saving the segment into a new function, save_rmode_seg(), and call it just during the transition. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 13e0296..4e49caf 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2768,7 +2768,7 @@ static gva_t rmode_tss_base(struct kvm *kvm) return kvm->arch.tss_addr; } -static void fix_rmode_seg(int seg, struct kvm_save_segment *save) +static void save_rmode_seg(int seg, struct kvm_save_segment *save) { struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; @@ -2776,6 +2776,12 @@ static void fix_rmode_seg(int seg, struct kvm_save_segment *save) save->base = vmcs_readl(sf->base); save->limit = vmcs_read32(sf->limit); save->ar = vmcs_read32(sf->ar_bytes); +} + +static void fix_rmode_seg(int seg, struct kvm_save_segment *save) +{ + struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; + vmcs_write16(sf->selector, save->base >> 4); vmcs_write32(sf->base, save->base & 0xffff0); vmcs_write32(sf->limit, 0xffff); @@ -2798,6 +2804,12 @@ static void enter_rmode(struct kvm_vcpu *vcpu) vmx->emulation_required = 1; vmx->rmode.vm86_active = 1; + save_rmode_seg(VCPU_SREG_TR, &vmx->rmode.tr); + save_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es); + save_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds); + save_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs); + save_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs); + /* * Very old userspace does not call KVM_SET_TSS_ADDR before entering * vcpu. Call it here with phys address pointing 16M below 4G. @@ -2812,14 +2824,8 @@ static void enter_rmode(struct kvm_vcpu *vcpu) vmx_segment_cache_clear(vmx); - vmx->rmode.tr.selector = vmcs_read16(GUEST_TR_SELECTOR); - vmx->rmode.tr.base = vmcs_readl(GUEST_TR_BASE); vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); - - vmx->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT); vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); - - vmx->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES); vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); flags = vmcs_readl(GUEST_RFLAGS); -- cgit v1.1 From 72fbefec26841699fee9ad0b050624aeb43d5bae Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 21 Aug 2012 17:06:59 +0300 Subject: KVM: VMX: Fix incorrect lookup of segment S flag in fix_pmode_dataseg() fix_pmode_dataseg() looks up S in ->base instead of ->ar_bytes. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 4e49caf..1d93079 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2693,11 +2693,11 @@ static __exit void hardware_unsetup(void) free_kvm_area(); } -static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save) +static void fix_pmode_dataseg(int seg, struct kvm_segment *save) { struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; - if (vmcs_readl(sf->base) == save->base && (save->base & AR_S_MASK)) { + if (vmcs_readl(sf->base) == save->base && (save->ar_bytes & AR_S_MASK)) { vmcs_write16(sf->selector, save->selector); vmcs_writel(sf->base, save->base); vmcs_write32(sf->limit, save->limit); -- cgit v1.1 From f5f7b2fe3bf849b58c8144729aba78b8e29e1e4c Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 21 Aug 2012 17:07:00 +0300 Subject: KVM: VMX: Use kvm_segment to save protected-mode segments when entering realmode Instead of using struct kvm_save_segment, use struct kvm_segment, which is what the other APIs use. This leads to some simplification. We replace save_rmode_seg() with a call to vmx_save_segment(). Since this depends on rmode.vm86_active, we move the call to before setting the flag. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 85 +++++++++++++++--------------------------------------- 1 file changed, 24 insertions(+), 61 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 1d93079..7e95ff6 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -405,16 +405,16 @@ struct vcpu_vmx { struct { int vm86_active; ulong save_rflags; + struct kvm_segment segs[8]; + } rmode; + struct { + u32 bitmask; /* 4 bits per segment (1 bit per field) */ struct kvm_save_segment { u16 selector; unsigned long base; u32 limit; u32 ar; - } tr, es, ds, fs, gs; - } rmode; - struct { - u32 bitmask; /* 4 bits per segment (1 bit per field) */ - struct kvm_save_segment seg[8]; + } seg[8]; } segment_cache; int vpid; bool emulation_required; @@ -2693,15 +2693,12 @@ static __exit void hardware_unsetup(void) free_kvm_area(); } -static void fix_pmode_dataseg(int seg, struct kvm_segment *save) +static void fix_pmode_dataseg(struct kvm_vcpu *vcpu, int seg, struct kvm_segment *save) { struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; - if (vmcs_readl(sf->base) == save->base && (save->ar_bytes & AR_S_MASK)) { - vmcs_write16(sf->selector, save->selector); - vmcs_writel(sf->base, save->base); - vmcs_write32(sf->limit, save->limit); - vmcs_write32(sf->ar_bytes, save->ar); + if (vmcs_readl(sf->base) == save->base && save->s) { + vmx_set_segment(vcpu, save, seg); } else { u32 dpl = (vmcs_read16(sf->selector) & SELECTOR_RPL_MASK) << AR_DPL_SHIFT; @@ -2719,10 +2716,7 @@ static void enter_pmode(struct kvm_vcpu *vcpu) vmx_segment_cache_clear(vmx); - vmcs_write16(GUEST_TR_SELECTOR, vmx->rmode.tr.selector); - vmcs_writel(GUEST_TR_BASE, vmx->rmode.tr.base); - vmcs_write32(GUEST_TR_LIMIT, vmx->rmode.tr.limit); - vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar); + vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); flags = vmcs_readl(GUEST_RFLAGS); flags &= RMODE_GUEST_OWNED_EFLAGS_BITS; @@ -2737,10 +2731,10 @@ static void enter_pmode(struct kvm_vcpu *vcpu) if (emulate_invalid_guest_state) return; - fix_pmode_dataseg(VCPU_SREG_ES, &vmx->rmode.es); - fix_pmode_dataseg(VCPU_SREG_DS, &vmx->rmode.ds); - fix_pmode_dataseg(VCPU_SREG_GS, &vmx->rmode.gs); - fix_pmode_dataseg(VCPU_SREG_FS, &vmx->rmode.fs); + fix_pmode_dataseg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); + fix_pmode_dataseg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); + fix_pmode_dataseg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); + fix_pmode_dataseg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); vmx_segment_cache_clear(vmx); @@ -2768,17 +2762,7 @@ static gva_t rmode_tss_base(struct kvm *kvm) return kvm->arch.tss_addr; } -static void save_rmode_seg(int seg, struct kvm_save_segment *save) -{ - struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; - - save->selector = vmcs_read16(sf->selector); - save->base = vmcs_readl(sf->base); - save->limit = vmcs_read32(sf->limit); - save->ar = vmcs_read32(sf->ar_bytes); -} - -static void fix_rmode_seg(int seg, struct kvm_save_segment *save) +static void fix_rmode_seg(int seg, struct kvm_segment *save) { struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; @@ -2801,14 +2785,15 @@ static void enter_rmode(struct kvm_vcpu *vcpu) if (enable_unrestricted_guest) return; + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS); + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS); + vmx->emulation_required = 1; vmx->rmode.vm86_active = 1; - save_rmode_seg(VCPU_SREG_TR, &vmx->rmode.tr); - save_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es); - save_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds); - save_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs); - save_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs); /* * Very old userspace does not call KVM_SET_TSS_ADDR before entering @@ -3118,7 +3103,6 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) { struct vcpu_vmx *vmx = to_vmx(vcpu); - struct kvm_save_segment *save; u32 ar; if (vmx->rmode.vm86_active @@ -3126,27 +3110,15 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu, || seg == VCPU_SREG_DS || seg == VCPU_SREG_FS || seg == VCPU_SREG_GS) && !emulate_invalid_guest_state) { - switch (seg) { - case VCPU_SREG_TR: save = &vmx->rmode.tr; break; - case VCPU_SREG_ES: save = &vmx->rmode.es; break; - case VCPU_SREG_DS: save = &vmx->rmode.ds; break; - case VCPU_SREG_FS: save = &vmx->rmode.fs; break; - case VCPU_SREG_GS: save = &vmx->rmode.gs; break; - default: BUG(); - } - var->selector = save->selector; - var->base = save->base; - var->limit = save->limit; - ar = save->ar; + *var = vmx->rmode.segs[seg]; if (seg == VCPU_SREG_TR || var->selector == vmx_read_guest_seg_selector(vmx, seg)) - goto use_saved_rmode_seg; + return; } var->base = vmx_read_guest_seg_base(vmx, seg); var->limit = vmx_read_guest_seg_limit(vmx, seg); var->selector = vmx_read_guest_seg_selector(vmx, seg); ar = vmx_read_guest_seg_ar(vmx, seg); -use_saved_rmode_seg: if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state) ar = 0; var->type = ar & 15; @@ -3235,10 +3207,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) { vmcs_write16(sf->selector, var->selector); - vmx->rmode.tr.selector = var->selector; - vmx->rmode.tr.base = var->base; - vmx->rmode.tr.limit = var->limit; - vmx->rmode.tr.ar = vmx_segment_access_rights(var); + vmx->rmode.segs[VCPU_SREG_TR] = *var; return; } vmcs_writel(sf->base, var->base); @@ -3289,16 +3258,10 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, vmcs_readl(GUEST_CS_BASE) >> 4); break; case VCPU_SREG_ES: - fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es); - break; case VCPU_SREG_DS: - fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds); - break; case VCPU_SREG_GS: - fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs); - break; case VCPU_SREG_FS: - fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs); + fix_rmode_seg(seg, &vmx->rmode.segs[seg]); break; case VCPU_SREG_SS: vmcs_write16(GUEST_SS_SELECTOR, -- cgit v1.1 From c865c43de66dc973865bda337022f03b6e16c8df Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 21 Aug 2012 17:07:01 +0300 Subject: KVM: VMX: Retain limit and attributes when entering protected mode Real processors don't change segment limits and attributes while in real mode. Mimic that behaviour. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 7e95ff6..88eeb40 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2696,14 +2696,14 @@ static __exit void hardware_unsetup(void) static void fix_pmode_dataseg(struct kvm_vcpu *vcpu, int seg, struct kvm_segment *save) { struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; + struct kvm_segment tmp = *save; - if (vmcs_readl(sf->base) == save->base && save->s) { - vmx_set_segment(vcpu, save, seg); - } else { - u32 dpl = (vmcs_read16(sf->selector) & SELECTOR_RPL_MASK) - << AR_DPL_SHIFT; - vmcs_write32(sf->ar_bytes, 0x93 | dpl); + if (!(vmcs_readl(sf->base) == tmp.base && tmp.s)) { + tmp.base = vmcs_readl(sf->base); + tmp.selector = vmcs_read16(sf->selector); + tmp.s = 1; } + vmx_set_segment(vcpu, &tmp, seg); } static void enter_pmode(struct kvm_vcpu *vcpu) -- cgit v1.1 From 495e116684cebc5ae625916aba37fc07f345707b Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 21 Aug 2012 17:07:02 +0300 Subject: KVM: VMX: Allow real mode emulation using vm86 with dpl=0 Real mode is always entered from protected mode with dpl=0. Since the dpl doesn't affect execution, and we already override it to 3 in the vmcs (as vmx requires), we can allow execution in that state. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 88eeb40..4811d91 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3317,7 +3317,7 @@ static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg) return false; if (var.limit != 0xffff) return false; - if (ar != 0xf3) + if ((ar | (3 << AR_DPL_SHIFT)) != 0xf3) return false; return true; -- cgit v1.1 From e2a610d7fc3e285af8061ff071761752255d95f6 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 21 Aug 2012 17:07:03 +0300 Subject: KVM: VMX: Allow vm86 virtualization of big real mode Usually, big real mode uses large (4GB) segments. Currently we don't virtualize this; if any segment has a limit other than 0xffff, we emulate. But if we set the vmx-visible limit to 0xffff, we can use vm86 to virtualize real mode; if an access overruns the segment limit, the guest will #GP, which we will trap and forward to the emulator. This results in significantly faster execution, and less risk of hitting an unemulated instruction. If the limit is less than 0xffff, we retain the existing behaviour. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 4811d91..fd21eb4 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3315,7 +3315,7 @@ static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg) if (var.base != (var.selector << 4)) return false; - if (var.limit != 0xffff) + if (var.limit < 0xffff) return false; if ((ar | (3 << AR_DPL_SHIFT)) != 0xf3) return false; -- cgit v1.1 From 03ebebeb1ff5d1d6209fd8df4ffc9204df82bd55 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 21 Aug 2012 17:07:04 +0300 Subject: KVM: x86 emulator: Leave segment limit and attributs alone in real mode When loading a segment in real mode, only the base and selector must be modified. The limit needs to be left alone, otherwise big real mode users will hit a #GP due to limit checking (currently this is suppressed because we don't check limits in real mode). Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 5e27ba5..f8b27cd 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1388,19 +1388,15 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */ ulong desc_addr; int ret; + u16 dummy; memset(&seg_desc, 0, sizeof seg_desc); if ((seg <= VCPU_SREG_GS && ctxt->mode == X86EMUL_MODE_VM86) || ctxt->mode == X86EMUL_MODE_REAL) { /* set real mode segment descriptor */ + ctxt->ops->get_segment(ctxt, &dummy, &seg_desc, NULL, seg); set_desc_base(&seg_desc, selector << 4); - set_desc_limit(&seg_desc, 0xffff); - seg_desc.type = 3; - seg_desc.p = 1; - seg_desc.s = 1; - if (ctxt->mode == X86EMUL_MODE_VM86) - seg_desc.dpl = 3; goto load; } -- cgit v1.1 From a5625189f6810ef79ced53989c794acfa10d3370 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 21 Aug 2012 17:07:05 +0300 Subject: KVM: x86 emulator: Check segment limits in real mode too Segment limits are verified in real mode, not just protected mode. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index f8b27cd..5b1c701 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -668,8 +668,6 @@ static int __linearize(struct x86_emulate_ctxt *ctxt, la = seg_base(ctxt, addr.seg) + addr.ea; switch (ctxt->mode) { - case X86EMUL_MODE_REAL: - break; case X86EMUL_MODE_PROT64: if (((signed long)la << 16) >> 16 != la) return emulate_gp(ctxt, 0); @@ -699,7 +697,10 @@ static int __linearize(struct x86_emulate_ctxt *ctxt, goto bad; } cpl = ctxt->ops->cpl(ctxt); - rpl = sel & 3; + if (ctxt->mode == X86EMUL_MODE_REAL) + rpl = 0; + else + rpl = sel & 3; cpl = max(cpl, rpl); if (!(desc.type & 8)) { /* data segment */ -- cgit v1.1 From 0afbe2f8781a812c7e501ec129eff45b21f792af Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 21 Aug 2012 17:07:06 +0300 Subject: KVM: x86 emulator: Fix #GP error code during linearization We want the segment selector, nor segment number. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 5b1c701..1451cff 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -725,9 +725,9 @@ static int __linearize(struct x86_emulate_ctxt *ctxt, return X86EMUL_CONTINUE; bad: if (addr.seg == VCPU_SREG_SS) - return emulate_ss(ctxt, addr.seg); + return emulate_ss(ctxt, sel); else - return emulate_gp(ctxt, addr.seg); + return emulate_gp(ctxt, sel); } static int linearize(struct x86_emulate_ctxt *ctxt, -- cgit v1.1 From 726364202853f843a97a8ba4a7c3cd91e3aa84b7 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 21 Aug 2012 17:07:07 +0300 Subject: KVM: VMX: Return real real-mode segment data even if emulate_invalid_guest_state=1 emulate_invalid_guest_state=1 doesn't mean we don't munge the segments in the vmcs; we do. So we need to return the real ones (maintained by vmx_set_segment). Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index fd21eb4..0d68726 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3108,8 +3108,7 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu, if (vmx->rmode.vm86_active && (seg == VCPU_SREG_TR || seg == VCPU_SREG_ES || seg == VCPU_SREG_DS || seg == VCPU_SREG_FS - || seg == VCPU_SREG_GS) - && !emulate_invalid_guest_state) { + || seg == VCPU_SREG_GS)) { *var = vmx->rmode.segs[seg]; if (seg == VCPU_SREG_TR || var->selector == vmx_read_guest_seg_selector(vmx, seg)) -- cgit v1.1 From 1390a28b274e2e45f89bac67c435cbcbc5cc0790 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 21 Aug 2012 17:07:08 +0300 Subject: KVM: VMX: Preserve segment limit and access rights in real mode While this is undocumented, real processors do not reload the segment limit and access rights when loading a segment register in real mode. Real programs rely on it so we need to comply with this behaviour. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 0d68726..6e6421a 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3113,6 +3113,9 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu, if (seg == VCPU_SREG_TR || var->selector == vmx_read_guest_seg_selector(vmx, seg)) return; + var->base = vmx_read_guest_seg_base(vmx, seg); + var->selector = vmx_read_guest_seg_selector(vmx, seg); + return; } var->base = vmx_read_guest_seg_base(vmx, seg); var->limit = vmx_read_guest_seg_limit(vmx, seg); -- cgit v1.1 From ce5668034752e52e995c8dc625337380099a088a Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 21 Aug 2012 17:07:09 +0300 Subject: KVM: VMX: Save all segment data in real mode Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 6e6421a..62b2d0c 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3216,6 +3216,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, vmcs_write32(sf->limit, var->limit); vmcs_write16(sf->selector, var->selector); if (vmx->rmode.vm86_active && var->s) { + vmx->rmode.segs[seg] = *var; /* * Hack real-mode segments into vm86 compatibility. */ -- cgit v1.1 From a81aba14dc0ea499f4c218b5db0303b2ea8151d3 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 21 Aug 2012 17:07:10 +0300 Subject: KVM: VMX: Ignore segment G and D bits when considering whether we can virtualize We will enter the guest with G and D cleared; as real hardware ignores D in real mode, and G is taken care of by the limit test, we allow more code to run in vm86 mode. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 62b2d0c..248c2b4 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3320,7 +3320,7 @@ static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg) return false; if (var.limit < 0xffff) return false; - if ((ar | (3 << AR_DPL_SHIFT)) != 0xf3) + if (((ar | (3 << AR_DPL_SHIFT)) & ~(AR_G_MASK | AR_DB_MASK)) != 0xf3) return false; return true; -- cgit v1.1 From 9a7819774e4236e8736a074b7e85276967911924 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Thu, 30 Aug 2012 17:45:54 -0300 Subject: KVM: x86: remove unused variable from kvm_task_switch() Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e00050c..20f2266 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5714,7 +5714,6 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, { struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; int ret; - unsigned reg; init_emulate_ctxt(vcpu); -- cgit v1.1 From e09a8417823992700c063f9b8f85119f2d9016ed Mon Sep 17 00:00:00 2001 From: Jon Mason Date: Tue, 10 Jul 2012 15:31:30 -0700 Subject: hpet: Remove unused PCI Vendor ID #define HPET_ID_VENDOR_8086 is defined but never used. It would be a redefine of PCI_VENDOR_ID_INTEL if it was ever used. Signed-off-by: Jon Mason Signed-off-by: Jiri Kosina --- arch/x86/include/asm/hpet.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h index 2c392d6..434e210 100644 --- a/arch/x86/include/asm/hpet.h +++ b/arch/x86/include/asm/hpet.h @@ -35,8 +35,6 @@ #define HPET_ID_NUMBER_SHIFT 8 #define HPET_ID_VENDOR_SHIFT 16 -#define HPET_ID_VENDOR_8086 0x8086 - #define HPET_CFG_ENABLE 0x001 #define HPET_CFG_LEGACY 0x002 #define HPET_LEGACY_8254 2 -- cgit v1.1 From ec798660cf72c981ad8eed272487a0fe2b3222f2 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Mon, 3 Sep 2012 14:47:25 +0300 Subject: KVM: cleanup pic reset kvm_pic_reset() is not used anywhere. Move reset logic from pic_ioport_write() there. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/i8259.c | 52 +++++++++++----------------------------------------- 1 file changed, 11 insertions(+), 41 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 90c84f9..848206d 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -275,23 +275,20 @@ void kvm_pic_reset(struct kvm_kpic_state *s) { int irq, i; struct kvm_vcpu *vcpu; - u8 irr = s->irr, isr = s->imr; + u8 edge_irr = s->irr & ~s->elcr; bool found = false; s->last_irr = 0; - s->irr = 0; + s->irr &= s->elcr; s->imr = 0; - s->isr = 0; s->priority_add = 0; - s->irq_base = 0; - s->read_reg_select = 0; - s->poll = 0; s->special_mask = 0; - s->init_state = 0; - s->auto_eoi = 0; - s->rotate_on_auto_eoi = 0; - s->special_fully_nested_mode = 0; - s->init4 = 0; + s->read_reg_select = 0; + if (!s->init4) { + s->special_fully_nested_mode = 0; + s->auto_eoi = 0; + } + s->init_state = 1; kvm_for_each_vcpu(i, vcpu, s->pics_state->kvm) if (kvm_apic_accept_pic_intr(vcpu)) { @@ -304,7 +301,7 @@ void kvm_pic_reset(struct kvm_kpic_state *s) return; for (irq = 0; irq < PIC_NUM_PINS/2; irq++) - if (irr & (1 << irq) || isr & (1 << irq)) + if (edge_irr & (1 << irq)) pic_clear_isr(s, irq); } @@ -316,40 +313,13 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val) addr &= 1; if (addr == 0) { if (val & 0x10) { - u8 edge_irr = s->irr & ~s->elcr; - int i; - bool found; - struct kvm_vcpu *vcpu; - s->init4 = val & 1; - s->last_irr = 0; - s->irr &= s->elcr; - s->imr = 0; - s->priority_add = 0; - s->special_mask = 0; - s->read_reg_select = 0; - if (!s->init4) { - s->special_fully_nested_mode = 0; - s->auto_eoi = 0; - } - s->init_state = 1; if (val & 0x02) pr_pic_unimpl("single mode not supported"); if (val & 0x08) pr_pic_unimpl( - "level sensitive irq not supported"); - - kvm_for_each_vcpu(i, vcpu, s->pics_state->kvm) - if (kvm_apic_accept_pic_intr(vcpu)) { - found = true; - break; - } - - - if (found) - for (irq = 0; irq < PIC_NUM_PINS/2; irq++) - if (edge_irr & (1 << irq)) - pic_clear_isr(s, irq); + "level sensitive irq not supported"); + kvm_pic_reset(s); } else if (val & 0x08) { if (val & 0x04) s->poll = 1; -- cgit v1.1 From 749c59fd15b2c18dd6c15c353a899fb6ac49b865 Mon Sep 17 00:00:00 2001 From: Jamie Iles Date: Thu, 30 Aug 2012 11:32:13 +0100 Subject: KVM: PIC: fix use of uninitialised variable. Commit aea218f3cbbc (KVM: PIC: call ack notifiers for irqs that are dropped form irr) used an uninitialised variable to track whether an appropriate apic had been found. This could result in calling the ack notifier incorrectly. Cc: Gleb Natapov Cc: Avi Kivity Signed-off-by: Jamie Iles Signed-off-by: Avi Kivity --- arch/x86/kvm/i8259.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index e498b18..9fc9aa7 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -318,7 +318,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val) if (val & 0x10) { u8 edge_irr = s->irr & ~s->elcr; int i; - bool found; + bool found = false; struct kvm_vcpu *vcpu; s->init4 = val & 1; -- cgit v1.1 From 3ec18cd8b8f8395d0df604c62ab3bc2cf3a966b4 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Mon, 20 Aug 2012 11:24:21 +0200 Subject: perf/x86: Enable Intel Cedarview Atom suppport This patch enables perf_events support for Intel Cedarview Atom (model 54) processors. Support includes PEBS and LBR. Tested on my Atom N2600 netbook. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120820092421.GA11284@quad Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 1 + arch/x86/kernel/cpu/perf_event_intel_lbr.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 7f2739e..0d3d63a 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -2008,6 +2008,7 @@ __init int intel_pmu_init(void) break; case 28: /* Atom */ + case 54: /* Cedariew */ memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, sizeof(hw_cache_event_ids)); diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index 520b426..da02e9c 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -686,7 +686,8 @@ void intel_pmu_lbr_init_atom(void) * to have an operational LBR which can freeze * on PMU interrupt */ - if (boot_cpu_data.x86_mask < 10) { + if (boot_cpu_data.x86_model == 28 + && boot_cpu_data.x86_mask < 10) { pr_cont("LBR disabled due to erratum"); return; } -- cgit v1.1 From 406eae5d70c1a849f4a050f708a459c2349e9dda Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Sun, 2 Sep 2012 22:20:56 -0700 Subject: x86/Kconfig: Update defconfigs to current results of "make savedefconfig" The x86 defconfigs have become somewhat out of date compared to the current result of "make savedefconfig". Update them to the current output, as a prelude to further defconfig changes, to avoid unrelated noise in those further changes. Signed-off-by: Josh Triplett Cc: Randy Dunlap Cc: Suresh Siddha Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/80c8a5fbeaf6cdb72fb78a016013427efee52668.1346649518.git.josh@joshtriplett.org Signed-off-by: Ingo Molnar --- arch/x86/configs/i386_defconfig | 12 ++++-------- arch/x86/configs/x86_64_defconfig | 12 ++++-------- 2 files changed, 8 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 119db67..1903408 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -8,6 +8,8 @@ CONFIG_TASK_DELAY_ACCT=y CONFIG_TASK_XACCT=y CONFIG_TASK_IO_ACCOUNTING=y CONFIG_AUDIT=y +CONFIG_NO_HZ=y +CONFIG_HIGH_RES_TIMERS=y CONFIG_LOG_BUF_SHIFT=18 CONFIG_CGROUPS=y CONFIG_CGROUP_FREEZER=y @@ -34,8 +36,6 @@ CONFIG_SGI_PARTITION=y CONFIG_SUN_PARTITION=y CONFIG_KARMA_PARTITION=y CONFIG_EFI_PARTITION=y -CONFIG_NO_HZ=y -CONFIG_HIGH_RES_TIMERS=y CONFIG_SMP=y CONFIG_X86_GENERIC=y CONFIG_HPET_TIMER=y @@ -231,8 +231,6 @@ CONFIG_SND_HRTIMER=y CONFIG_SND_HDA_INTEL=y CONFIG_SND_HDA_HWDEP=y CONFIG_HIDRAW=y -CONFIG_HID_PID=y -CONFIG_USB_HIDDEV=y CONFIG_HID_GYRATION=y CONFIG_LOGITECH_FF=y CONFIG_HID_NTRIG=y @@ -243,11 +241,11 @@ CONFIG_HID_SAMSUNG=y CONFIG_HID_SONY=y CONFIG_HID_SUNPLUS=y CONFIG_HID_TOPSEED=y +CONFIG_HID_PID=y +CONFIG_USB_HIDDEV=y CONFIG_USB=y CONFIG_USB_DEBUG=y CONFIG_USB_ANNOUNCE_NEW_DEVICES=y -CONFIG_USB_DEVICEFS=y -# CONFIG_USB_DEVICE_CLASS is not set CONFIG_USB_MON=y CONFIG_USB_EHCI_HCD=y # CONFIG_USB_EHCI_TT_NEWSCHED is not set @@ -280,7 +278,6 @@ CONFIG_PROC_KCORE=y CONFIG_TMPFS_POSIX_ACL=y CONFIG_HUGETLBFS=y CONFIG_NFS_FS=y -CONFIG_NFS_V3=y CONFIG_NFS_V3_ACL=y CONFIG_NFS_V4=y CONFIG_ROOT_NFS=y @@ -299,7 +296,6 @@ CONFIG_DEBUG_KERNEL=y CONFIG_SCHEDSTATS=y CONFIG_TIMER_STATS=y CONFIG_DEBUG_STACK_USAGE=y -CONFIG_SYSCTL_SYSCALL_CHECK=y CONFIG_BLK_DEV_IO_TRACE=y CONFIG_PROVIDE_OHCI1394_DMA_INIT=y CONFIG_EARLY_PRINTK_DBGP=y diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index 76eb290..c2c0448 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -8,6 +8,8 @@ CONFIG_TASK_DELAY_ACCT=y CONFIG_TASK_XACCT=y CONFIG_TASK_IO_ACCOUNTING=y CONFIG_AUDIT=y +CONFIG_NO_HZ=y +CONFIG_HIGH_RES_TIMERS=y CONFIG_LOG_BUF_SHIFT=18 CONFIG_CGROUPS=y CONFIG_CGROUP_FREEZER=y @@ -34,8 +36,6 @@ CONFIG_SGI_PARTITION=y CONFIG_SUN_PARTITION=y CONFIG_KARMA_PARTITION=y CONFIG_EFI_PARTITION=y -CONFIG_NO_HZ=y -CONFIG_HIGH_RES_TIMERS=y CONFIG_SMP=y CONFIG_CALGARY_IOMMU=y CONFIG_NR_CPUS=64 @@ -227,8 +227,6 @@ CONFIG_SND_HRTIMER=y CONFIG_SND_HDA_INTEL=y CONFIG_SND_HDA_HWDEP=y CONFIG_HIDRAW=y -CONFIG_HID_PID=y -CONFIG_USB_HIDDEV=y CONFIG_HID_GYRATION=y CONFIG_LOGITECH_FF=y CONFIG_HID_NTRIG=y @@ -239,11 +237,11 @@ CONFIG_HID_SAMSUNG=y CONFIG_HID_SONY=y CONFIG_HID_SUNPLUS=y CONFIG_HID_TOPSEED=y +CONFIG_HID_PID=y +CONFIG_USB_HIDDEV=y CONFIG_USB=y CONFIG_USB_DEBUG=y CONFIG_USB_ANNOUNCE_NEW_DEVICES=y -CONFIG_USB_DEVICEFS=y -# CONFIG_USB_DEVICE_CLASS is not set CONFIG_USB_MON=y CONFIG_USB_EHCI_HCD=y # CONFIG_USB_EHCI_TT_NEWSCHED is not set @@ -280,7 +278,6 @@ CONFIG_PROC_KCORE=y CONFIG_TMPFS_POSIX_ACL=y CONFIG_HUGETLBFS=y CONFIG_NFS_FS=y -CONFIG_NFS_V3=y CONFIG_NFS_V3_ACL=y CONFIG_NFS_V4=y CONFIG_ROOT_NFS=y @@ -298,7 +295,6 @@ CONFIG_DEBUG_KERNEL=y CONFIG_SCHEDSTATS=y CONFIG_TIMER_STATS=y CONFIG_DEBUG_STACK_USAGE=y -CONFIG_SYSCTL_SYSCALL_CHECK=y CONFIG_BLK_DEV_IO_TRACE=y CONFIG_PROVIDE_OHCI1394_DMA_INIT=y CONFIG_EARLY_PRINTK_DBGP=y -- cgit v1.1 From 3fe2cb8f9e9486980ff03d7e020781cfdb028ffa Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Sun, 2 Sep 2012 22:21:05 -0700 Subject: x86/Kconfig: Switch to ext4 in defconfigs The current x86 and x86-64 defconfigs do not enable ext4, which most current distributions default to. Switch the defconfigs to ext4, so they will boot on current systems without additional configuration. Signed-off-by: Josh Triplett Cc: Randy Dunlap Cc: Suresh Siddha Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/bd8a359506b7e1287c680823de16d67608ec52fe.1346649518.git.josh@joshtriplett.org Signed-off-by: Ingo Molnar --- arch/x86/configs/i386_defconfig | 7 +++---- arch/x86/configs/x86_64_defconfig | 7 +++---- 2 files changed, 6 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 1903408..2701b8a 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -260,10 +260,9 @@ CONFIG_RTC_CLASS=y CONFIG_DMADEVICES=y CONFIG_EEEPC_LAPTOP=y CONFIG_EFI_VARS=y -CONFIG_EXT3_FS=y -# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set -CONFIG_EXT3_FS_POSIX_ACL=y -CONFIG_EXT3_FS_SECURITY=y +CONFIG_EXT4_FS=y +CONFIG_EXT4_FS_POSIX_ACL=y +CONFIG_EXT4_FS_SECURITY=y CONFIG_QUOTA=y CONFIG_QUOTA_NETLINK_INTERFACE=y # CONFIG_PRINT_QUOTA_WARNING is not set diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index c2c0448..c17614e 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -260,10 +260,9 @@ CONFIG_AMD_IOMMU_STATS=y CONFIG_INTEL_IOMMU=y # CONFIG_INTEL_IOMMU_DEFAULT_ON is not set CONFIG_EFI_VARS=y -CONFIG_EXT3_FS=y -# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set -CONFIG_EXT3_FS_POSIX_ACL=y -CONFIG_EXT3_FS_SECURITY=y +CONFIG_EXT4_FS=y +CONFIG_EXT4_FS_POSIX_ACL=y +CONFIG_EXT4_FS_SECURITY=y CONFIG_QUOTA=y CONFIG_QUOTA_NETLINK_INTERFACE=y # CONFIG_PRINT_QUOTA_WARNING is not set -- cgit v1.1 From b92f885f31db782608c8b73942111238020b1f4a Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Sun, 2 Sep 2012 22:21:13 -0700 Subject: x86/Kconfig: Disable CONFIG_CRC_T10DIF in defconfigs CONFIG_CRC_T10DIF explicitly states that it exists only for use by out-of-tree modules; anything in-kernel that needs it selects it. Thus, compile it out by default. Signed-off-by: Josh Triplett Cc: Randy Dunlap Cc: Suresh Siddha Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/3aaff7a0af1320427952d411a21b8ded29747a1f.1346649518.git.josh@joshtriplett.org Signed-off-by: Ingo Molnar --- arch/x86/configs/i386_defconfig | 1 - arch/x86/configs/x86_64_defconfig | 1 - 2 files changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 2701b8a..d833bb6 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -311,4 +311,3 @@ CONFIG_SECURITY_SELINUX_BOOTPARAM=y CONFIG_SECURITY_SELINUX_DISABLE=y CONFIG_CRYPTO_AES_586=y # CONFIG_CRYPTO_ANSI_CPRNG is not set -CONFIG_CRC_T10DIF=y diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index c17614e..7ddcd99 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -309,4 +309,3 @@ CONFIG_SECURITY_SELINUX=y CONFIG_SECURITY_SELINUX_BOOTPARAM=y CONFIG_SECURITY_SELINUX_DISABLE=y # CONFIG_CRYPTO_ANSI_CPRNG is not set -CONFIG_CRC_T10DIF=y -- cgit v1.1 From be2f328d8616c81b86e4013974608776c317bbeb Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Sun, 2 Sep 2012 22:21:21 -0700 Subject: x86/Kconfig: Turn off CONFIG_BLK_DEV_RAM The vast majority of systems either use initramfs or mount a root filesystem directly from the kernel. Distros have defaulted to initramfs for years. Only highly specialized systems would use an actual filesystem-image initrd at this point, and such systems don't rely on defconfig anyway. Drop initrd support (and specifically RAM block device support) from the defconfigs. Signed-off-by: Josh Triplett Cc: Randy Dunlap Cc: Suresh Siddha Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/2521e983a63595cd7a331236d929577660f89c72.1346649518.git.josh@joshtriplett.org Signed-off-by: Ingo Molnar --- arch/x86/configs/i386_defconfig | 2 -- arch/x86/configs/x86_64_defconfig | 2 -- 2 files changed, 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index d833bb6..a6533a2 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -144,8 +144,6 @@ CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" CONFIG_DEBUG_DEVRES=y CONFIG_CONNECTOR=y CONFIG_BLK_DEV_LOOP=y -CONFIG_BLK_DEV_RAM=y -CONFIG_BLK_DEV_RAM_SIZE=16384 CONFIG_BLK_DEV_SD=y CONFIG_BLK_DEV_SR=y CONFIG_BLK_DEV_SR_VENDOR=y diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index 7ddcd99..18f3cc4 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -144,8 +144,6 @@ CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" CONFIG_DEBUG_DEVRES=y CONFIG_CONNECTOR=y CONFIG_BLK_DEV_LOOP=y -CONFIG_BLK_DEV_RAM=y -CONFIG_BLK_DEV_RAM_SIZE=16384 CONFIG_BLK_DEV_SD=y CONFIG_BLK_DEV_SR=y CONFIG_BLK_DEV_SR_VENDOR=y -- cgit v1.1 From c206b9dcb1fc41b104db43e98f9b83a8c0c559ae Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Sun, 2 Sep 2012 22:21:29 -0700 Subject: x86/Kconfig: Turn off DEBUG_NX_TEST module in defconfigs The x86 defconfigs include exactly one module: test_nx.ko, a special-purpose module which just exists to do evil things like executing code off the stack to see if the kernel has enabled NX support. Anyone who actually uses that module can easily enable it themselves, but the vast majority of kernel builds don't need it; disable it by default. Signed-off-by: Josh Triplett Cc: Randy Dunlap Cc: Suresh Siddha Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Arjan van de Ven Link: http://lkml.kernel.org/r/e72faf875e1172fb1cbec5e6d3cd4122df508a97.1346649518.git.josh@joshtriplett.org Signed-off-by: Ingo Molnar --- arch/x86/configs/i386_defconfig | 1 - arch/x86/configs/x86_64_defconfig | 1 - 2 files changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index a6533a2..5598547 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -298,7 +298,6 @@ CONFIG_PROVIDE_OHCI1394_DMA_INIT=y CONFIG_EARLY_PRINTK_DBGP=y CONFIG_DEBUG_STACKOVERFLOW=y # CONFIG_DEBUG_RODATA_TEST is not set -CONFIG_DEBUG_NX_TEST=m CONFIG_DEBUG_BOOT_PARAMS=y CONFIG_OPTIMIZE_INLINING=y CONFIG_KEYS_DEBUG_PROC_KEYS=y diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index 18f3cc4..671524d 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -297,7 +297,6 @@ CONFIG_PROVIDE_OHCI1394_DMA_INIT=y CONFIG_EARLY_PRINTK_DBGP=y CONFIG_DEBUG_STACKOVERFLOW=y # CONFIG_DEBUG_RODATA_TEST is not set -CONFIG_DEBUG_NX_TEST=m CONFIG_DEBUG_BOOT_PARAMS=y CONFIG_OPTIMIZE_INLINING=y CONFIG_KEYS_DEBUG_PROC_KEYS=y -- cgit v1.1 From f00026276ace77dcad1cdf17f696ae4e56e12ee6 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Sun, 2 Sep 2012 23:31:40 +0200 Subject: x86: Fix __user annotations in asm/sys_ia32.h Fix the following sparse warnings: sys_ia32.c:293:38: warning: incorrect type in argument 2 (different address spaces) sys_ia32.c:293:38: expected unsigned int [noderef] [usertype] *stat_addr sys_ia32.c:293:38: got unsigned int *stat_addr Ironically, sys_ia32.h was introduced to fix sparse warnings but missed that one. Signed-off-by: Mathias Krause Link: http://lkml.kernel.org/r/1346621506-30857-2-git-send-email-minipli@googlemail.com Signed-off-by: Ingo Molnar --- arch/x86/ia32/sys_ia32.c | 2 +- arch/x86/include/asm/sys_ia32.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index 4540bec..c5b938d 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c @@ -287,7 +287,7 @@ asmlinkage long sys32_sigaction(int sig, struct old_sigaction32 __user *act, return ret; } -asmlinkage long sys32_waitpid(compat_pid_t pid, unsigned int *stat_addr, +asmlinkage long sys32_waitpid(compat_pid_t pid, unsigned int __user *stat_addr, int options) { return compat_sys_wait4(pid, stat_addr, options, NULL); diff --git a/arch/x86/include/asm/sys_ia32.h b/arch/x86/include/asm/sys_ia32.h index 3fda9db4..4ca1c61 100644 --- a/arch/x86/include/asm/sys_ia32.h +++ b/arch/x86/include/asm/sys_ia32.h @@ -40,7 +40,7 @@ asmlinkage long sys32_sigaction(int, struct old_sigaction32 __user *, struct old_sigaction32 __user *); asmlinkage long sys32_alarm(unsigned int); -asmlinkage long sys32_waitpid(compat_pid_t, unsigned int *, int); +asmlinkage long sys32_waitpid(compat_pid_t, unsigned int __user *, int); asmlinkage long sys32_sysfs(int, u32, u32); asmlinkage long sys32_sched_rr_get_interval(compat_pid_t, -- cgit v1.1 From 3d1334064fb365ea8f299874c2b4c46de2bee74d Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Sun, 2 Sep 2012 23:31:41 +0200 Subject: x86/vdso: Add __user annotation to VDSO32_SYMBOL The address calculated by VDSO32_SYMBOL() is a pointer into userland. Add the __user annotation to fix related sparse warnings in its users. Signed-off-by: Mathias Krause Cc: Andy Lutomirski Link: http://lkml.kernel.org/r/1346621506-30857-3-git-send-email-minipli@googlemail.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/vdso.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h index bb05228..fddb53d 100644 --- a/arch/x86/include/asm/vdso.h +++ b/arch/x86/include/asm/vdso.h @@ -11,7 +11,8 @@ extern const char VDSO32_PRELINK[]; #define VDSO32_SYMBOL(base, name) \ ({ \ extern const char VDSO32_##name[]; \ - (void *)(VDSO32_##name - VDSO32_PRELINK + (unsigned long)(base)); \ + (void __user *)(VDSO32_##name - VDSO32_PRELINK + \ + (unsigned long)(base)); \ }) #endif -- cgit v1.1 From 0ff8fef4eaf252ee13a2d0b175a8c876415bd62a Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Sun, 2 Sep 2012 23:31:42 +0200 Subject: x86/signals: ia32_signal.c: add __user casts to fix sparse warnings Fix the following sparse warnings by adding appropriate __user casts and annotations: ia32_signal.c:165:38: warning: incorrect type in argument 1 (different address spaces) ia32_signal.c:165:38: expected struct sigaltstack const [noderef] [usertype] * ia32_signal.c:165:38: got struct sigaltstack * [...] Signed-off-by: Mathias Krause Cc: Oleg Nesterov Link: http://lkml.kernel.org/r/1346621506-30857-4-git-send-email-minipli@googlemail.com Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32_signal.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 673ac9b..452d4dd 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -162,7 +162,8 @@ asmlinkage long sys32_sigaltstack(const stack_ia32_t __user *uss_ptr, } seg = get_fs(); set_fs(KERNEL_DS); - ret = do_sigaltstack(uss_ptr ? &uss : NULL, &uoss, regs->sp); + ret = do_sigaltstack((stack_t __force __user *) (uss_ptr ? &uss : NULL), + (stack_t __force __user *) &uoss, regs->sp); set_fs(seg); if (ret >= 0 && uoss_ptr) { if (!access_ok(VERIFY_WRITE, uoss_ptr, sizeof(stack_ia32_t))) @@ -361,7 +362,7 @@ static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc, */ static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, - void **fpstate) + void __user **fpstate) { unsigned long sp; @@ -382,7 +383,7 @@ static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, if (used_math()) { sp = sp - sig_xstate_ia32_size; - *fpstate = (struct _fpstate_ia32 *) sp; + *fpstate = (struct _fpstate_ia32 __user *) sp; if (save_i387_xstate_ia32(*fpstate) < 0) return (void __user *) -1L; } @@ -448,7 +449,7 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka, * These are actually not used anymore, but left because some * gdb versions depend on them as a marker. */ - put_user_ex(*((u64 *)&code), (u64 *)frame->retcode); + put_user_ex(*((u64 *)&code), (u64 __user *)frame->retcode); } put_user_catch(err); if (err) @@ -529,7 +530,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, * Not actually used anymore, but left because some gdb * versions need it. */ - put_user_ex(*((u64 *)&code), (u64 *)frame->retcode); + put_user_ex(*((u64 *)&code), (u64 __user *)frame->retcode); } put_user_catch(err); if (err) -- cgit v1.1 From 04d695a6828bca54d53305246545cd1f8a841ac6 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Sun, 2 Sep 2012 23:31:43 +0200 Subject: x86/pci/probe_roms: Add missing __iomem annotation to pci_map_biosrom() Stay in sync with the declaration and fix the corresponding sparse warnings. Signed-off-by: Mathias Krause Cc: Dan Williams Link: http://lkml.kernel.org/r/1346621506-30857-5-git-send-email-minipli@googlemail.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/probe_roms.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c index 0bc72e2..d5f15c3 100644 --- a/arch/x86/kernel/probe_roms.c +++ b/arch/x86/kernel/probe_roms.c @@ -150,7 +150,7 @@ static struct resource *find_oprom(struct pci_dev *pdev) return oprom; } -void *pci_map_biosrom(struct pci_dev *pdev) +void __iomem *pci_map_biosrom(struct pci_dev *pdev) { struct resource *oprom = find_oprom(pdev); -- cgit v1.1 From 5c7d03e99cb1ed449328ed9fba0c632944d39e7e Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Sun, 2 Sep 2012 23:31:44 +0200 Subject: x86/fpu/xsave: Keep __user annotation in casts Don't remove the __user annotation of the fpstate pointer, but drop the superfluous void * cast instead. This fixes the following sparse warnings: xsave.c:135:15: warning: cast removes address space of expression xsave.c:135:15: warning: incorrect type in argument 1 (different address spaces) xsave.c:135:15: expected void const volatile [noderef] * [...] Signed-off-by: Mathias Krause Cc: Suresh Siddha Link: http://lkml.kernel.org/r/1346621506-30857-6-git-send-email-minipli@googlemail.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/xsave.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 3d3e207..9e1a8a7 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -132,9 +132,9 @@ int check_for_xstate(struct i387_fxsave_struct __user *buf, fx_sw_user->xstate_size > fx_sw_user->extended_size) return -EINVAL; - err = __get_user(magic2, (__u32 *) (((void *)fpstate) + - fx_sw_user->extended_size - - FP_XSTATE_MAGIC2_SIZE)); + err = __get_user(magic2, (__u32 __user *) (fpstate + + fx_sw_user->extended_size - + FP_XSTATE_MAGIC2_SIZE)); if (err) return err; /* -- cgit v1.1 From 2b11afd1ab502d959ae8d6d5812923151b5bc505 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Sun, 2 Sep 2012 23:31:45 +0200 Subject: x86/iommu: Drop duplicate const in __IOMMU_INIT It's redundant and makes sparse complain about it. Signed-off-by: Mathias Krause Cc: Joerg Roedel Link: http://lkml.kernel.org/r/1346621506-30857-7-git-send-email-minipli@googlemail.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/iommu_table.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/iommu_table.h b/arch/x86/include/asm/iommu_table.h index f229b13..bbf8fb2e 100644 --- a/arch/x86/include/asm/iommu_table.h +++ b/arch/x86/include/asm/iommu_table.h @@ -48,7 +48,7 @@ struct iommu_table_entry { #define __IOMMU_INIT(_detect, _depend, _early_init, _late_init, _finish)\ - static const struct iommu_table_entry const \ + static const struct iommu_table_entry \ __iommu_entry_##_detect __used \ __attribute__ ((unused, __section__(".iommu_table"), \ aligned((sizeof(void *))))) \ -- cgit v1.1 From ae13b7b4e041eccf34fa4dd58581fe1441375578 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Sun, 2 Sep 2012 23:31:46 +0200 Subject: x86/iommu: Use NULL instead of plain 0 for __IOMMU_INIT IOMMU_INIT_POST and IOMMU_INIT_POST_FINISH pass the plain value 0 instead of NULL to __IOMMU_INIT. Fix this and make sparse happy by doing so. Signed-off-by: Mathias Krause Cc: Joerg Roedel Link: http://lkml.kernel.org/r/1346621506-30857-8-git-send-email-minipli@googlemail.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/iommu_table.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/iommu_table.h b/arch/x86/include/asm/iommu_table.h index bbf8fb2e..f42a047 100644 --- a/arch/x86/include/asm/iommu_table.h +++ b/arch/x86/include/asm/iommu_table.h @@ -63,10 +63,10 @@ struct iommu_table_entry { * to stop detecting the other IOMMUs after yours has been detected. */ #define IOMMU_INIT_POST(_detect) \ - __IOMMU_INIT(_detect, pci_swiotlb_detect_4gb, 0, 0, 0) + __IOMMU_INIT(_detect, pci_swiotlb_detect_4gb, NULL, NULL, 0) #define IOMMU_INIT_POST_FINISH(detect) \ - __IOMMU_INIT(_detect, pci_swiotlb_detect_4gb, 0, 0, 1) + __IOMMU_INIT(_detect, pci_swiotlb_detect_4gb, NULL, NULL, 1) /* * A more sophisticated version of IOMMU_INIT. This variant requires: -- cgit v1.1 From 4454d32749465ffa77d82bc1fdd196d6dedc544b Mon Sep 17 00:00:00 2001 From: Joe Millenbach Date: Sun, 2 Sep 2012 17:38:20 -0700 Subject: x86/kconfig: Remove outdated reference to Intel CPUs in CONFIG_SWIOTLB Deleted the no longer valid example of which x86 CPUs lack a hardware IOMMU, and moved the "If unsure..." statement to a new line to follow the style of surrounding options. Signed-off-by: Joe Millenbach Reviewed-by: Josh Triplett Cc: team-fjord@googlegroups.com Cc: Konrad Rzeszutek Wilk Link: http://lkml.kernel.org/r/1346632700-29113-1-git-send-email-jmillenbach@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 8ec3a1a..50a1d1f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -746,10 +746,10 @@ config SWIOTLB def_bool y if X86_64 ---help--- Support for software bounce buffers used on x86-64 systems - which don't have a hardware IOMMU (e.g. the current generation - of Intel's x86-64 CPUs). Using this PCI devices which can only - access 32-bits of memory can be used on systems with more than - 3 GB of memory. If unsure, say Y. + which don't have a hardware IOMMU. Using this PCI devices + which can only access 32-bits of memory can be used on systems + with more than 3 GB of memory. + If unsure, say Y. config IOMMU_HELPER def_bool (CALGARY_IOMMU || GART_IOMMU || SWIOTLB || AMD_IOMMU) -- cgit v1.1 From 326d07cb30fed2387efccd4bf3bd8e4f28719e9e Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Thu, 30 Aug 2012 01:30:13 +0200 Subject: KVM: x86: minor size optimization Some fields can be constified and/or made static to reduce code and data size. Numbers for a 32 bit build: text data bss dec hex filename before: 3351 80 0 3431 d67 cpuid.o after: 3391 0 0 3391 d3f cpuid.o Signed-off-by: Mathias Krause Signed-off-by: Avi Kivity --- arch/x86/kvm/cpuid.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index b496da6..ec79e77 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -397,8 +397,8 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, break; } case KVM_CPUID_SIGNATURE: { - char signature[12] = "KVMKVMKVM\0\0"; - u32 *sigptr = (u32 *)signature; + static const char signature[12] = "KVMKVMKVM\0\0"; + const u32 *sigptr = (const u32 *)signature; entry->eax = KVM_CPUID_FEATURES; entry->ebx = sigptr[0]; entry->ecx = sigptr[1]; @@ -484,10 +484,10 @@ struct kvm_cpuid_param { u32 func; u32 idx; bool has_leaf_count; - bool (*qualifier)(struct kvm_cpuid_param *param); + bool (*qualifier)(const struct kvm_cpuid_param *param); }; -static bool is_centaur_cpu(struct kvm_cpuid_param *param) +static bool is_centaur_cpu(const struct kvm_cpuid_param *param) { return boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR; } @@ -498,7 +498,7 @@ int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 *cpuid_entries; int limit, nent = 0, r = -E2BIG, i; u32 func; - static struct kvm_cpuid_param param[] = { + static const struct kvm_cpuid_param param[] = { { .func = 0, .has_leaf_count = true }, { .func = 0x80000000, .has_leaf_count = true }, { .func = 0xC0000000, .qualifier = is_centaur_cpu, .has_leaf_count = true }, @@ -517,7 +517,7 @@ int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, r = 0; for (i = 0; i < ARRAY_SIZE(param); i++) { - struct kvm_cpuid_param *ent = ¶m[i]; + const struct kvm_cpuid_param *ent = ¶m[i]; if (ent->qualifier && !ent->qualifier(ent)) continue; -- cgit v1.1 From 89a87c67791b840d815a2028b88cefe6906ed42c Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Thu, 30 Aug 2012 01:30:14 +0200 Subject: KVM: x86 emulator: use aligned variants of SSE register ops As the the compiler ensures that the memory operand is always aligned to a 16 byte memory location, use the aligned variant of MOVDQ for read_sse_reg() and write_sse_reg(). Signed-off-by: Mathias Krause Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 64 +++++++++++++++++++++++++------------------------- 1 file changed, 32 insertions(+), 32 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 1451cff..5a0fee1 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -909,23 +909,23 @@ static void read_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, int reg) { ctxt->ops->get_fpu(ctxt); switch (reg) { - case 0: asm("movdqu %%xmm0, %0" : "=m"(*data)); break; - case 1: asm("movdqu %%xmm1, %0" : "=m"(*data)); break; - case 2: asm("movdqu %%xmm2, %0" : "=m"(*data)); break; - case 3: asm("movdqu %%xmm3, %0" : "=m"(*data)); break; - case 4: asm("movdqu %%xmm4, %0" : "=m"(*data)); break; - case 5: asm("movdqu %%xmm5, %0" : "=m"(*data)); break; - case 6: asm("movdqu %%xmm6, %0" : "=m"(*data)); break; - case 7: asm("movdqu %%xmm7, %0" : "=m"(*data)); break; + case 0: asm("movdqa %%xmm0, %0" : "=m"(*data)); break; + case 1: asm("movdqa %%xmm1, %0" : "=m"(*data)); break; + case 2: asm("movdqa %%xmm2, %0" : "=m"(*data)); break; + case 3: asm("movdqa %%xmm3, %0" : "=m"(*data)); break; + case 4: asm("movdqa %%xmm4, %0" : "=m"(*data)); break; + case 5: asm("movdqa %%xmm5, %0" : "=m"(*data)); break; + case 6: asm("movdqa %%xmm6, %0" : "=m"(*data)); break; + case 7: asm("movdqa %%xmm7, %0" : "=m"(*data)); break; #ifdef CONFIG_X86_64 - case 8: asm("movdqu %%xmm8, %0" : "=m"(*data)); break; - case 9: asm("movdqu %%xmm9, %0" : "=m"(*data)); break; - case 10: asm("movdqu %%xmm10, %0" : "=m"(*data)); break; - case 11: asm("movdqu %%xmm11, %0" : "=m"(*data)); break; - case 12: asm("movdqu %%xmm12, %0" : "=m"(*data)); break; - case 13: asm("movdqu %%xmm13, %0" : "=m"(*data)); break; - case 14: asm("movdqu %%xmm14, %0" : "=m"(*data)); break; - case 15: asm("movdqu %%xmm15, %0" : "=m"(*data)); break; + case 8: asm("movdqa %%xmm8, %0" : "=m"(*data)); break; + case 9: asm("movdqa %%xmm9, %0" : "=m"(*data)); break; + case 10: asm("movdqa %%xmm10, %0" : "=m"(*data)); break; + case 11: asm("movdqa %%xmm11, %0" : "=m"(*data)); break; + case 12: asm("movdqa %%xmm12, %0" : "=m"(*data)); break; + case 13: asm("movdqa %%xmm13, %0" : "=m"(*data)); break; + case 14: asm("movdqa %%xmm14, %0" : "=m"(*data)); break; + case 15: asm("movdqa %%xmm15, %0" : "=m"(*data)); break; #endif default: BUG(); } @@ -937,23 +937,23 @@ static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, { ctxt->ops->get_fpu(ctxt); switch (reg) { - case 0: asm("movdqu %0, %%xmm0" : : "m"(*data)); break; - case 1: asm("movdqu %0, %%xmm1" : : "m"(*data)); break; - case 2: asm("movdqu %0, %%xmm2" : : "m"(*data)); break; - case 3: asm("movdqu %0, %%xmm3" : : "m"(*data)); break; - case 4: asm("movdqu %0, %%xmm4" : : "m"(*data)); break; - case 5: asm("movdqu %0, %%xmm5" : : "m"(*data)); break; - case 6: asm("movdqu %0, %%xmm6" : : "m"(*data)); break; - case 7: asm("movdqu %0, %%xmm7" : : "m"(*data)); break; + case 0: asm("movdqa %0, %%xmm0" : : "m"(*data)); break; + case 1: asm("movdqa %0, %%xmm1" : : "m"(*data)); break; + case 2: asm("movdqa %0, %%xmm2" : : "m"(*data)); break; + case 3: asm("movdqa %0, %%xmm3" : : "m"(*data)); break; + case 4: asm("movdqa %0, %%xmm4" : : "m"(*data)); break; + case 5: asm("movdqa %0, %%xmm5" : : "m"(*data)); break; + case 6: asm("movdqa %0, %%xmm6" : : "m"(*data)); break; + case 7: asm("movdqa %0, %%xmm7" : : "m"(*data)); break; #ifdef CONFIG_X86_64 - case 8: asm("movdqu %0, %%xmm8" : : "m"(*data)); break; - case 9: asm("movdqu %0, %%xmm9" : : "m"(*data)); break; - case 10: asm("movdqu %0, %%xmm10" : : "m"(*data)); break; - case 11: asm("movdqu %0, %%xmm11" : : "m"(*data)); break; - case 12: asm("movdqu %0, %%xmm12" : : "m"(*data)); break; - case 13: asm("movdqu %0, %%xmm13" : : "m"(*data)); break; - case 14: asm("movdqu %0, %%xmm14" : : "m"(*data)); break; - case 15: asm("movdqu %0, %%xmm15" : : "m"(*data)); break; + case 8: asm("movdqa %0, %%xmm8" : : "m"(*data)); break; + case 9: asm("movdqa %0, %%xmm9" : : "m"(*data)); break; + case 10: asm("movdqa %0, %%xmm10" : : "m"(*data)); break; + case 11: asm("movdqa %0, %%xmm11" : : "m"(*data)); break; + case 12: asm("movdqa %0, %%xmm12" : : "m"(*data)); break; + case 13: asm("movdqa %0, %%xmm13" : : "m"(*data)); break; + case 14: asm("movdqa %0, %%xmm14" : : "m"(*data)); break; + case 15: asm("movdqa %0, %%xmm15" : : "m"(*data)); break; #endif default: BUG(); } -- cgit v1.1 From fd0a0d82083747301f6c8084b4141bb490625016 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Thu, 30 Aug 2012 01:30:15 +0200 Subject: KVM: x86 emulator: mark opcode tables const The opcode tables never change at runtime, therefor mark them const. Signed-off-by: Mathias Krause Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 5a0fee1..fd06f9d 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -161,9 +161,9 @@ struct opcode { u64 intercept : 8; union { int (*execute)(struct x86_emulate_ctxt *ctxt); - struct opcode *group; - struct group_dual *gdual; - struct gprefix *gprefix; + const struct opcode *group; + const struct group_dual *gdual; + const struct gprefix *gprefix; } u; int (*check_perm)(struct x86_emulate_ctxt *ctxt); }; @@ -3574,13 +3574,13 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt) I2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e), \ I2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e) -static struct opcode group7_rm1[] = { +static const struct opcode group7_rm1[] = { DI(SrcNone | Priv, monitor), DI(SrcNone | Priv, mwait), N, N, N, N, N, N, }; -static struct opcode group7_rm3[] = { +static const struct opcode group7_rm3[] = { DIP(SrcNone | Prot | Priv, vmrun, check_svme_pa), II(SrcNone | Prot | VendorSpecific, em_vmmcall, vmmcall), DIP(SrcNone | Prot | Priv, vmload, check_svme_pa), @@ -3591,13 +3591,13 @@ static struct opcode group7_rm3[] = { DIP(SrcNone | Prot | Priv, invlpga, check_svme), }; -static struct opcode group7_rm7[] = { +static const struct opcode group7_rm7[] = { N, DIP(SrcNone, rdtscp, check_rdtsc), N, N, N, N, N, N, }; -static struct opcode group1[] = { +static const struct opcode group1[] = { I(Lock, em_add), I(Lock | PageTable, em_or), I(Lock, em_adc), @@ -3608,11 +3608,11 @@ static struct opcode group1[] = { I(0, em_cmp), }; -static struct opcode group1A[] = { +static const struct opcode group1A[] = { I(DstMem | SrcNone | Mov | Stack, em_pop), N, N, N, N, N, N, N, }; -static struct opcode group3[] = { +static const struct opcode group3[] = { I(DstMem | SrcImm, em_test), I(DstMem | SrcImm, em_test), I(DstMem | SrcNone | Lock, em_not), @@ -3623,13 +3623,13 @@ static struct opcode group3[] = { I(SrcMem, em_idiv_ex), }; -static struct opcode group4[] = { +static const struct opcode group4[] = { I(ByteOp | DstMem | SrcNone | Lock, em_grp45), I(ByteOp | DstMem | SrcNone | Lock, em_grp45), N, N, N, N, N, N, }; -static struct opcode group5[] = { +static const struct opcode group5[] = { I(DstMem | SrcNone | Lock, em_grp45), I(DstMem | SrcNone | Lock, em_grp45), I(SrcMem | Stack, em_grp45), @@ -3639,7 +3639,7 @@ static struct opcode group5[] = { I(SrcMem | Stack, em_grp45), N, }; -static struct opcode group6[] = { +static const struct opcode group6[] = { DI(Prot, sldt), DI(Prot, str), II(Prot | Priv | SrcMem16, em_lldt, lldt), @@ -3647,7 +3647,7 @@ static struct opcode group6[] = { N, N, N, N, }; -static struct group_dual group7 = { { +static const struct group_dual group7 = { { II(Mov | DstMem | Priv, em_sgdt, sgdt), II(Mov | DstMem | Priv, em_sidt, sidt), II(SrcMem | Priv, em_lgdt, lgdt), @@ -3664,7 +3664,7 @@ static struct group_dual group7 = { { EXT(0, group7_rm7), } }; -static struct opcode group8[] = { +static const struct opcode group8[] = { N, N, N, N, I(DstMem | SrcImmByte, em_bt), I(DstMem | SrcImmByte | Lock | PageTable, em_bts), @@ -3672,26 +3672,26 @@ static struct opcode group8[] = { I(DstMem | SrcImmByte | Lock | PageTable, em_btc), }; -static struct group_dual group9 = { { +static const struct group_dual group9 = { { N, I(DstMem64 | Lock | PageTable, em_cmpxchg8b), N, N, N, N, N, N, }, { N, N, N, N, N, N, N, N, } }; -static struct opcode group11[] = { +static const struct opcode group11[] = { I(DstMem | SrcImm | Mov | PageTable, em_mov), X7(D(Undefined)), }; -static struct gprefix pfx_0f_6f_0f_7f = { +static const struct gprefix pfx_0f_6f_0f_7f = { I(Mmx, em_mov), I(Sse | Aligned, em_mov), N, I(Sse | Unaligned, em_mov), }; -static struct gprefix pfx_vmovntpx = { +static const struct gprefix pfx_vmovntpx = { I(0, em_mov), N, N, N, }; -static struct opcode opcode_table[256] = { +static const struct opcode opcode_table[256] = { /* 0x00 - 0x07 */ I6ALU(Lock, em_add), I(ImplicitOps | Stack | No64 | Src2ES, em_push_sreg), @@ -3808,7 +3808,7 @@ static struct opcode opcode_table[256] = { D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5), }; -static struct opcode twobyte_table[256] = { +static const struct opcode twobyte_table[256] = { /* 0x00 - 0x0F */ G(0, group6), GD(0, &group7), N, N, N, I(ImplicitOps | VendorSpecific, em_syscall), -- cgit v1.1 From 0225fb509d51fcf777eb0aa31c304c582e3248fd Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Thu, 30 Aug 2012 01:30:16 +0200 Subject: KVM: x86 emulator: constify emulate_ops We never change emulate_ops[] at runtime so it should be r/o. Signed-off-by: Mathias Krause Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 2 +- arch/x86/kvm/emulate.c | 22 +++++++++++----------- arch/x86/kvm/x86.c | 2 +- 3 files changed, 13 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 282aee5..b5bb73a 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -250,7 +250,7 @@ struct read_cache { }; struct x86_emulate_ctxt { - struct x86_emulate_ops *ops; + const struct x86_emulate_ops *ops; /* Register state before/after emulation. */ unsigned long eflags; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index fd06f9d..663e958 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1325,7 +1325,7 @@ static int read_interrupt_descriptor(struct x86_emulate_ctxt *ctxt, static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, u16 selector, struct desc_ptr *dt) { - struct x86_emulate_ops *ops = ctxt->ops; + const struct x86_emulate_ops *ops = ctxt->ops; if (selector & 1 << 2) { struct desc_struct desc; @@ -1747,7 +1747,7 @@ static int em_popa(struct x86_emulate_ctxt *ctxt) static int __emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq) { - struct x86_emulate_ops *ops = ctxt->ops; + const struct x86_emulate_ops *ops = ctxt->ops; int rc; struct desc_ptr dt; gva_t cs_addr; @@ -2129,7 +2129,7 @@ static bool vendor_intel(struct x86_emulate_ctxt *ctxt) static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt) { - struct x86_emulate_ops *ops = ctxt->ops; + const struct x86_emulate_ops *ops = ctxt->ops; u32 eax, ebx, ecx, edx; /* @@ -2173,7 +2173,7 @@ static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt) static int em_syscall(struct x86_emulate_ctxt *ctxt) { - struct x86_emulate_ops *ops = ctxt->ops; + const struct x86_emulate_ops *ops = ctxt->ops; struct desc_struct cs, ss; u64 msr_data; u16 cs_sel, ss_sel; @@ -2231,7 +2231,7 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt) static int em_sysenter(struct x86_emulate_ctxt *ctxt) { - struct x86_emulate_ops *ops = ctxt->ops; + const struct x86_emulate_ops *ops = ctxt->ops; struct desc_struct cs, ss; u64 msr_data; u16 cs_sel, ss_sel; @@ -2294,7 +2294,7 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt) static int em_sysexit(struct x86_emulate_ctxt *ctxt) { - struct x86_emulate_ops *ops = ctxt->ops; + const struct x86_emulate_ops *ops = ctxt->ops; struct desc_struct cs, ss; u64 msr_data; int usermode; @@ -2357,7 +2357,7 @@ static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt) static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, u16 port, u16 len) { - struct x86_emulate_ops *ops = ctxt->ops; + const struct x86_emulate_ops *ops = ctxt->ops; struct desc_struct tr_seg; u32 base3; int r; @@ -2476,7 +2476,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, u16 tss_selector, u16 old_tss_sel, ulong old_tss_base, struct desc_struct *new_desc) { - struct x86_emulate_ops *ops = ctxt->ops; + const struct x86_emulate_ops *ops = ctxt->ops; struct tss_segment_16 tss_seg; int ret; u32 new_tss_base = get_desc_base(new_desc); @@ -2623,7 +2623,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, u16 tss_selector, u16 old_tss_sel, ulong old_tss_base, struct desc_struct *new_desc) { - struct x86_emulate_ops *ops = ctxt->ops; + const struct x86_emulate_ops *ops = ctxt->ops; struct tss_segment_32 tss_seg; int ret; u32 new_tss_base = get_desc_base(new_desc); @@ -2667,7 +2667,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, u16 tss_selector, int idt_index, int reason, bool has_error_code, u32 error_code) { - struct x86_emulate_ops *ops = ctxt->ops; + const struct x86_emulate_ops *ops = ctxt->ops; struct desc_struct curr_tss_desc, next_tss_desc; int ret; u16 old_tss_sel = get_segment_selector(ctxt, VCPU_SREG_TR); @@ -4339,7 +4339,7 @@ static void fetch_possible_mmx_operand(struct x86_emulate_ctxt *ctxt, int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) { - struct x86_emulate_ops *ops = ctxt->ops; + const struct x86_emulate_ops *ops = ctxt->ops; int rc = X86EMUL_CONTINUE; int saved_dst_type = ctxt->dst.type; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 20f2266..0dc066f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4323,7 +4323,7 @@ static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulon kvm_register_write(emul_to_vcpu(ctxt), reg, val); } -static struct x86_emulate_ops emulate_ops = { +static const struct x86_emulate_ops emulate_ops = { .read_gpr = emulator_read_gpr, .write_gpr = emulator_write_gpr, .read_std = kvm_read_guest_virt_system, -- cgit v1.1 From 0fbe9b0b19fb92eee2cf23c23d63d6b3312681e5 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Thu, 30 Aug 2012 01:30:17 +0200 Subject: KVM: x86: constify read_write_emulator_ops We never change those, make them r/o. Signed-off-by: Mathias Krause Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0dc066f..3172416 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3773,14 +3773,14 @@ static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, return X86EMUL_CONTINUE; } -static struct read_write_emulator_ops read_emultor = { +static const struct read_write_emulator_ops read_emultor = { .read_write_prepare = read_prepare, .read_write_emulate = read_emulate, .read_write_mmio = vcpu_mmio_read, .read_write_exit_mmio = read_exit_mmio, }; -static struct read_write_emulator_ops write_emultor = { +static const struct read_write_emulator_ops write_emultor = { .read_write_emulate = write_emulate, .read_write_mmio = write_mmio, .read_write_exit_mmio = write_exit_mmio, @@ -3791,7 +3791,7 @@ static int emulator_read_write_onepage(unsigned long addr, void *val, unsigned int bytes, struct x86_exception *exception, struct kvm_vcpu *vcpu, - struct read_write_emulator_ops *ops) + const struct read_write_emulator_ops *ops) { gpa_t gpa; int handled, ret; @@ -3840,7 +3840,7 @@ mmio: int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr, void *val, unsigned int bytes, struct x86_exception *exception, - struct read_write_emulator_ops *ops) + const struct read_write_emulator_ops *ops) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); gpa_t gpa; -- cgit v1.1 From f1d248315afc55771c3991b934014daa154d05f1 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Thu, 30 Aug 2012 01:30:18 +0200 Subject: KVM: x86: more constification Signed-off-by: Mathias Krause Signed-off-by: Avi Kivity --- arch/x86/kvm/lapic.c | 2 +- arch/x86/kvm/x86.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 18d149d..07ad628 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -198,7 +198,7 @@ static inline int apic_x2apic_mode(struct kvm_lapic *apic) return apic->vcpu->arch.apic_base & X2APIC_ENABLE; } -static unsigned int apic_lvt_mask[APIC_LVT_NUM] = { +static const unsigned int apic_lvt_mask[APIC_LVT_NUM] = { LVT_MASK , /* part LVTT mask, timer mode mask added at runtime */ LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */ LVT_MASK | APIC_MODE_MASK, /* LVTPC */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3172416..666da13 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -817,7 +817,7 @@ static u32 msrs_to_save[] = { static unsigned num_msrs_to_save; -static u32 emulated_msrs[] = { +static const u32 emulated_msrs[] = { MSR_IA32_TSCDEADLINE, MSR_IA32_MISC_ENABLE, MSR_IA32_MCG_STATUS, -- cgit v1.1 From 772e031899fdf3c7636d66aae9b0b57d1aaebb93 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Thu, 30 Aug 2012 01:30:19 +0200 Subject: KVM: VMX: constify lookup tables We use vmcs_field_to_offset_table[], kvm_vmx_segment_fields[] and kvm_vmx_exit_handlers[] as lookup tables only -- make them r/o. Signed-off-by: Mathias Krause Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 248c2b4..d62b413 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -450,7 +450,7 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) #define FIELD64(number, name) [number] = VMCS12_OFFSET(name), \ [number##_HIGH] = VMCS12_OFFSET(name)+4 -static unsigned short vmcs_field_to_offset_table[] = { +static const unsigned short vmcs_field_to_offset_table[] = { FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id), FIELD(GUEST_ES_SELECTOR, guest_es_selector), FIELD(GUEST_CS_SELECTOR, guest_cs_selector), @@ -666,7 +666,7 @@ static struct vmx_capability { .ar_bytes = GUEST_##seg##_AR_BYTES, \ } -static struct kvm_vmx_segment_field { +static const struct kvm_vmx_segment_field { unsigned selector; unsigned base; unsigned limit; @@ -2695,7 +2695,7 @@ static __exit void hardware_unsetup(void) static void fix_pmode_dataseg(struct kvm_vcpu *vcpu, int seg, struct kvm_segment *save) { - struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; + const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; struct kvm_segment tmp = *save; if (!(vmcs_readl(sf->base) == tmp.base && tmp.s)) { @@ -2764,7 +2764,7 @@ static gva_t rmode_tss_base(struct kvm *kvm) static void fix_rmode_seg(int seg, struct kvm_segment *save) { - struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; + const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; vmcs_write16(sf->selector, save->base >> 4); vmcs_write32(sf->base, save->base & 0xffff0); @@ -3202,7 +3202,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) { struct vcpu_vmx *vmx = to_vmx(vcpu); - struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; + const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; u32 ar; vmx_segment_cache_clear(vmx); @@ -3572,7 +3572,7 @@ out: static void seg_setup(int seg) { - struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; + const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; unsigned int ar; vmcs_write16(sf->selector, 0); @@ -5655,7 +5655,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu) * may resume. Otherwise they set the kvm_run parameter to indicate what needs * to be done to userspace and return 0. */ -static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { +static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_EXCEPTION_NMI] = handle_exception, [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, -- cgit v1.1 From 09941fbb712655cde9b350852be7a99a6f61a03f Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Thu, 30 Aug 2012 01:30:20 +0200 Subject: KVM: SVM: constify lookup tables We never modify direct_access_msrs[], msrpm_ranges[], svm_exit_handlers[] or x86_intercept_map[] at runtime. Mark them r/o. Signed-off-by: Mathias Krause Cc: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 31be4a5..611c728 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -163,7 +163,7 @@ static DEFINE_PER_CPU(u64, current_tsc_ratio); #define MSR_INVALID 0xffffffffU -static struct svm_direct_access_msrs { +static const struct svm_direct_access_msrs { u32 index; /* Index of the MSR */ bool always; /* True if intercept is always on */ } direct_access_msrs[] = { @@ -400,7 +400,7 @@ struct svm_init_data { int r; }; -static u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000}; +static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000}; #define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges) #define MSRS_RANGE_SIZE 2048 @@ -3267,7 +3267,7 @@ static int pause_interception(struct vcpu_svm *svm) return 1; } -static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { +static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_READ_CR0] = cr_interception, [SVM_EXIT_READ_CR3] = cr_interception, [SVM_EXIT_READ_CR4] = cr_interception, @@ -4068,7 +4068,7 @@ static void svm_fpu_deactivate(struct kvm_vcpu *vcpu) #define POST_MEM(exit) { .exit_code = (exit), \ .stage = X86_ICPT_POST_MEMACCESS, } -static struct __x86_intercept { +static const struct __x86_intercept { u32 exit_code; enum x86_intercept_stage stage; } x86_intercept_map[] = { -- cgit v1.1 From 50e900417b8096939d12a46848f965e27a905e36 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Tue, 4 Sep 2012 15:45:17 -0400 Subject: xen/p2m: Fix one-off error in checking the P2M tree directory. We would traverse the full P2M top directory (from 0->MAX_DOMAIN_PAGES inclusive) when trying to figure out whether we can re-use some of the P2M middle leafs. Which meant that if the kernel was compiled with MAX_DOMAIN_PAGES=512 we would try to use the 512th entry. Fortunately for us the p2m_top_index has a check for this: BUG_ON(pfn >= MAX_P2M_PFN); which we hit and saw this: (XEN) domain_crash_sync called from entry.S (XEN) Domain 0 (vcpu#0) crashed on cpu#0: (XEN) ----[ Xen-4.1.2-OVM x86_64 debug=n Tainted: C ]---- (XEN) CPU: 0 (XEN) RIP: e033:[] (XEN) RFLAGS: 0000000000000212 EM: 1 CONTEXT: pv guest (XEN) rax: ffffffff81db5000 rbx: ffffffff81db4000 rcx: 0000000000000000 (XEN) rdx: 0000000000480211 rsi: 0000000000000000 rdi: ffffffff81db4000 (XEN) rbp: ffffffff81793db8 rsp: ffffffff81793d38 r8: 0000000008000000 (XEN) r9: 4000000000000000 r10: 0000000000000000 r11: ffffffff81db7000 (XEN) r12: 0000000000000ff8 r13: ffffffff81df1ff8 r14: ffffffff81db6000 (XEN) r15: 0000000000000ff8 cr0: 000000008005003b cr4: 00000000000026f0 (XEN) cr3: 0000000661795000 cr2: 0000000000000000 Fixes-Oracle-Bug: 14570662 CC: stable@vger.kernel.org # only for v3.5 Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/p2m.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index d4b2554..76ba0e9 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -599,7 +599,7 @@ bool __init early_can_reuse_p2m_middle(unsigned long set_pfn, unsigned long set_ if (p2m_index(set_pfn)) return false; - for (pfn = 0; pfn <= MAX_DOMAIN_PAGES; pfn += P2M_PER_PAGE) { + for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_PER_PAGE) { topidx = p2m_top_index(pfn); if (!p2m_top[topidx]) -- cgit v1.1 From ce7184bdbd38d920fb515266fbbdc585ad2e5493 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Fri, 24 Aug 2012 08:55:13 +0000 Subject: xen: fix logical error in tlb flushing While TLB_FLUSH_ALL gets passed as 'end' argument to flush_tlb_others(), the Xen code was made to check its 'start' parameter. That may give a incorrect op.cmd to MMUEXT_INVLPG_MULTI instead of MMUEXT_TLB_FLUSH_MULTI. Then it causes some page can not be flushed from TLB. This patch fixed this issue. Reported-by: Jan Beulich Signed-off-by: Alex Shi Acked-by: Jan Beulich Tested-by: Yongjie Ren Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index b65a761..5141d80 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1283,7 +1283,7 @@ static void xen_flush_tlb_others(const struct cpumask *cpus, cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask)); args->op.cmd = MMUEXT_TLB_FLUSH_MULTI; - if (start != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) { + if (end != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) { args->op.cmd = MMUEXT_INVLPG_MULTI; args->op.arg1.linear_addr = start; } -- cgit v1.1 From 69870a847856a1ba81f655a8633fce5f5b614730 Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Thu, 30 Aug 2012 13:58:11 +0100 Subject: xen/mm: return more precise error from xen_remap_domain_range() Callers of xen_remap_domain_range() need to know if the remap failed because frame is currently paged out. So they can retry the remap later on. Return -ENOENT in this case. This assumes that the error codes returned by Xen are a subset of those used by the kernel. It is unclear if this is defined as part of the hypercall ABI. Acked-by: Andres Lagar-Cavilla Signed-off-by: David Vrabel Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/mmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 885a223..2d9e7c9 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -2331,8 +2331,8 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma, if (err) goto out; - err = -EFAULT; - if (HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid) < 0) + err = HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid); + if (err < 0) goto out; nr -= batch; -- cgit v1.1 From 2df72e9bc4c505d8357012f2924589f3d16f9d44 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Fri, 24 Aug 2012 15:54:57 -0300 Subject: KVM: split kvm_arch_flush_shadow Introducing kvm_arch_flush_shadow_memslot, to invalidate the translations of a single memory slot. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 666da13..37797a0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6447,12 +6447,18 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, spin_unlock(&kvm->mmu_lock); } -void kvm_arch_flush_shadow(struct kvm *kvm) +void kvm_arch_flush_shadow_all(struct kvm *kvm) { kvm_mmu_zap_all(kvm); kvm_reload_remote_mmus(kvm); } +void kvm_arch_flush_shadow_memslot(struct kvm *kvm, + struct kvm_memory_slot *slot) +{ + kvm_arch_flush_shadow_all(kvm); +} + int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) { return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && -- cgit v1.1 From 3b4dc3a031110753b9ba36432dbd21f989fcee56 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 28 Aug 2012 17:43:26 -0300 Subject: KVM: move postcommit flush to x86, as mmio sptes are x86 specific Other arches do not need this. Signed-off-by: Marcelo Tosatti v2: fix incorrect deletion of mmio sptes on gpa move (noticed by Takuya) Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 37797a0..6f6812e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6445,6 +6445,14 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); kvm_mmu_slot_remove_write_access(kvm, mem->slot); spin_unlock(&kvm->mmu_lock); + /* + * If memory slot is created, or moved, we need to clear all + * mmio sptes. + */ + if (npages && old.base_gfn != mem->guest_phys_addr >> PAGE_SHIFT) { + kvm_mmu_zap_all(kvm); + kvm_reload_remote_mmus(kvm); + } } void kvm_arch_flush_shadow_all(struct kvm *kvm) -- cgit v1.1 From 716d51abff06f48425cef15d78ca6f36093f6dbf Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Mon, 3 Sep 2012 15:24:26 +0300 Subject: KVM: Provide userspace IO exit completion callback Current code assumes that IO exit was due to instruction emulation and handles execution back to emulator directly. This patch adds new userspace IO exit completion callback that can be set by any other code that caused IO exit to userspace. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/x86.c | 93 +++++++++++++++++++++++++---------------- 2 files changed, 57 insertions(+), 37 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index fc0e752..64adb61 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -414,6 +414,7 @@ struct kvm_vcpu_arch { struct x86_emulate_ctxt emulate_ctxt; bool emulate_regs_need_sync_to_vcpu; bool emulate_regs_need_sync_from_vcpu; + int (*complete_userspace_io)(struct kvm_vcpu *vcpu); gpa_t time; struct pvclock_vcpu_time_info hv_clock; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6f6812e..f91e2c9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4544,6 +4544,9 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt, return true; } +static int complete_emulated_mmio(struct kvm_vcpu *vcpu); +static int complete_emulated_pio(struct kvm_vcpu *vcpu); + int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2, int emulation_type, @@ -4614,13 +4617,16 @@ restart: } else if (vcpu->arch.pio.count) { if (!vcpu->arch.pio.in) vcpu->arch.pio.count = 0; - else + else { writeback = false; + vcpu->arch.complete_userspace_io = complete_emulated_pio; + } r = EMULATE_DO_MMIO; } else if (vcpu->mmio_needed) { if (!vcpu->mmio_is_write) writeback = false; r = EMULATE_DO_MMIO; + vcpu->arch.complete_userspace_io = complete_emulated_mmio; } else if (r == EMULATION_RESTART) goto restart; else @@ -5476,6 +5482,24 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) return r; } +static inline int complete_emulated_io(struct kvm_vcpu *vcpu) +{ + int r; + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE); + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); + if (r != EMULATE_DONE) + return 0; + return 1; +} + +static int complete_emulated_pio(struct kvm_vcpu *vcpu) +{ + BUG_ON(!vcpu->arch.pio.count); + + return complete_emulated_io(vcpu); +} + /* * Implements the following, as a state machine: * @@ -5492,47 +5516,37 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) * copy data * exit */ -static int complete_mmio(struct kvm_vcpu *vcpu) +static int complete_emulated_mmio(struct kvm_vcpu *vcpu) { struct kvm_run *run = vcpu->run; struct kvm_mmio_fragment *frag; - int r; - if (!(vcpu->arch.pio.count || vcpu->mmio_needed)) - return 1; + BUG_ON(!vcpu->mmio_needed); - if (vcpu->mmio_needed) { - /* Complete previous fragment */ - frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment++]; - if (!vcpu->mmio_is_write) - memcpy(frag->data, run->mmio.data, frag->len); - if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) { - vcpu->mmio_needed = 0; - if (vcpu->mmio_is_write) - return 1; - vcpu->mmio_read_completed = 1; - goto done; - } - /* Initiate next fragment */ - ++frag; - run->exit_reason = KVM_EXIT_MMIO; - run->mmio.phys_addr = frag->gpa; + /* Complete previous fragment */ + frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment++]; + if (!vcpu->mmio_is_write) + memcpy(frag->data, run->mmio.data, frag->len); + if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) { + vcpu->mmio_needed = 0; if (vcpu->mmio_is_write) - memcpy(run->mmio.data, frag->data, frag->len); - run->mmio.len = frag->len; - run->mmio.is_write = vcpu->mmio_is_write; - return 0; - - } -done: - vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); - r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE); - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); - if (r != EMULATE_DONE) - return 0; - return 1; + return 1; + vcpu->mmio_read_completed = 1; + return complete_emulated_io(vcpu); + } + /* Initiate next fragment */ + ++frag; + run->exit_reason = KVM_EXIT_MMIO; + run->mmio.phys_addr = frag->gpa; + if (vcpu->mmio_is_write) + memcpy(run->mmio.data, frag->data, frag->len); + run->mmio.len = frag->len; + run->mmio.is_write = vcpu->mmio_is_write; + vcpu->arch.complete_userspace_io = complete_emulated_mmio; + return 0; } + int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { int r; @@ -5559,9 +5573,14 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } } - r = complete_mmio(vcpu); - if (r <= 0) - goto out; + if (unlikely(vcpu->arch.complete_userspace_io)) { + int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io; + vcpu->arch.complete_userspace_io = NULL; + r = cui(vcpu); + if (r <= 0) + goto out; + } else + WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed); r = __vcpu_run(vcpu); -- cgit v1.1 From 9d1b39a967871b7c69025dba7b7bdaee42871021 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Mon, 3 Sep 2012 15:24:27 +0300 Subject: KVM: emulator: make x86 emulation modes enum instead of defines Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 22 ++++++++++------------ arch/x86/kvm/emulate.c | 4 +++- 2 files changed, 13 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index b5bb73a..e9e5675 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -249,6 +249,15 @@ struct read_cache { unsigned long end; }; +/* Execution mode, passed to the emulator. */ +enum x86emul_mode { + X86EMUL_MODE_REAL, /* Real mode. */ + X86EMUL_MODE_VM86, /* Virtual 8086 mode. */ + X86EMUL_MODE_PROT16, /* 16-bit protected mode. */ + X86EMUL_MODE_PROT32, /* 32-bit protected mode. */ + X86EMUL_MODE_PROT64, /* 64-bit (long) mode. */ +}; + struct x86_emulate_ctxt { const struct x86_emulate_ops *ops; @@ -256,7 +265,7 @@ struct x86_emulate_ctxt { unsigned long eflags; unsigned long eip; /* eip before instruction emulation */ /* Emulated execution mode, represented by an X86EMUL_MODE value. */ - int mode; + enum x86emul_mode mode; /* interruptibility state, as a result of execution of STI or MOV SS */ int interruptibility; @@ -308,17 +317,6 @@ struct x86_emulate_ctxt { #define REPE_PREFIX 0xf3 #define REPNE_PREFIX 0xf2 -/* Execution mode, passed to the emulator. */ -#define X86EMUL_MODE_REAL 0 /* Real mode. */ -#define X86EMUL_MODE_VM86 1 /* Virtual 8086 mode. */ -#define X86EMUL_MODE_PROT16 2 /* 16-bit protected mode. */ -#define X86EMUL_MODE_PROT32 4 /* 32-bit protected mode. */ -#define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */ - -/* any protected mode */ -#define X86EMUL_MODE_PROT (X86EMUL_MODE_PROT16|X86EMUL_MODE_PROT32| \ - X86EMUL_MODE_PROT64) - /* CPUID vendors */ #define X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx 0x68747541 #define X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx 0x444d4163 diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 663e958..5fe06a8 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2268,6 +2268,8 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt) if (msr_data == 0x0) return emulate_gp(ctxt, 0); break; + default: + break; } ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); @@ -4400,7 +4402,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) } /* Instruction can only be executed in protected mode */ - if ((ctxt->d & Prot) && !(ctxt->mode & X86EMUL_MODE_PROT)) { + if ((ctxt->d & Prot) && ctxt->mode < X86EMUL_MODE_PROT16) { rc = emulate_ud(ctxt); goto done; } -- cgit v1.1 From f3bd64c68a8f1245e3d037f70c6936cd7bb1196b Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Mon, 3 Sep 2012 15:24:28 +0300 Subject: KVM: emulator: string_addr_inc() cleanup Remove unneeded segment argument. Address structure already has correct segment which was put there during decode. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 5fe06a8..415f903 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2790,14 +2790,13 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt, return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; } -static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned seg, - int reg, struct operand *op) +static void string_addr_inc(struct x86_emulate_ctxt *ctxt, int reg, + struct operand *op) { int df = (ctxt->eflags & EFLG_DF) ? -1 : 1; register_address_increment(ctxt, reg_rmw(ctxt, reg), df * op->bytes); op->addr.mem.ea = register_address(ctxt, reg_read(ctxt, reg)); - op->addr.mem.seg = seg; } static int em_das(struct x86_emulate_ctxt *ctxt) @@ -4570,12 +4569,10 @@ writeback: ctxt->dst.type = saved_dst_type; if ((ctxt->d & SrcMask) == SrcSI) - string_addr_inc(ctxt, seg_override(ctxt), - VCPU_REGS_RSI, &ctxt->src); + string_addr_inc(ctxt, VCPU_REGS_RSI, &ctxt->src); if ((ctxt->d & DstMask) == DstDI) - string_addr_inc(ctxt, VCPU_SREG_ES, VCPU_REGS_RDI, - &ctxt->dst); + string_addr_inc(ctxt, VCPU_REGS_RDI, &ctxt->dst); if (ctxt->rep_prefix && (ctxt->d & String)) { struct read_cache *r = &ctxt->io_read; -- cgit v1.1 From b3356bf0dbb34980620f2f7def7d1b9a0d325225 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Mon, 3 Sep 2012 15:24:29 +0300 Subject: KVM: emulator: optimize "rep ins" handling Optimize "rep ins" by allowing emulator to write back more than one datum at a time. Introduce new operand type OP_MEM_STR which tells writeback() that dst contains pointer to an array that should be written back as opposite to just one data element. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 4 +++- arch/x86/kvm/emulate.c | 33 ++++++++++++++++++++++++++++----- 2 files changed, 31 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index e9e5675..15f960c 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -213,8 +213,9 @@ typedef u32 __attribute__((vector_size(16))) sse128_t; /* Type, address-of, and value of an instruction's operand. */ struct operand { - enum { OP_REG, OP_MEM, OP_IMM, OP_XMM, OP_MM, OP_NONE } type; + enum { OP_REG, OP_MEM, OP_MEM_STR, OP_IMM, OP_XMM, OP_MM, OP_NONE } type; unsigned int bytes; + unsigned int count; union { unsigned long orig_val; u64 orig_val64; @@ -234,6 +235,7 @@ struct operand { char valptr[sizeof(unsigned long) + 2]; sse128_t vec_val; u64 mm_val; + void *data; }; }; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 415f903..39171cb 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1301,8 +1301,15 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, rc->end = n * size; } - memcpy(dest, rc->data + rc->pos, size); - rc->pos += size; + if (ctxt->rep_prefix && !(ctxt->eflags & EFLG_DF)) { + ctxt->dst.data = rc->data + rc->pos; + ctxt->dst.type = OP_MEM_STR; + ctxt->dst.count = (rc->end - rc->pos) / size; + rc->pos = rc->end; + } else { + memcpy(dest, rc->data + rc->pos, size); + rc->pos += size; + } return 1; } @@ -1546,6 +1553,14 @@ static int writeback(struct x86_emulate_ctxt *ctxt) if (rc != X86EMUL_CONTINUE) return rc; break; + case OP_MEM_STR: + rc = segmented_write(ctxt, + ctxt->dst.addr.mem, + ctxt->dst.data, + ctxt->dst.bytes * ctxt->dst.count); + if (rc != X86EMUL_CONTINUE) + return rc; + break; case OP_XMM: write_sse_reg(ctxt, &ctxt->dst.vec_val, ctxt->dst.addr.xmm); break; @@ -2793,7 +2808,7 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt, static void string_addr_inc(struct x86_emulate_ctxt *ctxt, int reg, struct operand *op) { - int df = (ctxt->eflags & EFLG_DF) ? -1 : 1; + int df = (ctxt->eflags & EFLG_DF) ? -op->count : op->count; register_address_increment(ctxt, reg_rmw(ctxt, reg), df * op->bytes); op->addr.mem.ea = register_address(ctxt, reg_read(ctxt, reg)); @@ -3733,7 +3748,7 @@ static const struct opcode opcode_table[256] = { I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op), I(SrcImmByte | Mov | Stack, em_push), I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op), - I2bvIP(DstDI | SrcDX | Mov | String, em_in, ins, check_perm_in), /* insb, insw/insd */ + I2bvIP(DstDI | SrcDX | Mov | String | Unaligned, em_in, ins, check_perm_in), /* insb, insw/insd */ I2bvIP(SrcSI | DstDX | String, em_out, outs, check_perm_out), /* outsb, outsw/outsd */ /* 0x70 - 0x7F */ X16(D(SrcImmByte)), @@ -3991,6 +4006,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, register_address(ctxt, reg_read(ctxt, VCPU_REGS_RDI)); op->addr.mem.seg = VCPU_SREG_ES; op->val = 0; + op->count = 1; break; case OpDX: op->type = OP_REG; @@ -4034,6 +4050,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, register_address(ctxt, reg_read(ctxt, VCPU_REGS_RSI)); op->addr.mem.seg = seg_override(ctxt); op->val = 0; + op->count = 1; break; case OpImmFAddr: op->type = OP_IMM; @@ -4575,8 +4592,14 @@ writeback: string_addr_inc(ctxt, VCPU_REGS_RDI, &ctxt->dst); if (ctxt->rep_prefix && (ctxt->d & String)) { + unsigned int count; struct read_cache *r = &ctxt->io_read; - register_address_increment(ctxt, reg_rmw(ctxt, VCPU_REGS_RCX), -1); + if ((ctxt->d & SrcMask) == SrcSI) + count = ctxt->src.count; + else + count = ctxt->dst.count; + register_address_increment(ctxt, reg_rmw(ctxt, VCPU_REGS_RCX), + -count); if (!string_insn_completed(ctxt)) { /* -- cgit v1.1 From a50abc3b2b469ee80bc0f9ef5b6d457ef72659a9 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 5 Sep 2012 20:00:52 +0300 Subject: KVM: use symbolic constant for nr interrupts interrupt_bitmap is KVM_NR_INTERRUPTS bits in size, so just use that instead of hard-coded constants and math. Signed-off-by: Michael S. Tsirkin Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f91e2c9..c4d451e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2366,7 +2366,7 @@ static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq) { - if (irq->irq < 0 || irq->irq >= 256) + if (irq->irq < 0 || irq->irq >= KVM_NR_INTERRUPTS) return -EINVAL; if (irqchip_in_kernel(vcpu->kvm)) return -ENXIO; @@ -5793,7 +5793,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, if (mmu_reset_needed) kvm_mmu_reset_context(vcpu); - max_bits = (sizeof sregs->interrupt_bitmap) << 3; + max_bits = KVM_NR_INTERRUPTS; pending_vec = find_first_bit( (const unsigned long *)sregs->interrupt_bitmap, max_bits); if (pending_vec < max_bits) { -- cgit v1.1 From f94a73f8dd5644f45f9d2e3139608ca83b932d93 Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Tue, 28 Aug 2012 14:24:43 +0300 Subject: crypto: twofish-avx - tune assembler code for more performance Patch replaces 'movb' instructions with 'movzbl' to break false register dependencies and interleaves instructions better for out-of-order scheduling. Tested on Intel Core i5-2450M and AMD FX-8100. tcrypt ECB results: Intel Core i5-2450M: size old-vs-new new-vs-3way old-vs-3way enc dec enc dec enc dec 256 1.12x 1.13x 1.36x 1.37x 1.21x 1.22x 1k 1.14x 1.14x 1.48x 1.49x 1.29x 1.31x 8k 1.14x 1.14x 1.50x 1.52x 1.32x 1.33x AMD FX-8100: size old-vs-new new-vs-3way old-vs-3way enc dec enc dec enc dec 256 1.10x 1.11x 1.01x 1.01x 0.92x 0.91x 1k 1.11x 1.12x 1.08x 1.07x 0.97x 0.96x 8k 1.11x 1.13x 1.10x 1.08x 0.99x 0.97x [v2] - Do instruction interleaving another way to avoid adding new FPU<=>CPU register moves as these cause performance drop on Bulldozer. - Further interleaving improvements for better out-of-order scheduling. Tested-by: Borislav Petkov Cc: Johannes Goetzfried Signed-off-by: Jussi Kivilinna Signed-off-by: Herbert Xu --- arch/x86/crypto/twofish-avx-x86_64-asm_64.S | 227 +++++++++++++++++----------- 1 file changed, 142 insertions(+), 85 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S index 35f4557..1585abb 100644 --- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S @@ -4,6 +4,8 @@ * Copyright (C) 2012 Johannes Goetzfried * * + * Copyright © 2012 Jussi Kivilinna + * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or @@ -47,16 +49,22 @@ #define RC2 %xmm6 #define RD2 %xmm7 -#define RX %xmm8 -#define RY %xmm9 +#define RX0 %xmm8 +#define RY0 %xmm9 + +#define RX1 %xmm10 +#define RY1 %xmm11 -#define RK1 %xmm10 -#define RK2 %xmm11 +#define RK1 %xmm12 +#define RK2 %xmm13 -#define RID1 %rax -#define RID1b %al -#define RID2 %rbx -#define RID2b %bl +#define RT %xmm14 +#define RR %xmm15 + +#define RID1 %rbp +#define RID1d %ebp +#define RID2 %rsi +#define RID2d %esi #define RGI1 %rdx #define RGI1bl %dl @@ -65,6 +73,13 @@ #define RGI2bl %cl #define RGI2bh %ch +#define RGI3 %rax +#define RGI3bl %al +#define RGI3bh %ah +#define RGI4 %rbx +#define RGI4bl %bl +#define RGI4bh %bh + #define RGS1 %r8 #define RGS1d %r8d #define RGS2 %r9 @@ -73,89 +88,123 @@ #define RGS3d %r10d -#define lookup_32bit(t0, t1, t2, t3, src, dst) \ - movb src ## bl, RID1b; \ - movb src ## bh, RID2b; \ - movl t0(CTX, RID1, 4), dst ## d; \ - xorl t1(CTX, RID2, 4), dst ## d; \ +#define lookup_32bit(t0, t1, t2, t3, src, dst, interleave_op, il_reg) \ + movzbl src ## bl, RID1d; \ + movzbl src ## bh, RID2d; \ shrq $16, src; \ - movb src ## bl, RID1b; \ - movb src ## bh, RID2b; \ + movl t0(CTX, RID1, 4), dst ## d; \ + movl t1(CTX, RID2, 4), RID2d; \ + movzbl src ## bl, RID1d; \ + xorl RID2d, dst ## d; \ + movzbl src ## bh, RID2d; \ + interleave_op(il_reg); \ xorl t2(CTX, RID1, 4), dst ## d; \ xorl t3(CTX, RID2, 4), dst ## d; -#define G(a, x, t0, t1, t2, t3) \ - vmovq a, RGI1; \ - vpsrldq $8, a, x; \ - vmovq x, RGI2; \ +#define dummy(d) /* do nothing */ + +#define shr_next(reg) \ + shrq $16, reg; + +#define G(gi1, gi2, x, t0, t1, t2, t3) \ + lookup_32bit(t0, t1, t2, t3, ##gi1, RGS1, shr_next, ##gi1); \ + lookup_32bit(t0, t1, t2, t3, ##gi2, RGS3, shr_next, ##gi2); \ + \ + lookup_32bit(t0, t1, t2, t3, ##gi1, RGS2, dummy, none); \ + shlq $32, RGS2; \ + orq RGS1, RGS2; \ + lookup_32bit(t0, t1, t2, t3, ##gi2, RGS1, dummy, none); \ + shlq $32, RGS1; \ + orq RGS1, RGS3; + +#define round_head_2(a, b, x1, y1, x2, y2) \ + vmovq b ## 1, RGI3; \ + vpextrq $1, b ## 1, RGI4; \ \ - lookup_32bit(t0, t1, t2, t3, RGI1, RGS1); \ - shrq $16, RGI1; \ - lookup_32bit(t0, t1, t2, t3, RGI1, RGS2); \ - shlq $32, RGS2; \ - orq RGS1, RGS2; \ + G(RGI1, RGI2, x1, s0, s1, s2, s3); \ + vmovq a ## 2, RGI1; \ + vpextrq $1, a ## 2, RGI2; \ + vmovq RGS2, x1; \ + vpinsrq $1, RGS3, x1, x1; \ \ - lookup_32bit(t0, t1, t2, t3, RGI2, RGS1); \ - shrq $16, RGI2; \ - lookup_32bit(t0, t1, t2, t3, RGI2, RGS3); \ - shlq $32, RGS3; \ - orq RGS1, RGS3; \ + G(RGI3, RGI4, y1, s1, s2, s3, s0); \ + vmovq b ## 2, RGI3; \ + vpextrq $1, b ## 2, RGI4; \ + vmovq RGS2, y1; \ + vpinsrq $1, RGS3, y1, y1; \ \ - vmovq RGS2, x; \ - vpinsrq $1, RGS3, x, x; + G(RGI1, RGI2, x2, s0, s1, s2, s3); \ + vmovq RGS2, x2; \ + vpinsrq $1, RGS3, x2, x2; \ + \ + G(RGI3, RGI4, y2, s1, s2, s3, s0); \ + vmovq RGS2, y2; \ + vpinsrq $1, RGS3, y2, y2; -#define encround(a, b, c, d, x, y) \ - G(a, x, s0, s1, s2, s3); \ - G(b, y, s1, s2, s3, s0); \ +#define encround_tail(a, b, c, d, x, y, prerotate) \ vpaddd x, y, x; \ + vpaddd x, RK1, RT;\ + prerotate(b); \ + vpxor RT, c, c; \ vpaddd y, x, y; \ - vpaddd x, RK1, x; \ vpaddd y, RK2, y; \ - vpxor x, c, c; \ - vpsrld $1, c, x; \ + vpsrld $1, c, RT; \ vpslld $(32 - 1), c, c; \ - vpor c, x, c; \ - vpslld $1, d, x; \ - vpsrld $(32 - 1), d, d; \ - vpor d, x, d; \ - vpxor d, y, d; - -#define decround(a, b, c, d, x, y) \ - G(a, x, s0, s1, s2, s3); \ - G(b, y, s1, s2, s3, s0); \ + vpor c, RT, c; \ + vpxor d, y, d; \ + +#define decround_tail(a, b, c, d, x, y, prerotate) \ vpaddd x, y, x; \ + vpaddd x, RK1, RT;\ + prerotate(a); \ + vpxor RT, c, c; \ vpaddd y, x, y; \ vpaddd y, RK2, y; \ vpxor d, y, d; \ vpsrld $1, d, y; \ vpslld $(32 - 1), d, d; \ vpor d, y, d; \ - vpslld $1, c, y; \ - vpsrld $(32 - 1), c, c; \ - vpor c, y, c; \ - vpaddd x, RK1, x; \ - vpxor x, c, c; - -#define encrypt_round(n, a, b, c, d) \ - vbroadcastss (k+4*(2*(n)))(CTX), RK1; \ - vbroadcastss (k+4*(2*(n)+1))(CTX), RK2; \ - encround(a ## 1, b ## 1, c ## 1, d ## 1, RX, RY); \ - encround(a ## 2, b ## 2, c ## 2, d ## 2, RX, RY); - -#define decrypt_round(n, a, b, c, d) \ - vbroadcastss (k+4*(2*(n)))(CTX), RK1; \ - vbroadcastss (k+4*(2*(n)+1))(CTX), RK2; \ - decround(a ## 1, b ## 1, c ## 1, d ## 1, RX, RY); \ - decround(a ## 2, b ## 2, c ## 2, d ## 2, RX, RY); + +#define rotate_1l(x) \ + vpslld $1, x, RR; \ + vpsrld $(32 - 1), x, x; \ + vpor x, RR, x; + +#define preload_rgi(c) \ + vmovq c, RGI1; \ + vpextrq $1, c, RGI2; + +#define encrypt_round(n, a, b, c, d, preload, prerotate) \ + vbroadcastss (k+4*(2*(n)))(CTX), RK1; \ + vbroadcastss (k+4*(2*(n)+1))(CTX), RK2; \ + round_head_2(a, b, RX0, RY0, RX1, RY1); \ + encround_tail(a ## 1, b ## 1, c ## 1, d ## 1, RX0, RY0, prerotate); \ + preload(c ## 1); \ + encround_tail(a ## 2, b ## 2, c ## 2, d ## 2, RX1, RY1, prerotate); + +#define decrypt_round(n, a, b, c, d, preload, prerotate) \ + vbroadcastss (k+4*(2*(n)))(CTX), RK1; \ + vbroadcastss (k+4*(2*(n)+1))(CTX), RK2; \ + round_head_2(a, b, RX0, RY0, RX1, RY1); \ + decround_tail(a ## 1, b ## 1, c ## 1, d ## 1, RX0, RY0, prerotate); \ + preload(c ## 1); \ + decround_tail(a ## 2, b ## 2, c ## 2, d ## 2, RX1, RY1, prerotate); #define encrypt_cycle(n) \ - encrypt_round((2*n), RA, RB, RC, RD); \ - encrypt_round(((2*n) + 1), RC, RD, RA, RB); + encrypt_round((2*n), RA, RB, RC, RD, preload_rgi, rotate_1l); \ + encrypt_round(((2*n) + 1), RC, RD, RA, RB, preload_rgi, rotate_1l); + +#define encrypt_cycle_last(n) \ + encrypt_round((2*n), RA, RB, RC, RD, preload_rgi, rotate_1l); \ + encrypt_round(((2*n) + 1), RC, RD, RA, RB, dummy, dummy); #define decrypt_cycle(n) \ - decrypt_round(((2*n) + 1), RC, RD, RA, RB); \ - decrypt_round((2*n), RA, RB, RC, RD); + decrypt_round(((2*n) + 1), RC, RD, RA, RB, preload_rgi, rotate_1l); \ + decrypt_round((2*n), RA, RB, RC, RD, preload_rgi, rotate_1l); +#define decrypt_cycle_last(n) \ + decrypt_round(((2*n) + 1), RC, RD, RA, RB, preload_rgi, rotate_1l); \ + decrypt_round((2*n), RA, RB, RC, RD, dummy, dummy); #define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ vpunpckldq x1, x0, t0; \ @@ -216,17 +265,20 @@ __twofish_enc_blk_8way: * %rcx: bool, if true: xor output */ + pushq %rbp; pushq %rbx; pushq %rcx; vmovdqu w(CTX), RK1; leaq (4*4*4)(%rdx), %rax; - inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RK1, RX, RY, RK2); - inpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX, RY, RK2); + inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2); + preload_rgi(RA1); + rotate_1l(RD1); + inpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2); + rotate_1l(RD2); - xorq RID1, RID1; - xorq RID2, RID2; + movq %rsi, %r11; encrypt_cycle(0); encrypt_cycle(1); @@ -235,26 +287,27 @@ __twofish_enc_blk_8way: encrypt_cycle(4); encrypt_cycle(5); encrypt_cycle(6); - encrypt_cycle(7); + encrypt_cycle_last(7); vmovdqu (w+4*4)(CTX), RK1; popq %rcx; popq %rbx; + popq %rbp; - leaq (4*4*4)(%rsi), %rax; + leaq (4*4*4)(%r11), %rax; testb %cl, %cl; jnz __enc_xor8; - outunpack_blocks(%rsi, RC1, RD1, RA1, RB1, RK1, RX, RY, RK2); - outunpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX, RY, RK2); + outunpack_blocks(%r11, RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2); + outunpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2); ret; __enc_xor8: - outunpack_xor_blocks(%rsi, RC1, RD1, RA1, RB1, RK1, RX, RY, RK2); - outunpack_xor_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX, RY, RK2); + outunpack_xor_blocks(%r11, RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2); + outunpack_xor_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2); ret; @@ -269,16 +322,19 @@ twofish_dec_blk_8way: * %rdx: src */ + pushq %rbp; pushq %rbx; vmovdqu (w+4*4)(CTX), RK1; leaq (4*4*4)(%rdx), %rax; - inpack_blocks(%rdx, RC1, RD1, RA1, RB1, RK1, RX, RY, RK2); - inpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX, RY, RK2); + inpack_blocks(%rdx, RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2); + preload_rgi(RC1); + rotate_1l(RA1); + inpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2); + rotate_1l(RA2); - xorq RID1, RID1; - xorq RID2, RID2; + movq %rsi, %r11; decrypt_cycle(7); decrypt_cycle(6); @@ -287,14 +343,15 @@ twofish_dec_blk_8way: decrypt_cycle(3); decrypt_cycle(2); decrypt_cycle(1); - decrypt_cycle(0); + decrypt_cycle_last(0); vmovdqu (w)(CTX), RK1; popq %rbx; + popq %rbp; - leaq (4*4*4)(%rsi), %rax; - outunpack_blocks(%rsi, RA1, RB1, RC1, RD1, RK1, RX, RY, RK2); - outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX, RY, RK2); + leaq (4*4*4)(%r11), %rax; + outunpack_blocks(%r11, RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2); + outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2); ret; -- cgit v1.1 From ddaea7869d29beb9e0042c96ea52c9cca2afd68a Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Tue, 28 Aug 2012 14:24:49 +0300 Subject: crypto: cast5-avx - tune assembler code for more performance Patch replaces 'movb' instructions with 'movzbl' to break false register dependencies, interleaves instructions better for out-of-order scheduling and merges constant 16-bit rotation with round-key variable rotation. tcrypt ECB results (128bit key): Intel Core i5-2450M: size old-vs-new new-vs-generic old-vs-generic enc dec enc dec enc dec 256 1.18x 1.18x 2.45x 2.47x 2.08x 2.10x 1k 1.20x 1.20x 2.73x 2.73x 2.28x 2.28x 8k 1.20x 1.19x 2.73x 2.73x 2.28x 2.29x [v2] - Do instruction interleaving another way to avoid adding new FPU<=>CPU register moves as these cause performance drop on Bulldozer. - Improvements to round-key variable rotation handling. - Further interleaving improvements for better out-of-order scheduling. Cc: Johannes Goetzfried Signed-off-by: Jussi Kivilinna Signed-off-by: Herbert Xu --- arch/x86/crypto/cast5-avx-x86_64-asm_64.S | 266 ++++++++++++++++++------------ 1 file changed, 160 insertions(+), 106 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S index 94693c8..a41a3aa 100644 --- a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S @@ -4,6 +4,8 @@ * Copyright (C) 2012 Johannes Goetzfried * * + * Copyright © 2012 Jussi Kivilinna + * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or @@ -22,7 +24,6 @@ */ .file "cast5-avx-x86_64-asm_64.S" -.text .extern cast5_s1 .extern cast5_s2 @@ -57,17 +58,19 @@ #define RX %xmm8 #define RKM %xmm9 -#define RKRF %xmm10 -#define RKRR %xmm11 +#define RKR %xmm10 +#define RKRF %xmm11 +#define RKRR %xmm12 + +#define R32 %xmm13 +#define R1ST %xmm14 -#define RTMP %xmm12 -#define RMASK %xmm13 -#define R32 %xmm14 +#define RTMP %xmm15 -#define RID1 %rax -#define RID1b %al -#define RID2 %rbx -#define RID2b %bl +#define RID1 %rbp +#define RID1d %ebp +#define RID2 %rsi +#define RID2d %esi #define RGI1 %rdx #define RGI1bl %dl @@ -76,6 +79,13 @@ #define RGI2bl %cl #define RGI2bh %ch +#define RGI3 %rax +#define RGI3bl %al +#define RGI3bh %ah +#define RGI4 %rbx +#define RGI4bl %bl +#define RGI4bh %bh + #define RFS1 %r8 #define RFS1d %r8d #define RFS2 %r9 @@ -84,60 +94,84 @@ #define RFS3d %r10d -#define lookup_32bit(src, dst, op1, op2, op3) \ - movb src ## bl, RID1b; \ - movb src ## bh, RID2b; \ +#define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \ + movzbl src ## bh, RID1d; \ + movzbl src ## bl, RID2d; \ + shrq $16, src; \ movl s1(, RID1, 4), dst ## d; \ op1 s2(, RID2, 4), dst ## d; \ - shrq $16, src; \ - movb src ## bl, RID1b; \ - movb src ## bh, RID2b; \ + movzbl src ## bh, RID1d; \ + movzbl src ## bl, RID2d; \ + interleave_op(il_reg); \ op2 s3(, RID1, 4), dst ## d; \ op3 s4(, RID2, 4), dst ## d; -#define F(a, x, op0, op1, op2, op3) \ +#define dummy(d) /* do nothing */ + +#define shr_next(reg) \ + shrq $16, reg; + +#define F_head(a, x, gi1, gi2, op0) \ op0 a, RKM, x; \ - vpslld RKRF, x, RTMP; \ - vpsrld RKRR, x, x; \ + vpslld RKRF, x, RTMP; \ + vpsrld RKRR, x, x; \ vpor RTMP, x, x; \ \ - vpshufb RMASK, x, x; \ - vmovq x, RGI1; \ - vpsrldq $8, x, x; \ - vmovq x, RGI2; \ - \ - lookup_32bit(RGI1, RFS1, op1, op2, op3); \ - shrq $16, RGI1; \ - lookup_32bit(RGI1, RFS2, op1, op2, op3); \ - shlq $32, RFS2; \ - orq RFS1, RFS2; \ + vmovq x, gi1; \ + vpextrq $1, x, gi2; + +#define F_tail(a, x, gi1, gi2, op1, op2, op3) \ + lookup_32bit(##gi1, RFS1, op1, op2, op3, shr_next, ##gi1); \ + lookup_32bit(##gi2, RFS3, op1, op2, op3, shr_next, ##gi2); \ \ - lookup_32bit(RGI2, RFS1, op1, op2, op3); \ - shrq $16, RGI2; \ - lookup_32bit(RGI2, RFS3, op1, op2, op3); \ - shlq $32, RFS3; \ - orq RFS1, RFS3; \ + lookup_32bit(##gi1, RFS2, op1, op2, op3, dummy, none); \ + shlq $32, RFS2; \ + orq RFS1, RFS2; \ + lookup_32bit(##gi2, RFS1, op1, op2, op3, dummy, none); \ + shlq $32, RFS1; \ + orq RFS1, RFS3; \ \ - vmovq RFS2, x; \ + vmovq RFS2, x; \ vpinsrq $1, RFS3, x, x; -#define F1(b, x) F(b, x, vpaddd, xorl, subl, addl) -#define F2(b, x) F(b, x, vpxor, subl, addl, xorl) -#define F3(b, x) F(b, x, vpsubd, addl, xorl, subl) +#define F_2(a1, b1, a2, b2, op0, op1, op2, op3) \ + F_head(b1, RX, RGI1, RGI2, op0); \ + F_head(b2, RX, RGI3, RGI4, op0); \ + \ + F_tail(b1, RX, RGI1, RGI2, op1, op2, op3); \ + F_tail(b2, RTMP, RGI3, RGI4, op1, op2, op3); \ + \ + vpxor a1, RX, a1; \ + vpxor a2, RTMP, a2; + +#define F1_2(a1, b1, a2, b2) \ + F_2(a1, b1, a2, b2, vpaddd, xorl, subl, addl) +#define F2_2(a1, b1, a2, b2) \ + F_2(a1, b1, a2, b2, vpxor, subl, addl, xorl) +#define F3_2(a1, b1, a2, b2) \ + F_2(a1, b1, a2, b2, vpsubd, addl, xorl, subl) -#define subround(a, b, x, n, f) \ - F ## f(b, x); \ - vpxor a, x, a; +#define subround(a1, b1, a2, b2, f) \ + F ## f ## _2(a1, b1, a2, b2); #define round(l, r, n, f) \ vbroadcastss (km+(4*n))(CTX), RKM; \ - vpinsrb $0, (kr+n)(CTX), RKRF, RKRF; \ + vpand R1ST, RKR, RKRF; \ vpsubq RKRF, R32, RKRR; \ - subround(l ## 1, r ## 1, RX, n, f); \ - subround(l ## 2, r ## 2, RX, n, f); \ - subround(l ## 3, r ## 3, RX, n, f); \ - subround(l ## 4, r ## 4, RX, n, f); + vpsrldq $1, RKR, RKR; \ + subround(l ## 1, r ## 1, l ## 2, r ## 2, f); \ + subround(l ## 3, r ## 3, l ## 4, r ## 4, f); + +#define enc_preload_rkr() \ + vbroadcastss .L16_mask, RKR; \ + /* add 16-bit rotation to key rotations (mod 32) */ \ + vpxor kr(CTX), RKR, RKR; +#define dec_preload_rkr() \ + vbroadcastss .L16_mask, RKR; \ + /* add 16-bit rotation to key rotations (mod 32) */ \ + vpxor kr(CTX), RKR, RKR; \ + vpshufb .Lbswap128_mask, RKR, RKR; #define transpose_2x4(x0, x1, t0, t1) \ vpunpckldq x1, x0, t0; \ @@ -146,37 +180,47 @@ vpunpcklqdq t1, t0, x0; \ vpunpckhqdq t1, t0, x1; -#define inpack_blocks(in, x0, x1, t0, t1) \ +#define inpack_blocks(in, x0, x1, t0, t1, rmask) \ vmovdqu (0*4*4)(in), x0; \ vmovdqu (1*4*4)(in), x1; \ - vpshufb RMASK, x0, x0; \ - vpshufb RMASK, x1, x1; \ + vpshufb rmask, x0, x0; \ + vpshufb rmask, x1, x1; \ \ transpose_2x4(x0, x1, t0, t1) -#define outunpack_blocks(out, x0, x1, t0, t1) \ +#define outunpack_blocks(out, x0, x1, t0, t1, rmask) \ transpose_2x4(x0, x1, t0, t1) \ \ - vpshufb RMASK, x0, x0; \ - vpshufb RMASK, x1, x1; \ + vpshufb rmask, x0, x0; \ + vpshufb rmask, x1, x1; \ vmovdqu x0, (0*4*4)(out); \ vmovdqu x1, (1*4*4)(out); -#define outunpack_xor_blocks(out, x0, x1, t0, t1) \ +#define outunpack_xor_blocks(out, x0, x1, t0, t1, rmask) \ transpose_2x4(x0, x1, t0, t1) \ \ - vpshufb RMASK, x0, x0; \ - vpshufb RMASK, x1, x1; \ + vpshufb rmask, x0, x0; \ + vpshufb rmask, x1, x1; \ vpxor (0*4*4)(out), x0, x0; \ vmovdqu x0, (0*4*4)(out); \ vpxor (1*4*4)(out), x1, x1; \ vmovdqu x1, (1*4*4)(out); +.data + .align 16 .Lbswap_mask: .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 +.Lbswap128_mask: + .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 +.L16_mask: + .byte 16, 16, 16, 16 .L32_mask: - .byte 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0, 0, 0, 0, 0 + .byte 32, 0, 0, 0 +.Lfirst_mask: + .byte 0x1f, 0, 0, 0 + +.text .align 16 .global __cast5_enc_blk_16way @@ -190,23 +234,24 @@ __cast5_enc_blk_16way: * %rcx: bool, if true: xor output */ + pushq %rbp; pushq %rbx; pushq %rcx; - vmovdqu .Lbswap_mask, RMASK; - vmovdqu .L32_mask, R32; - vpxor RKRF, RKRF, RKRF; + vmovdqa .Lbswap_mask, RKM; + vmovd .Lfirst_mask, R1ST; + vmovd .L32_mask, R32; + enc_preload_rkr(); - inpack_blocks(%rdx, RL1, RR1, RTMP, RX); - leaq (2*4*4)(%rdx), %rax; - inpack_blocks(%rax, RL2, RR2, RTMP, RX); - leaq (2*4*4)(%rax), %rax; - inpack_blocks(%rax, RL3, RR3, RTMP, RX); - leaq (2*4*4)(%rax), %rax; - inpack_blocks(%rax, RL4, RR4, RTMP, RX); + leaq 1*(2*4*4)(%rdx), %rax; + inpack_blocks(%rdx, RL1, RR1, RTMP, RX, RKM); + inpack_blocks(%rax, RL2, RR2, RTMP, RX, RKM); + leaq 2*(2*4*4)(%rdx), %rax; + inpack_blocks(%rax, RL3, RR3, RTMP, RX, RKM); + leaq 3*(2*4*4)(%rdx), %rax; + inpack_blocks(%rax, RL4, RR4, RTMP, RX, RKM); - xorq RID1, RID1; - xorq RID2, RID2; + movq %rsi, %r11; round(RL, RR, 0, 1); round(RR, RL, 1, 2); @@ -221,8 +266,8 @@ __cast5_enc_blk_16way: round(RL, RR, 10, 2); round(RR, RL, 11, 3); - movb rr(CTX), %al; - testb %al, %al; + movzbl rr(CTX), %eax; + testl %eax, %eax; jnz __skip_enc; round(RL, RR, 12, 1); @@ -233,28 +278,30 @@ __cast5_enc_blk_16way: __skip_enc: popq %rcx; popq %rbx; + popq %rbp; + + vmovdqa .Lbswap_mask, RKM; + leaq 1*(2*4*4)(%r11), %rax; testb %cl, %cl; jnz __enc_xor16; - outunpack_blocks(%rsi, RR1, RL1, RTMP, RX); - leaq (2*4*4)(%rsi), %rax; - outunpack_blocks(%rax, RR2, RL2, RTMP, RX); - leaq (2*4*4)(%rax), %rax; - outunpack_blocks(%rax, RR3, RL3, RTMP, RX); - leaq (2*4*4)(%rax), %rax; - outunpack_blocks(%rax, RR4, RL4, RTMP, RX); + outunpack_blocks(%r11, RR1, RL1, RTMP, RX, RKM); + outunpack_blocks(%rax, RR2, RL2, RTMP, RX, RKM); + leaq 2*(2*4*4)(%r11), %rax; + outunpack_blocks(%rax, RR3, RL3, RTMP, RX, RKM); + leaq 3*(2*4*4)(%r11), %rax; + outunpack_blocks(%rax, RR4, RL4, RTMP, RX, RKM); ret; __enc_xor16: - outunpack_xor_blocks(%rsi, RR1, RL1, RTMP, RX); - leaq (2*4*4)(%rsi), %rax; - outunpack_xor_blocks(%rax, RR2, RL2, RTMP, RX); - leaq (2*4*4)(%rax), %rax; - outunpack_xor_blocks(%rax, RR3, RL3, RTMP, RX); - leaq (2*4*4)(%rax), %rax; - outunpack_xor_blocks(%rax, RR4, RL4, RTMP, RX); + outunpack_xor_blocks(%r11, RR1, RL1, RTMP, RX, RKM); + outunpack_xor_blocks(%rax, RR2, RL2, RTMP, RX, RKM); + leaq 2*(2*4*4)(%r11), %rax; + outunpack_xor_blocks(%rax, RR3, RL3, RTMP, RX, RKM); + leaq 3*(2*4*4)(%r11), %rax; + outunpack_xor_blocks(%rax, RR4, RL4, RTMP, RX, RKM); ret; @@ -269,25 +316,26 @@ cast5_dec_blk_16way: * %rdx: src */ + pushq %rbp; pushq %rbx; - vmovdqu .Lbswap_mask, RMASK; - vmovdqu .L32_mask, R32; - vpxor RKRF, RKRF, RKRF; + vmovdqa .Lbswap_mask, RKM; + vmovd .Lfirst_mask, R1ST; + vmovd .L32_mask, R32; + dec_preload_rkr(); - inpack_blocks(%rdx, RL1, RR1, RTMP, RX); - leaq (2*4*4)(%rdx), %rax; - inpack_blocks(%rax, RL2, RR2, RTMP, RX); - leaq (2*4*4)(%rax), %rax; - inpack_blocks(%rax, RL3, RR3, RTMP, RX); - leaq (2*4*4)(%rax), %rax; - inpack_blocks(%rax, RL4, RR4, RTMP, RX); + leaq 1*(2*4*4)(%rdx), %rax; + inpack_blocks(%rdx, RL1, RR1, RTMP, RX, RKM); + inpack_blocks(%rax, RL2, RR2, RTMP, RX, RKM); + leaq 2*(2*4*4)(%rdx), %rax; + inpack_blocks(%rax, RL3, RR3, RTMP, RX, RKM); + leaq 3*(2*4*4)(%rdx), %rax; + inpack_blocks(%rax, RL4, RR4, RTMP, RX, RKM); - xorq RID1, RID1; - xorq RID2, RID2; + movq %rsi, %r11; - movb rr(CTX), %al; - testb %al, %al; + movzbl rr(CTX), %eax; + testl %eax, %eax; jnz __skip_dec; round(RL, RR, 15, 1); @@ -295,7 +343,7 @@ cast5_dec_blk_16way: round(RL, RR, 13, 2); round(RR, RL, 12, 1); -__skip_dec: +__dec_tail: round(RL, RR, 11, 3); round(RR, RL, 10, 2); round(RL, RR, 9, 1); @@ -309,14 +357,20 @@ __skip_dec: round(RL, RR, 1, 2); round(RR, RL, 0, 1); + vmovdqa .Lbswap_mask, RKM; popq %rbx; + popq %rbp; - outunpack_blocks(%rsi, RR1, RL1, RTMP, RX); - leaq (2*4*4)(%rsi), %rax; - outunpack_blocks(%rax, RR2, RL2, RTMP, RX); - leaq (2*4*4)(%rax), %rax; - outunpack_blocks(%rax, RR3, RL3, RTMP, RX); - leaq (2*4*4)(%rax), %rax; - outunpack_blocks(%rax, RR4, RL4, RTMP, RX); + leaq 1*(2*4*4)(%r11), %rax; + outunpack_blocks(%r11, RR1, RL1, RTMP, RX, RKM); + outunpack_blocks(%rax, RR2, RL2, RTMP, RX, RKM); + leaq 2*(2*4*4)(%r11), %rax; + outunpack_blocks(%rax, RR3, RL3, RTMP, RX, RKM); + leaq 3*(2*4*4)(%r11), %rax; + outunpack_blocks(%rax, RR4, RL4, RTMP, RX, RKM); ret; + +__skip_dec: + vpsrldq $4, RKR, RKR; + jmp __dec_tail; -- cgit v1.1 From c09220e1bc97d83cae445cab8dcb057fabd62361 Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Tue, 28 Aug 2012 14:24:54 +0300 Subject: crypto: cast6-avx - tune assembler code for more performance Patch replaces 'movb' instructions with 'movzbl' to break false register dependencies, interleaves instructions better for out-of-order scheduling and merges constant 16-bit rotation with round-key variable rotation. tcrypt ECB results: Intel Core i5-2450M: size old-vs-new new-vs-generic old-vs-generic enc dec enc dec enc dec 256 1.13x 1.19x 2.05x 2.17x 1.82x 1.82x 1k 1.18x 1.21x 2.26x 2.33x 1.93x 1.93x 8k 1.19x 1.19x 2.32x 2.33x 1.95x 1.95x [v2] - Do instruction interleaving another way to avoid adding new FPU<=>CPU register moves as these cause performance drop on Bulldozer. - Improvements to round-key variable rotation handling. - Further interleaving improvements for better out-of-order scheduling. Cc: Johannes Goetzfried Signed-off-by: Jussi Kivilinna Signed-off-by: Herbert Xu --- arch/x86/crypto/cast6-avx-x86_64-asm_64.S | 276 ++++++++++++++++++------------ 1 file changed, 162 insertions(+), 114 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S index d258ce0..218d283 100644 --- a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S @@ -4,6 +4,8 @@ * Copyright (C) 2012 Johannes Goetzfried * * + * Copyright © 2012 Jussi Kivilinna + * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or @@ -22,7 +24,6 @@ */ .file "cast6-avx-x86_64-asm_64.S" -.text .extern cast6_s1 .extern cast6_s2 @@ -54,20 +55,21 @@ #define RC2 %xmm6 #define RD2 %xmm7 -#define RX %xmm8 +#define RX %xmm8 #define RKM %xmm9 -#define RKRF %xmm10 -#define RKRR %xmm11 +#define RKR %xmm10 +#define RKRF %xmm11 +#define RKRR %xmm12 +#define R32 %xmm13 +#define R1ST %xmm14 -#define RTMP %xmm12 -#define RMASK %xmm13 -#define R32 %xmm14 +#define RTMP %xmm15 -#define RID1 %rax -#define RID1b %al -#define RID2 %rbx -#define RID2b %bl +#define RID1 %rbp +#define RID1d %ebp +#define RID2 %rsi +#define RID2d %esi #define RGI1 %rdx #define RGI1bl %dl @@ -76,6 +78,13 @@ #define RGI2bl %cl #define RGI2bh %ch +#define RGI3 %rax +#define RGI3bl %al +#define RGI3bh %ah +#define RGI4 %rbx +#define RGI4bl %bl +#define RGI4bh %bh + #define RFS1 %r8 #define RFS1d %r8d #define RFS2 %r9 @@ -84,95 +93,106 @@ #define RFS3d %r10d -#define lookup_32bit(src, dst, op1, op2, op3) \ - movb src ## bl, RID1b; \ - movb src ## bh, RID2b; \ +#define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \ + movzbl src ## bh, RID1d; \ + movzbl src ## bl, RID2d; \ + shrq $16, src; \ movl s1(, RID1, 4), dst ## d; \ op1 s2(, RID2, 4), dst ## d; \ - shrq $16, src; \ - movb src ## bl, RID1b; \ - movb src ## bh, RID2b; \ + movzbl src ## bh, RID1d; \ + movzbl src ## bl, RID2d; \ + interleave_op(il_reg); \ op2 s3(, RID1, 4), dst ## d; \ op3 s4(, RID2, 4), dst ## d; -#define F(a, x, op0, op1, op2, op3) \ +#define dummy(d) /* do nothing */ + +#define shr_next(reg) \ + shrq $16, reg; + +#define F_head(a, x, gi1, gi2, op0) \ op0 a, RKM, x; \ - vpslld RKRF, x, RTMP; \ - vpsrld RKRR, x, x; \ + vpslld RKRF, x, RTMP; \ + vpsrld RKRR, x, x; \ vpor RTMP, x, x; \ \ - vpshufb RMASK, x, x; \ - vmovq x, RGI1; \ - vpsrldq $8, x, x; \ - vmovq x, RGI2; \ - \ - lookup_32bit(RGI1, RFS1, op1, op2, op3); \ - shrq $16, RGI1; \ - lookup_32bit(RGI1, RFS2, op1, op2, op3); \ - shlq $32, RFS2; \ - orq RFS1, RFS2; \ + vmovq x, gi1; \ + vpextrq $1, x, gi2; + +#define F_tail(a, x, gi1, gi2, op1, op2, op3) \ + lookup_32bit(##gi1, RFS1, op1, op2, op3, shr_next, ##gi1); \ + lookup_32bit(##gi2, RFS3, op1, op2, op3, shr_next, ##gi2); \ \ - lookup_32bit(RGI2, RFS1, op1, op2, op3); \ - shrq $16, RGI2; \ - lookup_32bit(RGI2, RFS3, op1, op2, op3); \ - shlq $32, RFS3; \ - orq RFS1, RFS3; \ + lookup_32bit(##gi1, RFS2, op1, op2, op3, dummy, none); \ + shlq $32, RFS2; \ + orq RFS1, RFS2; \ + lookup_32bit(##gi2, RFS1, op1, op2, op3, dummy, none); \ + shlq $32, RFS1; \ + orq RFS1, RFS3; \ \ - vmovq RFS2, x; \ + vmovq RFS2, x; \ vpinsrq $1, RFS3, x, x; -#define F1(b, x) F(b, x, vpaddd, xorl, subl, addl) -#define F2(b, x) F(b, x, vpxor, subl, addl, xorl) -#define F3(b, x) F(b, x, vpsubd, addl, xorl, subl) +#define F_2(a1, b1, a2, b2, op0, op1, op2, op3) \ + F_head(b1, RX, RGI1, RGI2, op0); \ + F_head(b2, RX, RGI3, RGI4, op0); \ + \ + F_tail(b1, RX, RGI1, RGI2, op1, op2, op3); \ + F_tail(b2, RTMP, RGI3, RGI4, op1, op2, op3); \ + \ + vpxor a1, RX, a1; \ + vpxor a2, RTMP, a2; + +#define F1_2(a1, b1, a2, b2) \ + F_2(a1, b1, a2, b2, vpaddd, xorl, subl, addl) +#define F2_2(a1, b1, a2, b2) \ + F_2(a1, b1, a2, b2, vpxor, subl, addl, xorl) +#define F3_2(a1, b1, a2, b2) \ + F_2(a1, b1, a2, b2, vpsubd, addl, xorl, subl) -#define qop(in, out, x, f) \ - F ## f(in ## 1, x); \ - vpxor out ## 1, x, out ## 1; \ - F ## f(in ## 2, x); \ - vpxor out ## 2, x, out ## 2; \ +#define qop(in, out, f) \ + F ## f ## _2(out ## 1, in ## 1, out ## 2, in ## 2); + +#define get_round_keys(nn) \ + vbroadcastss (km+(4*(nn)))(CTX), RKM; \ + vpand R1ST, RKR, RKRF; \ + vpsubq RKRF, R32, RKRR; \ + vpsrldq $1, RKR, RKR; #define Q(n) \ - vbroadcastss (km+(4*(4*n+0)))(CTX), RKM; \ - vpinsrb $0, (kr+(4*n+0))(CTX), RKRF, RKRF; \ - vpsubq RKRF, R32, RKRR; \ - qop(RD, RC, RX, 1); \ + get_round_keys(4*n+0); \ + qop(RD, RC, 1); \ \ - vbroadcastss (km+(4*(4*n+1)))(CTX), RKM; \ - vpinsrb $0, (kr+(4*n+1))(CTX), RKRF, RKRF; \ - vpsubq RKRF, R32, RKRR; \ - qop(RC, RB, RX, 2); \ + get_round_keys(4*n+1); \ + qop(RC, RB, 2); \ \ - vbroadcastss (km+(4*(4*n+2)))(CTX), RKM; \ - vpinsrb $0, (kr+(4*n+2))(CTX), RKRF, RKRF; \ - vpsubq RKRF, R32, RKRR; \ - qop(RB, RA, RX, 3); \ + get_round_keys(4*n+2); \ + qop(RB, RA, 3); \ \ - vbroadcastss (km+(4*(4*n+3)))(CTX), RKM; \ - vpinsrb $0, (kr+(4*n+3))(CTX), RKRF, RKRF; \ - vpsubq RKRF, R32, RKRR; \ - qop(RA, RD, RX, 1); + get_round_keys(4*n+3); \ + qop(RA, RD, 1); #define QBAR(n) \ - vbroadcastss (km+(4*(4*n+3)))(CTX), RKM; \ - vpinsrb $0, (kr+(4*n+3))(CTX), RKRF, RKRF; \ - vpsubq RKRF, R32, RKRR; \ - qop(RA, RD, RX, 1); \ + get_round_keys(4*n+3); \ + qop(RA, RD, 1); \ \ - vbroadcastss (km+(4*(4*n+2)))(CTX), RKM; \ - vpinsrb $0, (kr+(4*n+2))(CTX), RKRF, RKRF; \ - vpsubq RKRF, R32, RKRR; \ - qop(RB, RA, RX, 3); \ + get_round_keys(4*n+2); \ + qop(RB, RA, 3); \ \ - vbroadcastss (km+(4*(4*n+1)))(CTX), RKM; \ - vpinsrb $0, (kr+(4*n+1))(CTX), RKRF, RKRF; \ - vpsubq RKRF, R32, RKRR; \ - qop(RC, RB, RX, 2); \ + get_round_keys(4*n+1); \ + qop(RC, RB, 2); \ \ - vbroadcastss (km+(4*(4*n+0)))(CTX), RKM; \ - vpinsrb $0, (kr+(4*n+0))(CTX), RKRF, RKRF; \ - vpsubq RKRF, R32, RKRR; \ - qop(RD, RC, RX, 1); + get_round_keys(4*n+0); \ + qop(RD, RC, 1); + +#define shuffle(mask) \ + vpshufb mask, RKR, RKR; +#define preload_rkr(n, do_mask, mask) \ + vbroadcastss .L16_mask, RKR; \ + /* add 16-bit rotation to key rotations (mod 32) */ \ + vpxor (kr+n*16)(CTX), RKR, RKR; \ + do_mask(mask); #define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ vpunpckldq x1, x0, t0; \ @@ -185,37 +205,37 @@ vpunpcklqdq x3, t2, x2; \ vpunpckhqdq x3, t2, x3; -#define inpack_blocks(in, x0, x1, x2, x3, t0, t1, t2) \ +#define inpack_blocks(in, x0, x1, x2, x3, t0, t1, t2, rmask) \ vmovdqu (0*4*4)(in), x0; \ vmovdqu (1*4*4)(in), x1; \ vmovdqu (2*4*4)(in), x2; \ vmovdqu (3*4*4)(in), x3; \ - vpshufb RMASK, x0, x0; \ - vpshufb RMASK, x1, x1; \ - vpshufb RMASK, x2, x2; \ - vpshufb RMASK, x3, x3; \ + vpshufb rmask, x0, x0; \ + vpshufb rmask, x1, x1; \ + vpshufb rmask, x2, x2; \ + vpshufb rmask, x3, x3; \ \ transpose_4x4(x0, x1, x2, x3, t0, t1, t2) -#define outunpack_blocks(out, x0, x1, x2, x3, t0, t1, t2) \ +#define outunpack_blocks(out, x0, x1, x2, x3, t0, t1, t2, rmask) \ transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ \ - vpshufb RMASK, x0, x0; \ - vpshufb RMASK, x1, x1; \ - vpshufb RMASK, x2, x2; \ - vpshufb RMASK, x3, x3; \ + vpshufb rmask, x0, x0; \ + vpshufb rmask, x1, x1; \ + vpshufb rmask, x2, x2; \ + vpshufb rmask, x3, x3; \ vmovdqu x0, (0*4*4)(out); \ vmovdqu x1, (1*4*4)(out); \ vmovdqu x2, (2*4*4)(out); \ vmovdqu x3, (3*4*4)(out); -#define outunpack_xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \ +#define outunpack_xor_blocks(out, x0, x1, x2, x3, t0, t1, t2, rmask) \ transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ \ - vpshufb RMASK, x0, x0; \ - vpshufb RMASK, x1, x1; \ - vpshufb RMASK, x2, x2; \ - vpshufb RMASK, x3, x3; \ + vpshufb rmask, x0, x0; \ + vpshufb rmask, x1, x1; \ + vpshufb rmask, x2, x2; \ + vpshufb rmask, x3, x3; \ vpxor (0*4*4)(out), x0, x0; \ vmovdqu x0, (0*4*4)(out); \ vpxor (1*4*4)(out), x1, x1; \ @@ -225,11 +245,29 @@ vpxor (3*4*4)(out), x3, x3; \ vmovdqu x3, (3*4*4)(out); +.data + .align 16 .Lbswap_mask: .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 +.Lrkr_enc_Q_Q_QBAR_QBAR: + .byte 0, 1, 2, 3, 4, 5, 6, 7, 11, 10, 9, 8, 15, 14, 13, 12 +.Lrkr_enc_QBAR_QBAR_QBAR_QBAR: + .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 +.Lrkr_dec_Q_Q_Q_Q: + .byte 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3 +.Lrkr_dec_Q_Q_QBAR_QBAR: + .byte 12, 13, 14, 15, 8, 9, 10, 11, 7, 6, 5, 4, 3, 2, 1, 0 +.Lrkr_dec_QBAR_QBAR_QBAR_QBAR: + .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 +.L16_mask: + .byte 16, 16, 16, 16 .L32_mask: - .byte 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0, 0, 0, 0, 0 + .byte 32, 0, 0, 0 +.Lfirst_mask: + .byte 0x1f, 0, 0, 0 + +.text .align 16 .global __cast6_enc_blk_8way @@ -243,28 +281,31 @@ __cast6_enc_blk_8way: * %rcx: bool, if true: xor output */ + pushq %rbp; pushq %rbx; pushq %rcx; - vmovdqu .Lbswap_mask, RMASK; - vmovdqu .L32_mask, R32; - vpxor RKRF, RKRF, RKRF; + vmovdqa .Lbswap_mask, RKM; + vmovd .Lfirst_mask, R1ST; + vmovd .L32_mask, R32; leaq (4*4*4)(%rdx), %rax; - inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RTMP, RX, RKM); - inpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKM); + inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); + inpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); - xorq RID1, RID1; - xorq RID2, RID2; + movq %rsi, %r11; + preload_rkr(0, dummy, none); Q(0); Q(1); Q(2); Q(3); + preload_rkr(1, shuffle, .Lrkr_enc_Q_Q_QBAR_QBAR); Q(4); Q(5); QBAR(6); QBAR(7); + preload_rkr(2, shuffle, .Lrkr_enc_QBAR_QBAR_QBAR_QBAR); QBAR(8); QBAR(9); QBAR(10); @@ -272,20 +313,22 @@ __cast6_enc_blk_8way: popq %rcx; popq %rbx; + popq %rbp; - leaq (4*4*4)(%rsi), %rax; + vmovdqa .Lbswap_mask, RKM; + leaq (4*4*4)(%r11), %rax; testb %cl, %cl; jnz __enc_xor8; - outunpack_blocks(%rsi, RA1, RB1, RC1, RD1, RTMP, RX, RKM); - outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKM); + outunpack_blocks(%r11, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); + outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); ret; __enc_xor8: - outunpack_xor_blocks(%rsi, RA1, RB1, RC1, RD1, RTMP, RX, RKM); - outunpack_xor_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKM); + outunpack_xor_blocks(%r11, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); + outunpack_xor_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); ret; @@ -300,36 +343,41 @@ cast6_dec_blk_8way: * %rdx: src */ + pushq %rbp; pushq %rbx; - vmovdqu .Lbswap_mask, RMASK; - vmovdqu .L32_mask, R32; - vpxor RKRF, RKRF, RKRF; + vmovdqa .Lbswap_mask, RKM; + vmovd .Lfirst_mask, R1ST; + vmovd .L32_mask, R32; leaq (4*4*4)(%rdx), %rax; - inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RTMP, RX, RKM); - inpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKM); + inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); + inpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); - xorq RID1, RID1; - xorq RID2, RID2; + movq %rsi, %r11; + preload_rkr(2, shuffle, .Lrkr_dec_Q_Q_Q_Q); Q(11); Q(10); Q(9); Q(8); + preload_rkr(1, shuffle, .Lrkr_dec_Q_Q_QBAR_QBAR); Q(7); Q(6); QBAR(5); QBAR(4); + preload_rkr(0, shuffle, .Lrkr_dec_QBAR_QBAR_QBAR_QBAR); QBAR(3); QBAR(2); QBAR(1); QBAR(0); popq %rbx; + popq %rbp; - leaq (4*4*4)(%rsi), %rax; - outunpack_blocks(%rsi, RA1, RB1, RC1, RD1, RTMP, RX, RKM); - outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKM); + vmovdqa .Lbswap_mask, RKM; + leaq (4*4*4)(%r11), %rax; + outunpack_blocks(%r11, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); + outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); ret; -- cgit v1.1 From 1ffb72a39a92c1a03b3c732280e924c02c509cd3 Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Tue, 28 Aug 2012 16:46:59 +0300 Subject: crypto: camellia-x86_64 - fix sparse warnings (constant is so big) Fix "constant 0xXXXXXXXXXXXXXXXX is so big it's unsigned long" sparse warnings. Signed-off-by: Jussi Kivilinna Signed-off-by: Herbert Xu --- arch/x86/crypto/camellia_glue.c | 1376 +++++++++++++++++++-------------------- 1 file changed, 688 insertions(+), 688 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c index 7a74d7b..42ffd2b 100644 --- a/arch/x86/crypto/camellia_glue.c +++ b/arch/x86/crypto/camellia_glue.c @@ -92,715 +92,715 @@ static void camellia_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) /* camellia sboxes */ const u64 camellia_sp10011110[256] = { - 0x7000007070707000, 0x8200008282828200, 0x2c00002c2c2c2c00, - 0xec0000ecececec00, 0xb30000b3b3b3b300, 0x2700002727272700, - 0xc00000c0c0c0c000, 0xe50000e5e5e5e500, 0xe40000e4e4e4e400, - 0x8500008585858500, 0x5700005757575700, 0x3500003535353500, - 0xea0000eaeaeaea00, 0x0c00000c0c0c0c00, 0xae0000aeaeaeae00, - 0x4100004141414100, 0x2300002323232300, 0xef0000efefefef00, - 0x6b00006b6b6b6b00, 0x9300009393939300, 0x4500004545454500, - 0x1900001919191900, 0xa50000a5a5a5a500, 0x2100002121212100, - 0xed0000edededed00, 0x0e00000e0e0e0e00, 0x4f00004f4f4f4f00, - 0x4e00004e4e4e4e00, 0x1d00001d1d1d1d00, 0x6500006565656500, - 0x9200009292929200, 0xbd0000bdbdbdbd00, 0x8600008686868600, - 0xb80000b8b8b8b800, 0xaf0000afafafaf00, 0x8f00008f8f8f8f00, - 0x7c00007c7c7c7c00, 0xeb0000ebebebeb00, 0x1f00001f1f1f1f00, - 0xce0000cececece00, 0x3e00003e3e3e3e00, 0x3000003030303000, - 0xdc0000dcdcdcdc00, 0x5f00005f5f5f5f00, 0x5e00005e5e5e5e00, - 0xc50000c5c5c5c500, 0x0b00000b0b0b0b00, 0x1a00001a1a1a1a00, - 0xa60000a6a6a6a600, 0xe10000e1e1e1e100, 0x3900003939393900, - 0xca0000cacacaca00, 0xd50000d5d5d5d500, 0x4700004747474700, - 0x5d00005d5d5d5d00, 0x3d00003d3d3d3d00, 0xd90000d9d9d9d900, - 0x0100000101010100, 0x5a00005a5a5a5a00, 0xd60000d6d6d6d600, - 0x5100005151515100, 0x5600005656565600, 0x6c00006c6c6c6c00, - 0x4d00004d4d4d4d00, 0x8b00008b8b8b8b00, 0x0d00000d0d0d0d00, - 0x9a00009a9a9a9a00, 0x6600006666666600, 0xfb0000fbfbfbfb00, - 0xcc0000cccccccc00, 0xb00000b0b0b0b000, 0x2d00002d2d2d2d00, - 0x7400007474747400, 0x1200001212121200, 0x2b00002b2b2b2b00, - 0x2000002020202000, 0xf00000f0f0f0f000, 0xb10000b1b1b1b100, - 0x8400008484848400, 0x9900009999999900, 0xdf0000dfdfdfdf00, - 0x4c00004c4c4c4c00, 0xcb0000cbcbcbcb00, 0xc20000c2c2c2c200, - 0x3400003434343400, 0x7e00007e7e7e7e00, 0x7600007676767600, - 0x0500000505050500, 0x6d00006d6d6d6d00, 0xb70000b7b7b7b700, - 0xa90000a9a9a9a900, 0x3100003131313100, 0xd10000d1d1d1d100, - 0x1700001717171700, 0x0400000404040400, 0xd70000d7d7d7d700, - 0x1400001414141400, 0x5800005858585800, 0x3a00003a3a3a3a00, - 0x6100006161616100, 0xde0000dededede00, 0x1b00001b1b1b1b00, - 0x1100001111111100, 0x1c00001c1c1c1c00, 0x3200003232323200, - 0x0f00000f0f0f0f00, 0x9c00009c9c9c9c00, 0x1600001616161600, - 0x5300005353535300, 0x1800001818181800, 0xf20000f2f2f2f200, - 0x2200002222222200, 0xfe0000fefefefe00, 0x4400004444444400, - 0xcf0000cfcfcfcf00, 0xb20000b2b2b2b200, 0xc30000c3c3c3c300, - 0xb50000b5b5b5b500, 0x7a00007a7a7a7a00, 0x9100009191919100, - 0x2400002424242400, 0x0800000808080800, 0xe80000e8e8e8e800, - 0xa80000a8a8a8a800, 0x6000006060606000, 0xfc0000fcfcfcfc00, - 0x6900006969696900, 0x5000005050505000, 0xaa0000aaaaaaaa00, - 0xd00000d0d0d0d000, 0xa00000a0a0a0a000, 0x7d00007d7d7d7d00, - 0xa10000a1a1a1a100, 0x8900008989898900, 0x6200006262626200, - 0x9700009797979700, 0x5400005454545400, 0x5b00005b5b5b5b00, - 0x1e00001e1e1e1e00, 0x9500009595959500, 0xe00000e0e0e0e000, - 0xff0000ffffffff00, 0x6400006464646400, 0xd20000d2d2d2d200, - 0x1000001010101000, 0xc40000c4c4c4c400, 0x0000000000000000, - 0x4800004848484800, 0xa30000a3a3a3a300, 0xf70000f7f7f7f700, - 0x7500007575757500, 0xdb0000dbdbdbdb00, 0x8a00008a8a8a8a00, - 0x0300000303030300, 0xe60000e6e6e6e600, 0xda0000dadadada00, - 0x0900000909090900, 0x3f00003f3f3f3f00, 0xdd0000dddddddd00, - 0x9400009494949400, 0x8700008787878700, 0x5c00005c5c5c5c00, - 0x8300008383838300, 0x0200000202020200, 0xcd0000cdcdcdcd00, - 0x4a00004a4a4a4a00, 0x9000009090909000, 0x3300003333333300, - 0x7300007373737300, 0x6700006767676700, 0xf60000f6f6f6f600, - 0xf30000f3f3f3f300, 0x9d00009d9d9d9d00, 0x7f00007f7f7f7f00, - 0xbf0000bfbfbfbf00, 0xe20000e2e2e2e200, 0x5200005252525200, - 0x9b00009b9b9b9b00, 0xd80000d8d8d8d800, 0x2600002626262600, - 0xc80000c8c8c8c800, 0x3700003737373700, 0xc60000c6c6c6c600, - 0x3b00003b3b3b3b00, 0x8100008181818100, 0x9600009696969600, - 0x6f00006f6f6f6f00, 0x4b00004b4b4b4b00, 0x1300001313131300, - 0xbe0000bebebebe00, 0x6300006363636300, 0x2e00002e2e2e2e00, - 0xe90000e9e9e9e900, 0x7900007979797900, 0xa70000a7a7a7a700, - 0x8c00008c8c8c8c00, 0x9f00009f9f9f9f00, 0x6e00006e6e6e6e00, - 0xbc0000bcbcbcbc00, 0x8e00008e8e8e8e00, 0x2900002929292900, - 0xf50000f5f5f5f500, 0xf90000f9f9f9f900, 0xb60000b6b6b6b600, - 0x2f00002f2f2f2f00, 0xfd0000fdfdfdfd00, 0xb40000b4b4b4b400, - 0x5900005959595900, 0x7800007878787800, 0x9800009898989800, - 0x0600000606060600, 0x6a00006a6a6a6a00, 0xe70000e7e7e7e700, - 0x4600004646464600, 0x7100007171717100, 0xba0000babababa00, - 0xd40000d4d4d4d400, 0x2500002525252500, 0xab0000abababab00, - 0x4200004242424200, 0x8800008888888800, 0xa20000a2a2a2a200, - 0x8d00008d8d8d8d00, 0xfa0000fafafafa00, 0x7200007272727200, - 0x0700000707070700, 0xb90000b9b9b9b900, 0x5500005555555500, - 0xf80000f8f8f8f800, 0xee0000eeeeeeee00, 0xac0000acacacac00, - 0x0a00000a0a0a0a00, 0x3600003636363600, 0x4900004949494900, - 0x2a00002a2a2a2a00, 0x6800006868686800, 0x3c00003c3c3c3c00, - 0x3800003838383800, 0xf10000f1f1f1f100, 0xa40000a4a4a4a400, - 0x4000004040404000, 0x2800002828282800, 0xd30000d3d3d3d300, - 0x7b00007b7b7b7b00, 0xbb0000bbbbbbbb00, 0xc90000c9c9c9c900, - 0x4300004343434300, 0xc10000c1c1c1c100, 0x1500001515151500, - 0xe30000e3e3e3e300, 0xad0000adadadad00, 0xf40000f4f4f4f400, - 0x7700007777777700, 0xc70000c7c7c7c700, 0x8000008080808000, - 0x9e00009e9e9e9e00, + 0x7000007070707000ULL, 0x8200008282828200ULL, 0x2c00002c2c2c2c00ULL, + 0xec0000ecececec00ULL, 0xb30000b3b3b3b300ULL, 0x2700002727272700ULL, + 0xc00000c0c0c0c000ULL, 0xe50000e5e5e5e500ULL, 0xe40000e4e4e4e400ULL, + 0x8500008585858500ULL, 0x5700005757575700ULL, 0x3500003535353500ULL, + 0xea0000eaeaeaea00ULL, 0x0c00000c0c0c0c00ULL, 0xae0000aeaeaeae00ULL, + 0x4100004141414100ULL, 0x2300002323232300ULL, 0xef0000efefefef00ULL, + 0x6b00006b6b6b6b00ULL, 0x9300009393939300ULL, 0x4500004545454500ULL, + 0x1900001919191900ULL, 0xa50000a5a5a5a500ULL, 0x2100002121212100ULL, + 0xed0000edededed00ULL, 0x0e00000e0e0e0e00ULL, 0x4f00004f4f4f4f00ULL, + 0x4e00004e4e4e4e00ULL, 0x1d00001d1d1d1d00ULL, 0x6500006565656500ULL, + 0x9200009292929200ULL, 0xbd0000bdbdbdbd00ULL, 0x8600008686868600ULL, + 0xb80000b8b8b8b800ULL, 0xaf0000afafafaf00ULL, 0x8f00008f8f8f8f00ULL, + 0x7c00007c7c7c7c00ULL, 0xeb0000ebebebeb00ULL, 0x1f00001f1f1f1f00ULL, + 0xce0000cececece00ULL, 0x3e00003e3e3e3e00ULL, 0x3000003030303000ULL, + 0xdc0000dcdcdcdc00ULL, 0x5f00005f5f5f5f00ULL, 0x5e00005e5e5e5e00ULL, + 0xc50000c5c5c5c500ULL, 0x0b00000b0b0b0b00ULL, 0x1a00001a1a1a1a00ULL, + 0xa60000a6a6a6a600ULL, 0xe10000e1e1e1e100ULL, 0x3900003939393900ULL, + 0xca0000cacacaca00ULL, 0xd50000d5d5d5d500ULL, 0x4700004747474700ULL, + 0x5d00005d5d5d5d00ULL, 0x3d00003d3d3d3d00ULL, 0xd90000d9d9d9d900ULL, + 0x0100000101010100ULL, 0x5a00005a5a5a5a00ULL, 0xd60000d6d6d6d600ULL, + 0x5100005151515100ULL, 0x5600005656565600ULL, 0x6c00006c6c6c6c00ULL, + 0x4d00004d4d4d4d00ULL, 0x8b00008b8b8b8b00ULL, 0x0d00000d0d0d0d00ULL, + 0x9a00009a9a9a9a00ULL, 0x6600006666666600ULL, 0xfb0000fbfbfbfb00ULL, + 0xcc0000cccccccc00ULL, 0xb00000b0b0b0b000ULL, 0x2d00002d2d2d2d00ULL, + 0x7400007474747400ULL, 0x1200001212121200ULL, 0x2b00002b2b2b2b00ULL, + 0x2000002020202000ULL, 0xf00000f0f0f0f000ULL, 0xb10000b1b1b1b100ULL, + 0x8400008484848400ULL, 0x9900009999999900ULL, 0xdf0000dfdfdfdf00ULL, + 0x4c00004c4c4c4c00ULL, 0xcb0000cbcbcbcb00ULL, 0xc20000c2c2c2c200ULL, + 0x3400003434343400ULL, 0x7e00007e7e7e7e00ULL, 0x7600007676767600ULL, + 0x0500000505050500ULL, 0x6d00006d6d6d6d00ULL, 0xb70000b7b7b7b700ULL, + 0xa90000a9a9a9a900ULL, 0x3100003131313100ULL, 0xd10000d1d1d1d100ULL, + 0x1700001717171700ULL, 0x0400000404040400ULL, 0xd70000d7d7d7d700ULL, + 0x1400001414141400ULL, 0x5800005858585800ULL, 0x3a00003a3a3a3a00ULL, + 0x6100006161616100ULL, 0xde0000dededede00ULL, 0x1b00001b1b1b1b00ULL, + 0x1100001111111100ULL, 0x1c00001c1c1c1c00ULL, 0x3200003232323200ULL, + 0x0f00000f0f0f0f00ULL, 0x9c00009c9c9c9c00ULL, 0x1600001616161600ULL, + 0x5300005353535300ULL, 0x1800001818181800ULL, 0xf20000f2f2f2f200ULL, + 0x2200002222222200ULL, 0xfe0000fefefefe00ULL, 0x4400004444444400ULL, + 0xcf0000cfcfcfcf00ULL, 0xb20000b2b2b2b200ULL, 0xc30000c3c3c3c300ULL, + 0xb50000b5b5b5b500ULL, 0x7a00007a7a7a7a00ULL, 0x9100009191919100ULL, + 0x2400002424242400ULL, 0x0800000808080800ULL, 0xe80000e8e8e8e800ULL, + 0xa80000a8a8a8a800ULL, 0x6000006060606000ULL, 0xfc0000fcfcfcfc00ULL, + 0x6900006969696900ULL, 0x5000005050505000ULL, 0xaa0000aaaaaaaa00ULL, + 0xd00000d0d0d0d000ULL, 0xa00000a0a0a0a000ULL, 0x7d00007d7d7d7d00ULL, + 0xa10000a1a1a1a100ULL, 0x8900008989898900ULL, 0x6200006262626200ULL, + 0x9700009797979700ULL, 0x5400005454545400ULL, 0x5b00005b5b5b5b00ULL, + 0x1e00001e1e1e1e00ULL, 0x9500009595959500ULL, 0xe00000e0e0e0e000ULL, + 0xff0000ffffffff00ULL, 0x6400006464646400ULL, 0xd20000d2d2d2d200ULL, + 0x1000001010101000ULL, 0xc40000c4c4c4c400ULL, 0x0000000000000000ULL, + 0x4800004848484800ULL, 0xa30000a3a3a3a300ULL, 0xf70000f7f7f7f700ULL, + 0x7500007575757500ULL, 0xdb0000dbdbdbdb00ULL, 0x8a00008a8a8a8a00ULL, + 0x0300000303030300ULL, 0xe60000e6e6e6e600ULL, 0xda0000dadadada00ULL, + 0x0900000909090900ULL, 0x3f00003f3f3f3f00ULL, 0xdd0000dddddddd00ULL, + 0x9400009494949400ULL, 0x8700008787878700ULL, 0x5c00005c5c5c5c00ULL, + 0x8300008383838300ULL, 0x0200000202020200ULL, 0xcd0000cdcdcdcd00ULL, + 0x4a00004a4a4a4a00ULL, 0x9000009090909000ULL, 0x3300003333333300ULL, + 0x7300007373737300ULL, 0x6700006767676700ULL, 0xf60000f6f6f6f600ULL, + 0xf30000f3f3f3f300ULL, 0x9d00009d9d9d9d00ULL, 0x7f00007f7f7f7f00ULL, + 0xbf0000bfbfbfbf00ULL, 0xe20000e2e2e2e200ULL, 0x5200005252525200ULL, + 0x9b00009b9b9b9b00ULL, 0xd80000d8d8d8d800ULL, 0x2600002626262600ULL, + 0xc80000c8c8c8c800ULL, 0x3700003737373700ULL, 0xc60000c6c6c6c600ULL, + 0x3b00003b3b3b3b00ULL, 0x8100008181818100ULL, 0x9600009696969600ULL, + 0x6f00006f6f6f6f00ULL, 0x4b00004b4b4b4b00ULL, 0x1300001313131300ULL, + 0xbe0000bebebebe00ULL, 0x6300006363636300ULL, 0x2e00002e2e2e2e00ULL, + 0xe90000e9e9e9e900ULL, 0x7900007979797900ULL, 0xa70000a7a7a7a700ULL, + 0x8c00008c8c8c8c00ULL, 0x9f00009f9f9f9f00ULL, 0x6e00006e6e6e6e00ULL, + 0xbc0000bcbcbcbc00ULL, 0x8e00008e8e8e8e00ULL, 0x2900002929292900ULL, + 0xf50000f5f5f5f500ULL, 0xf90000f9f9f9f900ULL, 0xb60000b6b6b6b600ULL, + 0x2f00002f2f2f2f00ULL, 0xfd0000fdfdfdfd00ULL, 0xb40000b4b4b4b400ULL, + 0x5900005959595900ULL, 0x7800007878787800ULL, 0x9800009898989800ULL, + 0x0600000606060600ULL, 0x6a00006a6a6a6a00ULL, 0xe70000e7e7e7e700ULL, + 0x4600004646464600ULL, 0x7100007171717100ULL, 0xba0000babababa00ULL, + 0xd40000d4d4d4d400ULL, 0x2500002525252500ULL, 0xab0000abababab00ULL, + 0x4200004242424200ULL, 0x8800008888888800ULL, 0xa20000a2a2a2a200ULL, + 0x8d00008d8d8d8d00ULL, 0xfa0000fafafafa00ULL, 0x7200007272727200ULL, + 0x0700000707070700ULL, 0xb90000b9b9b9b900ULL, 0x5500005555555500ULL, + 0xf80000f8f8f8f800ULL, 0xee0000eeeeeeee00ULL, 0xac0000acacacac00ULL, + 0x0a00000a0a0a0a00ULL, 0x3600003636363600ULL, 0x4900004949494900ULL, + 0x2a00002a2a2a2a00ULL, 0x6800006868686800ULL, 0x3c00003c3c3c3c00ULL, + 0x3800003838383800ULL, 0xf10000f1f1f1f100ULL, 0xa40000a4a4a4a400ULL, + 0x4000004040404000ULL, 0x2800002828282800ULL, 0xd30000d3d3d3d300ULL, + 0x7b00007b7b7b7b00ULL, 0xbb0000bbbbbbbb00ULL, 0xc90000c9c9c9c900ULL, + 0x4300004343434300ULL, 0xc10000c1c1c1c100ULL, 0x1500001515151500ULL, + 0xe30000e3e3e3e300ULL, 0xad0000adadadad00ULL, 0xf40000f4f4f4f400ULL, + 0x7700007777777700ULL, 0xc70000c7c7c7c700ULL, 0x8000008080808000ULL, + 0x9e00009e9e9e9e00ULL, }; const u64 camellia_sp22000222[256] = { - 0xe0e0000000e0e0e0, 0x0505000000050505, 0x5858000000585858, - 0xd9d9000000d9d9d9, 0x6767000000676767, 0x4e4e0000004e4e4e, - 0x8181000000818181, 0xcbcb000000cbcbcb, 0xc9c9000000c9c9c9, - 0x0b0b0000000b0b0b, 0xaeae000000aeaeae, 0x6a6a0000006a6a6a, - 0xd5d5000000d5d5d5, 0x1818000000181818, 0x5d5d0000005d5d5d, - 0x8282000000828282, 0x4646000000464646, 0xdfdf000000dfdfdf, - 0xd6d6000000d6d6d6, 0x2727000000272727, 0x8a8a0000008a8a8a, - 0x3232000000323232, 0x4b4b0000004b4b4b, 0x4242000000424242, - 0xdbdb000000dbdbdb, 0x1c1c0000001c1c1c, 0x9e9e0000009e9e9e, - 0x9c9c0000009c9c9c, 0x3a3a0000003a3a3a, 0xcaca000000cacaca, - 0x2525000000252525, 0x7b7b0000007b7b7b, 0x0d0d0000000d0d0d, - 0x7171000000717171, 0x5f5f0000005f5f5f, 0x1f1f0000001f1f1f, - 0xf8f8000000f8f8f8, 0xd7d7000000d7d7d7, 0x3e3e0000003e3e3e, - 0x9d9d0000009d9d9d, 0x7c7c0000007c7c7c, 0x6060000000606060, - 0xb9b9000000b9b9b9, 0xbebe000000bebebe, 0xbcbc000000bcbcbc, - 0x8b8b0000008b8b8b, 0x1616000000161616, 0x3434000000343434, - 0x4d4d0000004d4d4d, 0xc3c3000000c3c3c3, 0x7272000000727272, - 0x9595000000959595, 0xabab000000ababab, 0x8e8e0000008e8e8e, - 0xbaba000000bababa, 0x7a7a0000007a7a7a, 0xb3b3000000b3b3b3, - 0x0202000000020202, 0xb4b4000000b4b4b4, 0xadad000000adadad, - 0xa2a2000000a2a2a2, 0xacac000000acacac, 0xd8d8000000d8d8d8, - 0x9a9a0000009a9a9a, 0x1717000000171717, 0x1a1a0000001a1a1a, - 0x3535000000353535, 0xcccc000000cccccc, 0xf7f7000000f7f7f7, - 0x9999000000999999, 0x6161000000616161, 0x5a5a0000005a5a5a, - 0xe8e8000000e8e8e8, 0x2424000000242424, 0x5656000000565656, - 0x4040000000404040, 0xe1e1000000e1e1e1, 0x6363000000636363, - 0x0909000000090909, 0x3333000000333333, 0xbfbf000000bfbfbf, - 0x9898000000989898, 0x9797000000979797, 0x8585000000858585, - 0x6868000000686868, 0xfcfc000000fcfcfc, 0xecec000000ececec, - 0x0a0a0000000a0a0a, 0xdada000000dadada, 0x6f6f0000006f6f6f, - 0x5353000000535353, 0x6262000000626262, 0xa3a3000000a3a3a3, - 0x2e2e0000002e2e2e, 0x0808000000080808, 0xafaf000000afafaf, - 0x2828000000282828, 0xb0b0000000b0b0b0, 0x7474000000747474, - 0xc2c2000000c2c2c2, 0xbdbd000000bdbdbd, 0x3636000000363636, - 0x2222000000222222, 0x3838000000383838, 0x6464000000646464, - 0x1e1e0000001e1e1e, 0x3939000000393939, 0x2c2c0000002c2c2c, - 0xa6a6000000a6a6a6, 0x3030000000303030, 0xe5e5000000e5e5e5, - 0x4444000000444444, 0xfdfd000000fdfdfd, 0x8888000000888888, - 0x9f9f0000009f9f9f, 0x6565000000656565, 0x8787000000878787, - 0x6b6b0000006b6b6b, 0xf4f4000000f4f4f4, 0x2323000000232323, - 0x4848000000484848, 0x1010000000101010, 0xd1d1000000d1d1d1, - 0x5151000000515151, 0xc0c0000000c0c0c0, 0xf9f9000000f9f9f9, - 0xd2d2000000d2d2d2, 0xa0a0000000a0a0a0, 0x5555000000555555, - 0xa1a1000000a1a1a1, 0x4141000000414141, 0xfafa000000fafafa, - 0x4343000000434343, 0x1313000000131313, 0xc4c4000000c4c4c4, - 0x2f2f0000002f2f2f, 0xa8a8000000a8a8a8, 0xb6b6000000b6b6b6, - 0x3c3c0000003c3c3c, 0x2b2b0000002b2b2b, 0xc1c1000000c1c1c1, - 0xffff000000ffffff, 0xc8c8000000c8c8c8, 0xa5a5000000a5a5a5, - 0x2020000000202020, 0x8989000000898989, 0x0000000000000000, - 0x9090000000909090, 0x4747000000474747, 0xefef000000efefef, - 0xeaea000000eaeaea, 0xb7b7000000b7b7b7, 0x1515000000151515, - 0x0606000000060606, 0xcdcd000000cdcdcd, 0xb5b5000000b5b5b5, - 0x1212000000121212, 0x7e7e0000007e7e7e, 0xbbbb000000bbbbbb, - 0x2929000000292929, 0x0f0f0000000f0f0f, 0xb8b8000000b8b8b8, - 0x0707000000070707, 0x0404000000040404, 0x9b9b0000009b9b9b, - 0x9494000000949494, 0x2121000000212121, 0x6666000000666666, - 0xe6e6000000e6e6e6, 0xcece000000cecece, 0xeded000000ededed, - 0xe7e7000000e7e7e7, 0x3b3b0000003b3b3b, 0xfefe000000fefefe, - 0x7f7f0000007f7f7f, 0xc5c5000000c5c5c5, 0xa4a4000000a4a4a4, - 0x3737000000373737, 0xb1b1000000b1b1b1, 0x4c4c0000004c4c4c, - 0x9191000000919191, 0x6e6e0000006e6e6e, 0x8d8d0000008d8d8d, - 0x7676000000767676, 0x0303000000030303, 0x2d2d0000002d2d2d, - 0xdede000000dedede, 0x9696000000969696, 0x2626000000262626, - 0x7d7d0000007d7d7d, 0xc6c6000000c6c6c6, 0x5c5c0000005c5c5c, - 0xd3d3000000d3d3d3, 0xf2f2000000f2f2f2, 0x4f4f0000004f4f4f, - 0x1919000000191919, 0x3f3f0000003f3f3f, 0xdcdc000000dcdcdc, - 0x7979000000797979, 0x1d1d0000001d1d1d, 0x5252000000525252, - 0xebeb000000ebebeb, 0xf3f3000000f3f3f3, 0x6d6d0000006d6d6d, - 0x5e5e0000005e5e5e, 0xfbfb000000fbfbfb, 0x6969000000696969, - 0xb2b2000000b2b2b2, 0xf0f0000000f0f0f0, 0x3131000000313131, - 0x0c0c0000000c0c0c, 0xd4d4000000d4d4d4, 0xcfcf000000cfcfcf, - 0x8c8c0000008c8c8c, 0xe2e2000000e2e2e2, 0x7575000000757575, - 0xa9a9000000a9a9a9, 0x4a4a0000004a4a4a, 0x5757000000575757, - 0x8484000000848484, 0x1111000000111111, 0x4545000000454545, - 0x1b1b0000001b1b1b, 0xf5f5000000f5f5f5, 0xe4e4000000e4e4e4, - 0x0e0e0000000e0e0e, 0x7373000000737373, 0xaaaa000000aaaaaa, - 0xf1f1000000f1f1f1, 0xdddd000000dddddd, 0x5959000000595959, - 0x1414000000141414, 0x6c6c0000006c6c6c, 0x9292000000929292, - 0x5454000000545454, 0xd0d0000000d0d0d0, 0x7878000000787878, - 0x7070000000707070, 0xe3e3000000e3e3e3, 0x4949000000494949, - 0x8080000000808080, 0x5050000000505050, 0xa7a7000000a7a7a7, - 0xf6f6000000f6f6f6, 0x7777000000777777, 0x9393000000939393, - 0x8686000000868686, 0x8383000000838383, 0x2a2a0000002a2a2a, - 0xc7c7000000c7c7c7, 0x5b5b0000005b5b5b, 0xe9e9000000e9e9e9, - 0xeeee000000eeeeee, 0x8f8f0000008f8f8f, 0x0101000000010101, - 0x3d3d0000003d3d3d, + 0xe0e0000000e0e0e0ULL, 0x0505000000050505ULL, 0x5858000000585858ULL, + 0xd9d9000000d9d9d9ULL, 0x6767000000676767ULL, 0x4e4e0000004e4e4eULL, + 0x8181000000818181ULL, 0xcbcb000000cbcbcbULL, 0xc9c9000000c9c9c9ULL, + 0x0b0b0000000b0b0bULL, 0xaeae000000aeaeaeULL, 0x6a6a0000006a6a6aULL, + 0xd5d5000000d5d5d5ULL, 0x1818000000181818ULL, 0x5d5d0000005d5d5dULL, + 0x8282000000828282ULL, 0x4646000000464646ULL, 0xdfdf000000dfdfdfULL, + 0xd6d6000000d6d6d6ULL, 0x2727000000272727ULL, 0x8a8a0000008a8a8aULL, + 0x3232000000323232ULL, 0x4b4b0000004b4b4bULL, 0x4242000000424242ULL, + 0xdbdb000000dbdbdbULL, 0x1c1c0000001c1c1cULL, 0x9e9e0000009e9e9eULL, + 0x9c9c0000009c9c9cULL, 0x3a3a0000003a3a3aULL, 0xcaca000000cacacaULL, + 0x2525000000252525ULL, 0x7b7b0000007b7b7bULL, 0x0d0d0000000d0d0dULL, + 0x7171000000717171ULL, 0x5f5f0000005f5f5fULL, 0x1f1f0000001f1f1fULL, + 0xf8f8000000f8f8f8ULL, 0xd7d7000000d7d7d7ULL, 0x3e3e0000003e3e3eULL, + 0x9d9d0000009d9d9dULL, 0x7c7c0000007c7c7cULL, 0x6060000000606060ULL, + 0xb9b9000000b9b9b9ULL, 0xbebe000000bebebeULL, 0xbcbc000000bcbcbcULL, + 0x8b8b0000008b8b8bULL, 0x1616000000161616ULL, 0x3434000000343434ULL, + 0x4d4d0000004d4d4dULL, 0xc3c3000000c3c3c3ULL, 0x7272000000727272ULL, + 0x9595000000959595ULL, 0xabab000000abababULL, 0x8e8e0000008e8e8eULL, + 0xbaba000000bababaULL, 0x7a7a0000007a7a7aULL, 0xb3b3000000b3b3b3ULL, + 0x0202000000020202ULL, 0xb4b4000000b4b4b4ULL, 0xadad000000adadadULL, + 0xa2a2000000a2a2a2ULL, 0xacac000000acacacULL, 0xd8d8000000d8d8d8ULL, + 0x9a9a0000009a9a9aULL, 0x1717000000171717ULL, 0x1a1a0000001a1a1aULL, + 0x3535000000353535ULL, 0xcccc000000ccccccULL, 0xf7f7000000f7f7f7ULL, + 0x9999000000999999ULL, 0x6161000000616161ULL, 0x5a5a0000005a5a5aULL, + 0xe8e8000000e8e8e8ULL, 0x2424000000242424ULL, 0x5656000000565656ULL, + 0x4040000000404040ULL, 0xe1e1000000e1e1e1ULL, 0x6363000000636363ULL, + 0x0909000000090909ULL, 0x3333000000333333ULL, 0xbfbf000000bfbfbfULL, + 0x9898000000989898ULL, 0x9797000000979797ULL, 0x8585000000858585ULL, + 0x6868000000686868ULL, 0xfcfc000000fcfcfcULL, 0xecec000000ecececULL, + 0x0a0a0000000a0a0aULL, 0xdada000000dadadaULL, 0x6f6f0000006f6f6fULL, + 0x5353000000535353ULL, 0x6262000000626262ULL, 0xa3a3000000a3a3a3ULL, + 0x2e2e0000002e2e2eULL, 0x0808000000080808ULL, 0xafaf000000afafafULL, + 0x2828000000282828ULL, 0xb0b0000000b0b0b0ULL, 0x7474000000747474ULL, + 0xc2c2000000c2c2c2ULL, 0xbdbd000000bdbdbdULL, 0x3636000000363636ULL, + 0x2222000000222222ULL, 0x3838000000383838ULL, 0x6464000000646464ULL, + 0x1e1e0000001e1e1eULL, 0x3939000000393939ULL, 0x2c2c0000002c2c2cULL, + 0xa6a6000000a6a6a6ULL, 0x3030000000303030ULL, 0xe5e5000000e5e5e5ULL, + 0x4444000000444444ULL, 0xfdfd000000fdfdfdULL, 0x8888000000888888ULL, + 0x9f9f0000009f9f9fULL, 0x6565000000656565ULL, 0x8787000000878787ULL, + 0x6b6b0000006b6b6bULL, 0xf4f4000000f4f4f4ULL, 0x2323000000232323ULL, + 0x4848000000484848ULL, 0x1010000000101010ULL, 0xd1d1000000d1d1d1ULL, + 0x5151000000515151ULL, 0xc0c0000000c0c0c0ULL, 0xf9f9000000f9f9f9ULL, + 0xd2d2000000d2d2d2ULL, 0xa0a0000000a0a0a0ULL, 0x5555000000555555ULL, + 0xa1a1000000a1a1a1ULL, 0x4141000000414141ULL, 0xfafa000000fafafaULL, + 0x4343000000434343ULL, 0x1313000000131313ULL, 0xc4c4000000c4c4c4ULL, + 0x2f2f0000002f2f2fULL, 0xa8a8000000a8a8a8ULL, 0xb6b6000000b6b6b6ULL, + 0x3c3c0000003c3c3cULL, 0x2b2b0000002b2b2bULL, 0xc1c1000000c1c1c1ULL, + 0xffff000000ffffffULL, 0xc8c8000000c8c8c8ULL, 0xa5a5000000a5a5a5ULL, + 0x2020000000202020ULL, 0x8989000000898989ULL, 0x0000000000000000ULL, + 0x9090000000909090ULL, 0x4747000000474747ULL, 0xefef000000efefefULL, + 0xeaea000000eaeaeaULL, 0xb7b7000000b7b7b7ULL, 0x1515000000151515ULL, + 0x0606000000060606ULL, 0xcdcd000000cdcdcdULL, 0xb5b5000000b5b5b5ULL, + 0x1212000000121212ULL, 0x7e7e0000007e7e7eULL, 0xbbbb000000bbbbbbULL, + 0x2929000000292929ULL, 0x0f0f0000000f0f0fULL, 0xb8b8000000b8b8b8ULL, + 0x0707000000070707ULL, 0x0404000000040404ULL, 0x9b9b0000009b9b9bULL, + 0x9494000000949494ULL, 0x2121000000212121ULL, 0x6666000000666666ULL, + 0xe6e6000000e6e6e6ULL, 0xcece000000cececeULL, 0xeded000000edededULL, + 0xe7e7000000e7e7e7ULL, 0x3b3b0000003b3b3bULL, 0xfefe000000fefefeULL, + 0x7f7f0000007f7f7fULL, 0xc5c5000000c5c5c5ULL, 0xa4a4000000a4a4a4ULL, + 0x3737000000373737ULL, 0xb1b1000000b1b1b1ULL, 0x4c4c0000004c4c4cULL, + 0x9191000000919191ULL, 0x6e6e0000006e6e6eULL, 0x8d8d0000008d8d8dULL, + 0x7676000000767676ULL, 0x0303000000030303ULL, 0x2d2d0000002d2d2dULL, + 0xdede000000dededeULL, 0x9696000000969696ULL, 0x2626000000262626ULL, + 0x7d7d0000007d7d7dULL, 0xc6c6000000c6c6c6ULL, 0x5c5c0000005c5c5cULL, + 0xd3d3000000d3d3d3ULL, 0xf2f2000000f2f2f2ULL, 0x4f4f0000004f4f4fULL, + 0x1919000000191919ULL, 0x3f3f0000003f3f3fULL, 0xdcdc000000dcdcdcULL, + 0x7979000000797979ULL, 0x1d1d0000001d1d1dULL, 0x5252000000525252ULL, + 0xebeb000000ebebebULL, 0xf3f3000000f3f3f3ULL, 0x6d6d0000006d6d6dULL, + 0x5e5e0000005e5e5eULL, 0xfbfb000000fbfbfbULL, 0x6969000000696969ULL, + 0xb2b2000000b2b2b2ULL, 0xf0f0000000f0f0f0ULL, 0x3131000000313131ULL, + 0x0c0c0000000c0c0cULL, 0xd4d4000000d4d4d4ULL, 0xcfcf000000cfcfcfULL, + 0x8c8c0000008c8c8cULL, 0xe2e2000000e2e2e2ULL, 0x7575000000757575ULL, + 0xa9a9000000a9a9a9ULL, 0x4a4a0000004a4a4aULL, 0x5757000000575757ULL, + 0x8484000000848484ULL, 0x1111000000111111ULL, 0x4545000000454545ULL, + 0x1b1b0000001b1b1bULL, 0xf5f5000000f5f5f5ULL, 0xe4e4000000e4e4e4ULL, + 0x0e0e0000000e0e0eULL, 0x7373000000737373ULL, 0xaaaa000000aaaaaaULL, + 0xf1f1000000f1f1f1ULL, 0xdddd000000ddddddULL, 0x5959000000595959ULL, + 0x1414000000141414ULL, 0x6c6c0000006c6c6cULL, 0x9292000000929292ULL, + 0x5454000000545454ULL, 0xd0d0000000d0d0d0ULL, 0x7878000000787878ULL, + 0x7070000000707070ULL, 0xe3e3000000e3e3e3ULL, 0x4949000000494949ULL, + 0x8080000000808080ULL, 0x5050000000505050ULL, 0xa7a7000000a7a7a7ULL, + 0xf6f6000000f6f6f6ULL, 0x7777000000777777ULL, 0x9393000000939393ULL, + 0x8686000000868686ULL, 0x8383000000838383ULL, 0x2a2a0000002a2a2aULL, + 0xc7c7000000c7c7c7ULL, 0x5b5b0000005b5b5bULL, 0xe9e9000000e9e9e9ULL, + 0xeeee000000eeeeeeULL, 0x8f8f0000008f8f8fULL, 0x0101000000010101ULL, + 0x3d3d0000003d3d3dULL, }; const u64 camellia_sp03303033[256] = { - 0x0038380038003838, 0x0041410041004141, 0x0016160016001616, - 0x0076760076007676, 0x00d9d900d900d9d9, 0x0093930093009393, - 0x0060600060006060, 0x00f2f200f200f2f2, 0x0072720072007272, - 0x00c2c200c200c2c2, 0x00abab00ab00abab, 0x009a9a009a009a9a, - 0x0075750075007575, 0x0006060006000606, 0x0057570057005757, - 0x00a0a000a000a0a0, 0x0091910091009191, 0x00f7f700f700f7f7, - 0x00b5b500b500b5b5, 0x00c9c900c900c9c9, 0x00a2a200a200a2a2, - 0x008c8c008c008c8c, 0x00d2d200d200d2d2, 0x0090900090009090, - 0x00f6f600f600f6f6, 0x0007070007000707, 0x00a7a700a700a7a7, - 0x0027270027002727, 0x008e8e008e008e8e, 0x00b2b200b200b2b2, - 0x0049490049004949, 0x00dede00de00dede, 0x0043430043004343, - 0x005c5c005c005c5c, 0x00d7d700d700d7d7, 0x00c7c700c700c7c7, - 0x003e3e003e003e3e, 0x00f5f500f500f5f5, 0x008f8f008f008f8f, - 0x0067670067006767, 0x001f1f001f001f1f, 0x0018180018001818, - 0x006e6e006e006e6e, 0x00afaf00af00afaf, 0x002f2f002f002f2f, - 0x00e2e200e200e2e2, 0x0085850085008585, 0x000d0d000d000d0d, - 0x0053530053005353, 0x00f0f000f000f0f0, 0x009c9c009c009c9c, - 0x0065650065006565, 0x00eaea00ea00eaea, 0x00a3a300a300a3a3, - 0x00aeae00ae00aeae, 0x009e9e009e009e9e, 0x00ecec00ec00ecec, - 0x0080800080008080, 0x002d2d002d002d2d, 0x006b6b006b006b6b, - 0x00a8a800a800a8a8, 0x002b2b002b002b2b, 0x0036360036003636, - 0x00a6a600a600a6a6, 0x00c5c500c500c5c5, 0x0086860086008686, - 0x004d4d004d004d4d, 0x0033330033003333, 0x00fdfd00fd00fdfd, - 0x0066660066006666, 0x0058580058005858, 0x0096960096009696, - 0x003a3a003a003a3a, 0x0009090009000909, 0x0095950095009595, - 0x0010100010001010, 0x0078780078007878, 0x00d8d800d800d8d8, - 0x0042420042004242, 0x00cccc00cc00cccc, 0x00efef00ef00efef, - 0x0026260026002626, 0x00e5e500e500e5e5, 0x0061610061006161, - 0x001a1a001a001a1a, 0x003f3f003f003f3f, 0x003b3b003b003b3b, - 0x0082820082008282, 0x00b6b600b600b6b6, 0x00dbdb00db00dbdb, - 0x00d4d400d400d4d4, 0x0098980098009898, 0x00e8e800e800e8e8, - 0x008b8b008b008b8b, 0x0002020002000202, 0x00ebeb00eb00ebeb, - 0x000a0a000a000a0a, 0x002c2c002c002c2c, 0x001d1d001d001d1d, - 0x00b0b000b000b0b0, 0x006f6f006f006f6f, 0x008d8d008d008d8d, - 0x0088880088008888, 0x000e0e000e000e0e, 0x0019190019001919, - 0x0087870087008787, 0x004e4e004e004e4e, 0x000b0b000b000b0b, - 0x00a9a900a900a9a9, 0x000c0c000c000c0c, 0x0079790079007979, - 0x0011110011001111, 0x007f7f007f007f7f, 0x0022220022002222, - 0x00e7e700e700e7e7, 0x0059590059005959, 0x00e1e100e100e1e1, - 0x00dada00da00dada, 0x003d3d003d003d3d, 0x00c8c800c800c8c8, - 0x0012120012001212, 0x0004040004000404, 0x0074740074007474, - 0x0054540054005454, 0x0030300030003030, 0x007e7e007e007e7e, - 0x00b4b400b400b4b4, 0x0028280028002828, 0x0055550055005555, - 0x0068680068006868, 0x0050500050005050, 0x00bebe00be00bebe, - 0x00d0d000d000d0d0, 0x00c4c400c400c4c4, 0x0031310031003131, - 0x00cbcb00cb00cbcb, 0x002a2a002a002a2a, 0x00adad00ad00adad, - 0x000f0f000f000f0f, 0x00caca00ca00caca, 0x0070700070007070, - 0x00ffff00ff00ffff, 0x0032320032003232, 0x0069690069006969, - 0x0008080008000808, 0x0062620062006262, 0x0000000000000000, - 0x0024240024002424, 0x00d1d100d100d1d1, 0x00fbfb00fb00fbfb, - 0x00baba00ba00baba, 0x00eded00ed00eded, 0x0045450045004545, - 0x0081810081008181, 0x0073730073007373, 0x006d6d006d006d6d, - 0x0084840084008484, 0x009f9f009f009f9f, 0x00eeee00ee00eeee, - 0x004a4a004a004a4a, 0x00c3c300c300c3c3, 0x002e2e002e002e2e, - 0x00c1c100c100c1c1, 0x0001010001000101, 0x00e6e600e600e6e6, - 0x0025250025002525, 0x0048480048004848, 0x0099990099009999, - 0x00b9b900b900b9b9, 0x00b3b300b300b3b3, 0x007b7b007b007b7b, - 0x00f9f900f900f9f9, 0x00cece00ce00cece, 0x00bfbf00bf00bfbf, - 0x00dfdf00df00dfdf, 0x0071710071007171, 0x0029290029002929, - 0x00cdcd00cd00cdcd, 0x006c6c006c006c6c, 0x0013130013001313, - 0x0064640064006464, 0x009b9b009b009b9b, 0x0063630063006363, - 0x009d9d009d009d9d, 0x00c0c000c000c0c0, 0x004b4b004b004b4b, - 0x00b7b700b700b7b7, 0x00a5a500a500a5a5, 0x0089890089008989, - 0x005f5f005f005f5f, 0x00b1b100b100b1b1, 0x0017170017001717, - 0x00f4f400f400f4f4, 0x00bcbc00bc00bcbc, 0x00d3d300d300d3d3, - 0x0046460046004646, 0x00cfcf00cf00cfcf, 0x0037370037003737, - 0x005e5e005e005e5e, 0x0047470047004747, 0x0094940094009494, - 0x00fafa00fa00fafa, 0x00fcfc00fc00fcfc, 0x005b5b005b005b5b, - 0x0097970097009797, 0x00fefe00fe00fefe, 0x005a5a005a005a5a, - 0x00acac00ac00acac, 0x003c3c003c003c3c, 0x004c4c004c004c4c, - 0x0003030003000303, 0x0035350035003535, 0x00f3f300f300f3f3, - 0x0023230023002323, 0x00b8b800b800b8b8, 0x005d5d005d005d5d, - 0x006a6a006a006a6a, 0x0092920092009292, 0x00d5d500d500d5d5, - 0x0021210021002121, 0x0044440044004444, 0x0051510051005151, - 0x00c6c600c600c6c6, 0x007d7d007d007d7d, 0x0039390039003939, - 0x0083830083008383, 0x00dcdc00dc00dcdc, 0x00aaaa00aa00aaaa, - 0x007c7c007c007c7c, 0x0077770077007777, 0x0056560056005656, - 0x0005050005000505, 0x001b1b001b001b1b, 0x00a4a400a400a4a4, - 0x0015150015001515, 0x0034340034003434, 0x001e1e001e001e1e, - 0x001c1c001c001c1c, 0x00f8f800f800f8f8, 0x0052520052005252, - 0x0020200020002020, 0x0014140014001414, 0x00e9e900e900e9e9, - 0x00bdbd00bd00bdbd, 0x00dddd00dd00dddd, 0x00e4e400e400e4e4, - 0x00a1a100a100a1a1, 0x00e0e000e000e0e0, 0x008a8a008a008a8a, - 0x00f1f100f100f1f1, 0x00d6d600d600d6d6, 0x007a7a007a007a7a, - 0x00bbbb00bb00bbbb, 0x00e3e300e300e3e3, 0x0040400040004040, - 0x004f4f004f004f4f, + 0x0038380038003838ULL, 0x0041410041004141ULL, 0x0016160016001616ULL, + 0x0076760076007676ULL, 0x00d9d900d900d9d9ULL, 0x0093930093009393ULL, + 0x0060600060006060ULL, 0x00f2f200f200f2f2ULL, 0x0072720072007272ULL, + 0x00c2c200c200c2c2ULL, 0x00abab00ab00ababULL, 0x009a9a009a009a9aULL, + 0x0075750075007575ULL, 0x0006060006000606ULL, 0x0057570057005757ULL, + 0x00a0a000a000a0a0ULL, 0x0091910091009191ULL, 0x00f7f700f700f7f7ULL, + 0x00b5b500b500b5b5ULL, 0x00c9c900c900c9c9ULL, 0x00a2a200a200a2a2ULL, + 0x008c8c008c008c8cULL, 0x00d2d200d200d2d2ULL, 0x0090900090009090ULL, + 0x00f6f600f600f6f6ULL, 0x0007070007000707ULL, 0x00a7a700a700a7a7ULL, + 0x0027270027002727ULL, 0x008e8e008e008e8eULL, 0x00b2b200b200b2b2ULL, + 0x0049490049004949ULL, 0x00dede00de00dedeULL, 0x0043430043004343ULL, + 0x005c5c005c005c5cULL, 0x00d7d700d700d7d7ULL, 0x00c7c700c700c7c7ULL, + 0x003e3e003e003e3eULL, 0x00f5f500f500f5f5ULL, 0x008f8f008f008f8fULL, + 0x0067670067006767ULL, 0x001f1f001f001f1fULL, 0x0018180018001818ULL, + 0x006e6e006e006e6eULL, 0x00afaf00af00afafULL, 0x002f2f002f002f2fULL, + 0x00e2e200e200e2e2ULL, 0x0085850085008585ULL, 0x000d0d000d000d0dULL, + 0x0053530053005353ULL, 0x00f0f000f000f0f0ULL, 0x009c9c009c009c9cULL, + 0x0065650065006565ULL, 0x00eaea00ea00eaeaULL, 0x00a3a300a300a3a3ULL, + 0x00aeae00ae00aeaeULL, 0x009e9e009e009e9eULL, 0x00ecec00ec00ececULL, + 0x0080800080008080ULL, 0x002d2d002d002d2dULL, 0x006b6b006b006b6bULL, + 0x00a8a800a800a8a8ULL, 0x002b2b002b002b2bULL, 0x0036360036003636ULL, + 0x00a6a600a600a6a6ULL, 0x00c5c500c500c5c5ULL, 0x0086860086008686ULL, + 0x004d4d004d004d4dULL, 0x0033330033003333ULL, 0x00fdfd00fd00fdfdULL, + 0x0066660066006666ULL, 0x0058580058005858ULL, 0x0096960096009696ULL, + 0x003a3a003a003a3aULL, 0x0009090009000909ULL, 0x0095950095009595ULL, + 0x0010100010001010ULL, 0x0078780078007878ULL, 0x00d8d800d800d8d8ULL, + 0x0042420042004242ULL, 0x00cccc00cc00ccccULL, 0x00efef00ef00efefULL, + 0x0026260026002626ULL, 0x00e5e500e500e5e5ULL, 0x0061610061006161ULL, + 0x001a1a001a001a1aULL, 0x003f3f003f003f3fULL, 0x003b3b003b003b3bULL, + 0x0082820082008282ULL, 0x00b6b600b600b6b6ULL, 0x00dbdb00db00dbdbULL, + 0x00d4d400d400d4d4ULL, 0x0098980098009898ULL, 0x00e8e800e800e8e8ULL, + 0x008b8b008b008b8bULL, 0x0002020002000202ULL, 0x00ebeb00eb00ebebULL, + 0x000a0a000a000a0aULL, 0x002c2c002c002c2cULL, 0x001d1d001d001d1dULL, + 0x00b0b000b000b0b0ULL, 0x006f6f006f006f6fULL, 0x008d8d008d008d8dULL, + 0x0088880088008888ULL, 0x000e0e000e000e0eULL, 0x0019190019001919ULL, + 0x0087870087008787ULL, 0x004e4e004e004e4eULL, 0x000b0b000b000b0bULL, + 0x00a9a900a900a9a9ULL, 0x000c0c000c000c0cULL, 0x0079790079007979ULL, + 0x0011110011001111ULL, 0x007f7f007f007f7fULL, 0x0022220022002222ULL, + 0x00e7e700e700e7e7ULL, 0x0059590059005959ULL, 0x00e1e100e100e1e1ULL, + 0x00dada00da00dadaULL, 0x003d3d003d003d3dULL, 0x00c8c800c800c8c8ULL, + 0x0012120012001212ULL, 0x0004040004000404ULL, 0x0074740074007474ULL, + 0x0054540054005454ULL, 0x0030300030003030ULL, 0x007e7e007e007e7eULL, + 0x00b4b400b400b4b4ULL, 0x0028280028002828ULL, 0x0055550055005555ULL, + 0x0068680068006868ULL, 0x0050500050005050ULL, 0x00bebe00be00bebeULL, + 0x00d0d000d000d0d0ULL, 0x00c4c400c400c4c4ULL, 0x0031310031003131ULL, + 0x00cbcb00cb00cbcbULL, 0x002a2a002a002a2aULL, 0x00adad00ad00adadULL, + 0x000f0f000f000f0fULL, 0x00caca00ca00cacaULL, 0x0070700070007070ULL, + 0x00ffff00ff00ffffULL, 0x0032320032003232ULL, 0x0069690069006969ULL, + 0x0008080008000808ULL, 0x0062620062006262ULL, 0x0000000000000000ULL, + 0x0024240024002424ULL, 0x00d1d100d100d1d1ULL, 0x00fbfb00fb00fbfbULL, + 0x00baba00ba00babaULL, 0x00eded00ed00ededULL, 0x0045450045004545ULL, + 0x0081810081008181ULL, 0x0073730073007373ULL, 0x006d6d006d006d6dULL, + 0x0084840084008484ULL, 0x009f9f009f009f9fULL, 0x00eeee00ee00eeeeULL, + 0x004a4a004a004a4aULL, 0x00c3c300c300c3c3ULL, 0x002e2e002e002e2eULL, + 0x00c1c100c100c1c1ULL, 0x0001010001000101ULL, 0x00e6e600e600e6e6ULL, + 0x0025250025002525ULL, 0x0048480048004848ULL, 0x0099990099009999ULL, + 0x00b9b900b900b9b9ULL, 0x00b3b300b300b3b3ULL, 0x007b7b007b007b7bULL, + 0x00f9f900f900f9f9ULL, 0x00cece00ce00ceceULL, 0x00bfbf00bf00bfbfULL, + 0x00dfdf00df00dfdfULL, 0x0071710071007171ULL, 0x0029290029002929ULL, + 0x00cdcd00cd00cdcdULL, 0x006c6c006c006c6cULL, 0x0013130013001313ULL, + 0x0064640064006464ULL, 0x009b9b009b009b9bULL, 0x0063630063006363ULL, + 0x009d9d009d009d9dULL, 0x00c0c000c000c0c0ULL, 0x004b4b004b004b4bULL, + 0x00b7b700b700b7b7ULL, 0x00a5a500a500a5a5ULL, 0x0089890089008989ULL, + 0x005f5f005f005f5fULL, 0x00b1b100b100b1b1ULL, 0x0017170017001717ULL, + 0x00f4f400f400f4f4ULL, 0x00bcbc00bc00bcbcULL, 0x00d3d300d300d3d3ULL, + 0x0046460046004646ULL, 0x00cfcf00cf00cfcfULL, 0x0037370037003737ULL, + 0x005e5e005e005e5eULL, 0x0047470047004747ULL, 0x0094940094009494ULL, + 0x00fafa00fa00fafaULL, 0x00fcfc00fc00fcfcULL, 0x005b5b005b005b5bULL, + 0x0097970097009797ULL, 0x00fefe00fe00fefeULL, 0x005a5a005a005a5aULL, + 0x00acac00ac00acacULL, 0x003c3c003c003c3cULL, 0x004c4c004c004c4cULL, + 0x0003030003000303ULL, 0x0035350035003535ULL, 0x00f3f300f300f3f3ULL, + 0x0023230023002323ULL, 0x00b8b800b800b8b8ULL, 0x005d5d005d005d5dULL, + 0x006a6a006a006a6aULL, 0x0092920092009292ULL, 0x00d5d500d500d5d5ULL, + 0x0021210021002121ULL, 0x0044440044004444ULL, 0x0051510051005151ULL, + 0x00c6c600c600c6c6ULL, 0x007d7d007d007d7dULL, 0x0039390039003939ULL, + 0x0083830083008383ULL, 0x00dcdc00dc00dcdcULL, 0x00aaaa00aa00aaaaULL, + 0x007c7c007c007c7cULL, 0x0077770077007777ULL, 0x0056560056005656ULL, + 0x0005050005000505ULL, 0x001b1b001b001b1bULL, 0x00a4a400a400a4a4ULL, + 0x0015150015001515ULL, 0x0034340034003434ULL, 0x001e1e001e001e1eULL, + 0x001c1c001c001c1cULL, 0x00f8f800f800f8f8ULL, 0x0052520052005252ULL, + 0x0020200020002020ULL, 0x0014140014001414ULL, 0x00e9e900e900e9e9ULL, + 0x00bdbd00bd00bdbdULL, 0x00dddd00dd00ddddULL, 0x00e4e400e400e4e4ULL, + 0x00a1a100a100a1a1ULL, 0x00e0e000e000e0e0ULL, 0x008a8a008a008a8aULL, + 0x00f1f100f100f1f1ULL, 0x00d6d600d600d6d6ULL, 0x007a7a007a007a7aULL, + 0x00bbbb00bb00bbbbULL, 0x00e3e300e300e3e3ULL, 0x0040400040004040ULL, + 0x004f4f004f004f4fULL, }; const u64 camellia_sp00444404[256] = { - 0x0000707070700070, 0x00002c2c2c2c002c, 0x0000b3b3b3b300b3, - 0x0000c0c0c0c000c0, 0x0000e4e4e4e400e4, 0x0000575757570057, - 0x0000eaeaeaea00ea, 0x0000aeaeaeae00ae, 0x0000232323230023, - 0x00006b6b6b6b006b, 0x0000454545450045, 0x0000a5a5a5a500a5, - 0x0000edededed00ed, 0x00004f4f4f4f004f, 0x00001d1d1d1d001d, - 0x0000929292920092, 0x0000868686860086, 0x0000afafafaf00af, - 0x00007c7c7c7c007c, 0x00001f1f1f1f001f, 0x00003e3e3e3e003e, - 0x0000dcdcdcdc00dc, 0x00005e5e5e5e005e, 0x00000b0b0b0b000b, - 0x0000a6a6a6a600a6, 0x0000393939390039, 0x0000d5d5d5d500d5, - 0x00005d5d5d5d005d, 0x0000d9d9d9d900d9, 0x00005a5a5a5a005a, - 0x0000515151510051, 0x00006c6c6c6c006c, 0x00008b8b8b8b008b, - 0x00009a9a9a9a009a, 0x0000fbfbfbfb00fb, 0x0000b0b0b0b000b0, - 0x0000747474740074, 0x00002b2b2b2b002b, 0x0000f0f0f0f000f0, - 0x0000848484840084, 0x0000dfdfdfdf00df, 0x0000cbcbcbcb00cb, - 0x0000343434340034, 0x0000767676760076, 0x00006d6d6d6d006d, - 0x0000a9a9a9a900a9, 0x0000d1d1d1d100d1, 0x0000040404040004, - 0x0000141414140014, 0x00003a3a3a3a003a, 0x0000dededede00de, - 0x0000111111110011, 0x0000323232320032, 0x00009c9c9c9c009c, - 0x0000535353530053, 0x0000f2f2f2f200f2, 0x0000fefefefe00fe, - 0x0000cfcfcfcf00cf, 0x0000c3c3c3c300c3, 0x00007a7a7a7a007a, - 0x0000242424240024, 0x0000e8e8e8e800e8, 0x0000606060600060, - 0x0000696969690069, 0x0000aaaaaaaa00aa, 0x0000a0a0a0a000a0, - 0x0000a1a1a1a100a1, 0x0000626262620062, 0x0000545454540054, - 0x00001e1e1e1e001e, 0x0000e0e0e0e000e0, 0x0000646464640064, - 0x0000101010100010, 0x0000000000000000, 0x0000a3a3a3a300a3, - 0x0000757575750075, 0x00008a8a8a8a008a, 0x0000e6e6e6e600e6, - 0x0000090909090009, 0x0000dddddddd00dd, 0x0000878787870087, - 0x0000838383830083, 0x0000cdcdcdcd00cd, 0x0000909090900090, - 0x0000737373730073, 0x0000f6f6f6f600f6, 0x00009d9d9d9d009d, - 0x0000bfbfbfbf00bf, 0x0000525252520052, 0x0000d8d8d8d800d8, - 0x0000c8c8c8c800c8, 0x0000c6c6c6c600c6, 0x0000818181810081, - 0x00006f6f6f6f006f, 0x0000131313130013, 0x0000636363630063, - 0x0000e9e9e9e900e9, 0x0000a7a7a7a700a7, 0x00009f9f9f9f009f, - 0x0000bcbcbcbc00bc, 0x0000292929290029, 0x0000f9f9f9f900f9, - 0x00002f2f2f2f002f, 0x0000b4b4b4b400b4, 0x0000787878780078, - 0x0000060606060006, 0x0000e7e7e7e700e7, 0x0000717171710071, - 0x0000d4d4d4d400d4, 0x0000abababab00ab, 0x0000888888880088, - 0x00008d8d8d8d008d, 0x0000727272720072, 0x0000b9b9b9b900b9, - 0x0000f8f8f8f800f8, 0x0000acacacac00ac, 0x0000363636360036, - 0x00002a2a2a2a002a, 0x00003c3c3c3c003c, 0x0000f1f1f1f100f1, - 0x0000404040400040, 0x0000d3d3d3d300d3, 0x0000bbbbbbbb00bb, - 0x0000434343430043, 0x0000151515150015, 0x0000adadadad00ad, - 0x0000777777770077, 0x0000808080800080, 0x0000828282820082, - 0x0000ecececec00ec, 0x0000272727270027, 0x0000e5e5e5e500e5, - 0x0000858585850085, 0x0000353535350035, 0x00000c0c0c0c000c, - 0x0000414141410041, 0x0000efefefef00ef, 0x0000939393930093, - 0x0000191919190019, 0x0000212121210021, 0x00000e0e0e0e000e, - 0x00004e4e4e4e004e, 0x0000656565650065, 0x0000bdbdbdbd00bd, - 0x0000b8b8b8b800b8, 0x00008f8f8f8f008f, 0x0000ebebebeb00eb, - 0x0000cececece00ce, 0x0000303030300030, 0x00005f5f5f5f005f, - 0x0000c5c5c5c500c5, 0x00001a1a1a1a001a, 0x0000e1e1e1e100e1, - 0x0000cacacaca00ca, 0x0000474747470047, 0x00003d3d3d3d003d, - 0x0000010101010001, 0x0000d6d6d6d600d6, 0x0000565656560056, - 0x00004d4d4d4d004d, 0x00000d0d0d0d000d, 0x0000666666660066, - 0x0000cccccccc00cc, 0x00002d2d2d2d002d, 0x0000121212120012, - 0x0000202020200020, 0x0000b1b1b1b100b1, 0x0000999999990099, - 0x00004c4c4c4c004c, 0x0000c2c2c2c200c2, 0x00007e7e7e7e007e, - 0x0000050505050005, 0x0000b7b7b7b700b7, 0x0000313131310031, - 0x0000171717170017, 0x0000d7d7d7d700d7, 0x0000585858580058, - 0x0000616161610061, 0x00001b1b1b1b001b, 0x00001c1c1c1c001c, - 0x00000f0f0f0f000f, 0x0000161616160016, 0x0000181818180018, - 0x0000222222220022, 0x0000444444440044, 0x0000b2b2b2b200b2, - 0x0000b5b5b5b500b5, 0x0000919191910091, 0x0000080808080008, - 0x0000a8a8a8a800a8, 0x0000fcfcfcfc00fc, 0x0000505050500050, - 0x0000d0d0d0d000d0, 0x00007d7d7d7d007d, 0x0000898989890089, - 0x0000979797970097, 0x00005b5b5b5b005b, 0x0000959595950095, - 0x0000ffffffff00ff, 0x0000d2d2d2d200d2, 0x0000c4c4c4c400c4, - 0x0000484848480048, 0x0000f7f7f7f700f7, 0x0000dbdbdbdb00db, - 0x0000030303030003, 0x0000dadadada00da, 0x00003f3f3f3f003f, - 0x0000949494940094, 0x00005c5c5c5c005c, 0x0000020202020002, - 0x00004a4a4a4a004a, 0x0000333333330033, 0x0000676767670067, - 0x0000f3f3f3f300f3, 0x00007f7f7f7f007f, 0x0000e2e2e2e200e2, - 0x00009b9b9b9b009b, 0x0000262626260026, 0x0000373737370037, - 0x00003b3b3b3b003b, 0x0000969696960096, 0x00004b4b4b4b004b, - 0x0000bebebebe00be, 0x00002e2e2e2e002e, 0x0000797979790079, - 0x00008c8c8c8c008c, 0x00006e6e6e6e006e, 0x00008e8e8e8e008e, - 0x0000f5f5f5f500f5, 0x0000b6b6b6b600b6, 0x0000fdfdfdfd00fd, - 0x0000595959590059, 0x0000989898980098, 0x00006a6a6a6a006a, - 0x0000464646460046, 0x0000babababa00ba, 0x0000252525250025, - 0x0000424242420042, 0x0000a2a2a2a200a2, 0x0000fafafafa00fa, - 0x0000070707070007, 0x0000555555550055, 0x0000eeeeeeee00ee, - 0x00000a0a0a0a000a, 0x0000494949490049, 0x0000686868680068, - 0x0000383838380038, 0x0000a4a4a4a400a4, 0x0000282828280028, - 0x00007b7b7b7b007b, 0x0000c9c9c9c900c9, 0x0000c1c1c1c100c1, - 0x0000e3e3e3e300e3, 0x0000f4f4f4f400f4, 0x0000c7c7c7c700c7, - 0x00009e9e9e9e009e, + 0x0000707070700070ULL, 0x00002c2c2c2c002cULL, 0x0000b3b3b3b300b3ULL, + 0x0000c0c0c0c000c0ULL, 0x0000e4e4e4e400e4ULL, 0x0000575757570057ULL, + 0x0000eaeaeaea00eaULL, 0x0000aeaeaeae00aeULL, 0x0000232323230023ULL, + 0x00006b6b6b6b006bULL, 0x0000454545450045ULL, 0x0000a5a5a5a500a5ULL, + 0x0000edededed00edULL, 0x00004f4f4f4f004fULL, 0x00001d1d1d1d001dULL, + 0x0000929292920092ULL, 0x0000868686860086ULL, 0x0000afafafaf00afULL, + 0x00007c7c7c7c007cULL, 0x00001f1f1f1f001fULL, 0x00003e3e3e3e003eULL, + 0x0000dcdcdcdc00dcULL, 0x00005e5e5e5e005eULL, 0x00000b0b0b0b000bULL, + 0x0000a6a6a6a600a6ULL, 0x0000393939390039ULL, 0x0000d5d5d5d500d5ULL, + 0x00005d5d5d5d005dULL, 0x0000d9d9d9d900d9ULL, 0x00005a5a5a5a005aULL, + 0x0000515151510051ULL, 0x00006c6c6c6c006cULL, 0x00008b8b8b8b008bULL, + 0x00009a9a9a9a009aULL, 0x0000fbfbfbfb00fbULL, 0x0000b0b0b0b000b0ULL, + 0x0000747474740074ULL, 0x00002b2b2b2b002bULL, 0x0000f0f0f0f000f0ULL, + 0x0000848484840084ULL, 0x0000dfdfdfdf00dfULL, 0x0000cbcbcbcb00cbULL, + 0x0000343434340034ULL, 0x0000767676760076ULL, 0x00006d6d6d6d006dULL, + 0x0000a9a9a9a900a9ULL, 0x0000d1d1d1d100d1ULL, 0x0000040404040004ULL, + 0x0000141414140014ULL, 0x00003a3a3a3a003aULL, 0x0000dededede00deULL, + 0x0000111111110011ULL, 0x0000323232320032ULL, 0x00009c9c9c9c009cULL, + 0x0000535353530053ULL, 0x0000f2f2f2f200f2ULL, 0x0000fefefefe00feULL, + 0x0000cfcfcfcf00cfULL, 0x0000c3c3c3c300c3ULL, 0x00007a7a7a7a007aULL, + 0x0000242424240024ULL, 0x0000e8e8e8e800e8ULL, 0x0000606060600060ULL, + 0x0000696969690069ULL, 0x0000aaaaaaaa00aaULL, 0x0000a0a0a0a000a0ULL, + 0x0000a1a1a1a100a1ULL, 0x0000626262620062ULL, 0x0000545454540054ULL, + 0x00001e1e1e1e001eULL, 0x0000e0e0e0e000e0ULL, 0x0000646464640064ULL, + 0x0000101010100010ULL, 0x0000000000000000ULL, 0x0000a3a3a3a300a3ULL, + 0x0000757575750075ULL, 0x00008a8a8a8a008aULL, 0x0000e6e6e6e600e6ULL, + 0x0000090909090009ULL, 0x0000dddddddd00ddULL, 0x0000878787870087ULL, + 0x0000838383830083ULL, 0x0000cdcdcdcd00cdULL, 0x0000909090900090ULL, + 0x0000737373730073ULL, 0x0000f6f6f6f600f6ULL, 0x00009d9d9d9d009dULL, + 0x0000bfbfbfbf00bfULL, 0x0000525252520052ULL, 0x0000d8d8d8d800d8ULL, + 0x0000c8c8c8c800c8ULL, 0x0000c6c6c6c600c6ULL, 0x0000818181810081ULL, + 0x00006f6f6f6f006fULL, 0x0000131313130013ULL, 0x0000636363630063ULL, + 0x0000e9e9e9e900e9ULL, 0x0000a7a7a7a700a7ULL, 0x00009f9f9f9f009fULL, + 0x0000bcbcbcbc00bcULL, 0x0000292929290029ULL, 0x0000f9f9f9f900f9ULL, + 0x00002f2f2f2f002fULL, 0x0000b4b4b4b400b4ULL, 0x0000787878780078ULL, + 0x0000060606060006ULL, 0x0000e7e7e7e700e7ULL, 0x0000717171710071ULL, + 0x0000d4d4d4d400d4ULL, 0x0000abababab00abULL, 0x0000888888880088ULL, + 0x00008d8d8d8d008dULL, 0x0000727272720072ULL, 0x0000b9b9b9b900b9ULL, + 0x0000f8f8f8f800f8ULL, 0x0000acacacac00acULL, 0x0000363636360036ULL, + 0x00002a2a2a2a002aULL, 0x00003c3c3c3c003cULL, 0x0000f1f1f1f100f1ULL, + 0x0000404040400040ULL, 0x0000d3d3d3d300d3ULL, 0x0000bbbbbbbb00bbULL, + 0x0000434343430043ULL, 0x0000151515150015ULL, 0x0000adadadad00adULL, + 0x0000777777770077ULL, 0x0000808080800080ULL, 0x0000828282820082ULL, + 0x0000ecececec00ecULL, 0x0000272727270027ULL, 0x0000e5e5e5e500e5ULL, + 0x0000858585850085ULL, 0x0000353535350035ULL, 0x00000c0c0c0c000cULL, + 0x0000414141410041ULL, 0x0000efefefef00efULL, 0x0000939393930093ULL, + 0x0000191919190019ULL, 0x0000212121210021ULL, 0x00000e0e0e0e000eULL, + 0x00004e4e4e4e004eULL, 0x0000656565650065ULL, 0x0000bdbdbdbd00bdULL, + 0x0000b8b8b8b800b8ULL, 0x00008f8f8f8f008fULL, 0x0000ebebebeb00ebULL, + 0x0000cececece00ceULL, 0x0000303030300030ULL, 0x00005f5f5f5f005fULL, + 0x0000c5c5c5c500c5ULL, 0x00001a1a1a1a001aULL, 0x0000e1e1e1e100e1ULL, + 0x0000cacacaca00caULL, 0x0000474747470047ULL, 0x00003d3d3d3d003dULL, + 0x0000010101010001ULL, 0x0000d6d6d6d600d6ULL, 0x0000565656560056ULL, + 0x00004d4d4d4d004dULL, 0x00000d0d0d0d000dULL, 0x0000666666660066ULL, + 0x0000cccccccc00ccULL, 0x00002d2d2d2d002dULL, 0x0000121212120012ULL, + 0x0000202020200020ULL, 0x0000b1b1b1b100b1ULL, 0x0000999999990099ULL, + 0x00004c4c4c4c004cULL, 0x0000c2c2c2c200c2ULL, 0x00007e7e7e7e007eULL, + 0x0000050505050005ULL, 0x0000b7b7b7b700b7ULL, 0x0000313131310031ULL, + 0x0000171717170017ULL, 0x0000d7d7d7d700d7ULL, 0x0000585858580058ULL, + 0x0000616161610061ULL, 0x00001b1b1b1b001bULL, 0x00001c1c1c1c001cULL, + 0x00000f0f0f0f000fULL, 0x0000161616160016ULL, 0x0000181818180018ULL, + 0x0000222222220022ULL, 0x0000444444440044ULL, 0x0000b2b2b2b200b2ULL, + 0x0000b5b5b5b500b5ULL, 0x0000919191910091ULL, 0x0000080808080008ULL, + 0x0000a8a8a8a800a8ULL, 0x0000fcfcfcfc00fcULL, 0x0000505050500050ULL, + 0x0000d0d0d0d000d0ULL, 0x00007d7d7d7d007dULL, 0x0000898989890089ULL, + 0x0000979797970097ULL, 0x00005b5b5b5b005bULL, 0x0000959595950095ULL, + 0x0000ffffffff00ffULL, 0x0000d2d2d2d200d2ULL, 0x0000c4c4c4c400c4ULL, + 0x0000484848480048ULL, 0x0000f7f7f7f700f7ULL, 0x0000dbdbdbdb00dbULL, + 0x0000030303030003ULL, 0x0000dadadada00daULL, 0x00003f3f3f3f003fULL, + 0x0000949494940094ULL, 0x00005c5c5c5c005cULL, 0x0000020202020002ULL, + 0x00004a4a4a4a004aULL, 0x0000333333330033ULL, 0x0000676767670067ULL, + 0x0000f3f3f3f300f3ULL, 0x00007f7f7f7f007fULL, 0x0000e2e2e2e200e2ULL, + 0x00009b9b9b9b009bULL, 0x0000262626260026ULL, 0x0000373737370037ULL, + 0x00003b3b3b3b003bULL, 0x0000969696960096ULL, 0x00004b4b4b4b004bULL, + 0x0000bebebebe00beULL, 0x00002e2e2e2e002eULL, 0x0000797979790079ULL, + 0x00008c8c8c8c008cULL, 0x00006e6e6e6e006eULL, 0x00008e8e8e8e008eULL, + 0x0000f5f5f5f500f5ULL, 0x0000b6b6b6b600b6ULL, 0x0000fdfdfdfd00fdULL, + 0x0000595959590059ULL, 0x0000989898980098ULL, 0x00006a6a6a6a006aULL, + 0x0000464646460046ULL, 0x0000babababa00baULL, 0x0000252525250025ULL, + 0x0000424242420042ULL, 0x0000a2a2a2a200a2ULL, 0x0000fafafafa00faULL, + 0x0000070707070007ULL, 0x0000555555550055ULL, 0x0000eeeeeeee00eeULL, + 0x00000a0a0a0a000aULL, 0x0000494949490049ULL, 0x0000686868680068ULL, + 0x0000383838380038ULL, 0x0000a4a4a4a400a4ULL, 0x0000282828280028ULL, + 0x00007b7b7b7b007bULL, 0x0000c9c9c9c900c9ULL, 0x0000c1c1c1c100c1ULL, + 0x0000e3e3e3e300e3ULL, 0x0000f4f4f4f400f4ULL, 0x0000c7c7c7c700c7ULL, + 0x00009e9e9e9e009eULL, }; const u64 camellia_sp02220222[256] = { - 0x00e0e0e000e0e0e0, 0x0005050500050505, 0x0058585800585858, - 0x00d9d9d900d9d9d9, 0x0067676700676767, 0x004e4e4e004e4e4e, - 0x0081818100818181, 0x00cbcbcb00cbcbcb, 0x00c9c9c900c9c9c9, - 0x000b0b0b000b0b0b, 0x00aeaeae00aeaeae, 0x006a6a6a006a6a6a, - 0x00d5d5d500d5d5d5, 0x0018181800181818, 0x005d5d5d005d5d5d, - 0x0082828200828282, 0x0046464600464646, 0x00dfdfdf00dfdfdf, - 0x00d6d6d600d6d6d6, 0x0027272700272727, 0x008a8a8a008a8a8a, - 0x0032323200323232, 0x004b4b4b004b4b4b, 0x0042424200424242, - 0x00dbdbdb00dbdbdb, 0x001c1c1c001c1c1c, 0x009e9e9e009e9e9e, - 0x009c9c9c009c9c9c, 0x003a3a3a003a3a3a, 0x00cacaca00cacaca, - 0x0025252500252525, 0x007b7b7b007b7b7b, 0x000d0d0d000d0d0d, - 0x0071717100717171, 0x005f5f5f005f5f5f, 0x001f1f1f001f1f1f, - 0x00f8f8f800f8f8f8, 0x00d7d7d700d7d7d7, 0x003e3e3e003e3e3e, - 0x009d9d9d009d9d9d, 0x007c7c7c007c7c7c, 0x0060606000606060, - 0x00b9b9b900b9b9b9, 0x00bebebe00bebebe, 0x00bcbcbc00bcbcbc, - 0x008b8b8b008b8b8b, 0x0016161600161616, 0x0034343400343434, - 0x004d4d4d004d4d4d, 0x00c3c3c300c3c3c3, 0x0072727200727272, - 0x0095959500959595, 0x00ababab00ababab, 0x008e8e8e008e8e8e, - 0x00bababa00bababa, 0x007a7a7a007a7a7a, 0x00b3b3b300b3b3b3, - 0x0002020200020202, 0x00b4b4b400b4b4b4, 0x00adadad00adadad, - 0x00a2a2a200a2a2a2, 0x00acacac00acacac, 0x00d8d8d800d8d8d8, - 0x009a9a9a009a9a9a, 0x0017171700171717, 0x001a1a1a001a1a1a, - 0x0035353500353535, 0x00cccccc00cccccc, 0x00f7f7f700f7f7f7, - 0x0099999900999999, 0x0061616100616161, 0x005a5a5a005a5a5a, - 0x00e8e8e800e8e8e8, 0x0024242400242424, 0x0056565600565656, - 0x0040404000404040, 0x00e1e1e100e1e1e1, 0x0063636300636363, - 0x0009090900090909, 0x0033333300333333, 0x00bfbfbf00bfbfbf, - 0x0098989800989898, 0x0097979700979797, 0x0085858500858585, - 0x0068686800686868, 0x00fcfcfc00fcfcfc, 0x00ececec00ececec, - 0x000a0a0a000a0a0a, 0x00dadada00dadada, 0x006f6f6f006f6f6f, - 0x0053535300535353, 0x0062626200626262, 0x00a3a3a300a3a3a3, - 0x002e2e2e002e2e2e, 0x0008080800080808, 0x00afafaf00afafaf, - 0x0028282800282828, 0x00b0b0b000b0b0b0, 0x0074747400747474, - 0x00c2c2c200c2c2c2, 0x00bdbdbd00bdbdbd, 0x0036363600363636, - 0x0022222200222222, 0x0038383800383838, 0x0064646400646464, - 0x001e1e1e001e1e1e, 0x0039393900393939, 0x002c2c2c002c2c2c, - 0x00a6a6a600a6a6a6, 0x0030303000303030, 0x00e5e5e500e5e5e5, - 0x0044444400444444, 0x00fdfdfd00fdfdfd, 0x0088888800888888, - 0x009f9f9f009f9f9f, 0x0065656500656565, 0x0087878700878787, - 0x006b6b6b006b6b6b, 0x00f4f4f400f4f4f4, 0x0023232300232323, - 0x0048484800484848, 0x0010101000101010, 0x00d1d1d100d1d1d1, - 0x0051515100515151, 0x00c0c0c000c0c0c0, 0x00f9f9f900f9f9f9, - 0x00d2d2d200d2d2d2, 0x00a0a0a000a0a0a0, 0x0055555500555555, - 0x00a1a1a100a1a1a1, 0x0041414100414141, 0x00fafafa00fafafa, - 0x0043434300434343, 0x0013131300131313, 0x00c4c4c400c4c4c4, - 0x002f2f2f002f2f2f, 0x00a8a8a800a8a8a8, 0x00b6b6b600b6b6b6, - 0x003c3c3c003c3c3c, 0x002b2b2b002b2b2b, 0x00c1c1c100c1c1c1, - 0x00ffffff00ffffff, 0x00c8c8c800c8c8c8, 0x00a5a5a500a5a5a5, - 0x0020202000202020, 0x0089898900898989, 0x0000000000000000, - 0x0090909000909090, 0x0047474700474747, 0x00efefef00efefef, - 0x00eaeaea00eaeaea, 0x00b7b7b700b7b7b7, 0x0015151500151515, - 0x0006060600060606, 0x00cdcdcd00cdcdcd, 0x00b5b5b500b5b5b5, - 0x0012121200121212, 0x007e7e7e007e7e7e, 0x00bbbbbb00bbbbbb, - 0x0029292900292929, 0x000f0f0f000f0f0f, 0x00b8b8b800b8b8b8, - 0x0007070700070707, 0x0004040400040404, 0x009b9b9b009b9b9b, - 0x0094949400949494, 0x0021212100212121, 0x0066666600666666, - 0x00e6e6e600e6e6e6, 0x00cecece00cecece, 0x00ededed00ededed, - 0x00e7e7e700e7e7e7, 0x003b3b3b003b3b3b, 0x00fefefe00fefefe, - 0x007f7f7f007f7f7f, 0x00c5c5c500c5c5c5, 0x00a4a4a400a4a4a4, - 0x0037373700373737, 0x00b1b1b100b1b1b1, 0x004c4c4c004c4c4c, - 0x0091919100919191, 0x006e6e6e006e6e6e, 0x008d8d8d008d8d8d, - 0x0076767600767676, 0x0003030300030303, 0x002d2d2d002d2d2d, - 0x00dedede00dedede, 0x0096969600969696, 0x0026262600262626, - 0x007d7d7d007d7d7d, 0x00c6c6c600c6c6c6, 0x005c5c5c005c5c5c, - 0x00d3d3d300d3d3d3, 0x00f2f2f200f2f2f2, 0x004f4f4f004f4f4f, - 0x0019191900191919, 0x003f3f3f003f3f3f, 0x00dcdcdc00dcdcdc, - 0x0079797900797979, 0x001d1d1d001d1d1d, 0x0052525200525252, - 0x00ebebeb00ebebeb, 0x00f3f3f300f3f3f3, 0x006d6d6d006d6d6d, - 0x005e5e5e005e5e5e, 0x00fbfbfb00fbfbfb, 0x0069696900696969, - 0x00b2b2b200b2b2b2, 0x00f0f0f000f0f0f0, 0x0031313100313131, - 0x000c0c0c000c0c0c, 0x00d4d4d400d4d4d4, 0x00cfcfcf00cfcfcf, - 0x008c8c8c008c8c8c, 0x00e2e2e200e2e2e2, 0x0075757500757575, - 0x00a9a9a900a9a9a9, 0x004a4a4a004a4a4a, 0x0057575700575757, - 0x0084848400848484, 0x0011111100111111, 0x0045454500454545, - 0x001b1b1b001b1b1b, 0x00f5f5f500f5f5f5, 0x00e4e4e400e4e4e4, - 0x000e0e0e000e0e0e, 0x0073737300737373, 0x00aaaaaa00aaaaaa, - 0x00f1f1f100f1f1f1, 0x00dddddd00dddddd, 0x0059595900595959, - 0x0014141400141414, 0x006c6c6c006c6c6c, 0x0092929200929292, - 0x0054545400545454, 0x00d0d0d000d0d0d0, 0x0078787800787878, - 0x0070707000707070, 0x00e3e3e300e3e3e3, 0x0049494900494949, - 0x0080808000808080, 0x0050505000505050, 0x00a7a7a700a7a7a7, - 0x00f6f6f600f6f6f6, 0x0077777700777777, 0x0093939300939393, - 0x0086868600868686, 0x0083838300838383, 0x002a2a2a002a2a2a, - 0x00c7c7c700c7c7c7, 0x005b5b5b005b5b5b, 0x00e9e9e900e9e9e9, - 0x00eeeeee00eeeeee, 0x008f8f8f008f8f8f, 0x0001010100010101, - 0x003d3d3d003d3d3d, + 0x00e0e0e000e0e0e0ULL, 0x0005050500050505ULL, 0x0058585800585858ULL, + 0x00d9d9d900d9d9d9ULL, 0x0067676700676767ULL, 0x004e4e4e004e4e4eULL, + 0x0081818100818181ULL, 0x00cbcbcb00cbcbcbULL, 0x00c9c9c900c9c9c9ULL, + 0x000b0b0b000b0b0bULL, 0x00aeaeae00aeaeaeULL, 0x006a6a6a006a6a6aULL, + 0x00d5d5d500d5d5d5ULL, 0x0018181800181818ULL, 0x005d5d5d005d5d5dULL, + 0x0082828200828282ULL, 0x0046464600464646ULL, 0x00dfdfdf00dfdfdfULL, + 0x00d6d6d600d6d6d6ULL, 0x0027272700272727ULL, 0x008a8a8a008a8a8aULL, + 0x0032323200323232ULL, 0x004b4b4b004b4b4bULL, 0x0042424200424242ULL, + 0x00dbdbdb00dbdbdbULL, 0x001c1c1c001c1c1cULL, 0x009e9e9e009e9e9eULL, + 0x009c9c9c009c9c9cULL, 0x003a3a3a003a3a3aULL, 0x00cacaca00cacacaULL, + 0x0025252500252525ULL, 0x007b7b7b007b7b7bULL, 0x000d0d0d000d0d0dULL, + 0x0071717100717171ULL, 0x005f5f5f005f5f5fULL, 0x001f1f1f001f1f1fULL, + 0x00f8f8f800f8f8f8ULL, 0x00d7d7d700d7d7d7ULL, 0x003e3e3e003e3e3eULL, + 0x009d9d9d009d9d9dULL, 0x007c7c7c007c7c7cULL, 0x0060606000606060ULL, + 0x00b9b9b900b9b9b9ULL, 0x00bebebe00bebebeULL, 0x00bcbcbc00bcbcbcULL, + 0x008b8b8b008b8b8bULL, 0x0016161600161616ULL, 0x0034343400343434ULL, + 0x004d4d4d004d4d4dULL, 0x00c3c3c300c3c3c3ULL, 0x0072727200727272ULL, + 0x0095959500959595ULL, 0x00ababab00abababULL, 0x008e8e8e008e8e8eULL, + 0x00bababa00bababaULL, 0x007a7a7a007a7a7aULL, 0x00b3b3b300b3b3b3ULL, + 0x0002020200020202ULL, 0x00b4b4b400b4b4b4ULL, 0x00adadad00adadadULL, + 0x00a2a2a200a2a2a2ULL, 0x00acacac00acacacULL, 0x00d8d8d800d8d8d8ULL, + 0x009a9a9a009a9a9aULL, 0x0017171700171717ULL, 0x001a1a1a001a1a1aULL, + 0x0035353500353535ULL, 0x00cccccc00ccccccULL, 0x00f7f7f700f7f7f7ULL, + 0x0099999900999999ULL, 0x0061616100616161ULL, 0x005a5a5a005a5a5aULL, + 0x00e8e8e800e8e8e8ULL, 0x0024242400242424ULL, 0x0056565600565656ULL, + 0x0040404000404040ULL, 0x00e1e1e100e1e1e1ULL, 0x0063636300636363ULL, + 0x0009090900090909ULL, 0x0033333300333333ULL, 0x00bfbfbf00bfbfbfULL, + 0x0098989800989898ULL, 0x0097979700979797ULL, 0x0085858500858585ULL, + 0x0068686800686868ULL, 0x00fcfcfc00fcfcfcULL, 0x00ececec00ecececULL, + 0x000a0a0a000a0a0aULL, 0x00dadada00dadadaULL, 0x006f6f6f006f6f6fULL, + 0x0053535300535353ULL, 0x0062626200626262ULL, 0x00a3a3a300a3a3a3ULL, + 0x002e2e2e002e2e2eULL, 0x0008080800080808ULL, 0x00afafaf00afafafULL, + 0x0028282800282828ULL, 0x00b0b0b000b0b0b0ULL, 0x0074747400747474ULL, + 0x00c2c2c200c2c2c2ULL, 0x00bdbdbd00bdbdbdULL, 0x0036363600363636ULL, + 0x0022222200222222ULL, 0x0038383800383838ULL, 0x0064646400646464ULL, + 0x001e1e1e001e1e1eULL, 0x0039393900393939ULL, 0x002c2c2c002c2c2cULL, + 0x00a6a6a600a6a6a6ULL, 0x0030303000303030ULL, 0x00e5e5e500e5e5e5ULL, + 0x0044444400444444ULL, 0x00fdfdfd00fdfdfdULL, 0x0088888800888888ULL, + 0x009f9f9f009f9f9fULL, 0x0065656500656565ULL, 0x0087878700878787ULL, + 0x006b6b6b006b6b6bULL, 0x00f4f4f400f4f4f4ULL, 0x0023232300232323ULL, + 0x0048484800484848ULL, 0x0010101000101010ULL, 0x00d1d1d100d1d1d1ULL, + 0x0051515100515151ULL, 0x00c0c0c000c0c0c0ULL, 0x00f9f9f900f9f9f9ULL, + 0x00d2d2d200d2d2d2ULL, 0x00a0a0a000a0a0a0ULL, 0x0055555500555555ULL, + 0x00a1a1a100a1a1a1ULL, 0x0041414100414141ULL, 0x00fafafa00fafafaULL, + 0x0043434300434343ULL, 0x0013131300131313ULL, 0x00c4c4c400c4c4c4ULL, + 0x002f2f2f002f2f2fULL, 0x00a8a8a800a8a8a8ULL, 0x00b6b6b600b6b6b6ULL, + 0x003c3c3c003c3c3cULL, 0x002b2b2b002b2b2bULL, 0x00c1c1c100c1c1c1ULL, + 0x00ffffff00ffffffULL, 0x00c8c8c800c8c8c8ULL, 0x00a5a5a500a5a5a5ULL, + 0x0020202000202020ULL, 0x0089898900898989ULL, 0x0000000000000000ULL, + 0x0090909000909090ULL, 0x0047474700474747ULL, 0x00efefef00efefefULL, + 0x00eaeaea00eaeaeaULL, 0x00b7b7b700b7b7b7ULL, 0x0015151500151515ULL, + 0x0006060600060606ULL, 0x00cdcdcd00cdcdcdULL, 0x00b5b5b500b5b5b5ULL, + 0x0012121200121212ULL, 0x007e7e7e007e7e7eULL, 0x00bbbbbb00bbbbbbULL, + 0x0029292900292929ULL, 0x000f0f0f000f0f0fULL, 0x00b8b8b800b8b8b8ULL, + 0x0007070700070707ULL, 0x0004040400040404ULL, 0x009b9b9b009b9b9bULL, + 0x0094949400949494ULL, 0x0021212100212121ULL, 0x0066666600666666ULL, + 0x00e6e6e600e6e6e6ULL, 0x00cecece00cececeULL, 0x00ededed00edededULL, + 0x00e7e7e700e7e7e7ULL, 0x003b3b3b003b3b3bULL, 0x00fefefe00fefefeULL, + 0x007f7f7f007f7f7fULL, 0x00c5c5c500c5c5c5ULL, 0x00a4a4a400a4a4a4ULL, + 0x0037373700373737ULL, 0x00b1b1b100b1b1b1ULL, 0x004c4c4c004c4c4cULL, + 0x0091919100919191ULL, 0x006e6e6e006e6e6eULL, 0x008d8d8d008d8d8dULL, + 0x0076767600767676ULL, 0x0003030300030303ULL, 0x002d2d2d002d2d2dULL, + 0x00dedede00dededeULL, 0x0096969600969696ULL, 0x0026262600262626ULL, + 0x007d7d7d007d7d7dULL, 0x00c6c6c600c6c6c6ULL, 0x005c5c5c005c5c5cULL, + 0x00d3d3d300d3d3d3ULL, 0x00f2f2f200f2f2f2ULL, 0x004f4f4f004f4f4fULL, + 0x0019191900191919ULL, 0x003f3f3f003f3f3fULL, 0x00dcdcdc00dcdcdcULL, + 0x0079797900797979ULL, 0x001d1d1d001d1d1dULL, 0x0052525200525252ULL, + 0x00ebebeb00ebebebULL, 0x00f3f3f300f3f3f3ULL, 0x006d6d6d006d6d6dULL, + 0x005e5e5e005e5e5eULL, 0x00fbfbfb00fbfbfbULL, 0x0069696900696969ULL, + 0x00b2b2b200b2b2b2ULL, 0x00f0f0f000f0f0f0ULL, 0x0031313100313131ULL, + 0x000c0c0c000c0c0cULL, 0x00d4d4d400d4d4d4ULL, 0x00cfcfcf00cfcfcfULL, + 0x008c8c8c008c8c8cULL, 0x00e2e2e200e2e2e2ULL, 0x0075757500757575ULL, + 0x00a9a9a900a9a9a9ULL, 0x004a4a4a004a4a4aULL, 0x0057575700575757ULL, + 0x0084848400848484ULL, 0x0011111100111111ULL, 0x0045454500454545ULL, + 0x001b1b1b001b1b1bULL, 0x00f5f5f500f5f5f5ULL, 0x00e4e4e400e4e4e4ULL, + 0x000e0e0e000e0e0eULL, 0x0073737300737373ULL, 0x00aaaaaa00aaaaaaULL, + 0x00f1f1f100f1f1f1ULL, 0x00dddddd00ddddddULL, 0x0059595900595959ULL, + 0x0014141400141414ULL, 0x006c6c6c006c6c6cULL, 0x0092929200929292ULL, + 0x0054545400545454ULL, 0x00d0d0d000d0d0d0ULL, 0x0078787800787878ULL, + 0x0070707000707070ULL, 0x00e3e3e300e3e3e3ULL, 0x0049494900494949ULL, + 0x0080808000808080ULL, 0x0050505000505050ULL, 0x00a7a7a700a7a7a7ULL, + 0x00f6f6f600f6f6f6ULL, 0x0077777700777777ULL, 0x0093939300939393ULL, + 0x0086868600868686ULL, 0x0083838300838383ULL, 0x002a2a2a002a2a2aULL, + 0x00c7c7c700c7c7c7ULL, 0x005b5b5b005b5b5bULL, 0x00e9e9e900e9e9e9ULL, + 0x00eeeeee00eeeeeeULL, 0x008f8f8f008f8f8fULL, 0x0001010100010101ULL, + 0x003d3d3d003d3d3dULL, }; const u64 camellia_sp30333033[256] = { - 0x3800383838003838, 0x4100414141004141, 0x1600161616001616, - 0x7600767676007676, 0xd900d9d9d900d9d9, 0x9300939393009393, - 0x6000606060006060, 0xf200f2f2f200f2f2, 0x7200727272007272, - 0xc200c2c2c200c2c2, 0xab00ababab00abab, 0x9a009a9a9a009a9a, - 0x7500757575007575, 0x0600060606000606, 0x5700575757005757, - 0xa000a0a0a000a0a0, 0x9100919191009191, 0xf700f7f7f700f7f7, - 0xb500b5b5b500b5b5, 0xc900c9c9c900c9c9, 0xa200a2a2a200a2a2, - 0x8c008c8c8c008c8c, 0xd200d2d2d200d2d2, 0x9000909090009090, - 0xf600f6f6f600f6f6, 0x0700070707000707, 0xa700a7a7a700a7a7, - 0x2700272727002727, 0x8e008e8e8e008e8e, 0xb200b2b2b200b2b2, - 0x4900494949004949, 0xde00dedede00dede, 0x4300434343004343, - 0x5c005c5c5c005c5c, 0xd700d7d7d700d7d7, 0xc700c7c7c700c7c7, - 0x3e003e3e3e003e3e, 0xf500f5f5f500f5f5, 0x8f008f8f8f008f8f, - 0x6700676767006767, 0x1f001f1f1f001f1f, 0x1800181818001818, - 0x6e006e6e6e006e6e, 0xaf00afafaf00afaf, 0x2f002f2f2f002f2f, - 0xe200e2e2e200e2e2, 0x8500858585008585, 0x0d000d0d0d000d0d, - 0x5300535353005353, 0xf000f0f0f000f0f0, 0x9c009c9c9c009c9c, - 0x6500656565006565, 0xea00eaeaea00eaea, 0xa300a3a3a300a3a3, - 0xae00aeaeae00aeae, 0x9e009e9e9e009e9e, 0xec00ececec00ecec, - 0x8000808080008080, 0x2d002d2d2d002d2d, 0x6b006b6b6b006b6b, - 0xa800a8a8a800a8a8, 0x2b002b2b2b002b2b, 0x3600363636003636, - 0xa600a6a6a600a6a6, 0xc500c5c5c500c5c5, 0x8600868686008686, - 0x4d004d4d4d004d4d, 0x3300333333003333, 0xfd00fdfdfd00fdfd, - 0x6600666666006666, 0x5800585858005858, 0x9600969696009696, - 0x3a003a3a3a003a3a, 0x0900090909000909, 0x9500959595009595, - 0x1000101010001010, 0x7800787878007878, 0xd800d8d8d800d8d8, - 0x4200424242004242, 0xcc00cccccc00cccc, 0xef00efefef00efef, - 0x2600262626002626, 0xe500e5e5e500e5e5, 0x6100616161006161, - 0x1a001a1a1a001a1a, 0x3f003f3f3f003f3f, 0x3b003b3b3b003b3b, - 0x8200828282008282, 0xb600b6b6b600b6b6, 0xdb00dbdbdb00dbdb, - 0xd400d4d4d400d4d4, 0x9800989898009898, 0xe800e8e8e800e8e8, - 0x8b008b8b8b008b8b, 0x0200020202000202, 0xeb00ebebeb00ebeb, - 0x0a000a0a0a000a0a, 0x2c002c2c2c002c2c, 0x1d001d1d1d001d1d, - 0xb000b0b0b000b0b0, 0x6f006f6f6f006f6f, 0x8d008d8d8d008d8d, - 0x8800888888008888, 0x0e000e0e0e000e0e, 0x1900191919001919, - 0x8700878787008787, 0x4e004e4e4e004e4e, 0x0b000b0b0b000b0b, - 0xa900a9a9a900a9a9, 0x0c000c0c0c000c0c, 0x7900797979007979, - 0x1100111111001111, 0x7f007f7f7f007f7f, 0x2200222222002222, - 0xe700e7e7e700e7e7, 0x5900595959005959, 0xe100e1e1e100e1e1, - 0xda00dadada00dada, 0x3d003d3d3d003d3d, 0xc800c8c8c800c8c8, - 0x1200121212001212, 0x0400040404000404, 0x7400747474007474, - 0x5400545454005454, 0x3000303030003030, 0x7e007e7e7e007e7e, - 0xb400b4b4b400b4b4, 0x2800282828002828, 0x5500555555005555, - 0x6800686868006868, 0x5000505050005050, 0xbe00bebebe00bebe, - 0xd000d0d0d000d0d0, 0xc400c4c4c400c4c4, 0x3100313131003131, - 0xcb00cbcbcb00cbcb, 0x2a002a2a2a002a2a, 0xad00adadad00adad, - 0x0f000f0f0f000f0f, 0xca00cacaca00caca, 0x7000707070007070, - 0xff00ffffff00ffff, 0x3200323232003232, 0x6900696969006969, - 0x0800080808000808, 0x6200626262006262, 0x0000000000000000, - 0x2400242424002424, 0xd100d1d1d100d1d1, 0xfb00fbfbfb00fbfb, - 0xba00bababa00baba, 0xed00ededed00eded, 0x4500454545004545, - 0x8100818181008181, 0x7300737373007373, 0x6d006d6d6d006d6d, - 0x8400848484008484, 0x9f009f9f9f009f9f, 0xee00eeeeee00eeee, - 0x4a004a4a4a004a4a, 0xc300c3c3c300c3c3, 0x2e002e2e2e002e2e, - 0xc100c1c1c100c1c1, 0x0100010101000101, 0xe600e6e6e600e6e6, - 0x2500252525002525, 0x4800484848004848, 0x9900999999009999, - 0xb900b9b9b900b9b9, 0xb300b3b3b300b3b3, 0x7b007b7b7b007b7b, - 0xf900f9f9f900f9f9, 0xce00cecece00cece, 0xbf00bfbfbf00bfbf, - 0xdf00dfdfdf00dfdf, 0x7100717171007171, 0x2900292929002929, - 0xcd00cdcdcd00cdcd, 0x6c006c6c6c006c6c, 0x1300131313001313, - 0x6400646464006464, 0x9b009b9b9b009b9b, 0x6300636363006363, - 0x9d009d9d9d009d9d, 0xc000c0c0c000c0c0, 0x4b004b4b4b004b4b, - 0xb700b7b7b700b7b7, 0xa500a5a5a500a5a5, 0x8900898989008989, - 0x5f005f5f5f005f5f, 0xb100b1b1b100b1b1, 0x1700171717001717, - 0xf400f4f4f400f4f4, 0xbc00bcbcbc00bcbc, 0xd300d3d3d300d3d3, - 0x4600464646004646, 0xcf00cfcfcf00cfcf, 0x3700373737003737, - 0x5e005e5e5e005e5e, 0x4700474747004747, 0x9400949494009494, - 0xfa00fafafa00fafa, 0xfc00fcfcfc00fcfc, 0x5b005b5b5b005b5b, - 0x9700979797009797, 0xfe00fefefe00fefe, 0x5a005a5a5a005a5a, - 0xac00acacac00acac, 0x3c003c3c3c003c3c, 0x4c004c4c4c004c4c, - 0x0300030303000303, 0x3500353535003535, 0xf300f3f3f300f3f3, - 0x2300232323002323, 0xb800b8b8b800b8b8, 0x5d005d5d5d005d5d, - 0x6a006a6a6a006a6a, 0x9200929292009292, 0xd500d5d5d500d5d5, - 0x2100212121002121, 0x4400444444004444, 0x5100515151005151, - 0xc600c6c6c600c6c6, 0x7d007d7d7d007d7d, 0x3900393939003939, - 0x8300838383008383, 0xdc00dcdcdc00dcdc, 0xaa00aaaaaa00aaaa, - 0x7c007c7c7c007c7c, 0x7700777777007777, 0x5600565656005656, - 0x0500050505000505, 0x1b001b1b1b001b1b, 0xa400a4a4a400a4a4, - 0x1500151515001515, 0x3400343434003434, 0x1e001e1e1e001e1e, - 0x1c001c1c1c001c1c, 0xf800f8f8f800f8f8, 0x5200525252005252, - 0x2000202020002020, 0x1400141414001414, 0xe900e9e9e900e9e9, - 0xbd00bdbdbd00bdbd, 0xdd00dddddd00dddd, 0xe400e4e4e400e4e4, - 0xa100a1a1a100a1a1, 0xe000e0e0e000e0e0, 0x8a008a8a8a008a8a, - 0xf100f1f1f100f1f1, 0xd600d6d6d600d6d6, 0x7a007a7a7a007a7a, - 0xbb00bbbbbb00bbbb, 0xe300e3e3e300e3e3, 0x4000404040004040, - 0x4f004f4f4f004f4f, + 0x3800383838003838ULL, 0x4100414141004141ULL, 0x1600161616001616ULL, + 0x7600767676007676ULL, 0xd900d9d9d900d9d9ULL, 0x9300939393009393ULL, + 0x6000606060006060ULL, 0xf200f2f2f200f2f2ULL, 0x7200727272007272ULL, + 0xc200c2c2c200c2c2ULL, 0xab00ababab00ababULL, 0x9a009a9a9a009a9aULL, + 0x7500757575007575ULL, 0x0600060606000606ULL, 0x5700575757005757ULL, + 0xa000a0a0a000a0a0ULL, 0x9100919191009191ULL, 0xf700f7f7f700f7f7ULL, + 0xb500b5b5b500b5b5ULL, 0xc900c9c9c900c9c9ULL, 0xa200a2a2a200a2a2ULL, + 0x8c008c8c8c008c8cULL, 0xd200d2d2d200d2d2ULL, 0x9000909090009090ULL, + 0xf600f6f6f600f6f6ULL, 0x0700070707000707ULL, 0xa700a7a7a700a7a7ULL, + 0x2700272727002727ULL, 0x8e008e8e8e008e8eULL, 0xb200b2b2b200b2b2ULL, + 0x4900494949004949ULL, 0xde00dedede00dedeULL, 0x4300434343004343ULL, + 0x5c005c5c5c005c5cULL, 0xd700d7d7d700d7d7ULL, 0xc700c7c7c700c7c7ULL, + 0x3e003e3e3e003e3eULL, 0xf500f5f5f500f5f5ULL, 0x8f008f8f8f008f8fULL, + 0x6700676767006767ULL, 0x1f001f1f1f001f1fULL, 0x1800181818001818ULL, + 0x6e006e6e6e006e6eULL, 0xaf00afafaf00afafULL, 0x2f002f2f2f002f2fULL, + 0xe200e2e2e200e2e2ULL, 0x8500858585008585ULL, 0x0d000d0d0d000d0dULL, + 0x5300535353005353ULL, 0xf000f0f0f000f0f0ULL, 0x9c009c9c9c009c9cULL, + 0x6500656565006565ULL, 0xea00eaeaea00eaeaULL, 0xa300a3a3a300a3a3ULL, + 0xae00aeaeae00aeaeULL, 0x9e009e9e9e009e9eULL, 0xec00ececec00ececULL, + 0x8000808080008080ULL, 0x2d002d2d2d002d2dULL, 0x6b006b6b6b006b6bULL, + 0xa800a8a8a800a8a8ULL, 0x2b002b2b2b002b2bULL, 0x3600363636003636ULL, + 0xa600a6a6a600a6a6ULL, 0xc500c5c5c500c5c5ULL, 0x8600868686008686ULL, + 0x4d004d4d4d004d4dULL, 0x3300333333003333ULL, 0xfd00fdfdfd00fdfdULL, + 0x6600666666006666ULL, 0x5800585858005858ULL, 0x9600969696009696ULL, + 0x3a003a3a3a003a3aULL, 0x0900090909000909ULL, 0x9500959595009595ULL, + 0x1000101010001010ULL, 0x7800787878007878ULL, 0xd800d8d8d800d8d8ULL, + 0x4200424242004242ULL, 0xcc00cccccc00ccccULL, 0xef00efefef00efefULL, + 0x2600262626002626ULL, 0xe500e5e5e500e5e5ULL, 0x6100616161006161ULL, + 0x1a001a1a1a001a1aULL, 0x3f003f3f3f003f3fULL, 0x3b003b3b3b003b3bULL, + 0x8200828282008282ULL, 0xb600b6b6b600b6b6ULL, 0xdb00dbdbdb00dbdbULL, + 0xd400d4d4d400d4d4ULL, 0x9800989898009898ULL, 0xe800e8e8e800e8e8ULL, + 0x8b008b8b8b008b8bULL, 0x0200020202000202ULL, 0xeb00ebebeb00ebebULL, + 0x0a000a0a0a000a0aULL, 0x2c002c2c2c002c2cULL, 0x1d001d1d1d001d1dULL, + 0xb000b0b0b000b0b0ULL, 0x6f006f6f6f006f6fULL, 0x8d008d8d8d008d8dULL, + 0x8800888888008888ULL, 0x0e000e0e0e000e0eULL, 0x1900191919001919ULL, + 0x8700878787008787ULL, 0x4e004e4e4e004e4eULL, 0x0b000b0b0b000b0bULL, + 0xa900a9a9a900a9a9ULL, 0x0c000c0c0c000c0cULL, 0x7900797979007979ULL, + 0x1100111111001111ULL, 0x7f007f7f7f007f7fULL, 0x2200222222002222ULL, + 0xe700e7e7e700e7e7ULL, 0x5900595959005959ULL, 0xe100e1e1e100e1e1ULL, + 0xda00dadada00dadaULL, 0x3d003d3d3d003d3dULL, 0xc800c8c8c800c8c8ULL, + 0x1200121212001212ULL, 0x0400040404000404ULL, 0x7400747474007474ULL, + 0x5400545454005454ULL, 0x3000303030003030ULL, 0x7e007e7e7e007e7eULL, + 0xb400b4b4b400b4b4ULL, 0x2800282828002828ULL, 0x5500555555005555ULL, + 0x6800686868006868ULL, 0x5000505050005050ULL, 0xbe00bebebe00bebeULL, + 0xd000d0d0d000d0d0ULL, 0xc400c4c4c400c4c4ULL, 0x3100313131003131ULL, + 0xcb00cbcbcb00cbcbULL, 0x2a002a2a2a002a2aULL, 0xad00adadad00adadULL, + 0x0f000f0f0f000f0fULL, 0xca00cacaca00cacaULL, 0x7000707070007070ULL, + 0xff00ffffff00ffffULL, 0x3200323232003232ULL, 0x6900696969006969ULL, + 0x0800080808000808ULL, 0x6200626262006262ULL, 0x0000000000000000ULL, + 0x2400242424002424ULL, 0xd100d1d1d100d1d1ULL, 0xfb00fbfbfb00fbfbULL, + 0xba00bababa00babaULL, 0xed00ededed00ededULL, 0x4500454545004545ULL, + 0x8100818181008181ULL, 0x7300737373007373ULL, 0x6d006d6d6d006d6dULL, + 0x8400848484008484ULL, 0x9f009f9f9f009f9fULL, 0xee00eeeeee00eeeeULL, + 0x4a004a4a4a004a4aULL, 0xc300c3c3c300c3c3ULL, 0x2e002e2e2e002e2eULL, + 0xc100c1c1c100c1c1ULL, 0x0100010101000101ULL, 0xe600e6e6e600e6e6ULL, + 0x2500252525002525ULL, 0x4800484848004848ULL, 0x9900999999009999ULL, + 0xb900b9b9b900b9b9ULL, 0xb300b3b3b300b3b3ULL, 0x7b007b7b7b007b7bULL, + 0xf900f9f9f900f9f9ULL, 0xce00cecece00ceceULL, 0xbf00bfbfbf00bfbfULL, + 0xdf00dfdfdf00dfdfULL, 0x7100717171007171ULL, 0x2900292929002929ULL, + 0xcd00cdcdcd00cdcdULL, 0x6c006c6c6c006c6cULL, 0x1300131313001313ULL, + 0x6400646464006464ULL, 0x9b009b9b9b009b9bULL, 0x6300636363006363ULL, + 0x9d009d9d9d009d9dULL, 0xc000c0c0c000c0c0ULL, 0x4b004b4b4b004b4bULL, + 0xb700b7b7b700b7b7ULL, 0xa500a5a5a500a5a5ULL, 0x8900898989008989ULL, + 0x5f005f5f5f005f5fULL, 0xb100b1b1b100b1b1ULL, 0x1700171717001717ULL, + 0xf400f4f4f400f4f4ULL, 0xbc00bcbcbc00bcbcULL, 0xd300d3d3d300d3d3ULL, + 0x4600464646004646ULL, 0xcf00cfcfcf00cfcfULL, 0x3700373737003737ULL, + 0x5e005e5e5e005e5eULL, 0x4700474747004747ULL, 0x9400949494009494ULL, + 0xfa00fafafa00fafaULL, 0xfc00fcfcfc00fcfcULL, 0x5b005b5b5b005b5bULL, + 0x9700979797009797ULL, 0xfe00fefefe00fefeULL, 0x5a005a5a5a005a5aULL, + 0xac00acacac00acacULL, 0x3c003c3c3c003c3cULL, 0x4c004c4c4c004c4cULL, + 0x0300030303000303ULL, 0x3500353535003535ULL, 0xf300f3f3f300f3f3ULL, + 0x2300232323002323ULL, 0xb800b8b8b800b8b8ULL, 0x5d005d5d5d005d5dULL, + 0x6a006a6a6a006a6aULL, 0x9200929292009292ULL, 0xd500d5d5d500d5d5ULL, + 0x2100212121002121ULL, 0x4400444444004444ULL, 0x5100515151005151ULL, + 0xc600c6c6c600c6c6ULL, 0x7d007d7d7d007d7dULL, 0x3900393939003939ULL, + 0x8300838383008383ULL, 0xdc00dcdcdc00dcdcULL, 0xaa00aaaaaa00aaaaULL, + 0x7c007c7c7c007c7cULL, 0x7700777777007777ULL, 0x5600565656005656ULL, + 0x0500050505000505ULL, 0x1b001b1b1b001b1bULL, 0xa400a4a4a400a4a4ULL, + 0x1500151515001515ULL, 0x3400343434003434ULL, 0x1e001e1e1e001e1eULL, + 0x1c001c1c1c001c1cULL, 0xf800f8f8f800f8f8ULL, 0x5200525252005252ULL, + 0x2000202020002020ULL, 0x1400141414001414ULL, 0xe900e9e9e900e9e9ULL, + 0xbd00bdbdbd00bdbdULL, 0xdd00dddddd00ddddULL, 0xe400e4e4e400e4e4ULL, + 0xa100a1a1a100a1a1ULL, 0xe000e0e0e000e0e0ULL, 0x8a008a8a8a008a8aULL, + 0xf100f1f1f100f1f1ULL, 0xd600d6d6d600d6d6ULL, 0x7a007a7a7a007a7aULL, + 0xbb00bbbbbb00bbbbULL, 0xe300e3e3e300e3e3ULL, 0x4000404040004040ULL, + 0x4f004f4f4f004f4fULL, }; const u64 camellia_sp44044404[256] = { - 0x7070007070700070, 0x2c2c002c2c2c002c, 0xb3b300b3b3b300b3, - 0xc0c000c0c0c000c0, 0xe4e400e4e4e400e4, 0x5757005757570057, - 0xeaea00eaeaea00ea, 0xaeae00aeaeae00ae, 0x2323002323230023, - 0x6b6b006b6b6b006b, 0x4545004545450045, 0xa5a500a5a5a500a5, - 0xeded00ededed00ed, 0x4f4f004f4f4f004f, 0x1d1d001d1d1d001d, - 0x9292009292920092, 0x8686008686860086, 0xafaf00afafaf00af, - 0x7c7c007c7c7c007c, 0x1f1f001f1f1f001f, 0x3e3e003e3e3e003e, - 0xdcdc00dcdcdc00dc, 0x5e5e005e5e5e005e, 0x0b0b000b0b0b000b, - 0xa6a600a6a6a600a6, 0x3939003939390039, 0xd5d500d5d5d500d5, - 0x5d5d005d5d5d005d, 0xd9d900d9d9d900d9, 0x5a5a005a5a5a005a, - 0x5151005151510051, 0x6c6c006c6c6c006c, 0x8b8b008b8b8b008b, - 0x9a9a009a9a9a009a, 0xfbfb00fbfbfb00fb, 0xb0b000b0b0b000b0, - 0x7474007474740074, 0x2b2b002b2b2b002b, 0xf0f000f0f0f000f0, - 0x8484008484840084, 0xdfdf00dfdfdf00df, 0xcbcb00cbcbcb00cb, - 0x3434003434340034, 0x7676007676760076, 0x6d6d006d6d6d006d, - 0xa9a900a9a9a900a9, 0xd1d100d1d1d100d1, 0x0404000404040004, - 0x1414001414140014, 0x3a3a003a3a3a003a, 0xdede00dedede00de, - 0x1111001111110011, 0x3232003232320032, 0x9c9c009c9c9c009c, - 0x5353005353530053, 0xf2f200f2f2f200f2, 0xfefe00fefefe00fe, - 0xcfcf00cfcfcf00cf, 0xc3c300c3c3c300c3, 0x7a7a007a7a7a007a, - 0x2424002424240024, 0xe8e800e8e8e800e8, 0x6060006060600060, - 0x6969006969690069, 0xaaaa00aaaaaa00aa, 0xa0a000a0a0a000a0, - 0xa1a100a1a1a100a1, 0x6262006262620062, 0x5454005454540054, - 0x1e1e001e1e1e001e, 0xe0e000e0e0e000e0, 0x6464006464640064, - 0x1010001010100010, 0x0000000000000000, 0xa3a300a3a3a300a3, - 0x7575007575750075, 0x8a8a008a8a8a008a, 0xe6e600e6e6e600e6, - 0x0909000909090009, 0xdddd00dddddd00dd, 0x8787008787870087, - 0x8383008383830083, 0xcdcd00cdcdcd00cd, 0x9090009090900090, - 0x7373007373730073, 0xf6f600f6f6f600f6, 0x9d9d009d9d9d009d, - 0xbfbf00bfbfbf00bf, 0x5252005252520052, 0xd8d800d8d8d800d8, - 0xc8c800c8c8c800c8, 0xc6c600c6c6c600c6, 0x8181008181810081, - 0x6f6f006f6f6f006f, 0x1313001313130013, 0x6363006363630063, - 0xe9e900e9e9e900e9, 0xa7a700a7a7a700a7, 0x9f9f009f9f9f009f, - 0xbcbc00bcbcbc00bc, 0x2929002929290029, 0xf9f900f9f9f900f9, - 0x2f2f002f2f2f002f, 0xb4b400b4b4b400b4, 0x7878007878780078, - 0x0606000606060006, 0xe7e700e7e7e700e7, 0x7171007171710071, - 0xd4d400d4d4d400d4, 0xabab00ababab00ab, 0x8888008888880088, - 0x8d8d008d8d8d008d, 0x7272007272720072, 0xb9b900b9b9b900b9, - 0xf8f800f8f8f800f8, 0xacac00acacac00ac, 0x3636003636360036, - 0x2a2a002a2a2a002a, 0x3c3c003c3c3c003c, 0xf1f100f1f1f100f1, - 0x4040004040400040, 0xd3d300d3d3d300d3, 0xbbbb00bbbbbb00bb, - 0x4343004343430043, 0x1515001515150015, 0xadad00adadad00ad, - 0x7777007777770077, 0x8080008080800080, 0x8282008282820082, - 0xecec00ececec00ec, 0x2727002727270027, 0xe5e500e5e5e500e5, - 0x8585008585850085, 0x3535003535350035, 0x0c0c000c0c0c000c, - 0x4141004141410041, 0xefef00efefef00ef, 0x9393009393930093, - 0x1919001919190019, 0x2121002121210021, 0x0e0e000e0e0e000e, - 0x4e4e004e4e4e004e, 0x6565006565650065, 0xbdbd00bdbdbd00bd, - 0xb8b800b8b8b800b8, 0x8f8f008f8f8f008f, 0xebeb00ebebeb00eb, - 0xcece00cecece00ce, 0x3030003030300030, 0x5f5f005f5f5f005f, - 0xc5c500c5c5c500c5, 0x1a1a001a1a1a001a, 0xe1e100e1e1e100e1, - 0xcaca00cacaca00ca, 0x4747004747470047, 0x3d3d003d3d3d003d, - 0x0101000101010001, 0xd6d600d6d6d600d6, 0x5656005656560056, - 0x4d4d004d4d4d004d, 0x0d0d000d0d0d000d, 0x6666006666660066, - 0xcccc00cccccc00cc, 0x2d2d002d2d2d002d, 0x1212001212120012, - 0x2020002020200020, 0xb1b100b1b1b100b1, 0x9999009999990099, - 0x4c4c004c4c4c004c, 0xc2c200c2c2c200c2, 0x7e7e007e7e7e007e, - 0x0505000505050005, 0xb7b700b7b7b700b7, 0x3131003131310031, - 0x1717001717170017, 0xd7d700d7d7d700d7, 0x5858005858580058, - 0x6161006161610061, 0x1b1b001b1b1b001b, 0x1c1c001c1c1c001c, - 0x0f0f000f0f0f000f, 0x1616001616160016, 0x1818001818180018, - 0x2222002222220022, 0x4444004444440044, 0xb2b200b2b2b200b2, - 0xb5b500b5b5b500b5, 0x9191009191910091, 0x0808000808080008, - 0xa8a800a8a8a800a8, 0xfcfc00fcfcfc00fc, 0x5050005050500050, - 0xd0d000d0d0d000d0, 0x7d7d007d7d7d007d, 0x8989008989890089, - 0x9797009797970097, 0x5b5b005b5b5b005b, 0x9595009595950095, - 0xffff00ffffff00ff, 0xd2d200d2d2d200d2, 0xc4c400c4c4c400c4, - 0x4848004848480048, 0xf7f700f7f7f700f7, 0xdbdb00dbdbdb00db, - 0x0303000303030003, 0xdada00dadada00da, 0x3f3f003f3f3f003f, - 0x9494009494940094, 0x5c5c005c5c5c005c, 0x0202000202020002, - 0x4a4a004a4a4a004a, 0x3333003333330033, 0x6767006767670067, - 0xf3f300f3f3f300f3, 0x7f7f007f7f7f007f, 0xe2e200e2e2e200e2, - 0x9b9b009b9b9b009b, 0x2626002626260026, 0x3737003737370037, - 0x3b3b003b3b3b003b, 0x9696009696960096, 0x4b4b004b4b4b004b, - 0xbebe00bebebe00be, 0x2e2e002e2e2e002e, 0x7979007979790079, - 0x8c8c008c8c8c008c, 0x6e6e006e6e6e006e, 0x8e8e008e8e8e008e, - 0xf5f500f5f5f500f5, 0xb6b600b6b6b600b6, 0xfdfd00fdfdfd00fd, - 0x5959005959590059, 0x9898009898980098, 0x6a6a006a6a6a006a, - 0x4646004646460046, 0xbaba00bababa00ba, 0x2525002525250025, - 0x4242004242420042, 0xa2a200a2a2a200a2, 0xfafa00fafafa00fa, - 0x0707000707070007, 0x5555005555550055, 0xeeee00eeeeee00ee, - 0x0a0a000a0a0a000a, 0x4949004949490049, 0x6868006868680068, - 0x3838003838380038, 0xa4a400a4a4a400a4, 0x2828002828280028, - 0x7b7b007b7b7b007b, 0xc9c900c9c9c900c9, 0xc1c100c1c1c100c1, - 0xe3e300e3e3e300e3, 0xf4f400f4f4f400f4, 0xc7c700c7c7c700c7, - 0x9e9e009e9e9e009e, + 0x7070007070700070ULL, 0x2c2c002c2c2c002cULL, 0xb3b300b3b3b300b3ULL, + 0xc0c000c0c0c000c0ULL, 0xe4e400e4e4e400e4ULL, 0x5757005757570057ULL, + 0xeaea00eaeaea00eaULL, 0xaeae00aeaeae00aeULL, 0x2323002323230023ULL, + 0x6b6b006b6b6b006bULL, 0x4545004545450045ULL, 0xa5a500a5a5a500a5ULL, + 0xeded00ededed00edULL, 0x4f4f004f4f4f004fULL, 0x1d1d001d1d1d001dULL, + 0x9292009292920092ULL, 0x8686008686860086ULL, 0xafaf00afafaf00afULL, + 0x7c7c007c7c7c007cULL, 0x1f1f001f1f1f001fULL, 0x3e3e003e3e3e003eULL, + 0xdcdc00dcdcdc00dcULL, 0x5e5e005e5e5e005eULL, 0x0b0b000b0b0b000bULL, + 0xa6a600a6a6a600a6ULL, 0x3939003939390039ULL, 0xd5d500d5d5d500d5ULL, + 0x5d5d005d5d5d005dULL, 0xd9d900d9d9d900d9ULL, 0x5a5a005a5a5a005aULL, + 0x5151005151510051ULL, 0x6c6c006c6c6c006cULL, 0x8b8b008b8b8b008bULL, + 0x9a9a009a9a9a009aULL, 0xfbfb00fbfbfb00fbULL, 0xb0b000b0b0b000b0ULL, + 0x7474007474740074ULL, 0x2b2b002b2b2b002bULL, 0xf0f000f0f0f000f0ULL, + 0x8484008484840084ULL, 0xdfdf00dfdfdf00dfULL, 0xcbcb00cbcbcb00cbULL, + 0x3434003434340034ULL, 0x7676007676760076ULL, 0x6d6d006d6d6d006dULL, + 0xa9a900a9a9a900a9ULL, 0xd1d100d1d1d100d1ULL, 0x0404000404040004ULL, + 0x1414001414140014ULL, 0x3a3a003a3a3a003aULL, 0xdede00dedede00deULL, + 0x1111001111110011ULL, 0x3232003232320032ULL, 0x9c9c009c9c9c009cULL, + 0x5353005353530053ULL, 0xf2f200f2f2f200f2ULL, 0xfefe00fefefe00feULL, + 0xcfcf00cfcfcf00cfULL, 0xc3c300c3c3c300c3ULL, 0x7a7a007a7a7a007aULL, + 0x2424002424240024ULL, 0xe8e800e8e8e800e8ULL, 0x6060006060600060ULL, + 0x6969006969690069ULL, 0xaaaa00aaaaaa00aaULL, 0xa0a000a0a0a000a0ULL, + 0xa1a100a1a1a100a1ULL, 0x6262006262620062ULL, 0x5454005454540054ULL, + 0x1e1e001e1e1e001eULL, 0xe0e000e0e0e000e0ULL, 0x6464006464640064ULL, + 0x1010001010100010ULL, 0x0000000000000000ULL, 0xa3a300a3a3a300a3ULL, + 0x7575007575750075ULL, 0x8a8a008a8a8a008aULL, 0xe6e600e6e6e600e6ULL, + 0x0909000909090009ULL, 0xdddd00dddddd00ddULL, 0x8787008787870087ULL, + 0x8383008383830083ULL, 0xcdcd00cdcdcd00cdULL, 0x9090009090900090ULL, + 0x7373007373730073ULL, 0xf6f600f6f6f600f6ULL, 0x9d9d009d9d9d009dULL, + 0xbfbf00bfbfbf00bfULL, 0x5252005252520052ULL, 0xd8d800d8d8d800d8ULL, + 0xc8c800c8c8c800c8ULL, 0xc6c600c6c6c600c6ULL, 0x8181008181810081ULL, + 0x6f6f006f6f6f006fULL, 0x1313001313130013ULL, 0x6363006363630063ULL, + 0xe9e900e9e9e900e9ULL, 0xa7a700a7a7a700a7ULL, 0x9f9f009f9f9f009fULL, + 0xbcbc00bcbcbc00bcULL, 0x2929002929290029ULL, 0xf9f900f9f9f900f9ULL, + 0x2f2f002f2f2f002fULL, 0xb4b400b4b4b400b4ULL, 0x7878007878780078ULL, + 0x0606000606060006ULL, 0xe7e700e7e7e700e7ULL, 0x7171007171710071ULL, + 0xd4d400d4d4d400d4ULL, 0xabab00ababab00abULL, 0x8888008888880088ULL, + 0x8d8d008d8d8d008dULL, 0x7272007272720072ULL, 0xb9b900b9b9b900b9ULL, + 0xf8f800f8f8f800f8ULL, 0xacac00acacac00acULL, 0x3636003636360036ULL, + 0x2a2a002a2a2a002aULL, 0x3c3c003c3c3c003cULL, 0xf1f100f1f1f100f1ULL, + 0x4040004040400040ULL, 0xd3d300d3d3d300d3ULL, 0xbbbb00bbbbbb00bbULL, + 0x4343004343430043ULL, 0x1515001515150015ULL, 0xadad00adadad00adULL, + 0x7777007777770077ULL, 0x8080008080800080ULL, 0x8282008282820082ULL, + 0xecec00ececec00ecULL, 0x2727002727270027ULL, 0xe5e500e5e5e500e5ULL, + 0x8585008585850085ULL, 0x3535003535350035ULL, 0x0c0c000c0c0c000cULL, + 0x4141004141410041ULL, 0xefef00efefef00efULL, 0x9393009393930093ULL, + 0x1919001919190019ULL, 0x2121002121210021ULL, 0x0e0e000e0e0e000eULL, + 0x4e4e004e4e4e004eULL, 0x6565006565650065ULL, 0xbdbd00bdbdbd00bdULL, + 0xb8b800b8b8b800b8ULL, 0x8f8f008f8f8f008fULL, 0xebeb00ebebeb00ebULL, + 0xcece00cecece00ceULL, 0x3030003030300030ULL, 0x5f5f005f5f5f005fULL, + 0xc5c500c5c5c500c5ULL, 0x1a1a001a1a1a001aULL, 0xe1e100e1e1e100e1ULL, + 0xcaca00cacaca00caULL, 0x4747004747470047ULL, 0x3d3d003d3d3d003dULL, + 0x0101000101010001ULL, 0xd6d600d6d6d600d6ULL, 0x5656005656560056ULL, + 0x4d4d004d4d4d004dULL, 0x0d0d000d0d0d000dULL, 0x6666006666660066ULL, + 0xcccc00cccccc00ccULL, 0x2d2d002d2d2d002dULL, 0x1212001212120012ULL, + 0x2020002020200020ULL, 0xb1b100b1b1b100b1ULL, 0x9999009999990099ULL, + 0x4c4c004c4c4c004cULL, 0xc2c200c2c2c200c2ULL, 0x7e7e007e7e7e007eULL, + 0x0505000505050005ULL, 0xb7b700b7b7b700b7ULL, 0x3131003131310031ULL, + 0x1717001717170017ULL, 0xd7d700d7d7d700d7ULL, 0x5858005858580058ULL, + 0x6161006161610061ULL, 0x1b1b001b1b1b001bULL, 0x1c1c001c1c1c001cULL, + 0x0f0f000f0f0f000fULL, 0x1616001616160016ULL, 0x1818001818180018ULL, + 0x2222002222220022ULL, 0x4444004444440044ULL, 0xb2b200b2b2b200b2ULL, + 0xb5b500b5b5b500b5ULL, 0x9191009191910091ULL, 0x0808000808080008ULL, + 0xa8a800a8a8a800a8ULL, 0xfcfc00fcfcfc00fcULL, 0x5050005050500050ULL, + 0xd0d000d0d0d000d0ULL, 0x7d7d007d7d7d007dULL, 0x8989008989890089ULL, + 0x9797009797970097ULL, 0x5b5b005b5b5b005bULL, 0x9595009595950095ULL, + 0xffff00ffffff00ffULL, 0xd2d200d2d2d200d2ULL, 0xc4c400c4c4c400c4ULL, + 0x4848004848480048ULL, 0xf7f700f7f7f700f7ULL, 0xdbdb00dbdbdb00dbULL, + 0x0303000303030003ULL, 0xdada00dadada00daULL, 0x3f3f003f3f3f003fULL, + 0x9494009494940094ULL, 0x5c5c005c5c5c005cULL, 0x0202000202020002ULL, + 0x4a4a004a4a4a004aULL, 0x3333003333330033ULL, 0x6767006767670067ULL, + 0xf3f300f3f3f300f3ULL, 0x7f7f007f7f7f007fULL, 0xe2e200e2e2e200e2ULL, + 0x9b9b009b9b9b009bULL, 0x2626002626260026ULL, 0x3737003737370037ULL, + 0x3b3b003b3b3b003bULL, 0x9696009696960096ULL, 0x4b4b004b4b4b004bULL, + 0xbebe00bebebe00beULL, 0x2e2e002e2e2e002eULL, 0x7979007979790079ULL, + 0x8c8c008c8c8c008cULL, 0x6e6e006e6e6e006eULL, 0x8e8e008e8e8e008eULL, + 0xf5f500f5f5f500f5ULL, 0xb6b600b6b6b600b6ULL, 0xfdfd00fdfdfd00fdULL, + 0x5959005959590059ULL, 0x9898009898980098ULL, 0x6a6a006a6a6a006aULL, + 0x4646004646460046ULL, 0xbaba00bababa00baULL, 0x2525002525250025ULL, + 0x4242004242420042ULL, 0xa2a200a2a2a200a2ULL, 0xfafa00fafafa00faULL, + 0x0707000707070007ULL, 0x5555005555550055ULL, 0xeeee00eeeeee00eeULL, + 0x0a0a000a0a0a000aULL, 0x4949004949490049ULL, 0x6868006868680068ULL, + 0x3838003838380038ULL, 0xa4a400a4a4a400a4ULL, 0x2828002828280028ULL, + 0x7b7b007b7b7b007bULL, 0xc9c900c9c9c900c9ULL, 0xc1c100c1c1c100c1ULL, + 0xe3e300e3e3e300e3ULL, 0xf4f400f4f4f400f4ULL, 0xc7c700c7c7c700c7ULL, + 0x9e9e009e9e9e009eULL, }; const u64 camellia_sp11101110[256] = { - 0x7070700070707000, 0x8282820082828200, 0x2c2c2c002c2c2c00, - 0xececec00ececec00, 0xb3b3b300b3b3b300, 0x2727270027272700, - 0xc0c0c000c0c0c000, 0xe5e5e500e5e5e500, 0xe4e4e400e4e4e400, - 0x8585850085858500, 0x5757570057575700, 0x3535350035353500, - 0xeaeaea00eaeaea00, 0x0c0c0c000c0c0c00, 0xaeaeae00aeaeae00, - 0x4141410041414100, 0x2323230023232300, 0xefefef00efefef00, - 0x6b6b6b006b6b6b00, 0x9393930093939300, 0x4545450045454500, - 0x1919190019191900, 0xa5a5a500a5a5a500, 0x2121210021212100, - 0xededed00ededed00, 0x0e0e0e000e0e0e00, 0x4f4f4f004f4f4f00, - 0x4e4e4e004e4e4e00, 0x1d1d1d001d1d1d00, 0x6565650065656500, - 0x9292920092929200, 0xbdbdbd00bdbdbd00, 0x8686860086868600, - 0xb8b8b800b8b8b800, 0xafafaf00afafaf00, 0x8f8f8f008f8f8f00, - 0x7c7c7c007c7c7c00, 0xebebeb00ebebeb00, 0x1f1f1f001f1f1f00, - 0xcecece00cecece00, 0x3e3e3e003e3e3e00, 0x3030300030303000, - 0xdcdcdc00dcdcdc00, 0x5f5f5f005f5f5f00, 0x5e5e5e005e5e5e00, - 0xc5c5c500c5c5c500, 0x0b0b0b000b0b0b00, 0x1a1a1a001a1a1a00, - 0xa6a6a600a6a6a600, 0xe1e1e100e1e1e100, 0x3939390039393900, - 0xcacaca00cacaca00, 0xd5d5d500d5d5d500, 0x4747470047474700, - 0x5d5d5d005d5d5d00, 0x3d3d3d003d3d3d00, 0xd9d9d900d9d9d900, - 0x0101010001010100, 0x5a5a5a005a5a5a00, 0xd6d6d600d6d6d600, - 0x5151510051515100, 0x5656560056565600, 0x6c6c6c006c6c6c00, - 0x4d4d4d004d4d4d00, 0x8b8b8b008b8b8b00, 0x0d0d0d000d0d0d00, - 0x9a9a9a009a9a9a00, 0x6666660066666600, 0xfbfbfb00fbfbfb00, - 0xcccccc00cccccc00, 0xb0b0b000b0b0b000, 0x2d2d2d002d2d2d00, - 0x7474740074747400, 0x1212120012121200, 0x2b2b2b002b2b2b00, - 0x2020200020202000, 0xf0f0f000f0f0f000, 0xb1b1b100b1b1b100, - 0x8484840084848400, 0x9999990099999900, 0xdfdfdf00dfdfdf00, - 0x4c4c4c004c4c4c00, 0xcbcbcb00cbcbcb00, 0xc2c2c200c2c2c200, - 0x3434340034343400, 0x7e7e7e007e7e7e00, 0x7676760076767600, - 0x0505050005050500, 0x6d6d6d006d6d6d00, 0xb7b7b700b7b7b700, - 0xa9a9a900a9a9a900, 0x3131310031313100, 0xd1d1d100d1d1d100, - 0x1717170017171700, 0x0404040004040400, 0xd7d7d700d7d7d700, - 0x1414140014141400, 0x5858580058585800, 0x3a3a3a003a3a3a00, - 0x6161610061616100, 0xdedede00dedede00, 0x1b1b1b001b1b1b00, - 0x1111110011111100, 0x1c1c1c001c1c1c00, 0x3232320032323200, - 0x0f0f0f000f0f0f00, 0x9c9c9c009c9c9c00, 0x1616160016161600, - 0x5353530053535300, 0x1818180018181800, 0xf2f2f200f2f2f200, - 0x2222220022222200, 0xfefefe00fefefe00, 0x4444440044444400, - 0xcfcfcf00cfcfcf00, 0xb2b2b200b2b2b200, 0xc3c3c300c3c3c300, - 0xb5b5b500b5b5b500, 0x7a7a7a007a7a7a00, 0x9191910091919100, - 0x2424240024242400, 0x0808080008080800, 0xe8e8e800e8e8e800, - 0xa8a8a800a8a8a800, 0x6060600060606000, 0xfcfcfc00fcfcfc00, - 0x6969690069696900, 0x5050500050505000, 0xaaaaaa00aaaaaa00, - 0xd0d0d000d0d0d000, 0xa0a0a000a0a0a000, 0x7d7d7d007d7d7d00, - 0xa1a1a100a1a1a100, 0x8989890089898900, 0x6262620062626200, - 0x9797970097979700, 0x5454540054545400, 0x5b5b5b005b5b5b00, - 0x1e1e1e001e1e1e00, 0x9595950095959500, 0xe0e0e000e0e0e000, - 0xffffff00ffffff00, 0x6464640064646400, 0xd2d2d200d2d2d200, - 0x1010100010101000, 0xc4c4c400c4c4c400, 0x0000000000000000, - 0x4848480048484800, 0xa3a3a300a3a3a300, 0xf7f7f700f7f7f700, - 0x7575750075757500, 0xdbdbdb00dbdbdb00, 0x8a8a8a008a8a8a00, - 0x0303030003030300, 0xe6e6e600e6e6e600, 0xdadada00dadada00, - 0x0909090009090900, 0x3f3f3f003f3f3f00, 0xdddddd00dddddd00, - 0x9494940094949400, 0x8787870087878700, 0x5c5c5c005c5c5c00, - 0x8383830083838300, 0x0202020002020200, 0xcdcdcd00cdcdcd00, - 0x4a4a4a004a4a4a00, 0x9090900090909000, 0x3333330033333300, - 0x7373730073737300, 0x6767670067676700, 0xf6f6f600f6f6f600, - 0xf3f3f300f3f3f300, 0x9d9d9d009d9d9d00, 0x7f7f7f007f7f7f00, - 0xbfbfbf00bfbfbf00, 0xe2e2e200e2e2e200, 0x5252520052525200, - 0x9b9b9b009b9b9b00, 0xd8d8d800d8d8d800, 0x2626260026262600, - 0xc8c8c800c8c8c800, 0x3737370037373700, 0xc6c6c600c6c6c600, - 0x3b3b3b003b3b3b00, 0x8181810081818100, 0x9696960096969600, - 0x6f6f6f006f6f6f00, 0x4b4b4b004b4b4b00, 0x1313130013131300, - 0xbebebe00bebebe00, 0x6363630063636300, 0x2e2e2e002e2e2e00, - 0xe9e9e900e9e9e900, 0x7979790079797900, 0xa7a7a700a7a7a700, - 0x8c8c8c008c8c8c00, 0x9f9f9f009f9f9f00, 0x6e6e6e006e6e6e00, - 0xbcbcbc00bcbcbc00, 0x8e8e8e008e8e8e00, 0x2929290029292900, - 0xf5f5f500f5f5f500, 0xf9f9f900f9f9f900, 0xb6b6b600b6b6b600, - 0x2f2f2f002f2f2f00, 0xfdfdfd00fdfdfd00, 0xb4b4b400b4b4b400, - 0x5959590059595900, 0x7878780078787800, 0x9898980098989800, - 0x0606060006060600, 0x6a6a6a006a6a6a00, 0xe7e7e700e7e7e700, - 0x4646460046464600, 0x7171710071717100, 0xbababa00bababa00, - 0xd4d4d400d4d4d400, 0x2525250025252500, 0xababab00ababab00, - 0x4242420042424200, 0x8888880088888800, 0xa2a2a200a2a2a200, - 0x8d8d8d008d8d8d00, 0xfafafa00fafafa00, 0x7272720072727200, - 0x0707070007070700, 0xb9b9b900b9b9b900, 0x5555550055555500, - 0xf8f8f800f8f8f800, 0xeeeeee00eeeeee00, 0xacacac00acacac00, - 0x0a0a0a000a0a0a00, 0x3636360036363600, 0x4949490049494900, - 0x2a2a2a002a2a2a00, 0x6868680068686800, 0x3c3c3c003c3c3c00, - 0x3838380038383800, 0xf1f1f100f1f1f100, 0xa4a4a400a4a4a400, - 0x4040400040404000, 0x2828280028282800, 0xd3d3d300d3d3d300, - 0x7b7b7b007b7b7b00, 0xbbbbbb00bbbbbb00, 0xc9c9c900c9c9c900, - 0x4343430043434300, 0xc1c1c100c1c1c100, 0x1515150015151500, - 0xe3e3e300e3e3e300, 0xadadad00adadad00, 0xf4f4f400f4f4f400, - 0x7777770077777700, 0xc7c7c700c7c7c700, 0x8080800080808000, - 0x9e9e9e009e9e9e00, + 0x7070700070707000ULL, 0x8282820082828200ULL, 0x2c2c2c002c2c2c00ULL, + 0xececec00ececec00ULL, 0xb3b3b300b3b3b300ULL, 0x2727270027272700ULL, + 0xc0c0c000c0c0c000ULL, 0xe5e5e500e5e5e500ULL, 0xe4e4e400e4e4e400ULL, + 0x8585850085858500ULL, 0x5757570057575700ULL, 0x3535350035353500ULL, + 0xeaeaea00eaeaea00ULL, 0x0c0c0c000c0c0c00ULL, 0xaeaeae00aeaeae00ULL, + 0x4141410041414100ULL, 0x2323230023232300ULL, 0xefefef00efefef00ULL, + 0x6b6b6b006b6b6b00ULL, 0x9393930093939300ULL, 0x4545450045454500ULL, + 0x1919190019191900ULL, 0xa5a5a500a5a5a500ULL, 0x2121210021212100ULL, + 0xededed00ededed00ULL, 0x0e0e0e000e0e0e00ULL, 0x4f4f4f004f4f4f00ULL, + 0x4e4e4e004e4e4e00ULL, 0x1d1d1d001d1d1d00ULL, 0x6565650065656500ULL, + 0x9292920092929200ULL, 0xbdbdbd00bdbdbd00ULL, 0x8686860086868600ULL, + 0xb8b8b800b8b8b800ULL, 0xafafaf00afafaf00ULL, 0x8f8f8f008f8f8f00ULL, + 0x7c7c7c007c7c7c00ULL, 0xebebeb00ebebeb00ULL, 0x1f1f1f001f1f1f00ULL, + 0xcecece00cecece00ULL, 0x3e3e3e003e3e3e00ULL, 0x3030300030303000ULL, + 0xdcdcdc00dcdcdc00ULL, 0x5f5f5f005f5f5f00ULL, 0x5e5e5e005e5e5e00ULL, + 0xc5c5c500c5c5c500ULL, 0x0b0b0b000b0b0b00ULL, 0x1a1a1a001a1a1a00ULL, + 0xa6a6a600a6a6a600ULL, 0xe1e1e100e1e1e100ULL, 0x3939390039393900ULL, + 0xcacaca00cacaca00ULL, 0xd5d5d500d5d5d500ULL, 0x4747470047474700ULL, + 0x5d5d5d005d5d5d00ULL, 0x3d3d3d003d3d3d00ULL, 0xd9d9d900d9d9d900ULL, + 0x0101010001010100ULL, 0x5a5a5a005a5a5a00ULL, 0xd6d6d600d6d6d600ULL, + 0x5151510051515100ULL, 0x5656560056565600ULL, 0x6c6c6c006c6c6c00ULL, + 0x4d4d4d004d4d4d00ULL, 0x8b8b8b008b8b8b00ULL, 0x0d0d0d000d0d0d00ULL, + 0x9a9a9a009a9a9a00ULL, 0x6666660066666600ULL, 0xfbfbfb00fbfbfb00ULL, + 0xcccccc00cccccc00ULL, 0xb0b0b000b0b0b000ULL, 0x2d2d2d002d2d2d00ULL, + 0x7474740074747400ULL, 0x1212120012121200ULL, 0x2b2b2b002b2b2b00ULL, + 0x2020200020202000ULL, 0xf0f0f000f0f0f000ULL, 0xb1b1b100b1b1b100ULL, + 0x8484840084848400ULL, 0x9999990099999900ULL, 0xdfdfdf00dfdfdf00ULL, + 0x4c4c4c004c4c4c00ULL, 0xcbcbcb00cbcbcb00ULL, 0xc2c2c200c2c2c200ULL, + 0x3434340034343400ULL, 0x7e7e7e007e7e7e00ULL, 0x7676760076767600ULL, + 0x0505050005050500ULL, 0x6d6d6d006d6d6d00ULL, 0xb7b7b700b7b7b700ULL, + 0xa9a9a900a9a9a900ULL, 0x3131310031313100ULL, 0xd1d1d100d1d1d100ULL, + 0x1717170017171700ULL, 0x0404040004040400ULL, 0xd7d7d700d7d7d700ULL, + 0x1414140014141400ULL, 0x5858580058585800ULL, 0x3a3a3a003a3a3a00ULL, + 0x6161610061616100ULL, 0xdedede00dedede00ULL, 0x1b1b1b001b1b1b00ULL, + 0x1111110011111100ULL, 0x1c1c1c001c1c1c00ULL, 0x3232320032323200ULL, + 0x0f0f0f000f0f0f00ULL, 0x9c9c9c009c9c9c00ULL, 0x1616160016161600ULL, + 0x5353530053535300ULL, 0x1818180018181800ULL, 0xf2f2f200f2f2f200ULL, + 0x2222220022222200ULL, 0xfefefe00fefefe00ULL, 0x4444440044444400ULL, + 0xcfcfcf00cfcfcf00ULL, 0xb2b2b200b2b2b200ULL, 0xc3c3c300c3c3c300ULL, + 0xb5b5b500b5b5b500ULL, 0x7a7a7a007a7a7a00ULL, 0x9191910091919100ULL, + 0x2424240024242400ULL, 0x0808080008080800ULL, 0xe8e8e800e8e8e800ULL, + 0xa8a8a800a8a8a800ULL, 0x6060600060606000ULL, 0xfcfcfc00fcfcfc00ULL, + 0x6969690069696900ULL, 0x5050500050505000ULL, 0xaaaaaa00aaaaaa00ULL, + 0xd0d0d000d0d0d000ULL, 0xa0a0a000a0a0a000ULL, 0x7d7d7d007d7d7d00ULL, + 0xa1a1a100a1a1a100ULL, 0x8989890089898900ULL, 0x6262620062626200ULL, + 0x9797970097979700ULL, 0x5454540054545400ULL, 0x5b5b5b005b5b5b00ULL, + 0x1e1e1e001e1e1e00ULL, 0x9595950095959500ULL, 0xe0e0e000e0e0e000ULL, + 0xffffff00ffffff00ULL, 0x6464640064646400ULL, 0xd2d2d200d2d2d200ULL, + 0x1010100010101000ULL, 0xc4c4c400c4c4c400ULL, 0x0000000000000000ULL, + 0x4848480048484800ULL, 0xa3a3a300a3a3a300ULL, 0xf7f7f700f7f7f700ULL, + 0x7575750075757500ULL, 0xdbdbdb00dbdbdb00ULL, 0x8a8a8a008a8a8a00ULL, + 0x0303030003030300ULL, 0xe6e6e600e6e6e600ULL, 0xdadada00dadada00ULL, + 0x0909090009090900ULL, 0x3f3f3f003f3f3f00ULL, 0xdddddd00dddddd00ULL, + 0x9494940094949400ULL, 0x8787870087878700ULL, 0x5c5c5c005c5c5c00ULL, + 0x8383830083838300ULL, 0x0202020002020200ULL, 0xcdcdcd00cdcdcd00ULL, + 0x4a4a4a004a4a4a00ULL, 0x9090900090909000ULL, 0x3333330033333300ULL, + 0x7373730073737300ULL, 0x6767670067676700ULL, 0xf6f6f600f6f6f600ULL, + 0xf3f3f300f3f3f300ULL, 0x9d9d9d009d9d9d00ULL, 0x7f7f7f007f7f7f00ULL, + 0xbfbfbf00bfbfbf00ULL, 0xe2e2e200e2e2e200ULL, 0x5252520052525200ULL, + 0x9b9b9b009b9b9b00ULL, 0xd8d8d800d8d8d800ULL, 0x2626260026262600ULL, + 0xc8c8c800c8c8c800ULL, 0x3737370037373700ULL, 0xc6c6c600c6c6c600ULL, + 0x3b3b3b003b3b3b00ULL, 0x8181810081818100ULL, 0x9696960096969600ULL, + 0x6f6f6f006f6f6f00ULL, 0x4b4b4b004b4b4b00ULL, 0x1313130013131300ULL, + 0xbebebe00bebebe00ULL, 0x6363630063636300ULL, 0x2e2e2e002e2e2e00ULL, + 0xe9e9e900e9e9e900ULL, 0x7979790079797900ULL, 0xa7a7a700a7a7a700ULL, + 0x8c8c8c008c8c8c00ULL, 0x9f9f9f009f9f9f00ULL, 0x6e6e6e006e6e6e00ULL, + 0xbcbcbc00bcbcbc00ULL, 0x8e8e8e008e8e8e00ULL, 0x2929290029292900ULL, + 0xf5f5f500f5f5f500ULL, 0xf9f9f900f9f9f900ULL, 0xb6b6b600b6b6b600ULL, + 0x2f2f2f002f2f2f00ULL, 0xfdfdfd00fdfdfd00ULL, 0xb4b4b400b4b4b400ULL, + 0x5959590059595900ULL, 0x7878780078787800ULL, 0x9898980098989800ULL, + 0x0606060006060600ULL, 0x6a6a6a006a6a6a00ULL, 0xe7e7e700e7e7e700ULL, + 0x4646460046464600ULL, 0x7171710071717100ULL, 0xbababa00bababa00ULL, + 0xd4d4d400d4d4d400ULL, 0x2525250025252500ULL, 0xababab00ababab00ULL, + 0x4242420042424200ULL, 0x8888880088888800ULL, 0xa2a2a200a2a2a200ULL, + 0x8d8d8d008d8d8d00ULL, 0xfafafa00fafafa00ULL, 0x7272720072727200ULL, + 0x0707070007070700ULL, 0xb9b9b900b9b9b900ULL, 0x5555550055555500ULL, + 0xf8f8f800f8f8f800ULL, 0xeeeeee00eeeeee00ULL, 0xacacac00acacac00ULL, + 0x0a0a0a000a0a0a00ULL, 0x3636360036363600ULL, 0x4949490049494900ULL, + 0x2a2a2a002a2a2a00ULL, 0x6868680068686800ULL, 0x3c3c3c003c3c3c00ULL, + 0x3838380038383800ULL, 0xf1f1f100f1f1f100ULL, 0xa4a4a400a4a4a400ULL, + 0x4040400040404000ULL, 0x2828280028282800ULL, 0xd3d3d300d3d3d300ULL, + 0x7b7b7b007b7b7b00ULL, 0xbbbbbb00bbbbbb00ULL, 0xc9c9c900c9c9c900ULL, + 0x4343430043434300ULL, 0xc1c1c100c1c1c100ULL, 0x1515150015151500ULL, + 0xe3e3e300e3e3e300ULL, 0xadadad00adadad00ULL, 0xf4f4f400f4f4f400ULL, + 0x7777770077777700ULL, 0xc7c7c700c7c7c700ULL, 0x8080800080808000ULL, + 0x9e9e9e009e9e9e00ULL, }; /* key constants */ -- cgit v1.1 From d4c9dbc61fe0ca042b835c6f234af12fa5f18310 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 7 Sep 2012 07:54:52 +0100 Subject: x86/mm: Fix range check in tlbflush debugfs interface Since the shift count settable there is used for shifting values of type "unsigned long", its value must not match or exceed BITS_PER_LONG (otherwise the shift operations are undefined). Similarly, the value must not be negative (but -1 must be permitted, as that's the value used to distinguish the case of the fine grained flushing being disabled). Signed-off-by: Jan Beulich Cc: Alex Shi Link: http://lkml.kernel.org/r/5049B65C020000780009990C@nat28.tlf.novell.com Signed-off-by: Ingo Molnar --- arch/x86/mm/tlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 613cd83..a085c56 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -320,7 +320,7 @@ static ssize_t tlbflush_write_file(struct file *file, if (kstrtos8(buf, 0, &shift)) return -EINVAL; - if (shift > 64) + if (shift < -1 || shift >= BITS_PER_LONG) return -EINVAL; tlb_flushall_shift = shift; -- cgit v1.1 From 4f97704555672f9ab48ca623561e96a9430bec9a Mon Sep 17 00:00:00 2001 From: "Ren, Yongjie" Date: Fri, 7 Sep 2012 07:36:59 +0000 Subject: KVM: x86: Check INVPCID feature bit in EBX of leaf 7 Checks and operations on the INVPCID feature bit should use EBX of CPUID leaf 7 instead of ECX. Signed-off-by: Junjie Mao Signed-off-by: Yongjie Ren Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index c00f03d..002b4a5 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6575,7 +6575,7 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) /* Exposing INVPCID only when PCID is exposed */ best = kvm_find_cpuid_entry(vcpu, 0x7, 0); if (vmx_invpcid_supported() && - best && (best->ecx & bit(X86_FEATURE_INVPCID)) && + best && (best->ebx & bit(X86_FEATURE_INVPCID)) && guest_cpuid_has_pcid(vcpu)) { exec_control |= SECONDARY_EXEC_ENABLE_INVPCID; vmcs_write32(SECONDARY_VM_EXEC_CONTROL, @@ -6585,7 +6585,7 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); if (best) - best->ecx &= ~bit(X86_FEATURE_INVPCID); + best->ebx &= ~bit(X86_FEATURE_INVPCID); } } -- cgit v1.1 From 05194cfc547528a01d655262c49da3b1311c4b11 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Sun, 9 Sep 2012 11:12:04 -0700 Subject: x86, cpufeature: Add feature bit for SMAP Add CPUID feature bit for Supervisor Mode Access Prevention (SMAP). Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/n/tip-ethzcr5nipikl6hd5q8ssepq@git.kernel.org --- arch/x86/include/asm/cpufeature.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 6b7ee5f..633b617 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -209,6 +209,7 @@ #define X86_FEATURE_RTM (9*32+11) /* Restricted Transactional Memory */ #define X86_FEATURE_RDSEED (9*32+18) /* The RDSEED instruction */ #define X86_FEATURE_ADX (9*32+19) /* The ADCX and ADOX instructions */ +#define X86_FEATURE_SMAP (9*32+20) /* Supervisor Mode Access Prevention */ #if defined(__KERNEL__) && !defined(__ASSEMBLY__) -- cgit v1.1 From 3dc9a633f8a65b39c5897874138027328bfb0a94 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Tue, 4 Sep 2012 08:28:02 +0000 Subject: acpi-cpufreq: Add support for modern AMD CPUs The programming model for P-states on modern AMD CPUs is very similar to that of Intel and VIA. It makes sense to consolidate this support into one driver rather than duplicating functionality between two of them. This patch adds support for AMDs with hardware P-state control to acpi-cpufreq. Signed-off-by: Matthew Garrett Signed-off-by: Andre Przywara Signed-off-by: Rafael J. Wysocki --- arch/x86/include/asm/msr-index.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 957ec87..1e1f3eb 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -248,6 +248,8 @@ #define MSR_IA32_PERF_STATUS 0x00000198 #define MSR_IA32_PERF_CTL 0x00000199 +#define MSR_AMD_PERF_STATUS 0xc0010063 +#define MSR_AMD_PERF_CTL 0xc0010062 #define MSR_IA32_MPERF 0x000000e7 #define MSR_IA32_APERF 0x000000e8 -- cgit v1.1 From f594065faf4f9067c2283a34619fc0714e79a98d Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Tue, 4 Sep 2012 08:28:06 +0000 Subject: ACPI: Add fixups for AMD P-state figures Some AMD systems may round the frequencies in ACPI tables to 100MHz boundaries. We can obtain the real frequencies from MSRs, so add a quirk to fix these frequencies up on AMD systems. Signed-off-by: Matthew Garrett Signed-off-by: Andre Przywara Signed-off-by: Rafael J. Wysocki --- arch/x86/include/asm/msr-index.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 1e1f3eb..fbee971 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -248,6 +248,7 @@ #define MSR_IA32_PERF_STATUS 0x00000198 #define MSR_IA32_PERF_CTL 0x00000199 +#define MSR_AMD_PSTATE_DEF_BASE 0xc0010064 #define MSR_AMD_PERF_STATUS 0xc0010063 #define MSR_AMD_PERF_CTL 0xc0010062 -- cgit v1.1 From 92b5265d38f6a4d33e9d43974f176f18547687d6 Mon Sep 17 00:00:00 2001 From: "Liu, Jinsong" Date: Mon, 10 Sep 2012 06:55:39 +0800 Subject: KVM: Depend on HIGH_RES_TIMERS KVM lapic timer and tsc deadline timer based on hrtimer, setting a leftmost node to rb tree and then do hrtimer reprogram. If hrtimer not configured as high resolution, hrtimer_enqueue_reprogram do nothing and then make kvm lapic timer and tsc deadline timer fail. Signed-off-by: Liu, Jinsong Signed-off-by: Avi Kivity --- arch/x86/kvm/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 45c044f..586f000 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -20,6 +20,7 @@ if VIRTUALIZATION config KVM tristate "Kernel-based Virtual Machine (KVM) support" depends on HAVE_KVM + depends on HIGH_RES_TIMERS # for device assignment: depends on PCI # for TASKSTATS/TASK_DELAY_ACCT: -- cgit v1.1 From 7de5bdc96c372ab875408c86e0099958dba89f56 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 7 Sep 2012 14:15:03 +0800 Subject: KVM: MMU: remove unnecessary check Checking the return of kvm_mmu_get_page is unnecessary since it is guaranteed by memory cache Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 399c177..aa0b469 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2616,11 +2616,6 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr, iterator.level - 1, 1, ACC_ALL, iterator.sptep); - if (!sp) { - pgprintk("nonpaging_map: ENOMEM\n"); - kvm_release_pfn_clean(pfn); - return -ENOMEM; - } mmu_spte_set(iterator.sptep, __pa(sp->spt) -- cgit v1.1 From 4484141a94f4a5afea6ebc0b2abba0aa1b0ae9d1 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 7 Sep 2012 14:14:20 +0800 Subject: KVM: fix error paths for failed gfn_to_page() calls This bug was triggered: [ 4220.198458] BUG: unable to handle kernel paging request at fffffffffffffffe [ 4220.203907] IP: [] put_page+0xf/0x34 ...... [ 4220.237326] Call Trace: [ 4220.237361] [] kvm_arch_destroy_vm+0xf9/0x101 [kvm] [ 4220.237382] [] kvm_put_kvm+0xcc/0x127 [kvm] [ 4220.237401] [] kvm_vcpu_release+0x18/0x1c [kvm] [ 4220.237407] [] __fput+0x111/0x1ed [ 4220.237411] [] ____fput+0xe/0x10 [ 4220.237418] [] task_work_run+0x5d/0x88 [ 4220.237424] [] do_exit+0x2bf/0x7ca The test case: printf(fmt, ##args); \ exit(-1);} while (0) static int create_vm(void) { int sys_fd, vm_fd; sys_fd = open("/dev/kvm", O_RDWR); if (sys_fd < 0) die("open /dev/kvm fail.\n"); vm_fd = ioctl(sys_fd, KVM_CREATE_VM, 0); if (vm_fd < 0) die("KVM_CREATE_VM fail.\n"); return vm_fd; } static int create_vcpu(int vm_fd) { int vcpu_fd; vcpu_fd = ioctl(vm_fd, KVM_CREATE_VCPU, 0); if (vcpu_fd < 0) die("KVM_CREATE_VCPU ioctl.\n"); printf("Create vcpu.\n"); return vcpu_fd; } static void *vcpu_thread(void *arg) { int vm_fd = (int)(long)arg; create_vcpu(vm_fd); return NULL; } int main(int argc, char *argv[]) { pthread_t thread; int vm_fd; (void)argc; (void)argv; vm_fd = create_vm(); pthread_create(&thread, NULL, vcpu_thread, (void *)(long)vm_fd); printf("Exit.\n"); return 0; } It caused by release kvm->arch.ept_identity_map_addr which is the error page. The parent thread can send KILL signal to the vcpu thread when it was exiting which stops faulting pages and potentially allocating memory. So gfn_to_pfn/gfn_to_page may fail at this time Fixed by checking the page before it is used Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 19 ++++++++++++++++--- arch/x86/kvm/x86.c | 13 ++++++++++--- 2 files changed, 26 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 002b4a5..b1eb202 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3619,6 +3619,7 @@ static void seg_setup(int seg) static int alloc_apic_access_page(struct kvm *kvm) { + struct page *page; struct kvm_userspace_memory_region kvm_userspace_mem; int r = 0; @@ -3633,7 +3634,13 @@ static int alloc_apic_access_page(struct kvm *kvm) if (r) goto out; - kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00); + page = gfn_to_page(kvm, 0xfee00); + if (is_error_page(page)) { + r = -EFAULT; + goto out; + } + + kvm->arch.apic_access_page = page; out: mutex_unlock(&kvm->slots_lock); return r; @@ -3641,6 +3648,7 @@ out: static int alloc_identity_pagetable(struct kvm *kvm) { + struct page *page; struct kvm_userspace_memory_region kvm_userspace_mem; int r = 0; @@ -3656,8 +3664,13 @@ static int alloc_identity_pagetable(struct kvm *kvm) if (r) goto out; - kvm->arch.ept_identity_pagetable = gfn_to_page(kvm, - kvm->arch.ept_identity_map_addr >> PAGE_SHIFT); + page = gfn_to_page(kvm, kvm->arch.ept_identity_map_addr >> PAGE_SHIFT); + if (is_error_page(page)) { + r = -EFAULT; + goto out; + } + + kvm->arch.ept_identity_pagetable = page; out: mutex_unlock(&kvm->slots_lock); return r; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 148ed66..2966c84 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5113,17 +5113,20 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu) !kvm_event_needs_reinjection(vcpu); } -static void vapic_enter(struct kvm_vcpu *vcpu) +static int vapic_enter(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; struct page *page; if (!apic || !apic->vapic_addr) - return; + return 0; page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); + if (is_error_page(page)) + return -EFAULT; vcpu->arch.apic->vapic_page = page; + return 0; } static void vapic_exit(struct kvm_vcpu *vcpu) @@ -5430,7 +5433,11 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) } vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); - vapic_enter(vcpu); + r = vapic_enter(vcpu); + if (r) { + srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); + return r; + } r = 1; while (r > 0) { -- cgit v1.1 From 280050cc81ccb2e06e4061228ee34c0cc86b1560 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 10 Sep 2012 22:48:33 +0200 Subject: x86 bpf_jit: support MOD operation commit b6069a9570 (filter: add MOD operation) added generic support for modulus operation in BPF. This patch brings JIT support for x86_64 Signed-off-by: Eric Dumazet Cc: Andi Kleen Cc: George Bakos Signed-off-by: David S. Miller --- arch/x86/net/bpf_jit_comp.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 33643a8..106c578 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -280,6 +280,31 @@ void bpf_jit_compile(struct sk_filter *fp) } EMIT4(0x31, 0xd2, 0xf7, 0xf3); /* xor %edx,%edx; div %ebx */ break; + case BPF_S_ALU_MOD_X: /* A %= X; */ + seen |= SEEN_XREG; + EMIT2(0x85, 0xdb); /* test %ebx,%ebx */ + if (pc_ret0 > 0) { + /* addrs[pc_ret0 - 1] is start address of target + * (addrs[i] - 6) is the address following this jmp + * ("xor %edx,%edx; div %ebx;mov %edx,%eax" being 6 bytes long) + */ + EMIT_COND_JMP(X86_JE, addrs[pc_ret0 - 1] - + (addrs[i] - 6)); + } else { + EMIT_COND_JMP(X86_JNE, 2 + 5); + CLEAR_A(); + EMIT1_off32(0xe9, cleanup_addr - (addrs[i] - 6)); /* jmp .+off32 */ + } + EMIT2(0x31, 0xd2); /* xor %edx,%edx */ + EMIT2(0xf7, 0xf3); /* div %ebx */ + EMIT2(0x89, 0xd0); /* mov %edx,%eax */ + break; + case BPF_S_ALU_MOD_K: /* A %= K; */ + EMIT2(0x31, 0xd2); /* xor %edx,%edx */ + EMIT1(0xb9);EMIT(K, 4); /* mov imm32,%ecx */ + EMIT2(0xf7, 0xf1); /* div %ecx */ + EMIT2(0x89, 0xd0); /* mov %edx,%eax */ + break; case BPF_S_ALU_DIV_K: /* A = reciprocal_divide(A, K); */ EMIT3(0x48, 0x69, 0xc0); /* imul imm32,%rax,%rax */ EMIT(K, 4); -- cgit v1.1 From 73090f8993a40a2f67fed1ab866a928c68cd3765 Mon Sep 17 00:00:00 2001 From: Attilio Rao Date: Tue, 21 Aug 2012 21:22:37 +0100 Subject: x86: Remove base argument from x86_init.paging.pagetable_setup_start We either use swapper_pg_dir or the argument is unused. Preparatory patch to simplify platform pagetable setup further. Signed-off-by: Attilio Rao Ackedb-by: Cc: Cc: Cc: Link: http://lkml.kernel.org/r/1345580561-8506-2-git-send-email-attilio.rao@citrix.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/pgtable_types.h | 6 +++--- arch/x86/include/asm/x86_init.h | 2 +- arch/x86/kernel/setup.c | 2 +- arch/x86/kernel/x86_init.c | 3 ++- arch/x86/mm/init_32.c | 4 ++-- arch/x86/xen/mmu.c | 2 +- 6 files changed, 10 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 013286a1..e02b875 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -303,11 +303,11 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pte); extern void native_pagetable_reserve(u64 start, u64 end); #ifdef CONFIG_X86_32 -extern void native_pagetable_setup_start(pgd_t *base); +extern void native_pagetable_setup_start(void); extern void native_pagetable_setup_done(pgd_t *base); #else -#define native_pagetable_setup_start x86_init_pgd_noop -#define native_pagetable_setup_done x86_init_pgd_noop +#define native_pagetable_setup_start x86_init_pgd_start_noop +#define native_pagetable_setup_done x86_init_pgd_done_noop #endif struct seq_file; diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 38155f6..782ba0c 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -85,7 +85,7 @@ struct x86_init_mapping { * @pagetable_setup_done: platform specific post paging_init() call */ struct x86_init_paging { - void (*pagetable_setup_start)(pgd_t *base); + void (*pagetable_setup_start)(void); void (*pagetable_setup_done)(pgd_t *base); }; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index f4b9b80..90cbbe0 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -961,7 +961,7 @@ void __init setup_arch(char **cmdline_p) kvmclock_init(); #endif - x86_init.paging.pagetable_setup_start(swapper_pg_dir); + x86_init.paging.pagetable_setup_start(); paging_init(); x86_init.paging.pagetable_setup_done(swapper_pg_dir); diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 9f3167e..3b88493 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -26,7 +26,8 @@ void __cpuinit x86_init_noop(void) { } void __init x86_init_uint_noop(unsigned int unused) { } -void __init x86_init_pgd_noop(pgd_t *unused) { } +void __init x86_init_pgd_start_noop(void) { } +void __init x86_init_pgd_done_noop(pgd_t *unused) { } int __init iommu_init_noop(void) { return 0; } void iommu_shutdown_noop(void) { } diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 575d86f..c4aa1b2 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -445,10 +445,10 @@ static inline void permanent_kmaps_init(pgd_t *pgd_base) } #endif /* CONFIG_HIGHMEM */ -void __init native_pagetable_setup_start(pgd_t *base) +void __init native_pagetable_setup_start(void) { unsigned long pfn, va; - pgd_t *pgd; + pgd_t *pgd, *base = swapper_pg_dir; pud_t *pud; pmd_t *pmd; pte_t *pte; diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 5141d80..32e66c8 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1174,7 +1174,7 @@ static void xen_exit_mmap(struct mm_struct *mm) spin_unlock(&mm->page_table_lock); } -static void __init xen_pagetable_setup_start(pgd_t *base) +static void __init xen_pagetable_setup_start(void) { } -- cgit v1.1 From 7737b215ad0f94d20a87d98315da9f6cadaf35c9 Mon Sep 17 00:00:00 2001 From: Attilio Rao Date: Tue, 21 Aug 2012 21:22:38 +0100 Subject: x86: Rename pagetable_setup_start() to pagetable_init() In preparation for unifying the pagetable_setup_start() and pagetable_setup_done() setup functions, rename appropriately all the infrastructure related to pagetable_setup_start(). Signed-off-by: Attilio Rao Ackedd-by: Cc: Cc: Cc: Link: http://lkml.kernel.org/r/1345580561-8506-3-git-send-email-attilio.rao@citrix.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/pgtable_types.h | 4 ++-- arch/x86/include/asm/x86_init.h | 4 ++-- arch/x86/kernel/setup.c | 2 +- arch/x86/kernel/x86_init.c | 4 ++-- arch/x86/mm/init_32.c | 4 ++-- arch/x86/xen/mmu.c | 4 ++-- 6 files changed, 11 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index e02b875..0c01e07 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -303,10 +303,10 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pte); extern void native_pagetable_reserve(u64 start, u64 end); #ifdef CONFIG_X86_32 -extern void native_pagetable_setup_start(void); +extern void native_pagetable_init(void); extern void native_pagetable_setup_done(pgd_t *base); #else -#define native_pagetable_setup_start x86_init_pgd_start_noop +#define native_pagetable_init x86_init_pgd_init_noop #define native_pagetable_setup_done x86_init_pgd_done_noop #endif diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 782ba0c..24084b2 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -81,11 +81,11 @@ struct x86_init_mapping { /** * struct x86_init_paging - platform specific paging functions - * @pagetable_setup_start: platform specific pre paging_init() call + * @pagetable_init: platform specific paging initialization call * @pagetable_setup_done: platform specific post paging_init() call */ struct x86_init_paging { - void (*pagetable_setup_start)(void); + void (*pagetable_init)(void); void (*pagetable_setup_done)(pgd_t *base); }; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 90cbbe0..61b7d98 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -961,7 +961,7 @@ void __init setup_arch(char **cmdline_p) kvmclock_init(); #endif - x86_init.paging.pagetable_setup_start(); + x86_init.paging.pagetable_init(); paging_init(); x86_init.paging.pagetable_setup_done(swapper_pg_dir); diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 3b88493..0e1e950 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -26,7 +26,7 @@ void __cpuinit x86_init_noop(void) { } void __init x86_init_uint_noop(unsigned int unused) { } -void __init x86_init_pgd_start_noop(void) { } +void __init x86_init_pgd_init_noop(void) { } void __init x86_init_pgd_done_noop(pgd_t *unused) { } int __init iommu_init_noop(void) { return 0; } void iommu_shutdown_noop(void) { } @@ -69,7 +69,7 @@ struct x86_init_ops x86_init __initdata = { }, .paging = { - .pagetable_setup_start = native_pagetable_setup_start, + .pagetable_init = native_pagetable_init, .pagetable_setup_done = native_pagetable_setup_done, }, diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index c4aa1b2..0e38e0e 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -445,7 +445,7 @@ static inline void permanent_kmaps_init(pgd_t *pgd_base) } #endif /* CONFIG_HIGHMEM */ -void __init native_pagetable_setup_start(void) +void __init native_pagetable_init(void) { unsigned long pfn, va; pgd_t *pgd, *base = swapper_pg_dir; @@ -493,7 +493,7 @@ void __init native_pagetable_setup_done(pgd_t *base) * If we're booting paravirtualized under a hypervisor, then there are * more options: we may already be running PAE, and the pagetable may * or may not be based in swapper_pg_dir. In any case, - * paravirt_pagetable_setup_start() will set up swapper_pg_dir + * paravirt_pagetable_init() will set up swapper_pg_dir * appropriately for the rest of the initialization to work. * * In general, pagetable_init() assumes that the pagetable may already diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 32e66c8..624efbe 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1174,7 +1174,7 @@ static void xen_exit_mmap(struct mm_struct *mm) spin_unlock(&mm->page_table_lock); } -static void __init xen_pagetable_setup_start(void) +static void __init xen_pagetable_init(void) { } @@ -2068,7 +2068,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = { void __init xen_init_mmu_ops(void) { x86_init.mapping.pagetable_reserve = xen_mapping_pagetable_reserve; - x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start; + x86_init.paging.pagetable_init = xen_pagetable_init; x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done; pv_mmu_ops = xen_mmu_ops; -- cgit v1.1 From 843b8ed2ec598aae5e3516b21957ede62a070e36 Mon Sep 17 00:00:00 2001 From: Attilio Rao Date: Tue, 21 Aug 2012 21:22:39 +0100 Subject: x86: Move paging_init() call to x86_init.paging.pagetable_init() Move the paging_init() call to the platform specific pagetable_init() function, so we can get rid of the extra pagetable_setup_done() function pointer. Signed-off-by: Attilio Rao Acked-by: Cc: Cc: Cc: Link: http://lkml.kernel.org/r/1345580561-8506-4-git-send-email-attilio.rao@citrix.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/pgtable_types.h | 2 +- arch/x86/kernel/setup.c | 1 - arch/x86/kernel/x86_init.c | 1 - arch/x86/mm/init_32.c | 1 + arch/x86/xen/mmu.c | 1 + 5 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 0c01e07..c93cb8e 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -306,7 +306,7 @@ extern void native_pagetable_reserve(u64 start, u64 end); extern void native_pagetable_init(void); extern void native_pagetable_setup_done(pgd_t *base); #else -#define native_pagetable_init x86_init_pgd_init_noop +#define native_pagetable_init paging_init #define native_pagetable_setup_done x86_init_pgd_done_noop #endif diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 61b7d98..315fd24 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -962,7 +962,6 @@ void __init setup_arch(char **cmdline_p) #endif x86_init.paging.pagetable_init(); - paging_init(); x86_init.paging.pagetable_setup_done(swapper_pg_dir); if (boot_cpu_data.cpuid_level >= 0) { diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 0e1e950..5f2478f 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -26,7 +26,6 @@ void __cpuinit x86_init_noop(void) { } void __init x86_init_uint_noop(unsigned int unused) { } -void __init x86_init_pgd_init_noop(void) { } void __init x86_init_pgd_done_noop(pgd_t *unused) { } int __init iommu_init_noop(void) { return 0; } void iommu_shutdown_noop(void) { } diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 0e38e0e..e35b4b1 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -475,6 +475,7 @@ void __init native_pagetable_init(void) pte_clear(NULL, va, pte); } paravirt_alloc_pmd(&init_mm, __pa(base) >> PAGE_SHIFT); + paging_init(); } void __init native_pagetable_setup_done(pgd_t *base) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 624efbe..c2ff7ea 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1176,6 +1176,7 @@ static void xen_exit_mmap(struct mm_struct *mm) static void __init xen_pagetable_init(void) { + paging_init(); } static __init void xen_mapping_pagetable_reserve(u64 start, u64 end) -- cgit v1.1 From c711288727a62f74d48032e56e51333dd104bf58 Mon Sep 17 00:00:00 2001 From: Attilio Rao Date: Tue, 21 Aug 2012 21:22:40 +0100 Subject: x86: xen: Cleanup and remove x86_init.paging.pagetable_setup_done() At this stage x86_init.paging.pagetable_setup_done is only used in the XEN case. Move its content in the x86_init.paging.pagetable_init setup function and remove the now unused x86_init.paging.pagetable_setup_done remaining infrastructure. Signed-off-by: Attilio Rao Acked-by: Cc: Cc: Cc: Link: http://lkml.kernel.org/r/1345580561-8506-5-git-send-email-attilio.rao@citrix.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/pgtable_types.h | 2 -- arch/x86/include/asm/x86_init.h | 2 -- arch/x86/kernel/setup.c | 1 - arch/x86/kernel/x86_init.c | 2 -- arch/x86/mm/init_32.c | 4 ---- arch/x86/xen/mmu.c | 13 ++++--------- 6 files changed, 4 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index c93cb8e..db8fec6 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -304,10 +304,8 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pte); extern void native_pagetable_reserve(u64 start, u64 end); #ifdef CONFIG_X86_32 extern void native_pagetable_init(void); -extern void native_pagetable_setup_done(pgd_t *base); #else #define native_pagetable_init paging_init -#define native_pagetable_setup_done x86_init_pgd_done_noop #endif struct seq_file; diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 24084b2..995ea5c3 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -82,11 +82,9 @@ struct x86_init_mapping { /** * struct x86_init_paging - platform specific paging functions * @pagetable_init: platform specific paging initialization call - * @pagetable_setup_done: platform specific post paging_init() call */ struct x86_init_paging { void (*pagetable_init)(void); - void (*pagetable_setup_done)(pgd_t *base); }; /** diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 315fd24..4f16547 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -962,7 +962,6 @@ void __init setup_arch(char **cmdline_p) #endif x86_init.paging.pagetable_init(); - x86_init.paging.pagetable_setup_done(swapper_pg_dir); if (boot_cpu_data.cpuid_level >= 0) { /* A CPU has %cr4 if and only if it has CPUID */ diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 5f2478f..7a3d075 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -26,7 +26,6 @@ void __cpuinit x86_init_noop(void) { } void __init x86_init_uint_noop(unsigned int unused) { } -void __init x86_init_pgd_done_noop(pgd_t *unused) { } int __init iommu_init_noop(void) { return 0; } void iommu_shutdown_noop(void) { } @@ -69,7 +68,6 @@ struct x86_init_ops x86_init __initdata = { .paging = { .pagetable_init = native_pagetable_init, - .pagetable_setup_done = native_pagetable_setup_done, }, .timers = { diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index e35b4b1..4f04db1 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -478,10 +478,6 @@ void __init native_pagetable_init(void) paging_init(); } -void __init native_pagetable_setup_done(pgd_t *base) -{ -} - /* * Build a proper pagetable for the kernel mappings. Up until this * point, we've been running on some set of pagetables constructed by diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index c2ff7ea..7a769b7 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1174,9 +1174,13 @@ static void xen_exit_mmap(struct mm_struct *mm) spin_unlock(&mm->page_table_lock); } +static void xen_post_allocator_init(void); + static void __init xen_pagetable_init(void) { paging_init(); + xen_setup_shared_info(); + xen_post_allocator_init(); } static __init void xen_mapping_pagetable_reserve(u64 start, u64 end) @@ -1193,14 +1197,6 @@ static __init void xen_mapping_pagetable_reserve(u64 start, u64 end) } } -static void xen_post_allocator_init(void); - -static void __init xen_pagetable_setup_done(pgd_t *base) -{ - xen_setup_shared_info(); - xen_post_allocator_init(); -} - static void xen_write_cr2(unsigned long cr2) { this_cpu_read(xen_vcpu)->arch.cr2 = cr2; @@ -2070,7 +2066,6 @@ void __init xen_init_mmu_ops(void) { x86_init.mapping.pagetable_reserve = xen_mapping_pagetable_reserve; x86_init.paging.pagetable_init = xen_pagetable_init; - x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done; pv_mmu_ops = xen_mmu_ops; memset(dummy_mapping, 0xff, PAGE_SIZE); -- cgit v1.1 From 64282278989d5b0398dcb3ba7904cb00c621dc35 Mon Sep 17 00:00:00 2001 From: Attilio Rao Date: Tue, 21 Aug 2012 21:22:41 +0100 Subject: x86: Document x86_init.paging.pagetable_init() Signed-off-by: Attilio Rao Acked-by: Cc: Cc: Cc: Link: http://lkml.kernel.org/r/1345580561-8506-6-git-send-email-attilio.rao@citrix.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/x86_init.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 995ea5c3..5769349 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -81,7 +81,10 @@ struct x86_init_mapping { /** * struct x86_init_paging - platform specific paging functions - * @pagetable_init: platform specific paging initialization call + * @pagetable_init: platform specific paging initialization call to setup + * the kernel pagetables and prepare accessors functions. + * Callback must call paging_init(). Called once after the + * direct mapping for phys memory is available. */ struct x86_init_paging { void (*pagetable_init)(void); -- cgit v1.1 From 2fc136eecd0c647a6b13fcd00d0c41a1a28f35a5 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Wed, 12 Sep 2012 12:44:30 +0100 Subject: xen/m2p: do not reuse kmap_op->dev_bus_addr If the caller passes a valid kmap_op to m2p_add_override, we use kmap_op->dev_bus_addr to store the original mfn, but dev_bus_addr is part of the interface with Xen and if we are batching the hypercalls it might not have been written by the hypervisor yet. That means that later on Xen will write to it and we'll think that the original mfn is actually what Xen has written to it. Rather than "stealing" struct members from kmap_op, keep using page->index to store the original mfn and add another parameter to m2p_remove_override to get the corresponding kmap_op instead. It is now responsibility of the caller to keep track of which kmap_op corresponds to a particular page in the m2p_override (gntdev, the only user of this interface that passes a valid kmap_op, is already doing that). CC: stable@kernel.org Reported-and-Tested-By: Sander Eikelenboom Signed-off-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/include/asm/xen/page.h | 3 ++- arch/x86/xen/p2m.c | 27 +++++++++++---------------- 2 files changed, 13 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index 93971e8..472b9b7 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -51,7 +51,8 @@ extern unsigned long set_phys_range_identity(unsigned long pfn_s, extern int m2p_add_override(unsigned long mfn, struct page *page, struct gnttab_map_grant_ref *kmap_op); -extern int m2p_remove_override(struct page *page, bool clear_pte); +extern int m2p_remove_override(struct page *page, + struct gnttab_map_grant_ref *kmap_op); extern struct page *m2p_find_override(unsigned long mfn); extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn); diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 76ba0e9..72213da 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -828,9 +828,6 @@ int m2p_add_override(unsigned long mfn, struct page *page, xen_mc_issue(PARAVIRT_LAZY_MMU); } - /* let's use dev_bus_addr to record the old mfn instead */ - kmap_op->dev_bus_addr = page->index; - page->index = (unsigned long) kmap_op; } spin_lock_irqsave(&m2p_override_lock, flags); list_add(&page->lru, &m2p_overrides[mfn_hash(mfn)]); @@ -857,7 +854,8 @@ int m2p_add_override(unsigned long mfn, struct page *page, return 0; } EXPORT_SYMBOL_GPL(m2p_add_override); -int m2p_remove_override(struct page *page, bool clear_pte) +int m2p_remove_override(struct page *page, + struct gnttab_map_grant_ref *kmap_op) { unsigned long flags; unsigned long mfn; @@ -887,10 +885,8 @@ int m2p_remove_override(struct page *page, bool clear_pte) WARN_ON(!PagePrivate(page)); ClearPagePrivate(page); - if (clear_pte) { - struct gnttab_map_grant_ref *map_op = - (struct gnttab_map_grant_ref *) page->index; - set_phys_to_machine(pfn, map_op->dev_bus_addr); + set_phys_to_machine(pfn, page->index); + if (kmap_op != NULL) { if (!PageHighMem(page)) { struct multicall_space mcs; struct gnttab_unmap_grant_ref *unmap_op; @@ -902,13 +898,13 @@ int m2p_remove_override(struct page *page, bool clear_pte) * issued. In this case handle is going to -1 because * it hasn't been modified yet. */ - if (map_op->handle == -1) + if (kmap_op->handle == -1) xen_mc_flush(); /* - * Now if map_op->handle is negative it means that the + * Now if kmap_op->handle is negative it means that the * hypercall actually returned an error. */ - if (map_op->handle == GNTST_general_error) { + if (kmap_op->handle == GNTST_general_error) { printk(KERN_WARNING "m2p_remove_override: " "pfn %lx mfn %lx, failed to modify kernel mappings", pfn, mfn); @@ -918,8 +914,8 @@ int m2p_remove_override(struct page *page, bool clear_pte) mcs = xen_mc_entry( sizeof(struct gnttab_unmap_grant_ref)); unmap_op = mcs.args; - unmap_op->host_addr = map_op->host_addr; - unmap_op->handle = map_op->handle; + unmap_op->host_addr = kmap_op->host_addr; + unmap_op->handle = kmap_op->handle; unmap_op->dev_bus_addr = 0; MULTI_grant_table_op(mcs.mc, @@ -930,10 +926,9 @@ int m2p_remove_override(struct page *page, bool clear_pte) set_pte_at(&init_mm, address, ptep, pfn_pte(pfn, PAGE_KERNEL)); __flush_tlb_single(address); - map_op->host_addr = 0; + kmap_op->host_addr = 0; } - } else - set_phys_to_machine(pfn, page->index); + } /* p2m(m2p(mfn)) == FOREIGN_FRAME(mfn): the mfn is already present * somewhere in this domain, even before being added to the -- cgit v1.1 From ecba9a52acdf20530d561b7634b80c35c308943a Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Wed, 5 Sep 2012 19:30:01 +0900 Subject: KVM: x86: lapic: Clean up find_highest_vector() and count_vectors() find_highest_vector() and count_vectors(): - Instead of using magic values, define and use proper macros. find_highest_vector(): - Remove likely() which is there only for historical reasons and not doing correct branch predictions anymore. Using such heuristics to optimize this function is not worth it now. Let CPUs predict things instead. - Stop checking word[0] separately. This was only needed for doing likely() optimization. - Use for loop, not while, to iterate over the register array to make the code clearer. Note that we actually confirmed that the likely() did wrong predictions by inserting debug code. Acked-by: Michael S. Tsirkin Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/lapic.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 07ad628..6f9fd63 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -66,6 +66,7 @@ #define APIC_DEST_NOSHORT 0x0 #define APIC_DEST_MASK 0x800 #define MAX_APIC_VECTOR 256 +#define APIC_VECTORS_PER_REG 32 #define VEC_POS(v) ((v) & (32 - 1)) #define REG_POS(v) (((v) >> 5) << 4) @@ -208,25 +209,30 @@ static const unsigned int apic_lvt_mask[APIC_LVT_NUM] = { static int find_highest_vector(void *bitmap) { - u32 *word = bitmap; - int word_offset = MAX_APIC_VECTOR >> 5; + int vec; + u32 *reg; - while ((word_offset != 0) && (word[(--word_offset) << 2] == 0)) - continue; + for (vec = MAX_APIC_VECTOR - APIC_VECTORS_PER_REG; + vec >= 0; vec -= APIC_VECTORS_PER_REG) { + reg = bitmap + REG_POS(vec); + if (*reg) + return fls(*reg) - 1 + vec; + } - if (likely(!word_offset && !word[0])) - return -1; - else - return fls(word[word_offset << 2]) - 1 + (word_offset << 5); + return -1; } static u8 count_vectors(void *bitmap) { - u32 *word = bitmap; - int word_offset; + int vec; + u32 *reg; u8 count = 0; - for (word_offset = 0; word_offset < MAX_APIC_VECTOR >> 5; ++word_offset) - count += hweight32(word[word_offset << 2]); + + for (vec = 0; vec < MAX_APIC_VECTOR; vec += APIC_VECTORS_PER_REG) { + reg = bitmap + REG_POS(vec); + count += hweight32(*reg); + } + return count; } -- cgit v1.1 From 35534b201c9f115c68962c095b5a9aad204d025f Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Wed, 29 Aug 2012 15:01:22 +0200 Subject: perf/x86: Export Sandy Bridge uncore clockticks event in sysfs This patch exports the clockticks event and its encoding to user level. The clockticks event was exported for Nehalem/Westmere but not for Sandy Bridge (client). Given that it uses a special encoding, it needs to be exported to user tools, so users can do: # perf stat -a -C 0 -e uncore_cbox_0/clockticks/ sleep 1 Signed-off-by: Stephane Eranian Acked-by: Yan, Zheng Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120829130122.GA32336@quad Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_uncore.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index 0a55710..38e4894 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -661,6 +661,11 @@ static void snb_uncore_msr_init_box(struct intel_uncore_box *box) } } +static struct uncore_event_desc snb_uncore_events[] = { + INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"), + { /* end: all zeroes */ }, +}; + static struct attribute *snb_uncore_formats_attr[] = { &format_attr_event.attr, &format_attr_umask.attr, @@ -704,6 +709,7 @@ static struct intel_uncore_type snb_uncore_cbox = { .constraints = snb_uncore_cbox_constraints, .ops = &snb_uncore_msr_ops, .format_group = &snb_uncore_format_group, + .event_descs = snb_uncore_events, }; static struct intel_uncore_type *snb_msr_uncores[] = { -- cgit v1.1 From bad9ac2d7f878a31cf1ae8c1ee3768077d222bcb Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 25 Jul 2012 19:12:45 +0200 Subject: perf/x86/ibs: Check syscall attribute flags Current implementation simply ignores attribute flags. Thus, there is no notification to userland of unsupported features. Check syscall's attribute flags to let userland know if a feature is supported by the kernel. This is also needed to distinguish between future kernels what might support a feature. Cc: v3.5.. Signed-off-by: Robert Richter Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120910093018.GO8285@erda.amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_amd_ibs.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c index 7bfb5be..eebd5ff 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -209,6 +209,15 @@ static int perf_ibs_precise_event(struct perf_event *event, u64 *config) return -EOPNOTSUPP; } +static const struct perf_event_attr ibs_notsupp = { + .exclude_user = 1, + .exclude_kernel = 1, + .exclude_hv = 1, + .exclude_idle = 1, + .exclude_host = 1, + .exclude_guest = 1, +}; + static int perf_ibs_init(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; @@ -229,6 +238,9 @@ static int perf_ibs_init(struct perf_event *event) if (event->pmu != &perf_ibs->pmu) return -ENOENT; + if (perf_flags(&event->attr) & perf_flags(&ibs_notsupp)) + return -EINVAL; + if (config & ~perf_ibs->config_mask) return -EINVAL; -- cgit v1.1 From 6eebdda35e6b18d0dddb2a44e34211bd94f0cad6 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Fri, 24 Aug 2012 23:58:47 +0400 Subject: x86: Drop unnecessary kernel_eflags variable on 64-bit On 64 bit x86 we save the current eflags in cpu_init for use in ret_from_fork. Strictly speaking reserved bits in EFLAGS should be read as written but in practise it is unlikely that EFLAGS could ever be extended in this way and the kernel alread clears any undefined flags early on. The equivalent 32 bit code simply hard codes 0x0202 as the new EFLAGS. This change makes 64 bit use the same mechanism to setup the initial EFLAGS on fork. Note that 64 bit resets EFLAGS before calling schedule_tail() as opposed to 32 bit which calls schedule_tail() first. Therefore the correct value for EFLAGS has opposite IF bit. Signed-off-by: Ian Campbell Signed-off-by: Cyrill Gorcunov Acked-by: Andi Kleen Acked-by: "H. Peter Anvin" Cc: Brian Gerst Cc: Peter Zijlstra Cc: Pekka Enberg Cc: Andi Kleen Link: http://lkml.kernel.org/r/20120824195847.GA31628@moon Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 1 - arch/x86/kernel/cpu/common.c | 4 ---- arch/x86/kernel/entry_64.S | 2 +- 3 files changed, 1 insertion(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index d048cad..9738b39e 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -423,7 +423,6 @@ DECLARE_INIT_PER_CPU(irq_stack_union); DECLARE_PER_CPU(char *, irq_stack_ptr); DECLARE_PER_CPU(unsigned int, irq_count); -extern unsigned long kernel_eflags; extern asmlinkage void ignore_sysret(void); #else /* X86_64 */ #ifdef CONFIG_CC_STACKPROTECTOR diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index a5fbc3c..9961e2e 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1116,8 +1116,6 @@ void syscall_init(void) X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL); } -unsigned long kernel_eflags; - /* * Copies of the original ist values from the tss are only accessed during * debugging, no special alignment required. @@ -1299,8 +1297,6 @@ void __cpuinit cpu_init(void) fpu_init(); xsave_init(); - raw_local_save_flags(kernel_eflags); - if (is_uv_system()) uv_cpu_init(); } diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 69babd8..b1dac12 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -440,7 +440,7 @@ ENTRY(ret_from_fork) LOCK ; btr $TIF_FORK,TI_flags(%r8) - pushq_cfi kernel_eflags(%rip) + pushq_cfi $0x0002 popfq_cfi # reset kernel eflags call schedule_tail # rdi: 'prev' task parameter -- cgit v1.1 From 73e8f3d7e2cb23614d5115703d76d8e54764b641 Mon Sep 17 00:00:00 2001 From: T Makphaibulchoke Date: Tue, 28 Aug 2012 21:21:43 -0600 Subject: x86/mm/init.c: Fix devmem_is_allowed() off by one Fixing an off-by-one error in devmem_is_allowed(), which allows accesses to physical addresses 0x100000-0x100fff, an extra page past 1MB. Signed-off-by: T Makphaibulchoke Acked-by: H. Peter Anvin Cc: yinghai@kernel.org Cc: tiwai@suse.de Cc: dhowells@redhat.com Link: http://lkml.kernel.org/r/1346210503-14276-1-git-send-email-tmac@hp.com Signed-off-by: Ingo Molnar --- arch/x86/mm/init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index e0e6990..ab1f6a9 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -319,7 +319,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, */ int devmem_is_allowed(unsigned long pagenr) { - if (pagenr <= 256) + if (pagenr < 256) return 1; if (iomem_is_exclusive(pagenr << PAGE_SHIFT)) return 0; -- cgit v1.1 From 1edfbb4153bd29bcf8d2236676238d5237972be1 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 10 Sep 2012 12:04:16 +0100 Subject: x86/64: Adjust types of temporaries used by ffs()/fls()/fls64() The 64-bit special cases of the former two (the thrird one is 64-bit only anyway) don't need to use "long" temporaries, as the result will always fit in a 32-bit variable, and the functions return plain "int". This avoids a few REX prefixes, i.e. minimally reduces code size. Signed-off-by: Jan Beulich Cc: Linus Torvalds Link: http://lkml.kernel.org/r/504DE550020000780009A258@nat28.tlf.novell.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/bitops.h | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index 72f5009..ebaee69 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -417,10 +417,9 @@ static inline int ffs(int x) * We cannot do this on 32 bits because at the very least some * 486 CPUs did not behave this way. */ - long tmp = -1; asm("bsfl %1,%0" : "=r" (r) - : "rm" (x), "0" (tmp)); + : "rm" (x), "0" (-1)); #elif defined(CONFIG_X86_CMOV) asm("bsfl %1,%0\n\t" "cmovzl %2,%0" @@ -459,10 +458,9 @@ static inline int fls(int x) * We cannot do this on 32 bits because at the very least some * 486 CPUs did not behave this way. */ - long tmp = -1; asm("bsrl %1,%0" : "=r" (r) - : "rm" (x), "0" (tmp)); + : "rm" (x), "0" (-1)); #elif defined(CONFIG_X86_CMOV) asm("bsrl %1,%0\n\t" "cmovzl %2,%0" @@ -490,13 +488,13 @@ static inline int fls(int x) #ifdef CONFIG_X86_64 static __always_inline int fls64(__u64 x) { - long bitpos = -1; + int bitpos = -1; /* * AMD64 says BSRQ won't clobber the dest reg if x==0; Intel64 says the * dest reg is undefined if x==0, but their CPU architect says its * value is written to set it to the same as before. */ - asm("bsrq %1,%0" + asm("bsrq %1,%q0" : "+r" (bitpos) : "rm" (x)); return bitpos + 1; -- cgit v1.1 From 5870661c091e827973674cc3469b50c959008c2b Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 10 Sep 2012 12:24:43 +0100 Subject: x86: Prefer TZCNT over BFS Following a relatively recent compiler change, make use of the fact that for non-zero input BSF and TZCNT produce the same result, and that CPUs not knowing of TZCNT will treat the instruction as BSF (i.e. ignore what looks like a REP prefix to them). The assumption here is that TZCNT would never have worse performance than BSF. For the moment, only do this when the respective generic-CPU option is selected (as there are no specific-CPU options covering the CPUs supporting TZCNT), and don't do that when size optimization was requested. Signed-off-by: Jan Beulich Cc: Linus Torvalds Link: http://lkml.kernel.org/r/504DEA1B020000780009A277@nat28.tlf.novell.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/bitops.h | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index ebaee69..b2af664 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -347,6 +347,19 @@ static int test_bit(int nr, const volatile unsigned long *addr); ? constant_test_bit((nr), (addr)) \ : variable_test_bit((nr), (addr))) +#if (defined(CONFIG_X86_GENERIC) || defined(CONFIG_GENERIC_CPU)) \ + && !defined(CONFIG_CC_OPTIMIZE_FOR_SIZE) +/* + * Since BSF and TZCNT have sufficiently similar semantics for the purposes + * for which we use them here, BMI-capable hardware will decode the prefixed + * variant as 'tzcnt ...' and may execute that faster than 'bsf ...', while + * older hardware will ignore the REP prefix and decode it as 'bsf ...'. + */ +# define BSF_PREFIX "rep;" +#else +# define BSF_PREFIX +#endif + /** * __ffs - find first set bit in word * @word: The word to search @@ -355,7 +368,7 @@ static int test_bit(int nr, const volatile unsigned long *addr); */ static inline unsigned long __ffs(unsigned long word) { - asm("bsf %1,%0" + asm(BSF_PREFIX "bsf %1,%0" : "=r" (word) : "rm" (word)); return word; @@ -369,12 +382,14 @@ static inline unsigned long __ffs(unsigned long word) */ static inline unsigned long ffz(unsigned long word) { - asm("bsf %1,%0" + asm(BSF_PREFIX "bsf %1,%0" : "=r" (word) : "r" (~word)); return word; } +#undef BSF_PREFIX + /* * __fls: find last set bit in word * @word: The word to search -- cgit v1.1 From 3120e25efdc0834c88e1c0f8394e2087444f8c19 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 10 Sep 2012 12:41:45 +0100 Subject: x86/Kconfig: Clean up Kconfig defaults The main goal here is to have the resulting .config no carry any options that aren't enabled and can't be (i.e such where the default is "no" and can't be changed), so that if any such option later gets a user visible prompt, the user will actually be prompted on a "make ...oldconfig" rather than keeping the previously invisible option disabled. There's a little bit of other trivial cleanup mixed in here. Signed-off-by: Jan Beulich Cc: Linus Torvalds Link: http://lkml.kernel.org/r/504DEE19020000780009A285@nat28.tlf.novell.com Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 50 ++++++++++++++++++++++++++++++-------------------- arch/x86/Kconfig.cpu | 5 +++-- 2 files changed, 33 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 8ec3a1a..3fb8719 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -7,11 +7,13 @@ config 64BIT Say no to build a 32-bit kernel - formerly known as i386 config X86_32 - def_bool !64BIT + def_bool y + depends on !64BIT select CLKSRC_I8253 config X86_64 - def_bool 64BIT + def_bool y + depends on 64BIT select X86_DEV_DMA_OPS ### Arch settings @@ -99,7 +101,8 @@ config X86 select GENERIC_STRNLEN_USER config INSTRUCTION_DECODER - def_bool (KPROBES || PERF_EVENTS || UPROBES) + def_bool y + depends on KPROBES || PERF_EVENTS || UPROBES config OUTPUT_FORMAT string @@ -127,13 +130,15 @@ config SBUS bool config NEED_DMA_MAP_STATE - def_bool (X86_64 || INTEL_IOMMU || DMA_API_DEBUG) + def_bool y + depends on X86_64 || INTEL_IOMMU || DMA_API_DEBUG config NEED_SG_DMA_LENGTH def_bool y config GENERIC_ISA_DMA - def_bool ISA_DMA_API + def_bool y + depends on ISA_DMA_API config GENERIC_BUG def_bool y @@ -150,13 +155,16 @@ config GENERIC_GPIO bool config ARCH_MAY_HAVE_PC_FDC - def_bool ISA_DMA_API + def_bool y + depends on ISA_DMA_API config RWSEM_GENERIC_SPINLOCK - def_bool !X86_XADD + def_bool y + depends on !X86_XADD config RWSEM_XCHGADD_ALGORITHM - def_bool X86_XADD + def_bool y + depends on X86_XADD config GENERIC_CALIBRATE_DELAY def_bool y @@ -752,7 +760,8 @@ config SWIOTLB 3 GB of memory. If unsure, say Y. config IOMMU_HELPER - def_bool (CALGARY_IOMMU || GART_IOMMU || SWIOTLB || AMD_IOMMU) + def_bool y + depends on CALGARY_IOMMU || GART_IOMMU || SWIOTLB || AMD_IOMMU config MAXSMP bool "Enable Maximum number of SMP Processors and NUMA Nodes" @@ -1159,10 +1168,12 @@ config X86_PAE consumes more pagetable space per process. config ARCH_PHYS_ADDR_T_64BIT - def_bool X86_64 || X86_PAE + def_bool y + depends on X86_64 || X86_PAE config ARCH_DMA_ADDR_T_64BIT - def_bool X86_64 || HIGHMEM64G + def_bool y + depends on X86_64 || HIGHMEM64G config DIRECT_GBPAGES bool "Enable 1GB pages for kernel pagetables" if EXPERT @@ -1285,8 +1296,8 @@ config ARCH_SELECT_MEMORY_MODEL depends on ARCH_SPARSEMEM_ENABLE config ARCH_MEMORY_PROBE - def_bool X86_64 - depends on MEMORY_HOTPLUG + def_bool y + depends on X86_64 && MEMORY_HOTPLUG config ARCH_PROC_KCORE_TEXT def_bool y @@ -1975,7 +1986,6 @@ config PCI_MMCONFIG config PCI_CNB20LE_QUIRK bool "Read CNB20LE Host Bridge Windows" if EXPERT - default n depends on PCI && EXPERIMENTAL help Read the PCI windows out of the CNB20LE host bridge. This allows @@ -2186,18 +2196,18 @@ config COMPAT depends on IA32_EMULATION || X86_X32 select ARCH_WANT_OLD_COMPAT_IPC +if COMPAT config COMPAT_FOR_U64_ALIGNMENT - def_bool COMPAT - depends on X86_64 + def_bool y config SYSVIPC_COMPAT def_bool y - depends on COMPAT && SYSVIPC + depends on SYSVIPC config KEYS_COMPAT - bool - depends on COMPAT && KEYS - default y + def_bool y + depends on KEYS +endif endmenu diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 706e12e..f3b86d0 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -306,7 +306,8 @@ config X86_INTERNODE_CACHE_SHIFT default X86_L1_CACHE_SHIFT config X86_CMPXCHG - def_bool X86_64 || (X86_32 && !M386) + def_bool y + depends on X86_64 || (X86_32 && !M386) config X86_L1_CACHE_SHIFT int @@ -317,7 +318,7 @@ config X86_L1_CACHE_SHIFT config X86_XADD def_bool y - depends on X86_64 || !M386 + depends on !M386 config X86_PPRO_FENCE bool "PentiumPro memory ordering errata workaround" -- cgit v1.1 From a5e37863ab31d78faddff15675c89979792bc0bd Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 5 Sep 2012 23:31:00 +0900 Subject: ftrace/x86: Adjust x86 regs.ip as like as x86-64 Adjust x86 regs.ip to ip + MCOUNT_INSN_SIZE as like as on x86-64. This helps us to consolidate codes which use regs->ip on both of x86/x86-64. Link: http://lkml.kernel.org/r/20120905143100.10329.60109.stgit@localhost.localdomain Cc: Peter Zijlstra Cc: Frederic Weisbecker Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- arch/x86/kernel/entry_32.S | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 061ac17..f438a44 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1148,7 +1148,6 @@ ENTRY(ftrace_regs_caller) * ip location, and move flags into the return ip location. */ pushl 4(%esp) /* save return ip into ip slot */ - subl $MCOUNT_INSN_SIZE, (%esp) /* Adjust ip */ pushl $0 /* Load 0 into orig_ax */ pushl %gs @@ -1169,6 +1168,7 @@ ENTRY(ftrace_regs_caller) movl $__KERNEL_CS,13*4(%esp) movl 12*4(%esp), %eax /* Load ip (1st parameter) */ + subl $MCOUNT_INSN_SIZE, %eax /* Adjust ip */ movl 0x4(%ebp), %edx /* Load parent ip (2nd parameter) */ leal function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */ pushl %esp /* Save pt_regs as 4th parameter */ @@ -1180,7 +1180,6 @@ GLOBAL(ftrace_regs_call) movl 14*4(%esp), %eax /* Move flags back into cs */ movl %eax, 13*4(%esp) /* Needed to keep addl from modifying flags */ movl 12*4(%esp), %eax /* Get return ip from regs->ip */ - addl $MCOUNT_INSN_SIZE, %eax movl %eax, 14*4(%esp) /* Put return ip back for ret */ popl %ebx -- cgit v1.1 From 4b036d54bf849a75d0103b33d92a53f89ecb9315 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 5 Sep 2012 23:31:12 +0900 Subject: kprobes/x86: Fix kprobes to collectly handle IP on ftrace Current kprobe_ftrace_handler expects regs->ip == ip, but it is incorrect (originally on x86-64). Actually, ftrace handler sets regs->ip = ip + MCOUNT_INSN_SIZE. kprobe_ftrace_handler must take care for that. Link: http://lkml.kernel.org/r/20120905143112.10329.72069.stgit@localhost.localdomain Cc: Peter Zijlstra Cc: Frederic Weisbecker Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- arch/x86/kernel/kprobes.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 47ae102..f49f60c 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -1072,7 +1072,8 @@ void __kprobes kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, if (kprobe_running()) { kprobes_inc_nmissed_count(p); } else { - regs->ip += sizeof(kprobe_opcode_t); + /* Kprobe handler expects regs->ip = ip + 1 as breakpoint hit */ + regs->ip = ip + sizeof(kprobe_opcode_t); __this_cpu_write(current_kprobe, p); kcb->kprobe_status = KPROBE_HIT_ACTIVE; @@ -1080,13 +1081,15 @@ void __kprobes kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, p->pre_handler(p, regs); if (unlikely(p->post_handler)) { - /* Emulate singlestep as if there is a 5byte nop */ + /* + * Emulate singlestep (and also recover regs->ip) + * as if there is a 5byte nop + */ regs->ip = ip + MCOUNT_INSN_SIZE; kcb->kprobe_status = KPROBE_HIT_SSDONE; p->post_handler(p, regs, 0); } __this_cpu_write(current_kprobe, NULL); - regs->ip = ip; /* Recover for next callback */ } end: local_irq_restore(flags); -- cgit v1.1 From 47d5a5f88b9d25d6464c9b60c28f391e84e3ed65 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 5 Sep 2012 23:31:18 +0900 Subject: ftrace/x86-64: Allow to change RIP in handlers Allow ftrace handlers to change RIP register (regs->ip) in handlers. This will allow handlers to call another function instead of original function. Link: http://lkml.kernel.org/r/20120905143118.10329.5078.stgit@localhost.localdomain Cc: Peter Zijlstra Cc: Frederic Weisbecker Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- arch/x86/kernel/entry_64.S | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index ed767b7..e9cc2b3 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -165,6 +165,10 @@ GLOBAL(ftrace_regs_call) movq EFLAGS(%rsp), %rax movq %rax, SS(%rsp) + /* Handlers can change the RIP */ + movq RIP(%rsp), %rax + movq %rax, SS+8(%rsp) + /* restore the rest of pt_regs */ movq R15(%rsp), %r15 movq R14(%rsp), %r14 -- cgit v1.1 From c6aaf4d0bb86e2154ea31a33804cec300611255f Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 5 Sep 2012 23:31:25 +0900 Subject: kprobes/x86: Fix to support jprobes on ftrace-based kprobe Fix kprobes/x86 to support jprobes on ftrace-based kprobes. Because of -mfentry support of ftrace, ftrace is now put on the beginning of function where jprobes are put. Originally ftrace-based kprobes doesn't support jprobe because it will change regs->ip and ftrace doesn't support changing IP and ftrace itself doesn't conflict jprobe. However, ftrace -mfentry support moves mcount call on the top of functions where jprobes are put. This means that jprobe always conflicts with ftrace-based kprobe and fails. This patch allows ftrace-based kprobes to support jprobes by allowing to modify regs->ip and kprobes breakpoint handler also allows to skip singlestepping because there is a ftrace call (not an original instruction). Link: http://lkml.kernel.org/r/20120905143125.10329.90836.stgit@localhost.localdomain Reported-by: Fengguang Wu Cc: Peter Zijlstra Cc: Frederic Weisbecker Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- arch/x86/kernel/kprobes.c | 42 +++++++++++++++++++++++++++++------------- 1 file changed, 29 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index f49f60c..b7c2a85 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -541,6 +541,8 @@ reenter_kprobe(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb return 1; } +static void __kprobes skip_singlestep(struct kprobe *p, struct pt_regs *regs, + struct kprobe_ctlblk *kcb); /* * Interrupts are disabled on entry as trap3 is an interrupt gate and they * remain disabled throughout this function. @@ -599,6 +601,12 @@ static int __kprobes kprobe_handler(struct pt_regs *regs) } else if (kprobe_running()) { p = __this_cpu_read(current_kprobe); if (p->break_handler && p->break_handler(p, regs)) { +#ifdef KPROBES_CAN_USE_FTRACE + if (kprobe_ftrace(p)) { + skip_singlestep(p, regs, kcb); + return 1; + } +#endif setup_singlestep(p, regs, kcb, 0); return 1; } @@ -1053,6 +1061,21 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) } #ifdef KPROBES_CAN_USE_FTRACE +static void __kprobes skip_singlestep(struct kprobe *p, struct pt_regs *regs, + struct kprobe_ctlblk *kcb) +{ + /* + * Emulate singlestep (and also recover regs->ip) + * as if there is a 5byte nop + */ + regs->ip = (unsigned long)p->addr + MCOUNT_INSN_SIZE; + if (unlikely(p->post_handler)) { + kcb->kprobe_status = KPROBE_HIT_SSDONE; + p->post_handler(p, regs, 0); + } + __this_cpu_write(current_kprobe, NULL); +} + /* Ftrace callback handler for kprobes */ void __kprobes kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ops, struct pt_regs *regs) @@ -1077,19 +1100,12 @@ void __kprobes kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, __this_cpu_write(current_kprobe, p); kcb->kprobe_status = KPROBE_HIT_ACTIVE; - if (p->pre_handler) - p->pre_handler(p, regs); - - if (unlikely(p->post_handler)) { - /* - * Emulate singlestep (and also recover regs->ip) - * as if there is a 5byte nop - */ - regs->ip = ip + MCOUNT_INSN_SIZE; - kcb->kprobe_status = KPROBE_HIT_SSDONE; - p->post_handler(p, regs, 0); - } - __this_cpu_write(current_kprobe, NULL); + if (!p->pre_handler || !p->pre_handler(p, regs)) + skip_singlestep(p, regs, kcb); + /* + * If pre_handler returns !0, it sets regs->ip and + * resets current kprobe. + */ } end: local_irq_restore(flags); -- cgit v1.1 From bdc1e47217315be14ba04881b0a4c8ecb3ff320c Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 20 Aug 2012 12:47:34 +0200 Subject: uprobes/x86: Implement x86 specific arch_uprobe_*_step The arch specific implementation behaves like user_enable_single_step() except that it does not disable single stepping if it was already enabled by ptrace. This allows the debugger to single step over an uprobe. The state of block stepping is not restored. It makes only sense together with TF and if that was enabled then the debugger is notified. Note: this is still not correct. For example, TIF_SINGLESTEP check is not right, the application itself can set X86_EFLAGS_TF. And otoh we leak TIF_SINGLESTEP (set by enable) if the probed insn is "popf". See the next patches, we need the changes in arch/x86/kernel/step.c first. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- arch/x86/include/asm/uprobes.h | 2 ++ arch/x86/kernel/uprobes.c | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h index f3971bb..cee5862 100644 --- a/arch/x86/include/asm/uprobes.h +++ b/arch/x86/include/asm/uprobes.h @@ -46,6 +46,8 @@ struct arch_uprobe_task { #ifdef CONFIG_X86_64 unsigned long saved_scratch_register; #endif +#define UPROBE_CLEAR_TF (1 << 0) + unsigned int restore_flags; }; extern int arch_uprobe_analyze_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long addr); diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 36fd420..309a0e0 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -41,6 +41,9 @@ /* Adjust the return address of a call insn */ #define UPROBE_FIX_CALL 0x2 +/* Instruction will modify TF, don't change it */ +#define UPROBE_FIX_SETF 0x4 + #define UPROBE_FIX_RIP_AX 0x8000 #define UPROBE_FIX_RIP_CX 0x4000 @@ -239,6 +242,10 @@ static void prepare_fixups(struct arch_uprobe *auprobe, struct insn *insn) insn_get_opcode(insn); /* should be a nop */ switch (OPCODE1(insn)) { + case 0x9d: + /* popf */ + auprobe->fixups |= UPROBE_FIX_SETF; + break; case 0xc3: /* ret/lret */ case 0xcb: case 0xc2: @@ -673,3 +680,29 @@ bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) } return false; } + +void arch_uprobe_enable_step(struct arch_uprobe *auprobe) +{ + struct uprobe_task *utask = current->utask; + struct arch_uprobe_task *autask = &utask->autask; + + autask->restore_flags = 0; + if (!test_tsk_thread_flag(current, TIF_SINGLESTEP) && + !(auprobe->fixups & UPROBE_FIX_SETF)) + autask->restore_flags |= UPROBE_CLEAR_TF; + /* + * The state of TIF_BLOCKSTEP is not saved. With the TF flag set we + * would to examine the opcode and the flags to make it right. Without + * TF block stepping makes no sense. + */ + user_enable_single_step(current); +} + +void arch_uprobe_disable_step(struct arch_uprobe *auprobe) +{ + struct uprobe_task *utask = current->utask; + struct arch_uprobe_task *autask = &utask->autask; + + if (autask->restore_flags & UPROBE_CLEAR_TF) + user_disable_single_step(current); +} -- cgit v1.1 From 848e8f5f0ad3169560c516fff6471be65f76e69f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 3 Aug 2012 17:31:46 +0200 Subject: ptrace/x86: Introduce set_task_blockstep() helper No functional changes, preparation for the next fix and for uprobes single-step fixes. Move the code playing with TIF_BLOCKSTEP/DEBUGCTLMSR_BTF into the new helper, set_task_blockstep(). Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- arch/x86/kernel/step.c | 41 +++++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index c346d11..7a51498 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c @@ -157,6 +157,21 @@ static int enable_single_step(struct task_struct *child) return 1; } +static void set_task_blockstep(struct task_struct *task, bool on) +{ + unsigned long debugctl; + + debugctl = get_debugctlmsr(); + if (on) { + debugctl |= DEBUGCTLMSR_BTF; + set_tsk_thread_flag(task, TIF_BLOCKSTEP); + } else { + debugctl &= ~DEBUGCTLMSR_BTF; + clear_tsk_thread_flag(task, TIF_BLOCKSTEP); + } + update_debugctlmsr(debugctl); +} + /* * Enable single or block step. */ @@ -169,19 +184,10 @@ static void enable_step(struct task_struct *child, bool block) * So no one should try to use debugger block stepping in a program * that uses user-mode single stepping itself. */ - if (enable_single_step(child) && block) { - unsigned long debugctl = get_debugctlmsr(); - - debugctl |= DEBUGCTLMSR_BTF; - update_debugctlmsr(debugctl); - set_tsk_thread_flag(child, TIF_BLOCKSTEP); - } else if (test_tsk_thread_flag(child, TIF_BLOCKSTEP)) { - unsigned long debugctl = get_debugctlmsr(); - - debugctl &= ~DEBUGCTLMSR_BTF; - update_debugctlmsr(debugctl); - clear_tsk_thread_flag(child, TIF_BLOCKSTEP); - } + if (enable_single_step(child) && block) + set_task_blockstep(child, true); + else if (test_tsk_thread_flag(child, TIF_BLOCKSTEP)) + set_task_blockstep(child, false); } void user_enable_single_step(struct task_struct *child) @@ -199,13 +205,8 @@ void user_disable_single_step(struct task_struct *child) /* * Make sure block stepping (BTF) is disabled. */ - if (test_tsk_thread_flag(child, TIF_BLOCKSTEP)) { - unsigned long debugctl = get_debugctlmsr(); - - debugctl &= ~DEBUGCTLMSR_BTF; - update_debugctlmsr(debugctl); - clear_tsk_thread_flag(child, TIF_BLOCKSTEP); - } + if (test_tsk_thread_flag(child, TIF_BLOCKSTEP)) + set_task_blockstep(child, false); /* Always clear TIF_SINGLESTEP... */ clear_tsk_thread_flag(child, TIF_SINGLESTEP); -- cgit v1.1 From 95cf00fa5d5e2a200a2c044c84bde8389a237e02 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 11 Aug 2012 18:06:42 +0200 Subject: ptrace/x86: Partly fix set_task_blockstep()->update_debugctlmsr() logic Afaics the usage of update_debugctlmsr() and TIF_BLOCKSTEP in step.c was always very wrong. 1. update_debugctlmsr() was simply unneeded. The child sleeps TASK_TRACED, __switch_to_xtra(next_p => child) should notice TIF_BLOCKSTEP and set/clear DEBUGCTLMSR_BTF after resume if needed. 2. It is wrong. The state of DEBUGCTLMSR_BTF bit in CPU register should always match the state of current's TIF_BLOCKSTEP bit. 3. Even get_debugctlmsr() + update_debugctlmsr() itself does not look right. Irq can change other bits in MSR_IA32_DEBUGCTLMSR register or the caller can be preempted in between. 4. It is not safe to play with TIF_BLOCKSTEP if task != current. DEBUGCTLMSR_BTF and TIF_BLOCKSTEP should always match each other if the task is running. The tracee is stopped but it can be SIGKILL'ed right before set/clear_tsk_thread_flag(). However, now that uprobes uses user_enable_single_step(current) we can't simply remove update_debugctlmsr(). So this patch adds the additional "task == current" check and disables irqs to avoid the race with interrupts/preemption. Unfortunately this patch doesn't solve the last problem, we need another fix. Probably we should teach ptrace_stop() to set/clear single/block stepping after resume. And afaics there is yet another problem: perf can play with MSR_IA32_DEBUGCTLMSR from nmi, this obviously means that even __switch_to_xtra() has problems. Signed-off-by: Oleg Nesterov --- arch/x86/kernel/step.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index 7a51498..f89cdc6 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c @@ -161,6 +161,16 @@ static void set_task_blockstep(struct task_struct *task, bool on) { unsigned long debugctl; + /* + * Ensure irq/preemption can't change debugctl in between. + * Note also that both TIF_BLOCKSTEP and debugctl should + * be changed atomically wrt preemption. + * FIXME: this means that set/clear TIF_BLOCKSTEP is simply + * wrong if task != current, SIGKILL can wakeup the stopped + * tracee and set/clear can play with the running task, this + * can confuse the next __switch_to_xtra(). + */ + local_irq_disable(); debugctl = get_debugctlmsr(); if (on) { debugctl |= DEBUGCTLMSR_BTF; @@ -169,7 +179,9 @@ static void set_task_blockstep(struct task_struct *task, bool on) debugctl &= ~DEBUGCTLMSR_BTF; clear_tsk_thread_flag(task, TIF_BLOCKSTEP); } - update_debugctlmsr(debugctl); + if (task == current) + update_debugctlmsr(debugctl); + local_irq_enable(); } /* -- cgit v1.1 From 9bd1190a11c9d2c59d35cb999b8d170ad52aab5f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 3 Sep 2012 15:24:17 +0200 Subject: uprobes/x86: Do not (ab)use TIF_SINGLESTEP/user_*_single_step() for single-stepping user_enable/disable_single_step() was designed for ptrace, it assumes a single user and does unnecessary and wrong things for uprobes. For example: - arch_uprobe_enable_step() can't trust TIF_SINGLESTEP, an application itself can set X86_EFLAGS_TF which must be preserved after arch_uprobe_disable_step(). - we do not want to set TIF_SINGLESTEP/TIF_FORCED_TF in arch_uprobe_enable_step(), this only makes sense for ptrace. - otoh we leak TIF_SINGLESTEP if arch_uprobe_disable_step() doesn't do user_disable_single_step(), the application will be killed after the next syscall. - arch_uprobe_enable_step() does access_process_vm() we do not need/want. Change arch_uprobe_enable/disable_step() to set/clear X86_EFLAGS_TF directly, this is much simpler and more correct. However, we need to clear TIF_BLOCKSTEP/DEBUGCTLMSR_BTF before executing the probed insn, add set_task_blockstep(false). Note: with or without this patch, there is another (hopefully minor) problem. A probed "pushf" insn can see the wrong X86_EFLAGS_TF set by uprobes. Perhaps we should change _disable to update the stack, or teach arch_uprobe_skip_sstep() to emulate this insn. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- arch/x86/include/asm/processor.h | 2 ++ arch/x86/kernel/step.c | 2 +- arch/x86/kernel/uprobes.c | 32 ++++++++++++++++++-------------- 3 files changed, 21 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index d048cad..433d2e5 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -759,6 +759,8 @@ static inline void update_debugctlmsr(unsigned long debugctlmsr) wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr); } +extern void set_task_blockstep(struct task_struct *task, bool on); + /* * from system description table in BIOS. Mostly for MCA use, but * others may find it useful: diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index f89cdc6..cd3b243 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c @@ -157,7 +157,7 @@ static int enable_single_step(struct task_struct *child) return 1; } -static void set_task_blockstep(struct task_struct *task, bool on) +void set_task_blockstep(struct task_struct *task, bool on) { unsigned long debugctl; diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 309a0e0..3b4aae6 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -683,26 +683,30 @@ bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) void arch_uprobe_enable_step(struct arch_uprobe *auprobe) { - struct uprobe_task *utask = current->utask; - struct arch_uprobe_task *autask = &utask->autask; + struct task_struct *task = current; + struct arch_uprobe_task *autask = &task->utask->autask; + struct pt_regs *regs = task_pt_regs(task); autask->restore_flags = 0; - if (!test_tsk_thread_flag(current, TIF_SINGLESTEP) && - !(auprobe->fixups & UPROBE_FIX_SETF)) + if (!(regs->flags & X86_EFLAGS_TF) && + !(auprobe->fixups & UPROBE_FIX_SETF)) autask->restore_flags |= UPROBE_CLEAR_TF; - /* - * The state of TIF_BLOCKSTEP is not saved. With the TF flag set we - * would to examine the opcode and the flags to make it right. Without - * TF block stepping makes no sense. - */ - user_enable_single_step(current); + + regs->flags |= X86_EFLAGS_TF; + if (test_tsk_thread_flag(task, TIF_BLOCKSTEP)) + set_task_blockstep(task, false); } void arch_uprobe_disable_step(struct arch_uprobe *auprobe) { - struct uprobe_task *utask = current->utask; - struct arch_uprobe_task *autask = &utask->autask; - + struct task_struct *task = current; + struct arch_uprobe_task *autask = &task->utask->autask; + struct pt_regs *regs = task_pt_regs(task); + /* + * The state of TIF_BLOCKSTEP was not saved so we can get an extra + * SIGTRAP if we do not clear TF. We need to examine the opcode to + * make it right. + */ if (autask->restore_flags & UPROBE_CLEAR_TF) - user_disable_single_step(current); + regs->flags &= ~X86_EFLAGS_TF; } -- cgit v1.1 From 3a4664aa8362d9fa9110828f55afa9f9fcd7e484 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 3 Sep 2012 16:05:10 +0200 Subject: uprobes/x86: Xol should send SIGTRAP if X86_EFLAGS_TF was set arch_uprobe_disable_step() correctly preserves X86_EFLAGS_TF and returns to user-mode. But this means the application gets SIGTRAP only after the next insn. This means that UPROBE_CLEAR_TF logic is not really right. _enable should only record the state of X86_EFLAGS_TF, and _disable should check it separately from UPROBE_FIX_SETF. Remove arch_uprobe_task->restore_flags, add ->saved_tf instead, and change enable/disable accordingly. This assumes that the probed insn was not trapped, see the next patch. arch_uprobe_skip_sstep() logic has the same problem, change it to check X86_EFLAGS_TF and send SIGTRAP as well. We will cleanup this all after we fold enable/disable_step into pre/post_hol hooks. Note: send_sig(SIGTRAP) is not actually right, we need send_sigtrap(). But this needs more changes, handle_swbp() does the same and this is equally wrong. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- arch/x86/include/asm/uprobes.h | 3 +-- arch/x86/kernel/uprobes.c | 19 +++++++++++++------ 2 files changed, 14 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h index cee5862..d561ff5 100644 --- a/arch/x86/include/asm/uprobes.h +++ b/arch/x86/include/asm/uprobes.h @@ -46,8 +46,7 @@ struct arch_uprobe_task { #ifdef CONFIG_X86_64 unsigned long saved_scratch_register; #endif -#define UPROBE_CLEAR_TF (1 << 0) - unsigned int restore_flags; + unsigned int saved_tf; }; extern int arch_uprobe_analyze_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long addr); diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 3b4aae6..7e993d1f 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -653,7 +653,7 @@ void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) * Skip these instructions as per the currently known x86 ISA. * 0x66* { 0x90 | 0x0f 0x1f | 0x0f 0x19 | 0x87 0xc0 } */ -bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) +static bool __skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) { int i; @@ -681,16 +681,21 @@ bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) return false; } +bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + bool ret = __skip_sstep(auprobe, regs); + if (ret && (regs->flags & X86_EFLAGS_TF)) + send_sig(SIGTRAP, current, 0); + return ret; +} + void arch_uprobe_enable_step(struct arch_uprobe *auprobe) { struct task_struct *task = current; struct arch_uprobe_task *autask = &task->utask->autask; struct pt_regs *regs = task_pt_regs(task); - autask->restore_flags = 0; - if (!(regs->flags & X86_EFLAGS_TF) && - !(auprobe->fixups & UPROBE_FIX_SETF)) - autask->restore_flags |= UPROBE_CLEAR_TF; + autask->saved_tf = !!(regs->flags & X86_EFLAGS_TF); regs->flags |= X86_EFLAGS_TF; if (test_tsk_thread_flag(task, TIF_BLOCKSTEP)) @@ -707,6 +712,8 @@ void arch_uprobe_disable_step(struct arch_uprobe *auprobe) * SIGTRAP if we do not clear TF. We need to examine the opcode to * make it right. */ - if (autask->restore_flags & UPROBE_CLEAR_TF) + if (autask->saved_tf) + send_sig(SIGTRAP, task, 0); + else if (!(auprobe->fixups & UPROBE_FIX_SETF)) regs->flags &= ~X86_EFLAGS_TF; } -- cgit v1.1 From d6a00b35e411519d774d978cdf80e4406d01b36b Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 8 Sep 2012 18:38:15 +0200 Subject: uprobes/x86: Fix arch_uprobe_disable_step() && UTASK_SSTEP_TRAPPED interaction arch_uprobe_disable_step() should also take UTASK_SSTEP_TRAPPED into account. In this case the probed insn was not executed, we need to clear X86_EFLAGS_TF if it was set by us and that is all. Again, this code will look more clean when we move it into arch_uprobe_post_xol() and arch_uprobe_abort_xol(). Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- arch/x86/kernel/uprobes.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 7e993d1f..9538f00 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -706,14 +706,20 @@ void arch_uprobe_disable_step(struct arch_uprobe *auprobe) { struct task_struct *task = current; struct arch_uprobe_task *autask = &task->utask->autask; + bool trapped = (task->utask->state == UTASK_SSTEP_TRAPPED); struct pt_regs *regs = task_pt_regs(task); /* * The state of TIF_BLOCKSTEP was not saved so we can get an extra * SIGTRAP if we do not clear TF. We need to examine the opcode to * make it right. */ - if (autask->saved_tf) - send_sig(SIGTRAP, task, 0); - else if (!(auprobe->fixups & UPROBE_FIX_SETF)) - regs->flags &= ~X86_EFLAGS_TF; + if (unlikely(trapped)) { + if (!autask->saved_tf) + regs->flags &= ~X86_EFLAGS_TF; + } else { + if (autask->saved_tf) + send_sig(SIGTRAP, task, 0); + else if (!(auprobe->fixups & UPROBE_FIX_SETF)) + regs->flags &= ~X86_EFLAGS_TF; + } } -- cgit v1.1 From baedbf02b1912225d60dd7403acb4b4e003088b5 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 3 Sep 2012 17:02:16 +0200 Subject: uprobes: Make arch_uprobe_task->saved_trap_nr "unsigned int" Make arch_uprobe_task->saved_trap_nr "unsigned int" and move it down after ->saved_scratch_register, this changes sizeof() from 24 to 16. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- arch/x86/include/asm/uprobes.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h index d561ff5..8ff8be7 100644 --- a/arch/x86/include/asm/uprobes.h +++ b/arch/x86/include/asm/uprobes.h @@ -42,10 +42,10 @@ struct arch_uprobe { }; struct arch_uprobe_task { - unsigned long saved_trap_nr; #ifdef CONFIG_X86_64 unsigned long saved_scratch_register; #endif + unsigned int saved_trap_nr; unsigned int saved_tf; }; -- cgit v1.1 From 38cb5ef4473c6f510fae3a00bdac3acd550e3796 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Thu, 26 Jul 2012 18:00:27 -0400 Subject: X86: Improve GOP detection in the EFI boot stub We currently use the PCI IO protocol as a proxy for a functional GOP. This is less than ideal, since some platforms will put the GOP on output devices rather than the GPU itself. Move to using the conout protocol. This is not guaranteed per-spec, but is part of the consplitter implementation that causes this problem in the first place and so should be reliable. Signed-off-by: Matthew Garrett Signed-off-by: Matt Fleming --- arch/x86/boot/compressed/eboot.c | 29 ++++++++++++++++------------- arch/x86/boot/compressed/eboot.h | 4 ++++ 2 files changed, 20 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index b3e0227..d5e4044 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -276,8 +276,9 @@ static efi_status_t setup_gop(struct screen_info *si, efi_guid_t *proto, nr_gops = size / sizeof(void *); for (i = 0; i < nr_gops; i++) { struct efi_graphics_output_mode_info *info; - efi_guid_t pciio_proto = EFI_PCI_IO_PROTOCOL_GUID; - void *pciio; + efi_guid_t conout_proto = EFI_CONSOLE_OUT_DEVICE_GUID; + bool conout_found = false; + void *dummy; void *h = gop_handle[i]; status = efi_call_phys3(sys_table->boottime->handle_protocol, @@ -285,19 +286,21 @@ static efi_status_t setup_gop(struct screen_info *si, efi_guid_t *proto, if (status != EFI_SUCCESS) continue; - efi_call_phys3(sys_table->boottime->handle_protocol, - h, &pciio_proto, &pciio); + status = efi_call_phys3(sys_table->boottime->handle_protocol, + h, &conout_proto, &dummy); + + if (status == EFI_SUCCESS) + conout_found = true; status = efi_call_phys4(gop->query_mode, gop, gop->mode->mode, &size, &info); - if (status == EFI_SUCCESS && (!first_gop || pciio)) { + if (status == EFI_SUCCESS && (!first_gop || conout_found)) { /* - * Apple provide GOPs that are not backed by - * real hardware (they're used to handle - * multiple displays). The workaround is to - * search for a GOP implementing the PCIIO - * protocol, and if one isn't found, to just - * fallback to the first GOP. + * Systems that use the UEFI Console Splitter may + * provide multiple GOP devices, not all of which are + * backed by real hardware. The workaround is to search + * for a GOP implementing the ConOut protocol, and if + * one isn't found, to just fall back to the first GOP. */ width = info->horizontal_resolution; height = info->vertical_resolution; @@ -308,10 +311,10 @@ static efi_status_t setup_gop(struct screen_info *si, efi_guid_t *proto, pixels_per_scan_line = info->pixels_per_scan_line; /* - * Once we've found a GOP supporting PCIIO, + * Once we've found a GOP supporting ConOut, * don't bother looking any further. */ - if (pciio) + if (conout_found) break; first_gop = gop; diff --git a/arch/x86/boot/compressed/eboot.h b/arch/x86/boot/compressed/eboot.h index 3b6e156..e5b0a8f 100644 --- a/arch/x86/boot/compressed/eboot.h +++ b/arch/x86/boot/compressed/eboot.h @@ -14,6 +14,10 @@ #define EFI_PAGE_SIZE (1UL << EFI_PAGE_SHIFT) #define EFI_READ_CHUNK_SIZE (1024 * 1024) +#define EFI_CONSOLE_OUT_DEVICE_GUID \ + EFI_GUID(0xd3b36f2c, 0xd551, 0x11d4, 0x9a, 0x46, 0x0, 0x90, 0x27, \ + 0x3f, 0xc1, 0x4d) + #define PIXEL_RGB_RESERVED_8BIT_PER_COLOR 0 #define PIXEL_BGR_RESERVED_8BIT_PER_COLOR 1 #define PIXEL_BIT_MASK 2 -- cgit v1.1 From 9dead5bbb825d7c25c0400e61de83075046322d0 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Thu, 26 Jul 2012 18:00:00 -0400 Subject: efi: Build EFI stub with EFI-appropriate options We can't assume the presence of the red zone while we're still in a boot services environment, so we should build with -fno-red-zone to avoid problems. Change the size of wchar at the same time to make string handling simpler. Signed-off-by: Matthew Garrett Signed-off-by: Matt Fleming --- arch/x86/boot/compressed/Makefile | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index e398bb5..8a84501 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -28,6 +28,9 @@ VMLINUX_OBJS = $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o \ $(obj)/string.o $(obj)/cmdline.o $(obj)/early_serial_console.o \ $(obj)/piggy.o +$(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone +$(obj)/efi_stub_$(BITS).o: KBUILD_CLFAGS += -fshort-wchar -mno-red-zone + ifeq ($(CONFIG_EFI_STUB), y) VMLINUX_OBJS += $(obj)/eboot.o $(obj)/efi_stub_$(BITS).o endif -- cgit v1.1 From d6cf86d8f23253225fe2a763d627ecf7dfee9dae Mon Sep 17 00:00:00 2001 From: Seiji Aguchi Date: Tue, 24 Jul 2012 13:27:23 +0000 Subject: efi: initialize efi.runtime_version to make query_variable_info/update_capsule workable A value of efi.runtime_version is checked before calling update_capsule()/query_variable_info() as follows. But it isn't initialized anywhere. static efi_status_t virt_efi_query_variable_info(u32 attr, u64 *storage_space, u64 *remaining_space, u64 *max_variable_size) { if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION) return EFI_UNSUPPORTED; This patch initializes a value of efi.runtime_version at boot time. Signed-off-by: Seiji Aguchi Acked-by: Matthew Garrett Signed-off-by: Matt Fleming --- arch/x86/platform/efi/efi.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 92660eda..f55a4ce 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -890,6 +890,7 @@ void __init efi_enter_virtual_mode(void) * * Call EFI services through wrapper functions. */ + efi.runtime_version = efi_systab.fw_revision; efi.get_time = virt_efi_get_time; efi.set_time = virt_efi_set_time; efi.get_wakeup_time = virt_efi_get_wakeup_time; -- cgit v1.1 From f462ed939de67c20528bc08f11d2fc4f2d59c0d5 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Fri, 27 Jul 2012 12:58:53 -0400 Subject: efifb: Skip DMI checks if the bootloader knows what it's doing The majority of the DMI checks in efifb are for cases where the bootloader has provided invalid information. However, on some machines the overrides may do more harm than good due to configuration differences between machines with the same machine identifier. It turns out that it's possible for the bootloader to get the correct information on GOP-based systems, but we can't guarantee that the kernel's being booted with one that's been updated to do so. Add support for a capabilities flag that can be set by the bootloader, and skip the DMI checks in that case. Additionally, set this flag in the UEFI stub code. Signed-off-by: Matthew Garrett Acked-by: Peter Jones Signed-off-by: Matt Fleming --- arch/x86/boot/compressed/eboot.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index d5e4044..bbd83b9 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -379,6 +379,8 @@ static efi_status_t setup_gop(struct screen_info *si, efi_guid_t *proto, si->rsvd_pos = 0; } + si->capabilities |= VIDEO_CAPABILITY_SKIP_QUIRKS; + free_handle: efi_call_phys1(sys_table->boottime->free_pool, gop_handle); return status; -- cgit v1.1 From e9b10953edbccd3744e039ffc060ab2692f17856 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Fri, 27 Jul 2012 17:20:49 -0400 Subject: x86, EFI: Calculate the EFI framebuffer size instead of trusting the firmware Seth Forshee reported that his system was reporting that the EFI framebuffer stretched from 0x90010000-0xb0010000 despite the GPU's BAR only covering 0x90000000-0x9ffffff. It's safer to calculate this value from the pixel stride and screen height (values we already depend on) rather than face potential problems with resource allocation later on. Signed-off-by: Matthew Garrett Tested-by: Seth Forshee Signed-off-by: Matt Fleming --- arch/x86/boot/compressed/eboot.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index bbd83b9..c760e07 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -331,7 +331,6 @@ static efi_status_t setup_gop(struct screen_info *si, efi_guid_t *proto, si->lfb_width = width; si->lfb_height = height; si->lfb_base = fb_base; - si->lfb_size = fb_size; si->pages = 1; if (pixel_format == PIXEL_RGB_RESERVED_8BIT_PER_COLOR) { @@ -379,6 +378,8 @@ static efi_status_t setup_gop(struct screen_info *si, efi_guid_t *proto, si->rsvd_pos = 0; } + si->lfb_size = si->lfb_linelength * si->lfb_height; + si->capabilities |= VIDEO_CAPABILITY_SKIP_QUIRKS; free_handle: -- cgit v1.1 From 83287ea420ced7242a704488aab0fcdcf2ced9ab Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 16 Sep 2012 15:10:57 +0300 Subject: KVM: VMX: Make lto-friendly LTO (link-time optimization) doesn't like local labels to be referred to from a different function, since the two functions may be built in separate compilation units. Use an external variable instead. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d62b413..5faf12a 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -127,6 +127,8 @@ module_param(ple_gap, int, S_IRUGO); static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW; module_param(ple_window, int, S_IRUGO); +extern const ulong vmx_return; + #define NR_AUTOLOAD_MSRS 8 #define VMCS02_POOL_SIZE 1 @@ -3724,8 +3726,7 @@ static void vmx_set_constant_host_state(void) native_store_idt(&dt); vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */ - asm("mov $.Lkvm_vmx_return, %0" : "=r"(tmpl)); - vmcs_writel(HOST_RIP, tmpl); /* 22.2.5 */ + vmcs_writel(HOST_RIP, vmx_return); /* 22.2.5 */ rdmsr(MSR_IA32_SYSENTER_CS, low32, high32); vmcs_write32(HOST_IA32_SYSENTER_CS, low32); @@ -6276,11 +6277,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) "mov %c[rcx](%0), %%"R"cx \n\t" /* kills %0 (ecx) */ /* Enter guest mode */ - "jne .Llaunched \n\t" + "jne 1f \n\t" __ex(ASM_VMX_VMLAUNCH) "\n\t" - "jmp .Lkvm_vmx_return \n\t" - ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t" - ".Lkvm_vmx_return: " + "jmp 2f \n\t" + "1: " __ex(ASM_VMX_VMRESUME) "\n\t" + "2: " /* Save guest registers, load host registers, keep flags */ "mov %0, %c[wordsize](%%"R"sp) \n\t" "pop %0 \n\t" @@ -6306,6 +6307,10 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) "pop %%"R"bp; pop %%"R"dx \n\t" "setbe %c[fail](%0) \n\t" + ".pushsection .rodata \n\t" + ".global vmx_return \n\t" + "vmx_return: " _ASM_PTR " 2b \n\t" + ".popsection" : : "c"(vmx), "d"((unsigned long)HOST_RSP), [launched]"i"(offsetof(struct vcpu_vmx, __launched)), [fail]"i"(offsetof(struct vcpu_vmx, fail)), -- cgit v1.1 From b188c81f2e1a188ddda6a3d353e5b546c30a9b90 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 16 Sep 2012 15:10:58 +0300 Subject: KVM: VMX: Make use of asm.h Use macros for bitness-insensitive register names, instead of rolling our own. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 69 ++++++++++++++++++++++++------------------------------ 1 file changed, 30 insertions(+), 39 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 5faf12a..30bcb95 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6184,14 +6184,6 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) msrs[i].host); } -#ifdef CONFIG_X86_64 -#define R "r" -#define Q "q" -#else -#define R "e" -#define Q "l" -#endif - static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -6240,30 +6232,30 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx->__launched = vmx->loaded_vmcs->launched; asm( /* Store host registers */ - "push %%"R"dx; push %%"R"bp;" - "push %%"R"cx \n\t" /* placeholder for guest rcx */ - "push %%"R"cx \n\t" - "cmp %%"R"sp, %c[host_rsp](%0) \n\t" + "push %%" _ASM_DX "; push %%" _ASM_BP ";" + "push %%" _ASM_CX " \n\t" /* placeholder for guest rcx */ + "push %%" _ASM_CX " \n\t" + "cmp %%" _ASM_SP ", %c[host_rsp](%0) \n\t" "je 1f \n\t" - "mov %%"R"sp, %c[host_rsp](%0) \n\t" + "mov %%" _ASM_SP ", %c[host_rsp](%0) \n\t" __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t" "1: \n\t" /* Reload cr2 if changed */ - "mov %c[cr2](%0), %%"R"ax \n\t" - "mov %%cr2, %%"R"dx \n\t" - "cmp %%"R"ax, %%"R"dx \n\t" + "mov %c[cr2](%0), %%" _ASM_AX " \n\t" + "mov %%cr2, %%" _ASM_DX " \n\t" + "cmp %%" _ASM_AX ", %%" _ASM_DX " \n\t" "je 2f \n\t" - "mov %%"R"ax, %%cr2 \n\t" + "mov %%" _ASM_AX", %%cr2 \n\t" "2: \n\t" /* Check if vmlaunch of vmresume is needed */ "cmpl $0, %c[launched](%0) \n\t" /* Load guest registers. Don't clobber flags. */ - "mov %c[rax](%0), %%"R"ax \n\t" - "mov %c[rbx](%0), %%"R"bx \n\t" - "mov %c[rdx](%0), %%"R"dx \n\t" - "mov %c[rsi](%0), %%"R"si \n\t" - "mov %c[rdi](%0), %%"R"di \n\t" - "mov %c[rbp](%0), %%"R"bp \n\t" + "mov %c[rax](%0), %%" _ASM_AX " \n\t" + "mov %c[rbx](%0), %%" _ASM_BX " \n\t" + "mov %c[rdx](%0), %%" _ASM_DX " \n\t" + "mov %c[rsi](%0), %%" _ASM_SI " \n\t" + "mov %c[rdi](%0), %%" _ASM_DI " \n\t" + "mov %c[rbp](%0), %%" _ASM_BP " \n\t" #ifdef CONFIG_X86_64 "mov %c[r8](%0), %%r8 \n\t" "mov %c[r9](%0), %%r9 \n\t" @@ -6274,7 +6266,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) "mov %c[r14](%0), %%r14 \n\t" "mov %c[r15](%0), %%r15 \n\t" #endif - "mov %c[rcx](%0), %%"R"cx \n\t" /* kills %0 (ecx) */ + "mov %c[rcx](%0), %%" _ASM_CX " \n\t" /* kills %0 (ecx) */ /* Enter guest mode */ "jne 1f \n\t" @@ -6283,15 +6275,15 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) "1: " __ex(ASM_VMX_VMRESUME) "\n\t" "2: " /* Save guest registers, load host registers, keep flags */ - "mov %0, %c[wordsize](%%"R"sp) \n\t" + "mov %0, %c[wordsize](%%" _ASM_SP ") \n\t" "pop %0 \n\t" - "mov %%"R"ax, %c[rax](%0) \n\t" - "mov %%"R"bx, %c[rbx](%0) \n\t" - "pop"Q" %c[rcx](%0) \n\t" - "mov %%"R"dx, %c[rdx](%0) \n\t" - "mov %%"R"si, %c[rsi](%0) \n\t" - "mov %%"R"di, %c[rdi](%0) \n\t" - "mov %%"R"bp, %c[rbp](%0) \n\t" + "mov %%" _ASM_AX ", %c[rax](%0) \n\t" + "mov %%" _ASM_BX ", %c[rbx](%0) \n\t" + __ASM_SIZE(pop) " %c[rcx](%0) \n\t" + "mov %%" _ASM_DX ", %c[rdx](%0) \n\t" + "mov %%" _ASM_SI ", %c[rsi](%0) \n\t" + "mov %%" _ASM_DI ", %c[rdi](%0) \n\t" + "mov %%" _ASM_BP ", %c[rbp](%0) \n\t" #ifdef CONFIG_X86_64 "mov %%r8, %c[r8](%0) \n\t" "mov %%r9, %c[r9](%0) \n\t" @@ -6302,10 +6294,10 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) "mov %%r14, %c[r14](%0) \n\t" "mov %%r15, %c[r15](%0) \n\t" #endif - "mov %%cr2, %%"R"ax \n\t" - "mov %%"R"ax, %c[cr2](%0) \n\t" + "mov %%cr2, %%" _ASM_AX " \n\t" + "mov %%" _ASM_AX ", %c[cr2](%0) \n\t" - "pop %%"R"bp; pop %%"R"dx \n\t" + "pop %%" _ASM_BP "; pop %%" _ASM_DX " \n\t" "setbe %c[fail](%0) \n\t" ".pushsection .rodata \n\t" ".global vmx_return \n\t" @@ -6335,9 +6327,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)), [wordsize]"i"(sizeof(ulong)) : "cc", "memory" - , R"ax", R"bx", R"di", R"si" #ifdef CONFIG_X86_64 + , "rax", "rbx", "rdi", "rsi" , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" +#else + , "eax", "ebx", "edi", "esi" #endif ); @@ -6389,9 +6383,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx_complete_interrupts(vmx); } -#undef R -#undef Q - static void vmx_free_vcpu(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); -- cgit v1.1 From 7454766f7bead388251aedee35a478356a7f4e72 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 16 Sep 2012 15:10:59 +0300 Subject: KVM: SVM: Make use of asm.h Use macros for bitness-insensitive register names, instead of rolling our own. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/svm.c | 46 ++++++++++++++++++++-------------------------- 1 file changed, 20 insertions(+), 26 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 611c728..818fceb 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3782,12 +3782,6 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu) svm_complete_interrupts(svm); } -#ifdef CONFIG_X86_64 -#define R "r" -#else -#define R "e" -#endif - static void svm_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -3814,13 +3808,13 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) local_irq_enable(); asm volatile ( - "push %%"R"bp; \n\t" - "mov %c[rbx](%[svm]), %%"R"bx \n\t" - "mov %c[rcx](%[svm]), %%"R"cx \n\t" - "mov %c[rdx](%[svm]), %%"R"dx \n\t" - "mov %c[rsi](%[svm]), %%"R"si \n\t" - "mov %c[rdi](%[svm]), %%"R"di \n\t" - "mov %c[rbp](%[svm]), %%"R"bp \n\t" + "push %%" _ASM_BP "; \n\t" + "mov %c[rbx](%[svm]), %%" _ASM_BX " \n\t" + "mov %c[rcx](%[svm]), %%" _ASM_CX " \n\t" + "mov %c[rdx](%[svm]), %%" _ASM_DX " \n\t" + "mov %c[rsi](%[svm]), %%" _ASM_SI " \n\t" + "mov %c[rdi](%[svm]), %%" _ASM_DI " \n\t" + "mov %c[rbp](%[svm]), %%" _ASM_BP " \n\t" #ifdef CONFIG_X86_64 "mov %c[r8](%[svm]), %%r8 \n\t" "mov %c[r9](%[svm]), %%r9 \n\t" @@ -3833,20 +3827,20 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) #endif /* Enter guest mode */ - "push %%"R"ax \n\t" - "mov %c[vmcb](%[svm]), %%"R"ax \n\t" + "push %%" _ASM_AX " \n\t" + "mov %c[vmcb](%[svm]), %%" _ASM_AX " \n\t" __ex(SVM_VMLOAD) "\n\t" __ex(SVM_VMRUN) "\n\t" __ex(SVM_VMSAVE) "\n\t" - "pop %%"R"ax \n\t" + "pop %%" _ASM_AX " \n\t" /* Save guest registers, load host registers */ - "mov %%"R"bx, %c[rbx](%[svm]) \n\t" - "mov %%"R"cx, %c[rcx](%[svm]) \n\t" - "mov %%"R"dx, %c[rdx](%[svm]) \n\t" - "mov %%"R"si, %c[rsi](%[svm]) \n\t" - "mov %%"R"di, %c[rdi](%[svm]) \n\t" - "mov %%"R"bp, %c[rbp](%[svm]) \n\t" + "mov %%" _ASM_BX ", %c[rbx](%[svm]) \n\t" + "mov %%" _ASM_CX ", %c[rcx](%[svm]) \n\t" + "mov %%" _ASM_DX ", %c[rdx](%[svm]) \n\t" + "mov %%" _ASM_SI ", %c[rsi](%[svm]) \n\t" + "mov %%" _ASM_DI ", %c[rdi](%[svm]) \n\t" + "mov %%" _ASM_BP ", %c[rbp](%[svm]) \n\t" #ifdef CONFIG_X86_64 "mov %%r8, %c[r8](%[svm]) \n\t" "mov %%r9, %c[r9](%[svm]) \n\t" @@ -3857,7 +3851,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) "mov %%r14, %c[r14](%[svm]) \n\t" "mov %%r15, %c[r15](%[svm]) \n\t" #endif - "pop %%"R"bp" + "pop %%" _ASM_BP : : [svm]"a"(svm), [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)), @@ -3878,9 +3872,11 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15])) #endif : "cc", "memory" - , R"bx", R"cx", R"dx", R"si", R"di" #ifdef CONFIG_X86_64 + , "rbx", "rcx", "rdx", "rsi", "rdi" , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15" +#else + , "ebx", "ecx", "edx", "esi", "edi" #endif ); @@ -3940,8 +3936,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) mark_all_clean(svm->vmcb); } -#undef R - static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) { struct vcpu_svm *svm = to_svm(vcpu); -- cgit v1.1 From 314d9f63f385096580e9e2a06eaa0745d92fe4ac Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 10 Sep 2012 15:53:49 +0800 Subject: perf/x86: Add cpumask for uncore pmu This patch adds a cpumask file to the uncore pmu sysfs directory. The cpumask file contains one active cpu for every socket. Signed-off-by: "Yan, Zheng" Acked-by: Peter Zijlstra Acked-by: Ingo Molnar Cc: Andi Kleen Cc: Jiri Olsa Cc: Peter Zijlstra Cc: Stephane Eranian Cc: "Yan, Zheng" Link: http://lkml.kernel.org/r/1347263631-23175-2-git-send-email-zheng.z.yan@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- arch/x86/kernel/cpu/perf_event_intel_uncore.c | 28 ++++++++++++++++++++++++--- arch/x86/kernel/cpu/perf_event_intel_uncore.h | 6 ++++-- 2 files changed, 29 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index 0a55710..62ec3e6 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -2341,6 +2341,27 @@ int uncore_pmu_event_init(struct perf_event *event) return ret; } +static ssize_t uncore_get_attr_cpumask(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int n = cpulist_scnprintf(buf, PAGE_SIZE - 2, &uncore_cpu_mask); + + buf[n++] = '\n'; + buf[n] = '\0'; + return n; +} + +static DEVICE_ATTR(cpumask, S_IRUGO, uncore_get_attr_cpumask, NULL); + +static struct attribute *uncore_pmu_attrs[] = { + &dev_attr_cpumask.attr, + NULL, +}; + +static struct attribute_group uncore_pmu_attr_group = { + .attrs = uncore_pmu_attrs, +}; + static int __init uncore_pmu_register(struct intel_uncore_pmu *pmu) { int ret; @@ -2378,8 +2399,8 @@ static void __init uncore_type_exit(struct intel_uncore_type *type) free_percpu(type->pmus[i].box); kfree(type->pmus); type->pmus = NULL; - kfree(type->attr_groups[1]); - type->attr_groups[1] = NULL; + kfree(type->events_group); + type->events_group = NULL; } static void __init uncore_types_exit(struct intel_uncore_type **types) @@ -2431,9 +2452,10 @@ static int __init uncore_type_init(struct intel_uncore_type *type) for (j = 0; j < i; j++) attrs[j] = &type->event_descs[j].attr.attr; - type->attr_groups[1] = events_group; + type->events_group = events_group; } + type->pmu_group = &uncore_pmu_attr_group; type->pmus = pmus; return 0; fail: diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h index 5b81c18..e68a455 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h @@ -369,10 +369,12 @@ struct intel_uncore_type { struct intel_uncore_pmu *pmus; struct intel_uncore_ops *ops; struct uncore_event_desc *event_descs; - const struct attribute_group *attr_groups[3]; + const struct attribute_group *attr_groups[4]; }; -#define format_group attr_groups[0] +#define pmu_group attr_groups[0] +#define format_group attr_groups[1] +#define events_group attr_groups[2] struct intel_uncore_ops { void (*init_box)(struct intel_uncore_box *); -- cgit v1.1 From 9fc77441e5e1bf80b794cc546d2243ee9f4afb75 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Sun, 16 Sep 2012 11:50:30 +0300 Subject: KVM: make processes waiting on vcpu mutex killable vcpu mutex can be held for unlimited time so taking it with mutex_lock on an ioctl is wrong: one process could be passed a vcpu fd and call this ioctl on the vcpu used by another process, it will then be unkillable until the owner exits. Call mutex_lock_killable instead and return status. Note: mutex_lock_interruptible would be even nicer, but I am not sure all users are prepared to handle EINTR from these ioctls. They might misinterpret it as an error. Cleanup paths expect a vcpu that can't be used by any userspace so this will always succeed - catch bugs by calling BUG_ON. Catch callers that don't check return state by adding __must_check. Signed-off-by: Michael S. Tsirkin Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c4d451e..19047ea 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6016,7 +6016,9 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) int r; vcpu->arch.mtrr_state.have_fixed = 1; - vcpu_load(vcpu); + r = vcpu_load(vcpu); + if (r) + return r; r = kvm_arch_vcpu_reset(vcpu); if (r == 0) r = kvm_mmu_setup(vcpu); @@ -6027,9 +6029,11 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { + int r; vcpu->arch.apf.msr_val = 0; - vcpu_load(vcpu); + r = vcpu_load(vcpu); + BUG_ON(r); kvm_mmu_unload(vcpu); vcpu_put(vcpu); @@ -6275,7 +6279,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) { - vcpu_load(vcpu); + int r; + r = vcpu_load(vcpu); + BUG_ON(r); kvm_mmu_unload(vcpu); vcpu_put(vcpu); } -- cgit v1.1 From b82776005369899c1c7ca2e4b2414bb64b538d2c Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 23 Aug 2012 14:36:15 -0400 Subject: xen/swiotlb: Use the swiotlb_late_init_with_tbl to init Xen-SWIOTLB late when PV PCI is used. With this patch we provide the functionality to initialize the Xen-SWIOTLB late in the bootup cycle - specifically for Xen PCI-frontend. We still will work if the user had supplied 'iommu=soft' on the Linux command line. Note: We cannot depend on after_bootmem to automatically determine whether this is early or not. This is because when PCI IOMMUs are initialized it is after after_bootmem but before a lot of "other" subsystems are initialized. CC: FUJITA Tomonori [v1: Fix smatch warnings] [v2: Added check for xen_swiotlb] [v3: Rebased with new xen-swiotlb changes] [v4: squashed xen/swiotlb: Depending on after_bootmem is not correct in] Reviewed-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/include/asm/xen/swiotlb-xen.h | 2 ++ arch/x86/xen/pci-swiotlb-xen.c | 24 ++++++++++++++++++++++-- 2 files changed, 24 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/xen/swiotlb-xen.h b/arch/x86/include/asm/xen/swiotlb-xen.h index 1be1ab7..ee52fca 100644 --- a/arch/x86/include/asm/xen/swiotlb-xen.h +++ b/arch/x86/include/asm/xen/swiotlb-xen.h @@ -5,10 +5,12 @@ extern int xen_swiotlb; extern int __init pci_xen_swiotlb_detect(void); extern void __init pci_xen_swiotlb_init(void); +extern int pci_xen_swiotlb_init_late(void); #else #define xen_swiotlb (0) static inline int __init pci_xen_swiotlb_detect(void) { return 0; } static inline void __init pci_xen_swiotlb_init(void) { } +static inline int pci_xen_swiotlb_init_late(void) { return -ENXIO; } #endif #endif /* _ASM_X86_SWIOTLB_XEN_H */ diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c index 1c17227..b152640 100644 --- a/arch/x86/xen/pci-swiotlb-xen.c +++ b/arch/x86/xen/pci-swiotlb-xen.c @@ -12,7 +12,7 @@ #include #include #endif - +#include int xen_swiotlb __read_mostly; static struct dma_map_ops xen_swiotlb_dma_ops = { @@ -69,13 +69,33 @@ int __init pci_xen_swiotlb_detect(void) void __init pci_xen_swiotlb_init(void) { if (xen_swiotlb) { - xen_swiotlb_init(1); + xen_swiotlb_init(1, true /* early */); dma_ops = &xen_swiotlb_dma_ops; /* Make sure ACS will be enabled */ pci_request_acs(); } } + +int pci_xen_swiotlb_init_late(void) +{ + int rc; + + if (xen_swiotlb) + return 0; + + rc = xen_swiotlb_init(1, false /* late */); + if (rc) + return rc; + + dma_ops = &xen_swiotlb_dma_ops; + /* Make sure ACS will be enabled */ + pci_request_acs(); + + return 0; +} +EXPORT_SYMBOL_GPL(pci_xen_swiotlb_init_late); + IOMMU_INIT_FINISH(pci_xen_swiotlb_detect, 0, pci_xen_swiotlb_init, -- cgit v1.1 From 2a3bce8f6afb9118a7ac3c360a5baf7cdaec87bc Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Mon, 13 Aug 2012 11:00:08 -0400 Subject: xen/swiotlb: Fix compile warnings when using plain integer instead of NULL pointer. arch/x86/xen/pci-swiotlb-xen.c:96:1: warning: Using plain integer as NULL pointer arch/x86/xen/pci-swiotlb-xen.c:96:1: warning: Using plain integer as NULL pointer Acked-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/pci-swiotlb-xen.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c index b152640..1608244 100644 --- a/arch/x86/xen/pci-swiotlb-xen.c +++ b/arch/x86/xen/pci-swiotlb-xen.c @@ -97,6 +97,6 @@ int pci_xen_swiotlb_init_late(void) EXPORT_SYMBOL_GPL(pci_xen_swiotlb_init_late); IOMMU_INIT_FINISH(pci_xen_swiotlb_detect, - 0, + NULL, pci_xen_swiotlb_init, - 0); + NULL); -- cgit v1.1 From e57dbaf77f372ac461b5b0b353c65efae9739a00 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 13 Sep 2011 15:23:21 +0200 Subject: x86, mce: Enable MCA support by default MCA is the basic support for hardware error logging and reporting, and it is majorly unwise to run without it so enable machine check software support by default on x86. Signed-off-by: Borislav Petkov Acked-by: Tony Luck --- arch/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 8ec3a1a..3d2d2ef 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -871,6 +871,7 @@ config X86_REROUTE_FOR_BROKEN_BOOT_IRQS config X86_MCE bool "Machine Check / overheating reporting" + default y ---help--- Machine Check support allows the processor to notify the kernel if it detects a problem (e.g. overheating, data corruption). -- cgit v1.1 From 57639bedd2b8268daa791efcfd0367b9031b057d Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 22 May 2012 18:47:38 +0200 Subject: x86, MCE: Remove unused defines Those were sitting there unused since the dawn of time, drop them. Signed-off-by: Borislav Petkov --- arch/x86/include/asm/mce.h | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index a3ac52b..ccaf7c5 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -116,19 +116,9 @@ struct mce_log { /* Software defined banks */ #define MCE_EXTENDED_BANK 128 #define MCE_THERMAL_BANK MCE_EXTENDED_BANK + 0 - -#define K8_MCE_THRESHOLD_BASE (MCE_EXTENDED_BANK + 1) /* MCE_AMD */ -#define K8_MCE_THRESHOLD_BANK_0 (MCE_THRESHOLD_BASE + 0 * 9) -#define K8_MCE_THRESHOLD_BANK_1 (MCE_THRESHOLD_BASE + 1 * 9) -#define K8_MCE_THRESHOLD_BANK_2 (MCE_THRESHOLD_BASE + 2 * 9) -#define K8_MCE_THRESHOLD_BANK_3 (MCE_THRESHOLD_BASE + 3 * 9) -#define K8_MCE_THRESHOLD_BANK_4 (MCE_THRESHOLD_BASE + 4 * 9) -#define K8_MCE_THRESHOLD_BANK_5 (MCE_THRESHOLD_BASE + 5 * 9) -#define K8_MCE_THRESHOLD_DRAM_ECC (MCE_THRESHOLD_BANK_4 + 0) - +#define K8_MCE_THRESHOLD_BASE (MCE_EXTENDED_BANK + 1) #ifdef __KERNEL__ - extern void mce_register_decode_chain(struct notifier_block *nb); extern void mce_unregister_decode_chain(struct notifier_block *nb); -- cgit v1.1 From 050902c011712ad4703038fa4489ec4edd87d396 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Tue, 24 Jul 2012 16:05:27 -0700 Subject: x86, signal: Cleanup ifdefs and is_ia32, is_x32 Use config_enabled() to cleanup the definitions of is_ia32/is_x32. Move the function prototypes to the header file to cleanup ifdefs, and move the x32_setup_rt_frame() code around. Signed-off-by: Suresh Siddha Link: http://lkml.kernel.org/r/1343171129-2747-2-git-send-email-suresh.b.siddha@intel.com Merged in compilation fix from, Link: http://lkml.kernel.org/r/1344544736.8326.17.camel@sbsiddha-desk.sc.intel.com Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/fpu-internal.h | 26 ++++- arch/x86/include/asm/signal.h | 4 + arch/x86/kernel/signal.c | 196 +++++++++++++++--------------------- 3 files changed, 110 insertions(+), 116 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index 75f4c6d..6f59543 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -12,6 +12,7 @@ #include #include +#include #include #include #include @@ -32,7 +33,6 @@ extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get, extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set, xstateregs_set; - /* * xstateregs_active == fpregs_active. Please refer to the comment * at the definition of fpregs_active. @@ -55,6 +55,22 @@ extern void finit_soft_fpu(struct i387_soft_struct *soft); static inline void finit_soft_fpu(struct i387_soft_struct *soft) {} #endif +static inline int is_ia32_compat_frame(void) +{ + return config_enabled(CONFIG_IA32_EMULATION) && + test_thread_flag(TIF_IA32); +} + +static inline int is_ia32_frame(void) +{ + return config_enabled(CONFIG_X86_32) || is_ia32_compat_frame(); +} + +static inline int is_x32_frame(void) +{ + return config_enabled(CONFIG_X86_X32_ABI) && test_thread_flag(TIF_X32); +} + #define X87_FSW_ES (1 << 7) /* Exception Summary */ static __always_inline __pure bool use_xsaveopt(void) @@ -180,6 +196,11 @@ static inline void fpu_fxsave(struct fpu *fpu) #endif } +int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, + compat_sigset_t *set, struct pt_regs *regs); +int ia32_setup_frame(int sig, struct k_sigaction *ka, + compat_sigset_t *set, struct pt_regs *regs); + #else /* CONFIG_X86_32 */ /* perform fxrstor iff the processor has extended states, otherwise frstor */ @@ -204,6 +225,9 @@ static inline void fpu_fxsave(struct fpu *fpu) : [fx] "=m" (fpu->state->fxsave)); } +#define ia32_setup_frame __setup_frame +#define ia32_setup_rt_frame __setup_rt_frame + #endif /* CONFIG_X86_64 */ /* diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h index 598457c..323973f 100644 --- a/arch/x86/include/asm/signal.h +++ b/arch/x86/include/asm/signal.h @@ -31,6 +31,10 @@ typedef struct { unsigned long sig[_NSIG_WORDS]; } sigset_t; +#ifndef CONFIG_COMPAT +typedef sigset_t compat_sigset_t; +#endif + #else /* Here we must cater to libcs that poke about in kernel headers. */ diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index b280908..bed431a 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -209,24 +209,21 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, unsigned long sp = regs->sp; int onsigstack = on_sig_stack(sp); -#ifdef CONFIG_X86_64 /* redzone */ - sp -= 128; -#endif /* CONFIG_X86_64 */ + if (config_enabled(CONFIG_X86_64)) + sp -= 128; if (!onsigstack) { /* This is the X/Open sanctioned signal stack switching. */ if (ka->sa.sa_flags & SA_ONSTACK) { if (current->sas_ss_size) sp = current->sas_ss_sp + current->sas_ss_size; - } else { -#ifdef CONFIG_X86_32 - /* This is the legacy signal stack switching. */ - if ((regs->ss & 0xffff) != __USER_DS && - !(ka->sa.sa_flags & SA_RESTORER) && - ka->sa.sa_restorer) + } else if (config_enabled(CONFIG_X86_32) && + (regs->ss & 0xffff) != __USER_DS && + !(ka->sa.sa_flags & SA_RESTORER) && + ka->sa.sa_restorer) { + /* This is the legacy signal stack switching. */ sp = (unsigned long) ka->sa.sa_restorer; -#endif /* CONFIG_X86_32 */ } } @@ -474,6 +471,74 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, } #endif /* CONFIG_X86_32 */ +static int x32_setup_rt_frame(int sig, struct k_sigaction *ka, + siginfo_t *info, compat_sigset_t *set, + struct pt_regs *regs) +{ +#ifdef CONFIG_X86_X32_ABI + struct rt_sigframe_x32 __user *frame; + void __user *restorer; + int err = 0; + void __user *fpstate = NULL; + + frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); + + if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) + return -EFAULT; + + if (ka->sa.sa_flags & SA_SIGINFO) { + if (copy_siginfo_to_user32(&frame->info, info)) + return -EFAULT; + } + + put_user_try { + /* Create the ucontext. */ + if (cpu_has_xsave) + put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags); + else + put_user_ex(0, &frame->uc.uc_flags); + put_user_ex(0, &frame->uc.uc_link); + put_user_ex(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); + put_user_ex(sas_ss_flags(regs->sp), + &frame->uc.uc_stack.ss_flags); + put_user_ex(current->sas_ss_size, &frame->uc.uc_stack.ss_size); + put_user_ex(0, &frame->uc.uc__pad0); + err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate, + regs, set->sig[0]); + err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); + + if (ka->sa.sa_flags & SA_RESTORER) { + restorer = ka->sa.sa_restorer; + } else { + /* could use a vstub here */ + restorer = NULL; + err |= -EFAULT; + } + put_user_ex(restorer, &frame->pretcode); + } put_user_catch(err); + + if (err) + return -EFAULT; + + /* Set up registers for signal handler */ + regs->sp = (unsigned long) frame; + regs->ip = (unsigned long) ka->sa.sa_handler; + + /* We use the x32 calling convention here... */ + regs->di = sig; + regs->si = (unsigned long) &frame->info; + regs->dx = (unsigned long) &frame->uc; + + loadsegment(ds, __USER_DS); + loadsegment(es, __USER_DS); + + regs->cs = __USER_CS; + regs->ss = __USER_DS; +#endif /* CONFIG_X86_X32_ABI */ + + return 0; +} + #ifdef CONFIG_X86_32 /* * Atomically swap in the new signal mask, and wait for a signal. @@ -612,55 +677,22 @@ static int signr_convert(int sig) return sig; } -#ifdef CONFIG_X86_32 - -#define is_ia32 1 -#define ia32_setup_frame __setup_frame -#define ia32_setup_rt_frame __setup_rt_frame - -#else /* !CONFIG_X86_32 */ - -#ifdef CONFIG_IA32_EMULATION -#define is_ia32 test_thread_flag(TIF_IA32) -#else /* !CONFIG_IA32_EMULATION */ -#define is_ia32 0 -#endif /* CONFIG_IA32_EMULATION */ - -#ifdef CONFIG_X86_X32_ABI -#define is_x32 test_thread_flag(TIF_X32) - -static int x32_setup_rt_frame(int sig, struct k_sigaction *ka, - siginfo_t *info, compat_sigset_t *set, - struct pt_regs *regs); -#else /* !CONFIG_X86_X32_ABI */ -#define is_x32 0 -#endif /* CONFIG_X86_X32_ABI */ - -int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, - sigset_t *set, struct pt_regs *regs); -int ia32_setup_frame(int sig, struct k_sigaction *ka, - sigset_t *set, struct pt_regs *regs); - -#endif /* CONFIG_X86_32 */ - static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, struct pt_regs *regs) { int usig = signr_convert(sig); sigset_t *set = sigmask_to_save(); + compat_sigset_t *cset = (compat_sigset_t *) set; /* Set up the stack frame */ - if (is_ia32) { + if (is_ia32_frame()) { if (ka->sa.sa_flags & SA_SIGINFO) - return ia32_setup_rt_frame(usig, ka, info, set, regs); + return ia32_setup_rt_frame(usig, ka, info, cset, regs); else - return ia32_setup_frame(usig, ka, set, regs); -#ifdef CONFIG_X86_X32_ABI - } else if (is_x32) { - return x32_setup_rt_frame(usig, ka, info, - (compat_sigset_t *)set, regs); -#endif + return ia32_setup_frame(usig, ka, cset, regs); + } else if (is_x32_frame()) { + return x32_setup_rt_frame(usig, ka, info, cset, regs); } else { return __setup_rt_frame(sig, ka, info, set, regs); } @@ -824,72 +856,6 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where) } #ifdef CONFIG_X86_X32_ABI -static int x32_setup_rt_frame(int sig, struct k_sigaction *ka, - siginfo_t *info, compat_sigset_t *set, - struct pt_regs *regs) -{ - struct rt_sigframe_x32 __user *frame; - void __user *restorer; - int err = 0; - void __user *fpstate = NULL; - - frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); - - if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) - return -EFAULT; - - if (ka->sa.sa_flags & SA_SIGINFO) { - if (copy_siginfo_to_user32(&frame->info, info)) - return -EFAULT; - } - - put_user_try { - /* Create the ucontext. */ - if (cpu_has_xsave) - put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags); - else - put_user_ex(0, &frame->uc.uc_flags); - put_user_ex(0, &frame->uc.uc_link); - put_user_ex(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); - put_user_ex(sas_ss_flags(regs->sp), - &frame->uc.uc_stack.ss_flags); - put_user_ex(current->sas_ss_size, &frame->uc.uc_stack.ss_size); - put_user_ex(0, &frame->uc.uc__pad0); - err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate, - regs, set->sig[0]); - err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); - - if (ka->sa.sa_flags & SA_RESTORER) { - restorer = ka->sa.sa_restorer; - } else { - /* could use a vstub here */ - restorer = NULL; - err |= -EFAULT; - } - put_user_ex(restorer, &frame->pretcode); - } put_user_catch(err); - - if (err) - return -EFAULT; - - /* Set up registers for signal handler */ - regs->sp = (unsigned long) frame; - regs->ip = (unsigned long) ka->sa.sa_handler; - - /* We use the x32 calling convention here... */ - regs->di = sig; - regs->si = (unsigned long) &frame->info; - regs->dx = (unsigned long) &frame->uc; - - loadsegment(ds, __USER_DS); - loadsegment(es, __USER_DS); - - regs->cs = __USER_CS; - regs->ss = __USER_DS; - - return 0; -} - asmlinkage long sys32_x32_rt_sigreturn(struct pt_regs *regs) { struct rt_sigframe_x32 __user *frame; -- cgit v1.1 From 0ca5bd0d886578ad0afeceaa83458c0f35cb3c6b Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Tue, 24 Jul 2012 16:05:28 -0700 Subject: x86, fpu: Consolidate inline asm routines for saving/restoring fpu state Consolidate x86, x86_64 inline asm routines saving/restoring fpu state using config_enabled(). Signed-off-by: Suresh Siddha Link: http://lkml.kernel.org/r/1343171129-2747-3-git-send-email-suresh.b.siddha@intel.com Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/fpu-internal.h | 184 +++++++++++++++--------------------- arch/x86/include/asm/xsave.h | 6 +- arch/x86/kernel/xsave.c | 4 +- 3 files changed, 81 insertions(+), 113 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index 6f59543..016acb3 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -97,34 +97,24 @@ static inline void sanitize_i387_state(struct task_struct *tsk) __sanitize_i387_state(tsk); } -#ifdef CONFIG_X86_64 -static inline int fxrstor_checking(struct i387_fxsave_struct *fx) -{ - int err; - - /* See comment in fxsave() below. */ -#ifdef CONFIG_AS_FXSAVEQ - asm volatile("1: fxrstorq %[fx]\n\t" - "2:\n" - ".section .fixup,\"ax\"\n" - "3: movl $-1,%[err]\n" - " jmp 2b\n" - ".previous\n" - _ASM_EXTABLE(1b, 3b) - : [err] "=r" (err) - : [fx] "m" (*fx), "0" (0)); -#else - asm volatile("1: rex64/fxrstor (%[fx])\n\t" - "2:\n" - ".section .fixup,\"ax\"\n" - "3: movl $-1,%[err]\n" - " jmp 2b\n" - ".previous\n" - _ASM_EXTABLE(1b, 3b) - : [err] "=r" (err) - : [fx] "R" (fx), "m" (*fx), "0" (0)); -#endif - return err; +#define check_insn(insn, output, input...) \ +({ \ + int err; \ + asm volatile("1:" #insn "\n\t" \ + "2:\n" \ + ".section .fixup,\"ax\"\n" \ + "3: movl $-1,%[err]\n" \ + " jmp 2b\n" \ + ".previous\n" \ + _ASM_EXTABLE(1b, 3b) \ + : [err] "=r" (err), output \ + : "0"(0), input); \ + err; \ +}) + +static inline int fsave_user(struct i387_fsave_struct __user *fx) +{ + return check_insn(fnsave %[fx]; fwait, [fx] "=m" (*fx), "m" (*fx)); } static inline int fxsave_user(struct i387_fxsave_struct __user *fx) @@ -140,90 +130,73 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx) if (unlikely(err)) return -EFAULT; - /* See comment in fxsave() below. */ -#ifdef CONFIG_AS_FXSAVEQ - asm volatile("1: fxsaveq %[fx]\n\t" - "2:\n" - ".section .fixup,\"ax\"\n" - "3: movl $-1,%[err]\n" - " jmp 2b\n" - ".previous\n" - _ASM_EXTABLE(1b, 3b) - : [err] "=r" (err), [fx] "=m" (*fx) - : "0" (0)); -#else - asm volatile("1: rex64/fxsave (%[fx])\n\t" - "2:\n" - ".section .fixup,\"ax\"\n" - "3: movl $-1,%[err]\n" - " jmp 2b\n" - ".previous\n" - _ASM_EXTABLE(1b, 3b) - : [err] "=r" (err), "=m" (*fx) - : [fx] "R" (fx), "0" (0)); -#endif - if (unlikely(err) && - __clear_user(fx, sizeof(struct i387_fxsave_struct))) - err = -EFAULT; - /* No need to clear here because the caller clears USED_MATH */ - return err; + if (config_enabled(CONFIG_X86_32)) + return check_insn(fxsave %[fx], [fx] "=m" (*fx), "m" (*fx)); + else if (config_enabled(CONFIG_AS_FXSAVEQ)) + return check_insn(fxsaveq %[fx], [fx] "=m" (*fx), "m" (*fx)); + + /* See comment in fpu_fxsave() below. */ + return check_insn(rex64/fxsave (%[fx]), "=m" (*fx), [fx] "R" (fx)); } -static inline void fpu_fxsave(struct fpu *fpu) +static inline int fxrstor_checking(struct i387_fxsave_struct *fx) { - /* Using "rex64; fxsave %0" is broken because, if the memory operand - uses any extended registers for addressing, a second REX prefix - will be generated (to the assembler, rex64 followed by semicolon - is a separate instruction), and hence the 64-bitness is lost. */ + if (config_enabled(CONFIG_X86_32)) + return check_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); + else if (config_enabled(CONFIG_AS_FXSAVEQ)) + return check_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); -#ifdef CONFIG_AS_FXSAVEQ - /* Using "fxsaveq %0" would be the ideal choice, but is only supported - starting with gas 2.16. */ - __asm__ __volatile__("fxsaveq %0" - : "=m" (fpu->state->fxsave)); -#else - /* Using, as a workaround, the properly prefixed form below isn't - accepted by any binutils version so far released, complaining that - the same type of prefix is used twice if an extended register is - needed for addressing (fix submitted to mainline 2005-11-21). - asm volatile("rex64/fxsave %0" - : "=m" (fpu->state->fxsave)); - This, however, we can work around by forcing the compiler to select - an addressing mode that doesn't require extended registers. */ - asm volatile("rex64/fxsave (%[fx])" - : "=m" (fpu->state->fxsave) - : [fx] "R" (&fpu->state->fxsave)); -#endif + /* See comment in fpu_fxsave() below. */ + return check_insn(rex64/fxrstor (%[fx]), "=m" (*fx), [fx] "R" (fx), + "m" (*fx)); } -int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, - compat_sigset_t *set, struct pt_regs *regs); -int ia32_setup_frame(int sig, struct k_sigaction *ka, - compat_sigset_t *set, struct pt_regs *regs); - -#else /* CONFIG_X86_32 */ - -/* perform fxrstor iff the processor has extended states, otherwise frstor */ -static inline int fxrstor_checking(struct i387_fxsave_struct *fx) +static inline int frstor_checking(struct i387_fsave_struct *fx) { - /* - * The "nop" is needed to make the instructions the same - * length. - */ - alternative_input( - "nop ; frstor %1", - "fxrstor %1", - X86_FEATURE_FXSR, - "m" (*fx)); - - return 0; + return check_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); } static inline void fpu_fxsave(struct fpu *fpu) { - asm volatile("fxsave %[fx]" - : [fx] "=m" (fpu->state->fxsave)); + if (config_enabled(CONFIG_X86_32)) + asm volatile( "fxsave %[fx]" : [fx] "=m" (fpu->state->fxsave)); + else if (config_enabled(CONFIG_AS_FXSAVEQ)) + asm volatile("fxsaveq %0" : "=m" (fpu->state->fxsave)); + else { + /* Using "rex64; fxsave %0" is broken because, if the memory + * operand uses any extended registers for addressing, a second + * REX prefix will be generated (to the assembler, rex64 + * followed by semicolon is a separate instruction), and hence + * the 64-bitness is lost. + * + * Using "fxsaveq %0" would be the ideal choice, but is only + * supported starting with gas 2.16. + * + * Using, as a workaround, the properly prefixed form below + * isn't accepted by any binutils version so far released, + * complaining that the same type of prefix is used twice if + * an extended register is needed for addressing (fix submitted + * to mainline 2005-11-21). + * + * asm volatile("rex64/fxsave %0" : "=m" (fpu->state->fxsave)); + * + * This, however, we can work around by forcing the compiler to + * select an addressing mode that doesn't require extended + * registers. + */ + asm volatile( "rex64/fxsave (%[fx])" + : "=m" (fpu->state->fxsave) + : [fx] "R" (&fpu->state->fxsave)); + } } +#ifdef CONFIG_X86_64 + +int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, + compat_sigset_t *set, struct pt_regs *regs); +int ia32_setup_frame(int sig, struct k_sigaction *ka, + compat_sigset_t *set, struct pt_regs *regs); + +#else /* CONFIG_X86_32 */ #define ia32_setup_frame __setup_frame #define ia32_setup_rt_frame __setup_rt_frame @@ -272,17 +245,14 @@ static inline int __save_init_fpu(struct task_struct *tsk) return fpu_save_init(&tsk->thread.fpu); } -static inline int fpu_fxrstor_checking(struct fpu *fpu) -{ - return fxrstor_checking(&fpu->state->fxsave); -} - static inline int fpu_restore_checking(struct fpu *fpu) { if (use_xsave()) - return fpu_xrstor_checking(fpu); + return fpu_xrstor_checking(&fpu->state->xsave); + else if (use_fxsr()) + return fxrstor_checking(&fpu->state->fxsave); else - return fpu_fxrstor_checking(fpu); + return frstor_checking(&fpu->state->fsave); } static inline int restore_fpu_checking(struct task_struct *tsk) diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h index 8a1b6f9..aaac810 100644 --- a/arch/x86/include/asm/xsave.h +++ b/arch/x86/include/asm/xsave.h @@ -42,9 +42,8 @@ extern int check_for_xstate(struct i387_fxsave_struct __user *buf, void __user *fpstate, struct _fpx_sw_bytes *sw); -static inline int fpu_xrstor_checking(struct fpu *fpu) +static inline int fpu_xrstor_checking(struct xsave_struct *fx) { - struct xsave_struct *fx = &fpu->state->xsave; int err; asm volatile("1: .byte " REX_PREFIX "0x0f,0xae,0x2f\n\t" @@ -84,9 +83,6 @@ static inline int xsave_user(struct xsave_struct __user *buf) : [err] "=r" (err) : "D" (buf), "a" (-1), "d" (-1), "0" (0) : "memory"); - if (unlikely(err) && __clear_user(buf, xstate_size)) - err = -EFAULT; - /* No need to clear here because the caller clears USED_MATH */ return err; } diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 9e1a8a7..7a3d4df 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -176,8 +176,10 @@ int save_i387_xstate(void __user *buf) else err = fxsave_user(buf); - if (err) + if (unlikely(err)) { + __clear_user(buf, xstate_size); return err; + } user_fpu_end(); } else { sanitize_i387_state(tsk); -- cgit v1.1 From 72a671ced66db6d1c2bfff1c930a101ac8d08204 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Tue, 24 Jul 2012 16:05:29 -0700 Subject: x86, fpu: Unify signal handling code paths for x86 and x86_64 kernels Currently for x86 and x86_32 binaries, fpstate in the user sigframe is copied to/from the fpstate in the task struct. And in the case of signal delivery for x86_64 binaries, if the fpstate is live in the CPU registers, then the live state is copied directly to the user sigframe. Otherwise fpstate in the task struct is copied to the user sigframe. During restore, fpstate in the user sigframe is restored directly to the live CPU registers. Historically, different code paths led to different bugs. For example, x86_64 code path was not preemption safe till recently. Also there is lot of code duplication for support of new features like xsave etc. Unify signal handling code paths for x86 and x86_64 kernels. New strategy is as follows: Signal delivery: Both for 32/64-bit frames, align the core math frame area to 64bytes as needed by xsave (this where the main fpu/extended state gets copied to and excludes the legacy compatibility fsave header for the 32-bit [f]xsave frames). If the state is live, copy the register state directly to the user frame. If not live, copy the state in the thread struct to the user frame. And for 32-bit [f]xsave frames, construct the fsave header separately before the actual [f]xsave area. Signal return: As the 32-bit frames with [f]xstate has an additional 'fsave' header, copy everything back from the user sigframe to the fpstate in the task structure and reconstruct the fxstate from the 'fsave' header (Also user passed pointers may not be correctly aligned for any attempt to directly restore any partial state). At the next fpstate usage, everything will be restored to the live CPU registers. For all the 64-bit frames and the 32-bit fsave frame, restore the state from the user sigframe directly to the live CPU registers. 64-bit signals always restored the math frame directly, so we can expect the math frame pointer to be correctly aligned. For 32-bit fsave frames, there are no alignment requirements, so we can restore the state directly. "lat_sig catch" microbenchmark numbers (for x86, x86_64, x86_32 binaries) are with in the noise range with this change. Signed-off-by: Suresh Siddha Link: http://lkml.kernel.org/r/1343171129-2747-4-git-send-email-suresh.b.siddha@intel.com [ Merged in compilation fix ] Link: http://lkml.kernel.org/r/1344544736.8326.17.camel@sbsiddha-desk.sc.intel.com Signed-off-by: H. Peter Anvin --- arch/x86/ia32/ia32_signal.c | 9 +- arch/x86/include/asm/fpu-internal.h | 111 +++++---- arch/x86/include/asm/xsave.h | 6 +- arch/x86/kernel/i387.c | 246 +------------------- arch/x86/kernel/process.c | 10 - arch/x86/kernel/ptrace.c | 3 - arch/x86/kernel/signal.c | 15 +- arch/x86/kernel/xsave.c | 432 +++++++++++++++++++++--------------- 8 files changed, 348 insertions(+), 484 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 452d4dd..8c77c64 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -251,7 +251,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs, get_user_ex(tmp, &sc->fpstate); buf = compat_ptr(tmp); - err |= restore_i387_xstate_ia32(buf); + err |= restore_xstate_sig(buf, 1); get_user_ex(*pax, &sc->ax); } get_user_catch(err); @@ -382,9 +382,12 @@ static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, sp = (unsigned long) ka->sa.sa_restorer; if (used_math()) { - sp = sp - sig_xstate_ia32_size; + unsigned long fx_aligned, math_size; + + sp = alloc_mathframe(sp, 1, &fx_aligned, &math_size); *fpstate = (struct _fpstate_ia32 __user *) sp; - if (save_i387_xstate_ia32(*fpstate) < 0) + if (save_xstate_sig(*fpstate, (void __user *)fx_aligned, + math_size) < 0) return (void __user *) -1L; } diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index 016acb3..4fbb419 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -22,11 +22,30 @@ #include #include -extern unsigned int sig_xstate_size; +#ifdef CONFIG_X86_64 +# include +# include +int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, + compat_sigset_t *set, struct pt_regs *regs); +int ia32_setup_frame(int sig, struct k_sigaction *ka, + compat_sigset_t *set, struct pt_regs *regs); +#else +# define user_i387_ia32_struct user_i387_struct +# define user32_fxsr_struct user_fxsr_struct +# define ia32_setup_frame __setup_frame +# define ia32_setup_rt_frame __setup_rt_frame +#endif + +extern unsigned int mxcsr_feature_mask; extern void fpu_init(void); DECLARE_PER_CPU(struct task_struct *, fpu_owner_task); +extern void convert_from_fxsr(struct user_i387_ia32_struct *env, + struct task_struct *tsk); +extern void convert_to_fxsr(struct task_struct *tsk, + const struct user_i387_ia32_struct *env); + extern user_regset_active_fn fpregs_active, xfpregs_active; extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get, xstateregs_get; @@ -39,19 +58,11 @@ extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set, */ #define xstateregs_active fpregs_active -extern struct _fpx_sw_bytes fx_sw_reserved; -#ifdef CONFIG_IA32_EMULATION -extern unsigned int sig_xstate_ia32_size; -extern struct _fpx_sw_bytes fx_sw_reserved_ia32; -struct _fpstate_ia32; -struct _xstate_ia32; -extern int save_i387_xstate_ia32(void __user *buf); -extern int restore_i387_xstate_ia32(void __user *buf); -#endif - #ifdef CONFIG_MATH_EMULATION +# define HAVE_HWFP (boot_cpu_data.hard_math) extern void finit_soft_fpu(struct i387_soft_struct *soft); #else +# define HAVE_HWFP 1 static inline void finit_soft_fpu(struct i387_soft_struct *soft) {} #endif @@ -119,17 +130,6 @@ static inline int fsave_user(struct i387_fsave_struct __user *fx) static inline int fxsave_user(struct i387_fxsave_struct __user *fx) { - int err; - - /* - * Clear the bytes not touched by the fxsave and reserved - * for the SW usage. - */ - err = __clear_user(&fx->sw_reserved, - sizeof(struct _fpx_sw_bytes)); - if (unlikely(err)) - return -EFAULT; - if (config_enabled(CONFIG_X86_32)) return check_insn(fxsave %[fx], [fx] "=m" (*fx), "m" (*fx)); else if (config_enabled(CONFIG_AS_FXSAVEQ)) @@ -189,19 +189,6 @@ static inline void fpu_fxsave(struct fpu *fpu) : [fx] "R" (&fpu->state->fxsave)); } } -#ifdef CONFIG_X86_64 - -int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, - compat_sigset_t *set, struct pt_regs *regs); -int ia32_setup_frame(int sig, struct k_sigaction *ka, - compat_sigset_t *set, struct pt_regs *regs); - -#else /* CONFIG_X86_32 */ - -#define ia32_setup_frame __setup_frame -#define ia32_setup_rt_frame __setup_rt_frame - -#endif /* CONFIG_X86_64 */ /* * These must be called with preempt disabled. Returns @@ -392,10 +379,28 @@ static inline void switch_fpu_finish(struct task_struct *new, fpu_switch_t fpu) /* * Signal frame handlers... */ -extern int save_i387_xstate(void __user *buf); -extern int restore_i387_xstate(void __user *buf); +extern int save_xstate_sig(void __user *buf, void __user *fx, int size); +extern int __restore_xstate_sig(void __user *buf, void __user *fx, int size); + +static inline int xstate_sigframe_size(void) +{ + return use_xsave() ? xstate_size + FP_XSTATE_MAGIC2_SIZE : xstate_size; +} + +static inline int restore_xstate_sig(void __user *buf, int ia32_frame) +{ + void __user *buf_fx = buf; + int size = xstate_sigframe_size(); + + if (ia32_frame && use_fxsr()) { + buf_fx = buf + sizeof(struct i387_fsave_struct); + size += sizeof(struct i387_fsave_struct); + } + + return __restore_xstate_sig(buf, buf_fx, size); +} -static inline void __clear_fpu(struct task_struct *tsk) +static inline void __drop_fpu(struct task_struct *tsk) { if (__thread_has_fpu(tsk)) { /* Ignore delayed exceptions from user space */ @@ -443,11 +448,21 @@ static inline void save_init_fpu(struct task_struct *tsk) preempt_enable(); } -static inline void clear_fpu(struct task_struct *tsk) +static inline void stop_fpu_preload(struct task_struct *tsk) { + tsk->fpu_counter = 0; +} + +static inline void drop_fpu(struct task_struct *tsk) +{ + /* + * Forget coprocessor state.. + */ + stop_fpu_preload(tsk); preempt_disable(); - __clear_fpu(tsk); + __drop_fpu(tsk); preempt_enable(); + clear_used_math(); } /* @@ -511,4 +526,20 @@ static inline void fpu_copy(struct fpu *dst, struct fpu *src) extern void fpu_finit(struct fpu *fpu); +static inline unsigned long +alloc_mathframe(unsigned long sp, int ia32_frame, unsigned long *buf_fx, + unsigned long *size) +{ + unsigned long frame_size = xstate_sigframe_size(); + + *buf_fx = sp = round_down(sp - frame_size, 64); + if (ia32_frame && use_fxsr()) { + frame_size += sizeof(struct i387_fsave_struct); + sp -= sizeof(struct i387_fsave_struct); + } + + *size = frame_size; + return sp; +} + #endif diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h index aaac810..c1d989a 100644 --- a/arch/x86/include/asm/xsave.h +++ b/arch/x86/include/asm/xsave.h @@ -38,9 +38,6 @@ extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; extern void xsave_init(void); extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask); extern int init_fpu(struct task_struct *child); -extern int check_for_xstate(struct i387_fxsave_struct __user *buf, - void __user *fpstate, - struct _fpx_sw_bytes *sw); static inline int fpu_xrstor_checking(struct xsave_struct *fx) { @@ -68,8 +65,7 @@ static inline int xsave_user(struct xsave_struct __user *buf) * Clear the xsave header first, so that reserved fields are * initialized to zero. */ - err = __clear_user(&buf->xsave_hdr, - sizeof(struct xsave_hdr_struct)); + err = __clear_user(&buf->xsave_hdr, sizeof(buf->xsave_hdr)); if (unlikely(err)) return -EFAULT; diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index f250431..ab6a2e8 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -19,20 +19,6 @@ #include #include -#ifdef CONFIG_X86_64 -# include -# include -#else -# define save_i387_xstate_ia32 save_i387_xstate -# define restore_i387_xstate_ia32 restore_i387_xstate -# define _fpstate_ia32 _fpstate -# define _xstate_ia32 _xstate -# define sig_xstate_ia32_size sig_xstate_size -# define fx_sw_reserved_ia32 fx_sw_reserved -# define user_i387_ia32_struct user_i387_struct -# define user32_fxsr_struct user_fxsr_struct -#endif - /* * Were we in an interrupt that interrupted kernel mode? * @@ -113,16 +99,9 @@ void unlazy_fpu(struct task_struct *tsk) } EXPORT_SYMBOL(unlazy_fpu); -#ifdef CONFIG_MATH_EMULATION -# define HAVE_HWFP (boot_cpu_data.hard_math) -#else -# define HAVE_HWFP 1 -#endif - -static unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu; +unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu; unsigned int xstate_size; EXPORT_SYMBOL_GPL(xstate_size); -unsigned int sig_xstate_ia32_size = sizeof(struct _fpstate_ia32); static struct i387_fxsave_struct fx_scratch __cpuinitdata; static void __cpuinit mxcsr_feature_mask_init(void) @@ -454,7 +433,7 @@ static inline u32 twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave) * FXSR floating point environment conversions. */ -static void +void convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk) { struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state->fxsave; @@ -491,8 +470,8 @@ convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk) memcpy(&to[i], &from[i], sizeof(to[0])); } -static void convert_to_fxsr(struct task_struct *tsk, - const struct user_i387_ia32_struct *env) +void convert_to_fxsr(struct task_struct *tsk, + const struct user_i387_ia32_struct *env) { struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state->fxsave; @@ -589,223 +568,6 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, } /* - * Signal frame handlers. - */ - -static inline int save_i387_fsave(struct _fpstate_ia32 __user *buf) -{ - struct task_struct *tsk = current; - struct i387_fsave_struct *fp = &tsk->thread.fpu.state->fsave; - - fp->status = fp->swd; - if (__copy_to_user(buf, fp, sizeof(struct i387_fsave_struct))) - return -1; - return 1; -} - -static int save_i387_fxsave(struct _fpstate_ia32 __user *buf) -{ - struct task_struct *tsk = current; - struct i387_fxsave_struct *fx = &tsk->thread.fpu.state->fxsave; - struct user_i387_ia32_struct env; - int err = 0; - - convert_from_fxsr(&env, tsk); - if (__copy_to_user(buf, &env, sizeof(env))) - return -1; - - err |= __put_user(fx->swd, &buf->status); - err |= __put_user(X86_FXSR_MAGIC, &buf->magic); - if (err) - return -1; - - if (__copy_to_user(&buf->_fxsr_env[0], fx, xstate_size)) - return -1; - return 1; -} - -static int save_i387_xsave(void __user *buf) -{ - struct task_struct *tsk = current; - struct _fpstate_ia32 __user *fx = buf; - int err = 0; - - - sanitize_i387_state(tsk); - - /* - * For legacy compatible, we always set FP/SSE bits in the bit - * vector while saving the state to the user context. - * This will enable us capturing any changes(during sigreturn) to - * the FP/SSE bits by the legacy applications which don't touch - * xstate_bv in the xsave header. - * - * xsave aware applications can change the xstate_bv in the xsave - * header as well as change any contents in the memory layout. - * xrestore as part of sigreturn will capture all the changes. - */ - tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE; - - if (save_i387_fxsave(fx) < 0) - return -1; - - err = __copy_to_user(&fx->sw_reserved, &fx_sw_reserved_ia32, - sizeof(struct _fpx_sw_bytes)); - err |= __put_user(FP_XSTATE_MAGIC2, - (__u32 __user *) (buf + sig_xstate_ia32_size - - FP_XSTATE_MAGIC2_SIZE)); - if (err) - return -1; - - return 1; -} - -int save_i387_xstate_ia32(void __user *buf) -{ - struct _fpstate_ia32 __user *fp = (struct _fpstate_ia32 __user *) buf; - struct task_struct *tsk = current; - - if (!used_math()) - return 0; - - if (!access_ok(VERIFY_WRITE, buf, sig_xstate_ia32_size)) - return -EACCES; - /* - * This will cause a "finit" to be triggered by the next - * attempted FPU operation by the 'current' process. - */ - clear_used_math(); - - if (!HAVE_HWFP) { - return fpregs_soft_get(current, NULL, - 0, sizeof(struct user_i387_ia32_struct), - NULL, fp) ? -1 : 1; - } - - unlazy_fpu(tsk); - - if (cpu_has_xsave) - return save_i387_xsave(fp); - if (cpu_has_fxsr) - return save_i387_fxsave(fp); - else - return save_i387_fsave(fp); -} - -static inline int restore_i387_fsave(struct _fpstate_ia32 __user *buf) -{ - struct task_struct *tsk = current; - - return __copy_from_user(&tsk->thread.fpu.state->fsave, buf, - sizeof(struct i387_fsave_struct)); -} - -static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf, - unsigned int size) -{ - struct task_struct *tsk = current; - struct user_i387_ia32_struct env; - int err; - - err = __copy_from_user(&tsk->thread.fpu.state->fxsave, &buf->_fxsr_env[0], - size); - /* mxcsr reserved bits must be masked to zero for security reasons */ - tsk->thread.fpu.state->fxsave.mxcsr &= mxcsr_feature_mask; - if (err || __copy_from_user(&env, buf, sizeof(env))) - return 1; - convert_to_fxsr(tsk, &env); - - return 0; -} - -static int restore_i387_xsave(void __user *buf) -{ - struct _fpx_sw_bytes fx_sw_user; - struct _fpstate_ia32 __user *fx_user = - ((struct _fpstate_ia32 __user *) buf); - struct i387_fxsave_struct __user *fx = - (struct i387_fxsave_struct __user *) &fx_user->_fxsr_env[0]; - struct xsave_hdr_struct *xsave_hdr = - ¤t->thread.fpu.state->xsave.xsave_hdr; - u64 mask; - int err; - - if (check_for_xstate(fx, buf, &fx_sw_user)) - goto fx_only; - - mask = fx_sw_user.xstate_bv; - - err = restore_i387_fxsave(buf, fx_sw_user.xstate_size); - - xsave_hdr->xstate_bv &= pcntxt_mask; - /* - * These bits must be zero. - */ - xsave_hdr->reserved1[0] = xsave_hdr->reserved1[1] = 0; - - /* - * Init the state that is not present in the memory layout - * and enabled by the OS. - */ - mask = ~(pcntxt_mask & ~mask); - xsave_hdr->xstate_bv &= mask; - - return err; -fx_only: - /* - * Couldn't find the extended state information in the memory - * layout. Restore the FP/SSE and init the other extended state - * enabled by the OS. - */ - xsave_hdr->xstate_bv = XSTATE_FPSSE; - return restore_i387_fxsave(buf, sizeof(struct i387_fxsave_struct)); -} - -int restore_i387_xstate_ia32(void __user *buf) -{ - int err; - struct task_struct *tsk = current; - struct _fpstate_ia32 __user *fp = (struct _fpstate_ia32 __user *) buf; - - if (HAVE_HWFP) - clear_fpu(tsk); - - if (!buf) { - if (used_math()) { - clear_fpu(tsk); - clear_used_math(); - } - - return 0; - } else - if (!access_ok(VERIFY_READ, buf, sig_xstate_ia32_size)) - return -EACCES; - - if (!used_math()) { - err = init_fpu(tsk); - if (err) - return err; - } - - if (HAVE_HWFP) { - if (cpu_has_xsave) - err = restore_i387_xsave(buf); - else if (cpu_has_fxsr) - err = restore_i387_fxsave(fp, sizeof(struct - i387_fxsave_struct)); - else - err = restore_i387_fsave(fp); - } else { - err = fpregs_soft_set(current, NULL, - 0, sizeof(struct user_i387_ia32_struct), - NULL, fp) != 0; - } - set_used_math(); - - return err; -} - -/* * FPU state for core dumps. * This is only used for a.out dumps now. * It is declared generically using elf_fpregset_t (which is diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index ef6a845..30069d1 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -97,16 +97,6 @@ void arch_task_cache_init(void) SLAB_PANIC | SLAB_NOTRACK, NULL); } -static inline void drop_fpu(struct task_struct *tsk) -{ - /* - * Forget coprocessor state.. - */ - tsk->fpu_counter = 0; - clear_fpu(tsk); - clear_used_math(); -} - /* * Free current thread data structures etc.. */ diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index c4c6a5c..861a9d1 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -1332,9 +1332,6 @@ static const struct user_regset_view user_x86_64_view = { #define genregs32_get genregs_get #define genregs32_set genregs_set -#define user_i387_ia32_struct user_i387_struct -#define user32_fxsr_struct user_fxsr_struct - #endif /* CONFIG_X86_64 */ #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index bed431a..e10f96a 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -114,7 +114,7 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, regs->orig_ax = -1; /* disable syscall checks */ get_user_ex(buf, &sc->fpstate); - err |= restore_i387_xstate(buf); + err |= restore_xstate_sig(buf, config_enabled(CONFIG_X86_32)); get_user_ex(*pax, &sc->ax); } get_user_catch(err); @@ -206,7 +206,9 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, void __user **fpstate) { /* Default to using normal stack */ + unsigned long math_size = 0; unsigned long sp = regs->sp; + unsigned long buf_fx = 0; int onsigstack = on_sig_stack(sp); /* redzone */ @@ -228,10 +230,8 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, } if (used_math()) { - sp -= sig_xstate_size; -#ifdef CONFIG_X86_64 - sp = round_down(sp, 64); -#endif /* CONFIG_X86_64 */ + sp = alloc_mathframe(sp, config_enabled(CONFIG_X86_32), + &buf_fx, &math_size); *fpstate = (void __user *)sp; } @@ -244,8 +244,9 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, if (onsigstack && !likely(on_sig_stack(sp))) return (void __user *)-1L; - /* save i387 state */ - if (used_math() && save_i387_xstate(*fpstate) < 0) + /* save i387 and extended state */ + if (used_math() && + save_xstate_sig(*fpstate, (void __user *)buf_fx, math_size) < 0) return (void __user *)-1L; return (void __user *)sp; diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 7a3d4df..0923d27 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -10,9 +10,7 @@ #include #include #include -#ifdef CONFIG_IA32_EMULATION -#include -#endif +#include #include /* @@ -25,11 +23,7 @@ u64 pcntxt_mask; */ static struct xsave_struct *init_xstate_buf; -struct _fpx_sw_bytes fx_sw_reserved; -#ifdef CONFIG_IA32_EMULATION -struct _fpx_sw_bytes fx_sw_reserved_ia32; -#endif - +static struct _fpx_sw_bytes fx_sw_reserved, fx_sw_reserved_ia32; static unsigned int *xstate_offsets, *xstate_sizes, xstate_features; /* @@ -44,9 +38,9 @@ static unsigned int *xstate_offsets, *xstate_sizes, xstate_features; */ void __sanitize_i387_state(struct task_struct *tsk) { - u64 xstate_bv; - int feature_bit = 0x2; struct i387_fxsave_struct *fx = &tsk->thread.fpu.state->fxsave; + int feature_bit = 0x2; + u64 xstate_bv; if (!fx) return; @@ -104,215 +98,314 @@ void __sanitize_i387_state(struct task_struct *tsk) * Check for the presence of extended state information in the * user fpstate pointer in the sigcontext. */ -int check_for_xstate(struct i387_fxsave_struct __user *buf, - void __user *fpstate, - struct _fpx_sw_bytes *fx_sw_user) +static inline int check_for_xstate(struct i387_fxsave_struct __user *buf, + void __user *fpstate, + struct _fpx_sw_bytes *fx_sw) { int min_xstate_size = sizeof(struct i387_fxsave_struct) + sizeof(struct xsave_hdr_struct); unsigned int magic2; - int err; - err = __copy_from_user(fx_sw_user, &buf->sw_reserved[0], - sizeof(struct _fpx_sw_bytes)); - if (err) - return -EFAULT; + if (__copy_from_user(fx_sw, &buf->sw_reserved[0], sizeof(*fx_sw))) + return -1; - /* - * First Magic check failed. - */ - if (fx_sw_user->magic1 != FP_XSTATE_MAGIC1) - return -EINVAL; + /* Check for the first magic field and other error scenarios. */ + if (fx_sw->magic1 != FP_XSTATE_MAGIC1 || + fx_sw->xstate_size < min_xstate_size || + fx_sw->xstate_size > xstate_size || + fx_sw->xstate_size > fx_sw->extended_size) + return -1; /* - * Check for error scenarios. - */ - if (fx_sw_user->xstate_size < min_xstate_size || - fx_sw_user->xstate_size > xstate_size || - fx_sw_user->xstate_size > fx_sw_user->extended_size) - return -EINVAL; - - err = __get_user(magic2, (__u32 __user *) (fpstate + - fx_sw_user->extended_size - - FP_XSTATE_MAGIC2_SIZE)); - if (err) - return err; - /* * Check for the presence of second magic word at the end of memory * layout. This detects the case where the user just copied the legacy * fpstate layout with out copying the extended state information * in the memory layout. */ - if (magic2 != FP_XSTATE_MAGIC2) - return -EFAULT; + if (__get_user(magic2, (__u32 __user *)(fpstate + fx_sw->xstate_size)) + || magic2 != FP_XSTATE_MAGIC2) + return -1; return 0; } -#ifdef CONFIG_X86_64 /* * Signal frame handlers. */ +static inline int save_fsave_header(struct task_struct *tsk, void __user *buf) +{ + if (use_fxsr()) { + struct xsave_struct *xsave = &tsk->thread.fpu.state->xsave; + struct user_i387_ia32_struct env; + struct _fpstate_ia32 __user *fp = buf; -int save_i387_xstate(void __user *buf) + convert_from_fxsr(&env, tsk); + + if (__copy_to_user(buf, &env, sizeof(env)) || + __put_user(xsave->i387.swd, &fp->status) || + __put_user(X86_FXSR_MAGIC, &fp->magic)) + return -1; + } else { + struct i387_fsave_struct __user *fp = buf; + u32 swd; + if (__get_user(swd, &fp->swd) || __put_user(swd, &fp->status)) + return -1; + } + + return 0; +} + +static inline int save_xstate_epilog(void __user *buf, int ia32_frame) { - struct task_struct *tsk = current; - int err = 0; + struct xsave_struct __user *x = buf; + struct _fpx_sw_bytes *sw_bytes; + u32 xstate_bv; + int err; - if (!access_ok(VERIFY_WRITE, buf, sig_xstate_size)) - return -EACCES; + /* Setup the bytes not touched by the [f]xsave and reserved for SW. */ + sw_bytes = ia32_frame ? &fx_sw_reserved_ia32 : &fx_sw_reserved; + err = __copy_to_user(&x->i387.sw_reserved, sw_bytes, sizeof(*sw_bytes)); - BUG_ON(sig_xstate_size < xstate_size); + if (!use_xsave()) + return err; - if ((unsigned long)buf % 64) - pr_err("%s: bad fpstate %p\n", __func__, buf); + err |= __put_user(FP_XSTATE_MAGIC2, (__u32 *)(buf + xstate_size)); - if (!used_math()) - return 0; + /* + * Read the xstate_bv which we copied (directly from the cpu or + * from the state in task struct) to the user buffers. + */ + err |= __get_user(xstate_bv, (__u32 *)&x->xsave_hdr.xstate_bv); - if (user_has_fpu()) { - if (use_xsave()) - err = xsave_user(buf); - else - err = fxsave_user(buf); + /* + * For legacy compatible, we always set FP/SSE bits in the bit + * vector while saving the state to the user context. This will + * enable us capturing any changes(during sigreturn) to + * the FP/SSE bits by the legacy applications which don't touch + * xstate_bv in the xsave header. + * + * xsave aware apps can change the xstate_bv in the xsave + * header as well as change any contents in the memory layout. + * xrestore as part of sigreturn will capture all the changes. + */ + xstate_bv |= XSTATE_FPSSE; - if (unlikely(err)) { - __clear_user(buf, xstate_size); - return err; - } + err |= __put_user(xstate_bv, (__u32 *)&x->xsave_hdr.xstate_bv); + + return err; +} + +static inline int save_user_xstate(struct xsave_struct __user *buf) +{ + int err; + + if (use_xsave()) + err = xsave_user(buf); + else if (use_fxsr()) + err = fxsave_user((struct i387_fxsave_struct __user *) buf); + else + err = fsave_user((struct i387_fsave_struct __user *) buf); + + if (unlikely(err) && __clear_user(buf, xstate_size)) + err = -EFAULT; + return err; +} + +/* + * Save the fpu, extended register state to the user signal frame. + * + * 'buf_fx' is the 64-byte aligned pointer at which the [f|fx|x]save + * state is copied. + * 'buf' points to the 'buf_fx' or to the fsave header followed by 'buf_fx'. + * + * buf == buf_fx for 64-bit frames and 32-bit fsave frame. + * buf != buf_fx for 32-bit frames with fxstate. + * + * If the fpu, extended register state is live, save the state directly + * to the user frame pointed by the aligned pointer 'buf_fx'. Otherwise, + * copy the thread's fpu state to the user frame starting at 'buf_fx'. + * + * If this is a 32-bit frame with fxstate, put a fsave header before + * the aligned state at 'buf_fx'. + * + * For [f]xsave state, update the SW reserved fields in the [f]xsave frame + * indicating the absence/presence of the extended state to the user. + */ +int save_xstate_sig(void __user *buf, void __user *buf_fx, int size) +{ + struct xsave_struct *xsave = ¤t->thread.fpu.state->xsave; + struct task_struct *tsk = current; + int ia32_fxstate = (buf != buf_fx); + + ia32_fxstate &= (config_enabled(CONFIG_X86_32) || + config_enabled(CONFIG_IA32_EMULATION)); + + if (!access_ok(VERIFY_WRITE, buf, size)) + return -EACCES; + + if (!HAVE_HWFP) + return fpregs_soft_get(current, NULL, 0, + sizeof(struct user_i387_ia32_struct), NULL, + (struct _fpstate_ia32 __user *) buf) ? -1 : 1; + + if (user_has_fpu()) { + /* Save the live register state to the user directly. */ + if (save_user_xstate(buf_fx)) + return -1; + /* Update the thread's fxstate to save the fsave header. */ + if (ia32_fxstate) + fpu_fxsave(&tsk->thread.fpu); user_fpu_end(); } else { sanitize_i387_state(tsk); - if (__copy_to_user(buf, &tsk->thread.fpu.state->fxsave, - xstate_size)) + if (__copy_to_user(buf_fx, xsave, xstate_size)) return -1; } - clear_used_math(); /* trigger finit */ + /* Save the fsave header for the 32-bit frames. */ + if ((ia32_fxstate || !use_fxsr()) && save_fsave_header(tsk, buf)) + return -1; - if (use_xsave()) { - struct _fpstate __user *fx = buf; - struct _xstate __user *x = buf; - u64 xstate_bv; + if (use_fxsr() && save_xstate_epilog(buf_fx, ia32_fxstate)) + return -1; + + drop_fpu(tsk); /* trigger finit */ + + return 0; +} - err = __copy_to_user(&fx->sw_reserved, &fx_sw_reserved, - sizeof(struct _fpx_sw_bytes)); +static inline void +sanitize_restored_xstate(struct task_struct *tsk, + struct user_i387_ia32_struct *ia32_env, + u64 xstate_bv, int fx_only) +{ + struct xsave_struct *xsave = &tsk->thread.fpu.state->xsave; + struct xsave_hdr_struct *xsave_hdr = &xsave->xsave_hdr; - err |= __put_user(FP_XSTATE_MAGIC2, - (__u32 __user *) (buf + sig_xstate_size - - FP_XSTATE_MAGIC2_SIZE)); + if (use_xsave()) { + /* These bits must be zero. */ + xsave_hdr->reserved1[0] = xsave_hdr->reserved1[1] = 0; /* - * Read the xstate_bv which we copied (directly from the cpu or - * from the state in task struct) to the user buffers and - * set the FP/SSE bits. + * Init the state that is not present in the memory + * layout and not enabled by the OS. */ - err |= __get_user(xstate_bv, &x->xstate_hdr.xstate_bv); + if (fx_only) + xsave_hdr->xstate_bv = XSTATE_FPSSE; + else + xsave_hdr->xstate_bv &= (pcntxt_mask & xstate_bv); + } + if (use_fxsr()) { /* - * For legacy compatible, we always set FP/SSE bits in the bit - * vector while saving the state to the user context. This will - * enable us capturing any changes(during sigreturn) to - * the FP/SSE bits by the legacy applications which don't touch - * xstate_bv in the xsave header. - * - * xsave aware apps can change the xstate_bv in the xsave - * header as well as change any contents in the memory layout. - * xrestore as part of sigreturn will capture all the changes. + * mscsr reserved bits must be masked to zero for security + * reasons. */ - xstate_bv |= XSTATE_FPSSE; - - err |= __put_user(xstate_bv, &x->xstate_hdr.xstate_bv); + xsave->i387.mxcsr &= mxcsr_feature_mask; - if (err) - return err; + convert_to_fxsr(tsk, ia32_env); } - - return 1; } /* - * Restore the extended state if present. Otherwise, restore the FP/SSE - * state. + * Restore the extended state if present. Otherwise, restore the FP/SSE state. */ -static int restore_user_xstate(void __user *buf) +static inline int restore_user_xstate(void __user *buf, u64 xbv, int fx_only) { - struct _fpx_sw_bytes fx_sw_user; - u64 mask; - int err; - - if (((unsigned long)buf % 64) || - check_for_xstate(buf, buf, &fx_sw_user)) - goto fx_only; - - mask = fx_sw_user.xstate_bv; - - /* - * restore the state passed by the user. - */ - err = xrestore_user(buf, mask); - if (err) - return err; - - /* - * init the state skipped by the user. - */ - mask = pcntxt_mask & ~mask; - if (unlikely(mask)) - xrstor_state(init_xstate_buf, mask); - - return 0; - -fx_only: - /* - * couldn't find the extended state information in the - * memory layout. Restore just the FP/SSE and init all - * the other extended state. - */ - xrstor_state(init_xstate_buf, pcntxt_mask & ~XSTATE_FPSSE); - return fxrstor_checking((__force struct i387_fxsave_struct *)buf); + if (use_xsave()) { + if ((unsigned long)buf % 64 || fx_only) { + u64 init_bv = pcntxt_mask & ~XSTATE_FPSSE; + xrstor_state(init_xstate_buf, init_bv); + return fxrstor_checking((__force void *) buf); + } else { + u64 init_bv = pcntxt_mask & ~xbv; + if (unlikely(init_bv)) + xrstor_state(init_xstate_buf, init_bv); + return xrestore_user(buf, xbv); + } + } else if (use_fxsr()) { + return fxrstor_checking((__force void *) buf); + } else + return frstor_checking((__force void *) buf); } -/* - * This restores directly out of user space. Exceptions are handled. - */ -int restore_i387_xstate(void __user *buf) +int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size) { + int ia32_fxstate = (buf != buf_fx); struct task_struct *tsk = current; - int err = 0; + int state_size = xstate_size; + u64 xstate_bv = 0; + int fx_only = 0; + + ia32_fxstate &= (config_enabled(CONFIG_X86_32) || + config_enabled(CONFIG_IA32_EMULATION)); if (!buf) { - if (used_math()) - goto clear; + drop_fpu(tsk); return 0; - } else - if (!access_ok(VERIFY_READ, buf, sig_xstate_size)) - return -EACCES; + } + + if (!access_ok(VERIFY_READ, buf, size)) + return -EACCES; + + if (!used_math() && init_fpu(tsk)) + return -1; - if (!used_math()) { - err = init_fpu(tsk); - if (err) - return err; + if (!HAVE_HWFP) { + return fpregs_soft_set(current, NULL, + 0, sizeof(struct user_i387_ia32_struct), + NULL, buf) != 0; } - user_fpu_begin(); - if (use_xsave()) - err = restore_user_xstate(buf); - else - err = fxrstor_checking((__force struct i387_fxsave_struct *) - buf); - if (unlikely(err)) { + if (use_xsave()) { + struct _fpx_sw_bytes fx_sw_user; + if (unlikely(check_for_xstate(buf_fx, buf_fx, &fx_sw_user))) { + /* + * Couldn't find the extended state information in the + * memory layout. Restore just the FP/SSE and init all + * the other extended state. + */ + state_size = sizeof(struct i387_fxsave_struct); + fx_only = 1; + } else { + state_size = fx_sw_user.xstate_size; + xstate_bv = fx_sw_user.xstate_bv; + } + } + + if (ia32_fxstate) { + /* + * For 32-bit frames with fxstate, copy the user state to the + * thread's fpu state, reconstruct fxstate from the fsave + * header. Sanitize the copied state etc. + */ + struct xsave_struct *xsave = &tsk->thread.fpu.state->xsave; + struct user_i387_ia32_struct env; + + stop_fpu_preload(tsk); + unlazy_fpu(tsk); + + if (__copy_from_user(xsave, buf_fx, state_size) || + __copy_from_user(&env, buf, sizeof(env))) { + drop_fpu(tsk); + return -1; + } + + sanitize_restored_xstate(tsk, &env, xstate_bv, fx_only); + } else { /* - * Encountered an error while doing the restore from the - * user buffer, clear the fpu state. + * For 64-bit frames and 32-bit fsave frames, restore the user + * state to the registers directly (with exceptions handled). */ -clear: - clear_fpu(tsk); - clear_used_math(); + user_fpu_begin(); + if (restore_user_xstate(buf_fx, xstate_bv, fx_only)) { + drop_fpu(tsk); + return -1; + } } - return err; + + return 0; } -#endif /* * Prepare the SW reserved portion of the fxsave memory layout, indicating @@ -323,31 +416,22 @@ clear: */ static void prepare_fx_sw_frame(void) { - int size_extended = (xstate_size - sizeof(struct i387_fxsave_struct)) + - FP_XSTATE_MAGIC2_SIZE; - - sig_xstate_size = sizeof(struct _fpstate) + size_extended; - -#ifdef CONFIG_IA32_EMULATION - sig_xstate_ia32_size = sizeof(struct _fpstate_ia32) + size_extended; -#endif + int fsave_header_size = sizeof(struct i387_fsave_struct); + int size = xstate_size + FP_XSTATE_MAGIC2_SIZE; - memset(&fx_sw_reserved, 0, sizeof(fx_sw_reserved)); + if (config_enabled(CONFIG_X86_32)) + size += fsave_header_size; fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1; - fx_sw_reserved.extended_size = sig_xstate_size; + fx_sw_reserved.extended_size = size; fx_sw_reserved.xstate_bv = pcntxt_mask; fx_sw_reserved.xstate_size = xstate_size; -#ifdef CONFIG_IA32_EMULATION - memcpy(&fx_sw_reserved_ia32, &fx_sw_reserved, - sizeof(struct _fpx_sw_bytes)); - fx_sw_reserved_ia32.extended_size = sig_xstate_ia32_size; -#endif -} -#ifdef CONFIG_X86_64 -unsigned int sig_xstate_size = sizeof(struct _fpstate); -#endif + if (config_enabled(CONFIG_IA32_EMULATION)) { + fx_sw_reserved_ia32 = fx_sw_reserved; + fx_sw_reserved_ia32.extended_size += fsave_header_size; + } +} /* * Enable the extended processor state save/restore feature -- cgit v1.1 From e962591749dfd4df9fea2c530ed7a3cfed50e5aa Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Fri, 24 Aug 2012 14:12:57 -0700 Subject: x86, fpu: drop_fpu() before restoring new state from sigframe No need to save the state with unlazy_fpu(), that is about to get overwritten by the state from the signal frame. Instead use drop_fpu() and continue to restore the new state. Also fold the stop_fpu_preload() into drop_fpu(). Signed-off-by: Suresh Siddha Link: http://lkml.kernel.org/r/1345842782-24175-2-git-send-email-suresh.b.siddha@intel.com Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/fpu-internal.h | 7 +------ arch/x86/kernel/xsave.c | 8 +++----- 2 files changed, 4 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index 4fbb419..78169d1 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -448,17 +448,12 @@ static inline void save_init_fpu(struct task_struct *tsk) preempt_enable(); } -static inline void stop_fpu_preload(struct task_struct *tsk) -{ - tsk->fpu_counter = 0; -} - static inline void drop_fpu(struct task_struct *tsk) { /* * Forget coprocessor state.. */ - stop_fpu_preload(tsk); + tsk->fpu_counter = 0; preempt_disable(); __drop_fpu(tsk); preempt_enable(); diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 0923d27..07ddc87 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -382,16 +382,14 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size) struct xsave_struct *xsave = &tsk->thread.fpu.state->xsave; struct user_i387_ia32_struct env; - stop_fpu_preload(tsk); - unlazy_fpu(tsk); + drop_fpu(tsk); if (__copy_from_user(xsave, buf_fx, state_size) || - __copy_from_user(&env, buf, sizeof(env))) { - drop_fpu(tsk); + __copy_from_user(&env, buf, sizeof(env))) return -1; - } sanitize_restored_xstate(tsk, &env, xstate_bv, fx_only); + set_used_math(); } else { /* * For 64-bit frames and 32-bit fsave frames, restore the user -- cgit v1.1 From 377ffbcc536a5a6666dc077395163ab149c02610 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Fri, 24 Aug 2012 14:12:58 -0700 Subject: x86, fpu: remove unnecessary user_fpu_end() in save_xstate_sig() Few lines below we do drop_fpu() which is more safer. Remove the unnecessary user_fpu_end() in save_xstate_sig(), which allows the drop_fpu() to ignore any pending exceptions from the user-space and drop the current fpu. Signed-off-by: Suresh Siddha Link: http://lkml.kernel.org/r/1345842782-24175-3-git-send-email-suresh.b.siddha@intel.com Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/fpu-internal.h | 17 +++-------------- arch/x86/kernel/xsave.c | 1 - 2 files changed, 3 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index 78169d1..52202a6 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -412,22 +412,11 @@ static inline void __drop_fpu(struct task_struct *tsk) } /* - * The actual user_fpu_begin/end() functions - * need to be preemption-safe. + * Need to be preemption-safe. * - * NOTE! user_fpu_end() must be used only after you - * have saved the FP state, and user_fpu_begin() must - * be used only immediately before restoring it. - * These functions do not do any save/restore on - * their own. + * NOTE! user_fpu_begin() must be used only immediately before restoring + * it. This function does not do any save/restore on their own. */ -static inline void user_fpu_end(void) -{ - preempt_disable(); - __thread_fpu_end(current); - preempt_enable(); -} - static inline void user_fpu_begin(void) { preempt_disable(); diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 07ddc87..4ac5f2e 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -255,7 +255,6 @@ int save_xstate_sig(void __user *buf, void __user *buf_fx, int size) /* Update the thread's fxstate to save the fsave header. */ if (ia32_fxstate) fpu_fxsave(&tsk->thread.fpu); - user_fpu_end(); } else { sanitize_i387_state(tsk); if (__copy_to_user(buf_fx, xsave, xstate_size)) -- cgit v1.1 From 9c1c3fac53378c9782c18f80107965578d7b7167 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Fri, 24 Aug 2012 14:12:59 -0700 Subject: x86, kvm: use kernel_fpu_begin/end() in kvm_load/put_guest_fpu() kvm's guest fpu save/restore should be wrapped around kernel_fpu_begin/end(). This will avoid for example taking a DNA in kvm_load_guest_fpu() when it tries to load the fpu immediately after doing unlazy_fpu() on the host side. More importantly this will prevent the host process fpu from being corrupted. Signed-off-by: Suresh Siddha Link: http://lkml.kernel.org/r/1345842782-24175-4-git-send-email-suresh.b.siddha@intel.com Cc: Avi Kivity Signed-off-by: H. Peter Anvin --- arch/x86/kvm/x86.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 148ed66..cf637f5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5972,7 +5972,7 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) */ kvm_put_guest_xcr0(vcpu); vcpu->guest_fpu_loaded = 1; - unlazy_fpu(current); + kernel_fpu_begin(); fpu_restore_checking(&vcpu->arch.guest_fpu); trace_kvm_fpu(1); } @@ -5986,6 +5986,7 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) vcpu->guest_fpu_loaded = 0; fpu_save_init(&vcpu->arch.guest_fpu); + kernel_fpu_end(); ++vcpu->stat.fpu_reload; kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu); trace_kvm_fpu(0); -- cgit v1.1 From 841e3604d35aa70d399146abdc526d8c89a2c2f5 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Fri, 24 Aug 2012 14:13:00 -0700 Subject: x86, fpu: always use kernel_fpu_begin/end() for in-kernel FPU usage use kernel_fpu_begin/end() instead of unconditionally accessing cr0 and saving/restoring just the few used xmm/ymm registers. This has some advantages like: * If the task's FPU state is already active, then kernel_fpu_begin() will just save the user-state and avoiding the read/write of cr0. In general, cr0 accesses are much slower. * Manual save/restore of xmm/ymm registers will affect the 'modified' and the 'init' optimizations brought in the by xsaveopt/xrstor infrastructure. * Foward compatibility with future vector register extensions will be a problem if the xmm/ymm registers are manually saved and restored (corrupting the extended state of those vector registers). With this patch, there was no significant difference in the xor throughput using AVX, measured during boot. Signed-off-by: Suresh Siddha Link: http://lkml.kernel.org/r/1345842782-24175-5-git-send-email-suresh.b.siddha@intel.com Cc: Jim Kukunas Cc: NeilBrown Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/xor_32.h | 56 ++++++-------------------------------- arch/x86/include/asm/xor_64.h | 61 +++++++----------------------------------- arch/x86/include/asm/xor_avx.h | 54 +++++++++---------------------------- 3 files changed, 29 insertions(+), 142 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h index 4545708..aabd585 100644 --- a/arch/x86/include/asm/xor_32.h +++ b/arch/x86/include/asm/xor_32.h @@ -534,38 +534,6 @@ static struct xor_block_template xor_block_p5_mmx = { * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo) */ -#define XMMS_SAVE \ -do { \ - preempt_disable(); \ - cr0 = read_cr0(); \ - clts(); \ - asm volatile( \ - "movups %%xmm0,(%0) ;\n\t" \ - "movups %%xmm1,0x10(%0) ;\n\t" \ - "movups %%xmm2,0x20(%0) ;\n\t" \ - "movups %%xmm3,0x30(%0) ;\n\t" \ - : \ - : "r" (xmm_save) \ - : "memory"); \ -} while (0) - -#define XMMS_RESTORE \ -do { \ - asm volatile( \ - "sfence ;\n\t" \ - "movups (%0),%%xmm0 ;\n\t" \ - "movups 0x10(%0),%%xmm1 ;\n\t" \ - "movups 0x20(%0),%%xmm2 ;\n\t" \ - "movups 0x30(%0),%%xmm3 ;\n\t" \ - : \ - : "r" (xmm_save) \ - : "memory"); \ - write_cr0(cr0); \ - preempt_enable(); \ -} while (0) - -#define ALIGN16 __attribute__((aligned(16))) - #define OFFS(x) "16*("#x")" #define PF_OFFS(x) "256+16*("#x")" #define PF0(x) " prefetchnta "PF_OFFS(x)"(%1) ;\n" @@ -587,10 +555,8 @@ static void xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) { unsigned long lines = bytes >> 8; - char xmm_save[16*4] ALIGN16; - int cr0; - XMMS_SAVE; + kernel_fpu_begin(); asm volatile( #undef BLOCK @@ -633,7 +599,7 @@ xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) : : "memory"); - XMMS_RESTORE; + kernel_fpu_end(); } static void @@ -641,10 +607,8 @@ xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, unsigned long *p3) { unsigned long lines = bytes >> 8; - char xmm_save[16*4] ALIGN16; - int cr0; - XMMS_SAVE; + kernel_fpu_begin(); asm volatile( #undef BLOCK @@ -694,7 +658,7 @@ xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, : : "memory" ); - XMMS_RESTORE; + kernel_fpu_end(); } static void @@ -702,10 +666,8 @@ xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, unsigned long *p3, unsigned long *p4) { unsigned long lines = bytes >> 8; - char xmm_save[16*4] ALIGN16; - int cr0; - XMMS_SAVE; + kernel_fpu_begin(); asm volatile( #undef BLOCK @@ -762,7 +724,7 @@ xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, : : "memory" ); - XMMS_RESTORE; + kernel_fpu_end(); } static void @@ -770,10 +732,8 @@ xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, unsigned long *p3, unsigned long *p4, unsigned long *p5) { unsigned long lines = bytes >> 8; - char xmm_save[16*4] ALIGN16; - int cr0; - XMMS_SAVE; + kernel_fpu_begin(); /* Make sure GCC forgets anything it knows about p4 or p5, such that it won't pass to the asm volatile below a @@ -850,7 +810,7 @@ xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, like assuming they have some legal value. */ asm("" : "=r" (p4), "=r" (p5)); - XMMS_RESTORE; + kernel_fpu_end(); } static struct xor_block_template xor_block_pIII_sse = { diff --git a/arch/x86/include/asm/xor_64.h b/arch/x86/include/asm/xor_64.h index b9b2323..5fc06d0 100644 --- a/arch/x86/include/asm/xor_64.h +++ b/arch/x86/include/asm/xor_64.h @@ -34,41 +34,7 @@ * no advantages to be gotten from x86-64 here anyways. */ -typedef struct { - unsigned long a, b; -} __attribute__((aligned(16))) xmm_store_t; - -/* Doesn't use gcc to save the XMM registers, because there is no easy way to - tell it to do a clts before the register saving. */ -#define XMMS_SAVE \ -do { \ - preempt_disable(); \ - asm volatile( \ - "movq %%cr0,%0 ;\n\t" \ - "clts ;\n\t" \ - "movups %%xmm0,(%1) ;\n\t" \ - "movups %%xmm1,0x10(%1) ;\n\t" \ - "movups %%xmm2,0x20(%1) ;\n\t" \ - "movups %%xmm3,0x30(%1) ;\n\t" \ - : "=&r" (cr0) \ - : "r" (xmm_save) \ - : "memory"); \ -} while (0) - -#define XMMS_RESTORE \ -do { \ - asm volatile( \ - "sfence ;\n\t" \ - "movups (%1),%%xmm0 ;\n\t" \ - "movups 0x10(%1),%%xmm1 ;\n\t" \ - "movups 0x20(%1),%%xmm2 ;\n\t" \ - "movups 0x30(%1),%%xmm3 ;\n\t" \ - "movq %0,%%cr0 ;\n\t" \ - : \ - : "r" (cr0), "r" (xmm_save) \ - : "memory"); \ - preempt_enable(); \ -} while (0) +#include #define OFFS(x) "16*("#x")" #define PF_OFFS(x) "256+16*("#x")" @@ -91,10 +57,8 @@ static void xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) { unsigned int lines = bytes >> 8; - unsigned long cr0; - xmm_store_t xmm_save[4]; - XMMS_SAVE; + kernel_fpu_begin(); asm volatile( #undef BLOCK @@ -135,7 +99,7 @@ xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) : [inc] "r" (256UL) : "memory"); - XMMS_RESTORE; + kernel_fpu_end(); } static void @@ -143,11 +107,8 @@ xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, unsigned long *p3) { unsigned int lines = bytes >> 8; - xmm_store_t xmm_save[4]; - unsigned long cr0; - - XMMS_SAVE; + kernel_fpu_begin(); asm volatile( #undef BLOCK #define BLOCK(i) \ @@ -194,7 +155,7 @@ xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3) : [inc] "r" (256UL) : "memory"); - XMMS_RESTORE; + kernel_fpu_end(); } static void @@ -202,10 +163,8 @@ xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, unsigned long *p3, unsigned long *p4) { unsigned int lines = bytes >> 8; - xmm_store_t xmm_save[4]; - unsigned long cr0; - XMMS_SAVE; + kernel_fpu_begin(); asm volatile( #undef BLOCK @@ -261,7 +220,7 @@ xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, : [inc] "r" (256UL) : "memory" ); - XMMS_RESTORE; + kernel_fpu_end(); } static void @@ -269,10 +228,8 @@ xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, unsigned long *p3, unsigned long *p4, unsigned long *p5) { unsigned int lines = bytes >> 8; - xmm_store_t xmm_save[4]; - unsigned long cr0; - XMMS_SAVE; + kernel_fpu_begin(); asm volatile( #undef BLOCK @@ -336,7 +293,7 @@ xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, : [inc] "r" (256UL) : "memory"); - XMMS_RESTORE; + kernel_fpu_end(); } static struct xor_block_template xor_block_sse = { diff --git a/arch/x86/include/asm/xor_avx.h b/arch/x86/include/asm/xor_avx.h index 2510d35..7ea79c5 100644 --- a/arch/x86/include/asm/xor_avx.h +++ b/arch/x86/include/asm/xor_avx.h @@ -20,32 +20,6 @@ #include #include -#define ALIGN32 __aligned(32) - -#define YMM_SAVED_REGS 4 - -#define YMMS_SAVE \ -do { \ - preempt_disable(); \ - cr0 = read_cr0(); \ - clts(); \ - asm volatile("vmovaps %%ymm0, %0" : "=m" (ymm_save[0]) : : "memory"); \ - asm volatile("vmovaps %%ymm1, %0" : "=m" (ymm_save[32]) : : "memory"); \ - asm volatile("vmovaps %%ymm2, %0" : "=m" (ymm_save[64]) : : "memory"); \ - asm volatile("vmovaps %%ymm3, %0" : "=m" (ymm_save[96]) : : "memory"); \ -} while (0); - -#define YMMS_RESTORE \ -do { \ - asm volatile("sfence" : : : "memory"); \ - asm volatile("vmovaps %0, %%ymm3" : : "m" (ymm_save[96])); \ - asm volatile("vmovaps %0, %%ymm2" : : "m" (ymm_save[64])); \ - asm volatile("vmovaps %0, %%ymm1" : : "m" (ymm_save[32])); \ - asm volatile("vmovaps %0, %%ymm0" : : "m" (ymm_save[0])); \ - write_cr0(cr0); \ - preempt_enable(); \ -} while (0); - #define BLOCK4(i) \ BLOCK(32 * i, 0) \ BLOCK(32 * (i + 1), 1) \ @@ -60,10 +34,9 @@ do { \ static void xor_avx_2(unsigned long bytes, unsigned long *p0, unsigned long *p1) { - unsigned long cr0, lines = bytes >> 9; - char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; + unsigned long lines = bytes >> 9; - YMMS_SAVE + kernel_fpu_begin(); while (lines--) { #undef BLOCK @@ -82,16 +55,15 @@ do { \ p1 = (unsigned long *)((uintptr_t)p1 + 512); } - YMMS_RESTORE + kernel_fpu_end(); } static void xor_avx_3(unsigned long bytes, unsigned long *p0, unsigned long *p1, unsigned long *p2) { - unsigned long cr0, lines = bytes >> 9; - char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; + unsigned long lines = bytes >> 9; - YMMS_SAVE + kernel_fpu_begin(); while (lines--) { #undef BLOCK @@ -113,16 +85,15 @@ do { \ p2 = (unsigned long *)((uintptr_t)p2 + 512); } - YMMS_RESTORE + kernel_fpu_end(); } static void xor_avx_4(unsigned long bytes, unsigned long *p0, unsigned long *p1, unsigned long *p2, unsigned long *p3) { - unsigned long cr0, lines = bytes >> 9; - char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; + unsigned long lines = bytes >> 9; - YMMS_SAVE + kernel_fpu_begin(); while (lines--) { #undef BLOCK @@ -147,16 +118,15 @@ do { \ p3 = (unsigned long *)((uintptr_t)p3 + 512); } - YMMS_RESTORE + kernel_fpu_end(); } static void xor_avx_5(unsigned long bytes, unsigned long *p0, unsigned long *p1, unsigned long *p2, unsigned long *p3, unsigned long *p4) { - unsigned long cr0, lines = bytes >> 9; - char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; + unsigned long lines = bytes >> 9; - YMMS_SAVE + kernel_fpu_begin(); while (lines--) { #undef BLOCK @@ -184,7 +154,7 @@ do { \ p4 = (unsigned long *)((uintptr_t)p4 + 512); } - YMMS_RESTORE + kernel_fpu_end(); } static struct xor_block_template xor_block_avx = { -- cgit v1.1 From 304bceda6a18ae0b0240b8aac9a6bdf8ce2d2469 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Fri, 24 Aug 2012 14:13:02 -0700 Subject: x86, fpu: use non-lazy fpu restore for processors supporting xsave Fundamental model of the current Linux kernel is to lazily init and restore FPU instead of restoring the task state during context switch. This changes that fundamental lazy model to the non-lazy model for the processors supporting xsave feature. Reasons driving this model change are: i. Newer processors support optimized state save/restore using xsaveopt and xrstor by tracking the INIT state and MODIFIED state during context-switch. This is faster than modifying the cr0.TS bit which has serializing semantics. ii. Newer glibc versions use SSE for some of the optimized copy/clear routines. With certain workloads (like boot, kernel-compilation etc), application completes its work with in the first 5 task switches, thus taking upto 5 #DNA traps with the kernel not getting a chance to apply the above mentioned pre-load heuristic. iii. Some xstate features (like AMD's LWP feature) don't honor the cr0.TS bit and thus will not work correctly in the presence of lazy restore. Non-lazy state restore is needed for enabling such features. Some data on a two socket SNB system: * Saved 20K DNA exceptions during boot on a two socket SNB system. * Saved 50K DNA exceptions during kernel-compilation workload. * Improved throughput of the AVX based checksumming function inside the kernel by ~15% as xsave/xrstor is faster than the serializing clts/stts pair. Also now kernel_fpu_begin/end() relies on the patched alternative instructions. So move check_fpu() which uses the kernel_fpu_begin/end() after alternative_instructions(). Signed-off-by: Suresh Siddha Link: http://lkml.kernel.org/r/1345842782-24175-7-git-send-email-suresh.b.siddha@intel.com Merge 32-bit boot fix from, Link: http://lkml.kernel.org/r/1347300665-6209-4-git-send-email-suresh.b.siddha@intel.com Cc: Jim Kukunas Cc: NeilBrown Cc: Avi Kivity Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/fpu-internal.h | 96 ++++++++++++++++++++++++------------- arch/x86/include/asm/i387.h | 1 + arch/x86/include/asm/xsave.h | 1 + arch/x86/kernel/cpu/bugs.c | 7 ++- arch/x86/kernel/i387.c | 20 ++++++-- arch/x86/kernel/process.c | 12 +++-- arch/x86/kernel/process_32.c | 4 -- arch/x86/kernel/process_64.c | 4 -- arch/x86/kernel/traps.c | 5 +- arch/x86/kernel/xsave.c | 57 +++++++++++++++++----- 10 files changed, 146 insertions(+), 61 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index 52202a6..8ca0f9f 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -291,15 +291,48 @@ static inline void __thread_set_has_fpu(struct task_struct *tsk) static inline void __thread_fpu_end(struct task_struct *tsk) { __thread_clear_has_fpu(tsk); - stts(); + if (!use_xsave()) + stts(); } static inline void __thread_fpu_begin(struct task_struct *tsk) { - clts(); + if (!use_xsave()) + clts(); __thread_set_has_fpu(tsk); } +static inline void __drop_fpu(struct task_struct *tsk) +{ + if (__thread_has_fpu(tsk)) { + /* Ignore delayed exceptions from user space */ + asm volatile("1: fwait\n" + "2:\n" + _ASM_EXTABLE(1b, 2b)); + __thread_fpu_end(tsk); + } +} + +static inline void drop_fpu(struct task_struct *tsk) +{ + /* + * Forget coprocessor state.. + */ + preempt_disable(); + tsk->fpu_counter = 0; + __drop_fpu(tsk); + clear_used_math(); + preempt_enable(); +} + +static inline void drop_init_fpu(struct task_struct *tsk) +{ + if (!use_xsave()) + drop_fpu(tsk); + else + xrstor_state(init_xstate_buf, -1); +} + /* * FPU state switching for scheduling. * @@ -333,7 +366,12 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta { fpu_switch_t fpu; - fpu.preload = tsk_used_math(new) && new->fpu_counter > 5; + /* + * If the task has used the math, pre-load the FPU on xsave processors + * or if the past 5 consecutive context-switches used math. + */ + fpu.preload = tsk_used_math(new) && (use_xsave() || + new->fpu_counter > 5); if (__thread_has_fpu(old)) { if (!__save_init_fpu(old)) cpu = ~0; @@ -345,14 +383,14 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta new->fpu_counter++; __thread_set_has_fpu(new); prefetch(new->thread.fpu.state); - } else + } else if (!use_xsave()) stts(); } else { old->fpu_counter = 0; old->thread.fpu.last_cpu = ~0; if (fpu.preload) { new->fpu_counter++; - if (fpu_lazy_restore(new, cpu)) + if (!use_xsave() && fpu_lazy_restore(new, cpu)) fpu.preload = 0; else prefetch(new->thread.fpu.state); @@ -372,7 +410,7 @@ static inline void switch_fpu_finish(struct task_struct *new, fpu_switch_t fpu) { if (fpu.preload) { if (unlikely(restore_fpu_checking(new))) - __thread_fpu_end(new); + drop_init_fpu(new); } } @@ -400,17 +438,6 @@ static inline int restore_xstate_sig(void __user *buf, int ia32_frame) return __restore_xstate_sig(buf, buf_fx, size); } -static inline void __drop_fpu(struct task_struct *tsk) -{ - if (__thread_has_fpu(tsk)) { - /* Ignore delayed exceptions from user space */ - asm volatile("1: fwait\n" - "2:\n" - _ASM_EXTABLE(1b, 2b)); - __thread_fpu_end(tsk); - } -} - /* * Need to be preemption-safe. * @@ -431,24 +458,18 @@ static inline void user_fpu_begin(void) static inline void save_init_fpu(struct task_struct *tsk) { WARN_ON_ONCE(!__thread_has_fpu(tsk)); + + if (use_xsave()) { + xsave_state(&tsk->thread.fpu.state->xsave, -1); + return; + } + preempt_disable(); __save_init_fpu(tsk); __thread_fpu_end(tsk); preempt_enable(); } -static inline void drop_fpu(struct task_struct *tsk) -{ - /* - * Forget coprocessor state.. - */ - tsk->fpu_counter = 0; - preempt_disable(); - __drop_fpu(tsk); - preempt_enable(); - clear_used_math(); -} - /* * i387 state interaction */ @@ -503,12 +524,21 @@ static inline void fpu_free(struct fpu *fpu) } } -static inline void fpu_copy(struct fpu *dst, struct fpu *src) +static inline void fpu_copy(struct task_struct *dst, struct task_struct *src) { - memcpy(dst->state, src->state, xstate_size); -} + if (use_xsave()) { + struct xsave_struct *xsave = &dst->thread.fpu.state->xsave; -extern void fpu_finit(struct fpu *fpu); + memset(&xsave->xsave_hdr, 0, sizeof(struct xsave_hdr_struct)); + xsave_state(xsave, -1); + } else { + struct fpu *dfpu = &dst->thread.fpu; + struct fpu *sfpu = &src->thread.fpu; + + unlazy_fpu(src); + memcpy(dfpu->state, sfpu->state, xstate_size); + } +} static inline unsigned long alloc_mathframe(unsigned long sp, int ia32_frame, unsigned long *buf_fx, diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 257d9cc..6c3bd37 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -19,6 +19,7 @@ struct pt_regs; struct user_i387_struct; extern int init_fpu(struct task_struct *child); +extern void fpu_finit(struct fpu *fpu); extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); extern void math_state_restore(void); diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h index c1d989a..2ddee1b8 100644 --- a/arch/x86/include/asm/xsave.h +++ b/arch/x86/include/asm/xsave.h @@ -34,6 +34,7 @@ extern unsigned int xstate_size; extern u64 pcntxt_mask; extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; +extern struct xsave_struct *init_xstate_buf; extern void xsave_init(void); extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask); diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index c97bb7b..d0e910d 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -165,10 +165,15 @@ void __init check_bugs(void) print_cpu_info(&boot_cpu_data); #endif check_config(); - check_fpu(); check_hlt(); check_popad(); init_utsname()->machine[1] = '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86); alternative_instructions(); + + /* + * kernel_fpu_begin/end() in check_fpu() relies on the patched + * alternative instructions. + */ + check_fpu(); } diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index ab6a2e8..5285574 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -22,7 +22,15 @@ /* * Were we in an interrupt that interrupted kernel mode? * - * We can do a kernel_fpu_begin/end() pair *ONLY* if that + * For now, on xsave platforms we will return interrupted + * kernel FPU as not-idle. TBD: As we use non-lazy FPU restore + * for xsave platforms, ideally we can change the return value + * to something like __thread_has_fpu(current). But we need to + * be careful of doing __thread_clear_has_fpu() before saving + * the FPU etc for supporting nested uses etc. For now, take + * the simple route! + * + * On others, we can do a kernel_fpu_begin/end() pair *ONLY* if that * pair does nothing at all: the thread must not have fpu (so * that we don't try to save the FPU state), and TS must * be set (so that the clts/stts pair does nothing that is @@ -30,6 +38,9 @@ */ static inline bool interrupted_kernel_fpu_idle(void) { + if (use_xsave()) + return 0; + return !__thread_has_fpu(current) && (read_cr0() & X86_CR0_TS); } @@ -73,7 +84,7 @@ void kernel_fpu_begin(void) __save_init_fpu(me); __thread_clear_has_fpu(me); /* We do 'stts()' in kernel_fpu_end() */ - } else { + } else if (!use_xsave()) { this_cpu_write(fpu_owner_task, NULL); clts(); } @@ -82,7 +93,10 @@ EXPORT_SYMBOL(kernel_fpu_begin); void kernel_fpu_end(void) { - stts(); + if (use_xsave()) + math_state_restore(); + else + stts(); preempt_enable(); } EXPORT_SYMBOL(kernel_fpu_end); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 30069d1..c21e30f 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -66,15 +66,13 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { int ret; - unlazy_fpu(src); - *dst = *src; if (fpu_allocated(&src->thread.fpu)) { memset(&dst->thread.fpu, 0, sizeof(dst->thread.fpu)); ret = fpu_alloc(&dst->thread.fpu); if (ret) return ret; - fpu_copy(&dst->thread.fpu, &src->thread.fpu); + fpu_copy(dst, src); } return 0; } @@ -153,7 +151,13 @@ void flush_thread(void) flush_ptrace_hw_breakpoint(tsk); memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); - drop_fpu(tsk); + drop_init_fpu(tsk); + /* + * Free the FPU state for non xsave platforms. They get reallocated + * lazily at the first use. + */ + if (!use_xsave()) + free_thread_xstate(tsk); } static void hard_disable_TSC(void) diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 516fa18..b9ff83c 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -190,10 +190,6 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) regs->cs = __USER_CS; regs->ip = new_ip; regs->sp = new_sp; - /* - * Free the old FP and other extended state - */ - free_thread_xstate(current); } EXPORT_SYMBOL_GPL(start_thread); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 0a980c9..8a6d20c 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -232,10 +232,6 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip, regs->cs = _cs; regs->ss = _ss; regs->flags = X86_EFLAGS_IF; - /* - * Free the old FP and other extended state - */ - free_thread_xstate(current); } void diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index b481341..ac7d527 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -613,11 +613,12 @@ void math_state_restore(void) } __thread_fpu_begin(tsk); + /* * Paranoid restore. send a SIGSEGV if we fail to restore the state. */ if (unlikely(restore_fpu_checking(tsk))) { - __thread_fpu_end(tsk); + drop_init_fpu(tsk); force_sig(SIGSEGV, tsk); return; } @@ -629,6 +630,8 @@ EXPORT_SYMBOL_GPL(math_state_restore); dotraplinkage void __kprobes do_device_not_available(struct pt_regs *regs, long error_code) { + BUG_ON(use_xsave()); + #ifdef CONFIG_MATH_EMULATION if (read_cr0() & X86_CR0_EM) { struct math_emu_info info = { }; diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 4ac5f2e..e7752bd 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -21,7 +21,7 @@ u64 pcntxt_mask; /* * Represents init state for the supported extended state. */ -static struct xsave_struct *init_xstate_buf; +struct xsave_struct *init_xstate_buf; static struct _fpx_sw_bytes fx_sw_reserved, fx_sw_reserved_ia32; static unsigned int *xstate_offsets, *xstate_sizes, xstate_features; @@ -268,7 +268,7 @@ int save_xstate_sig(void __user *buf, void __user *buf_fx, int size) if (use_fxsr() && save_xstate_epilog(buf_fx, ia32_fxstate)) return -1; - drop_fpu(tsk); /* trigger finit */ + drop_init_fpu(tsk); /* trigger finit */ return 0; } @@ -340,7 +340,7 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size) config_enabled(CONFIG_IA32_EMULATION)); if (!buf) { - drop_fpu(tsk); + drop_init_fpu(tsk); return 0; } @@ -380,15 +380,30 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size) */ struct xsave_struct *xsave = &tsk->thread.fpu.state->xsave; struct user_i387_ia32_struct env; + int err = 0; + /* + * Drop the current fpu which clears used_math(). This ensures + * that any context-switch during the copy of the new state, + * avoids the intermediate state from getting restored/saved. + * Thus avoiding the new restored state from getting corrupted. + * We will be ready to restore/save the state only after + * set_used_math() is again set. + */ drop_fpu(tsk); if (__copy_from_user(xsave, buf_fx, state_size) || - __copy_from_user(&env, buf, sizeof(env))) - return -1; + __copy_from_user(&env, buf, sizeof(env))) { + err = -1; + } else { + sanitize_restored_xstate(tsk, &env, xstate_bv, fx_only); + set_used_math(); + } - sanitize_restored_xstate(tsk, &env, xstate_bv, fx_only); - set_used_math(); + if (use_xsave()) + math_state_restore(); + + return err; } else { /* * For 64-bit frames and 32-bit fsave frames, restore the user @@ -396,7 +411,7 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size) */ user_fpu_begin(); if (restore_user_xstate(buf_fx, xstate_bv, fx_only)) { - drop_fpu(tsk); + drop_init_fpu(tsk); return -1; } } @@ -435,11 +450,29 @@ static void prepare_fx_sw_frame(void) */ static inline void xstate_enable(void) { + clts(); set_in_cr4(X86_CR4_OSXSAVE); xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask); } /* + * This is same as math_state_restore(). But use_xsave() is not yet + * patched to use math_state_restore(). + */ +static inline void init_restore_xstate(void) +{ + init_fpu(current); + __thread_fpu_begin(current); + xrstor_state(init_xstate_buf, -1); +} + +static inline void xstate_enable_ap(void) +{ + xstate_enable(); + init_restore_xstate(); +} + +/* * Record the offsets and sizes of different state managed by the xsave * memory layout. */ @@ -479,7 +512,6 @@ static void __init setup_xstate_init(void) __alignof__(struct xsave_struct)); init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT; - clts(); /* * Init all the features state with header_bv being 0x0 */ @@ -489,7 +521,6 @@ static void __init setup_xstate_init(void) * of any feature which is not represented by all zero's. */ xsave_state(init_xstate_buf, -1); - stts(); } /* @@ -533,6 +564,10 @@ static void __init xstate_enable_boot_cpu(void) pr_info("enabled xstate_bv 0x%llx, cntxt size 0x%x\n", pcntxt_mask, xstate_size); + + current->thread.fpu.state = + alloc_bootmem_align(xstate_size, __alignof__(struct xsave_struct)); + init_restore_xstate(); } /* @@ -551,6 +586,6 @@ void __cpuinit xsave_init(void) return; this_func = next_func; - next_func = xstate_enable; + next_func = xstate_enable_ap; this_func(); } -- cgit v1.1 From 5d2bd7009f306c82afddd1ca4d9763ad8473c216 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Thu, 6 Sep 2012 14:58:52 -0700 Subject: x86, fpu: decouple non-lazy/eager fpu restore from xsave Decouple non-lazy/eager fpu restore policy from the existence of the xsave feature. Introduce a synthetic CPUID flag to represent the eagerfpu policy. "eagerfpu=on" boot paramter will enable the policy. Requested-by: H. Peter Anvin Requested-by: Linus Torvalds Signed-off-by: Suresh Siddha Link: http://lkml.kernel.org/r/1347300665-6209-2-git-send-email-suresh.b.siddha@intel.com Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/cpufeature.h | 2 + arch/x86/include/asm/fpu-internal.h | 54 ++++++++++++++++------- arch/x86/kernel/cpu/common.c | 2 - arch/x86/kernel/i387.c | 25 ++++------- arch/x86/kernel/process.c | 2 +- arch/x86/kernel/traps.c | 2 +- arch/x86/kernel/xsave.c | 87 ++++++++++++++++++++++++------------- 7 files changed, 108 insertions(+), 66 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 6b7ee5f..5dd2b47 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -97,6 +97,7 @@ #define X86_FEATURE_EXTD_APICID (3*32+26) /* has extended APICID (8 bits) */ #define X86_FEATURE_AMD_DCM (3*32+27) /* multi-node processor */ #define X86_FEATURE_APERFMPERF (3*32+28) /* APERFMPERF */ +#define X86_FEATURE_EAGER_FPU (3*32+29) /* "eagerfpu" Non lazy FPU restore */ /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ #define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */ @@ -305,6 +306,7 @@ extern const char * const x86_power_flags[32]; #define cpu_has_perfctr_core boot_cpu_has(X86_FEATURE_PERFCTR_CORE) #define cpu_has_cx8 boot_cpu_has(X86_FEATURE_CX8) #define cpu_has_cx16 boot_cpu_has(X86_FEATURE_CX16) +#define cpu_has_eager_fpu boot_cpu_has(X86_FEATURE_EAGER_FPU) #if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64) # define cpu_has_invlpg 1 diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index 8ca0f9f..0ca72f0 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -38,6 +38,7 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka, extern unsigned int mxcsr_feature_mask; extern void fpu_init(void); +extern void eager_fpu_init(void); DECLARE_PER_CPU(struct task_struct *, fpu_owner_task); @@ -84,6 +85,11 @@ static inline int is_x32_frame(void) #define X87_FSW_ES (1 << 7) /* Exception Summary */ +static __always_inline __pure bool use_eager_fpu(void) +{ + return static_cpu_has(X86_FEATURE_EAGER_FPU); +} + static __always_inline __pure bool use_xsaveopt(void) { return static_cpu_has(X86_FEATURE_XSAVEOPT); @@ -99,6 +105,14 @@ static __always_inline __pure bool use_fxsr(void) return static_cpu_has(X86_FEATURE_FXSR); } +static inline void fx_finit(struct i387_fxsave_struct *fx) +{ + memset(fx, 0, xstate_size); + fx->cwd = 0x37f; + if (cpu_has_xmm) + fx->mxcsr = MXCSR_DEFAULT; +} + extern void __sanitize_i387_state(struct task_struct *); static inline void sanitize_i387_state(struct task_struct *tsk) @@ -291,13 +305,13 @@ static inline void __thread_set_has_fpu(struct task_struct *tsk) static inline void __thread_fpu_end(struct task_struct *tsk) { __thread_clear_has_fpu(tsk); - if (!use_xsave()) + if (!use_eager_fpu()) stts(); } static inline void __thread_fpu_begin(struct task_struct *tsk) { - if (!use_xsave()) + if (!use_eager_fpu()) clts(); __thread_set_has_fpu(tsk); } @@ -327,10 +341,14 @@ static inline void drop_fpu(struct task_struct *tsk) static inline void drop_init_fpu(struct task_struct *tsk) { - if (!use_xsave()) + if (!use_eager_fpu()) drop_fpu(tsk); - else - xrstor_state(init_xstate_buf, -1); + else { + if (use_xsave()) + xrstor_state(init_xstate_buf, -1); + else + fxrstor_checking(&init_xstate_buf->i387); + } } /* @@ -370,7 +388,7 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta * If the task has used the math, pre-load the FPU on xsave processors * or if the past 5 consecutive context-switches used math. */ - fpu.preload = tsk_used_math(new) && (use_xsave() || + fpu.preload = tsk_used_math(new) && (use_eager_fpu() || new->fpu_counter > 5); if (__thread_has_fpu(old)) { if (!__save_init_fpu(old)) @@ -383,14 +401,14 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta new->fpu_counter++; __thread_set_has_fpu(new); prefetch(new->thread.fpu.state); - } else if (!use_xsave()) + } else if (!use_eager_fpu()) stts(); } else { old->fpu_counter = 0; old->thread.fpu.last_cpu = ~0; if (fpu.preload) { new->fpu_counter++; - if (!use_xsave() && fpu_lazy_restore(new, cpu)) + if (!use_eager_fpu() && fpu_lazy_restore(new, cpu)) fpu.preload = 0; else prefetch(new->thread.fpu.state); @@ -452,6 +470,14 @@ static inline void user_fpu_begin(void) preempt_enable(); } +static inline void __save_fpu(struct task_struct *tsk) +{ + if (use_xsave()) + xsave_state(&tsk->thread.fpu.state->xsave, -1); + else + fpu_fxsave(&tsk->thread.fpu); +} + /* * These disable preemption on their own and are safe */ @@ -459,8 +485,8 @@ static inline void save_init_fpu(struct task_struct *tsk) { WARN_ON_ONCE(!__thread_has_fpu(tsk)); - if (use_xsave()) { - xsave_state(&tsk->thread.fpu.state->xsave, -1); + if (use_eager_fpu()) { + __save_fpu(tsk); return; } @@ -526,11 +552,9 @@ static inline void fpu_free(struct fpu *fpu) static inline void fpu_copy(struct task_struct *dst, struct task_struct *src) { - if (use_xsave()) { - struct xsave_struct *xsave = &dst->thread.fpu.state->xsave; - - memset(&xsave->xsave_hdr, 0, sizeof(struct xsave_hdr_struct)); - xsave_state(xsave, -1); + if (use_eager_fpu()) { + memset(&dst->thread.fpu.state->xsave, 0, xstate_size); + __save_fpu(dst); } else { struct fpu *dfpu = &dst->thread.fpu; struct fpu *sfpu = &src->thread.fpu; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index a5fbc3c..b0fe078 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1297,7 +1297,6 @@ void __cpuinit cpu_init(void) dbg_restore_debug_regs(); fpu_init(); - xsave_init(); raw_local_save_flags(kernel_eflags); @@ -1352,6 +1351,5 @@ void __cpuinit cpu_init(void) dbg_restore_debug_regs(); fpu_init(); - xsave_init(); } #endif diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 5285574..6782e39 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -22,9 +22,8 @@ /* * Were we in an interrupt that interrupted kernel mode? * - * For now, on xsave platforms we will return interrupted - * kernel FPU as not-idle. TBD: As we use non-lazy FPU restore - * for xsave platforms, ideally we can change the return value + * For now, with eagerfpu we will return interrupted kernel FPU + * state as not-idle. TBD: Ideally we can change the return value * to something like __thread_has_fpu(current). But we need to * be careful of doing __thread_clear_has_fpu() before saving * the FPU etc for supporting nested uses etc. For now, take @@ -38,7 +37,7 @@ */ static inline bool interrupted_kernel_fpu_idle(void) { - if (use_xsave()) + if (use_eager_fpu()) return 0; return !__thread_has_fpu(current) && @@ -84,7 +83,7 @@ void kernel_fpu_begin(void) __save_init_fpu(me); __thread_clear_has_fpu(me); /* We do 'stts()' in kernel_fpu_end() */ - } else if (!use_xsave()) { + } else if (!use_eager_fpu()) { this_cpu_write(fpu_owner_task, NULL); clts(); } @@ -93,7 +92,7 @@ EXPORT_SYMBOL(kernel_fpu_begin); void kernel_fpu_end(void) { - if (use_xsave()) + if (use_eager_fpu()) math_state_restore(); else stts(); @@ -122,7 +121,6 @@ static void __cpuinit mxcsr_feature_mask_init(void) { unsigned long mask = 0; - clts(); if (cpu_has_fxsr) { memset(&fx_scratch, 0, sizeof(struct i387_fxsave_struct)); asm volatile("fxsave %0" : : "m" (fx_scratch)); @@ -131,7 +129,6 @@ static void __cpuinit mxcsr_feature_mask_init(void) mask = 0x0000ffbf; } mxcsr_feature_mask &= mask; - stts(); } static void __cpuinit init_thread_xstate(void) @@ -185,9 +182,8 @@ void __cpuinit fpu_init(void) init_thread_xstate(); mxcsr_feature_mask_init(); - /* clean state in init */ - current_thread_info()->status = 0; - clear_used_math(); + xsave_init(); + eager_fpu_init(); } void fpu_finit(struct fpu *fpu) @@ -198,12 +194,7 @@ void fpu_finit(struct fpu *fpu) } if (cpu_has_fxsr) { - struct i387_fxsave_struct *fx = &fpu->state->fxsave; - - memset(fx, 0, xstate_size); - fx->cwd = 0x37f; - if (cpu_has_xmm) - fx->mxcsr = MXCSR_DEFAULT; + fx_finit(&fpu->state->fxsave); } else { struct i387_fsave_struct *fp = &fpu->state->fsave; memset(fp, 0, xstate_size); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index c21e30f..dc3567e 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -156,7 +156,7 @@ void flush_thread(void) * Free the FPU state for non xsave platforms. They get reallocated * lazily at the first use. */ - if (!use_xsave()) + if (!use_eager_fpu()) free_thread_xstate(tsk); } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index ac7d527..4f4aba0 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -630,7 +630,7 @@ EXPORT_SYMBOL_GPL(math_state_restore); dotraplinkage void __kprobes do_device_not_available(struct pt_regs *regs, long error_code) { - BUG_ON(use_xsave()); + BUG_ON(use_eager_fpu()); #ifdef CONFIG_MATH_EMULATION if (read_cr0() & X86_CR0_EM) { diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index e7752bd..c0afd2c 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -400,7 +400,7 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size) set_used_math(); } - if (use_xsave()) + if (use_eager_fpu()) math_state_restore(); return err; @@ -450,29 +450,11 @@ static void prepare_fx_sw_frame(void) */ static inline void xstate_enable(void) { - clts(); set_in_cr4(X86_CR4_OSXSAVE); xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask); } /* - * This is same as math_state_restore(). But use_xsave() is not yet - * patched to use math_state_restore(). - */ -static inline void init_restore_xstate(void) -{ - init_fpu(current); - __thread_fpu_begin(current); - xrstor_state(init_xstate_buf, -1); -} - -static inline void xstate_enable_ap(void) -{ - xstate_enable(); - init_restore_xstate(); -} - -/* * Record the offsets and sizes of different state managed by the xsave * memory layout. */ @@ -500,17 +482,20 @@ static void __init setup_xstate_features(void) /* * setup the xstate image representing the init state */ -static void __init setup_xstate_init(void) +static void __init setup_init_fpu_buf(void) { - setup_xstate_features(); - /* * Setup init_xstate_buf to represent the init state of * all the features managed by the xsave */ init_xstate_buf = alloc_bootmem_align(xstate_size, __alignof__(struct xsave_struct)); - init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT; + fx_finit(&init_xstate_buf->i387); + + if (!cpu_has_xsave) + return; + + setup_xstate_features(); /* * Init all the features state with header_bv being 0x0 @@ -523,6 +508,17 @@ static void __init setup_xstate_init(void) xsave_state(init_xstate_buf, -1); } +static int disable_eagerfpu; +static int __init eager_fpu_setup(char *s) +{ + if (!strcmp(s, "on")) + setup_force_cpu_cap(X86_FEATURE_EAGER_FPU); + else if (!strcmp(s, "off")) + disable_eagerfpu = 1; + return 1; +} +__setup("eagerfpu=", eager_fpu_setup); + /* * Enable and initialize the xsave feature. */ @@ -559,15 +555,10 @@ static void __init xstate_enable_boot_cpu(void) update_regset_xstate_info(xstate_size, pcntxt_mask); prepare_fx_sw_frame(); - - setup_xstate_init(); + setup_init_fpu_buf(); pr_info("enabled xstate_bv 0x%llx, cntxt size 0x%x\n", pcntxt_mask, xstate_size); - - current->thread.fpu.state = - alloc_bootmem_align(xstate_size, __alignof__(struct xsave_struct)); - init_restore_xstate(); } /* @@ -586,6 +577,42 @@ void __cpuinit xsave_init(void) return; this_func = next_func; - next_func = xstate_enable_ap; + next_func = xstate_enable; this_func(); } + +static inline void __init eager_fpu_init_bp(void) +{ + current->thread.fpu.state = + alloc_bootmem_align(xstate_size, __alignof__(struct xsave_struct)); + if (!init_xstate_buf) + setup_init_fpu_buf(); +} + +void __cpuinit eager_fpu_init(void) +{ + static __refdata void (*boot_func)(void) = eager_fpu_init_bp; + + clear_used_math(); + current_thread_info()->status = 0; + if (!cpu_has_eager_fpu) { + stts(); + return; + } + + if (boot_func) { + boot_func(); + boot_func = NULL; + } + + /* + * This is same as math_state_restore(). But use_xsave() is + * not yet patched to use math_state_restore(). + */ + init_fpu(current); + __thread_fpu_begin(current); + if (cpu_has_xsave) + xrstor_state(init_xstate_buf, -1); + else + fxrstor_checking(&init_xstate_buf->i387); +} -- cgit v1.1 From 212b02125f3725127148475059a70031bd031bdb Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Thu, 6 Sep 2012 15:05:18 -0700 Subject: x86, fpu: enable eagerfpu by default for xsaveopt xsaveopt/xrstor support optimized state save/restore by tracking the INIT state and MODIFIED state during context-switch. Enable eagerfpu by default for processors supporting xsaveopt. Can be disabled by passing "eagerfpu=off" boot parameter. Signed-off-by: Suresh Siddha Link: http://lkml.kernel.org/r/1347300665-6209-3-git-send-email-suresh.b.siddha@intel.com Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/cpufeature.h | 1 + arch/x86/kernel/xsave.c | 3 +++ 2 files changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 5dd2b47..0debdb5 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -300,6 +300,7 @@ extern const char * const x86_power_flags[32]; #define cpu_has_xmm4_2 boot_cpu_has(X86_FEATURE_XMM4_2) #define cpu_has_x2apic boot_cpu_has(X86_FEATURE_X2APIC) #define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE) +#define cpu_has_xsaveopt boot_cpu_has(X86_FEATURE_XSAVEOPT) #define cpu_has_osxsave boot_cpu_has(X86_FEATURE_OSXSAVE) #define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR) #define cpu_has_pclmulqdq boot_cpu_has(X86_FEATURE_PCLMULQDQ) diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index c0afd2c..e99f754 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -557,6 +557,9 @@ static void __init xstate_enable_boot_cpu(void) prepare_fx_sw_frame(); setup_init_fpu_buf(); + if (cpu_has_xsaveopt && !disable_eagerfpu) + setup_force_cpu_cap(X86_FEATURE_EAGER_FPU); + pr_info("enabled xstate_bv 0x%llx, cntxt size 0x%x\n", pcntxt_mask, xstate_size); } -- cgit v1.1 From e00229819f306b1f86134095347e9187dc346bd1 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 10 Sep 2012 10:32:32 -0700 Subject: x86, fpu: make eagerfpu= boot param tri-state Add the "eagerfpu=auto" (that selects the default scheme in enabling eagerfpu) which can override compiled-in boot parameters like "eagerfpu=on/off" (that force enable/disable eagerfpu). Signed-off-by: Suresh Siddha Link: http://lkml.kernel.org/r/1347300665-6209-5-git-send-email-suresh.b.siddha@intel.com Signed-off-by: H. Peter Anvin --- arch/x86/kernel/xsave.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index e99f754..4e89b3d 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -508,13 +508,15 @@ static void __init setup_init_fpu_buf(void) xsave_state(init_xstate_buf, -1); } -static int disable_eagerfpu; +static enum { AUTO, ENABLE, DISABLE } eagerfpu = AUTO; static int __init eager_fpu_setup(char *s) { if (!strcmp(s, "on")) - setup_force_cpu_cap(X86_FEATURE_EAGER_FPU); + eagerfpu = ENABLE; else if (!strcmp(s, "off")) - disable_eagerfpu = 1; + eagerfpu = DISABLE; + else if (!strcmp(s, "auto")) + eagerfpu = AUTO; return 1; } __setup("eagerfpu=", eager_fpu_setup); @@ -557,8 +559,9 @@ static void __init xstate_enable_boot_cpu(void) prepare_fx_sw_frame(); setup_init_fpu_buf(); - if (cpu_has_xsaveopt && !disable_eagerfpu) - setup_force_cpu_cap(X86_FEATURE_EAGER_FPU); + /* Auto enable eagerfpu for xsaveopt */ + if (cpu_has_xsaveopt && eagerfpu != DISABLE) + eagerfpu = ENABLE; pr_info("enabled xstate_bv 0x%llx, cntxt size 0x%x\n", pcntxt_mask, xstate_size); @@ -598,6 +601,10 @@ void __cpuinit eager_fpu_init(void) clear_used_math(); current_thread_info()->status = 0; + + if (eagerfpu == ENABLE) + setup_force_cpu_cap(X86_FEATURE_EAGER_FPU); + if (!cpu_has_eager_fpu) { stts(); return; -- cgit v1.1 From a8615af4bc3621cb01096541dafa6f68352ec2d9 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 10 Sep 2012 10:40:08 -0700 Subject: x86, fpu: remove cpu_has_xmm check in the fx_finit() CPUs with FXSAVE but no XMM/MXCSR (Pentium II from Intel, Crusoe/TM-3xxx/5xxx from Transmeta, and presumably some of the K6 generation from AMD) ever looked at the mxcsr field during fxrstor/fxsave. So remove the cpu_has_xmm check in the fx_finit() Reported-by: Al Viro Acked-by: H. Peter Anvin Signed-off-by: Suresh Siddha Link: http://lkml.kernel.org/r/1347300665-6209-6-git-send-email-suresh.b.siddha@intel.com Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/fpu-internal.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index 0ca72f0..92f3c6e 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -109,8 +109,7 @@ static inline void fx_finit(struct i387_fxsave_struct *fx) { memset(fx, 0, xstate_size); fx->cwd = 0x37f; - if (cpu_has_xmm) - fx->mxcsr = MXCSR_DEFAULT; + fx->mxcsr = MXCSR_DEFAULT; } extern void __sanitize_i387_state(struct task_struct *); -- cgit v1.1 From 3ddbebf878ac8d958bb34e87a742a6b3adc283a3 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Mon, 17 Sep 2012 13:22:53 +0200 Subject: PCI: Discard __init annotations for pci_fixup_irqs() and related functions Remove the __init annotations in order to keep pci_fixup_irqs() around after init (e.g. for hotplug). This requires the same change for the implementation of pcibios_update_irq() on all architectures. While at it, all __devinit annotations are removed as well, since they will be useless now that HOTPLUG is always on. Signed-off-by: Thierry Reding Signed-off-by: Bjorn Helgaas Acked-by: Greg Kroah-Hartman --- arch/x86/pci/visws.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/visws.c b/arch/x86/pci/visws.c index 6f2f8ee..9d736e7 100644 --- a/arch/x86/pci/visws.c +++ b/arch/x86/pci/visws.c @@ -62,7 +62,7 @@ out: return irq; } -void __init pcibios_update_irq(struct pci_dev *dev, int irq) +void pcibios_update_irq(struct pci_dev *dev, int irq) { pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq); } -- cgit v1.1 From 8885b7b637fa9aca7e1b00581a0173c6956966d3 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Mon, 17 Sep 2012 13:22:54 +0200 Subject: PCI: Provide a default pcibios_update_irq() Most architectures implement this in exactly the same way. Instead of having each architecture duplicate this function, provide a single implementation in the core and make it a weak symbol so that it can be overridden on architectures where it is required. Signed-off-by: Thierry Reding Signed-off-by: Bjorn Helgaas --- arch/x86/pci/visws.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/visws.c b/arch/x86/pci/visws.c index 9d736e7..3e6d2a6 100644 --- a/arch/x86/pci/visws.c +++ b/arch/x86/pci/visws.c @@ -62,11 +62,6 @@ out: return irq; } -void pcibios_update_irq(struct pci_dev *dev, int irq) -{ - pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq); -} - int __init pci_visws_init(void) { pcibios_enable_irq = &pci_visws_enable_irq; -- cgit v1.1 From 1e6dd8adc78d4a153db253d051fd4ef6c49c9019 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 5 Sep 2012 15:31:26 +0300 Subject: perf: Fix off by one test in perf_reg_value() The test should be >= ARRAY_SIZE() instead of > ARRAY_SIZE(). Signed-off-by: Dan Carpenter Acked-by: Jiri Olsa Acked-by: Peter Zijlstra Cc: Frederic Weisbecker Cc: Arnaldo Carvalho de Melo Link: http://lkml.kernel.org/r/20120905123126.GC6128@elgon.mountain Signed-off-by: Ingo Molnar --- arch/x86/kernel/perf_regs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c index c5a3e5c..e309cc5 100644 --- a/arch/x86/kernel/perf_regs.c +++ b/arch/x86/kernel/perf_regs.c @@ -57,7 +57,7 @@ static unsigned int pt_regs_offset[PERF_REG_X86_MAX] = { u64 perf_reg_value(struct pt_regs *regs, int idx) { - if (WARN_ON_ONCE(idx > ARRAY_SIZE(pt_regs_offset))) + if (WARN_ON_ONCE(idx >= ARRAY_SIZE(pt_regs_offset))) return 0; return regs_get_register(regs, pt_regs_offset[idx]); -- cgit v1.1 From 924e101a7ab6f884047f4344e5f1154a4bcd63a6 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 14 Sep 2012 18:37:46 +0200 Subject: x86/debug: Dump family, model, stepping of the boot CPU When acting on a user bug report, we find ourselves constantly asking for /proc/cpuinfo in order to know the exact family, model, stepping of the CPU in question. Instead of having to ask this, add this to dmesg so that it is visible and no ambiguities can ensue from looking at the official name string of the CPU coming from CPUID and trying to map it to f/m/s. Output then looks like this: [ 0.146041] smpboot: CPU0: AMD FX(tm)-8100 Eight-Core Processor (fam: 15, model: 01, stepping: 02) Signed-off-by: Borislav Petkov Cc: Andreas Herrmann Link: http://lkml.kernel.org/r/1347640666-13638-1-git-send-email-bp@amd64.org [ tweaked it minimally to add commas. ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index a5fbc3c..1cc48ff 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1023,14 +1023,16 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) printk(KERN_CONT "%s ", vendor); if (c->x86_model_id[0]) - printk(KERN_CONT "%s", c->x86_model_id); + printk(KERN_CONT "%s", strim(c->x86_model_id)); else printk(KERN_CONT "%d86", c->x86); + printk(KERN_CONT " (fam: %02x, model: %02x", c->x86, c->x86_model); + if (c->x86_mask || c->cpuid_level >= 0) - printk(KERN_CONT " stepping %02x\n", c->x86_mask); + printk(KERN_CONT ", stepping: %02x)\n", c->x86_mask); else - printk(KERN_CONT "\n"); + printk(KERN_CONT ")\n"); print_cpu_msr(c); } -- cgit v1.1 From e26a44a2d618a491d5c6a2a8aaf66ee03a94739f Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 18 Sep 2012 12:16:14 +0100 Subject: x86: Use REP BSF unconditionally Make "REP BSF" unconditional, as per the suggestion of hpa and Linus, this removes the insane BSF_PREFIX conditional and simplifies the logic. Suggested-by: "H. Peter Anvin" Suggested-by: Linus Torvalds Signed-off-by: Jan Beulich Link: http://lkml.kernel.org/r/5058741E020000780009C014@nat28.tlf.novell.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/bitops.h | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index b2af664..6dfd019 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -347,19 +347,6 @@ static int test_bit(int nr, const volatile unsigned long *addr); ? constant_test_bit((nr), (addr)) \ : variable_test_bit((nr), (addr))) -#if (defined(CONFIG_X86_GENERIC) || defined(CONFIG_GENERIC_CPU)) \ - && !defined(CONFIG_CC_OPTIMIZE_FOR_SIZE) -/* - * Since BSF and TZCNT have sufficiently similar semantics for the purposes - * for which we use them here, BMI-capable hardware will decode the prefixed - * variant as 'tzcnt ...' and may execute that faster than 'bsf ...', while - * older hardware will ignore the REP prefix and decode it as 'bsf ...'. - */ -# define BSF_PREFIX "rep;" -#else -# define BSF_PREFIX -#endif - /** * __ffs - find first set bit in word * @word: The word to search @@ -368,7 +355,7 @@ static int test_bit(int nr, const volatile unsigned long *addr); */ static inline unsigned long __ffs(unsigned long word) { - asm(BSF_PREFIX "bsf %1,%0" + asm("rep; bsf %1,%0" : "=r" (word) : "rm" (word)); return word; @@ -382,14 +369,12 @@ static inline unsigned long __ffs(unsigned long word) */ static inline unsigned long ffz(unsigned long word) { - asm(BSF_PREFIX "bsf %1,%0" + asm("rep; bsf %1,%0" : "=r" (word) : "r" (~word)); return word; } -#undef BSF_PREFIX - /* * __fls: find last set bit in word * @word: The word to search -- cgit v1.1 From 20a36e39d59757252edbbdcf9574ae2998733ce9 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 11 Sep 2012 01:07:01 +0200 Subject: perf/x86: Fix Intel Ivy Bridge support This patch updates the existing Intel IvyBridge (model 58) support with proper PEBS event constraints. It cannot reuse the same as SandyBridge because some events (0xd3) are specific to IvyBridge. Also there is no UOPS_DISPATCHED.THREAD on IVB, so do not populate the PERF_COUNT_HW_STALLED_CYCLES_BACKEND mapping. Signed-off-by: Stephane Eranian Cc: peterz@infradead.org Cc: ak@linux.intel.com Link: http://lkml.kernel.org/r/20120910230701.GA5898@quad Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.h | 2 ++ arch/x86/kernel/cpu/perf_event_intel.c | 24 +++++++++++++++++++++++- arch/x86/kernel/cpu/perf_event_intel_ds.c | 14 ++++++++++++++ 3 files changed, 39 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 6605a81..8b6defe 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -586,6 +586,8 @@ extern struct event_constraint intel_westmere_pebs_event_constraints[]; extern struct event_constraint intel_snb_pebs_event_constraints[]; +extern struct event_constraint intel_ivb_pebs_event_constraints[]; + struct event_constraint *intel_pebs_constraints(struct perf_event *event); void intel_pmu_pebs_enable(struct perf_event *event); diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 0d3d63a..6bca492 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -2048,7 +2048,6 @@ __init int intel_pmu_init(void) case 42: /* SandyBridge */ case 45: /* SandyBridge, "Romely-EP" */ x86_add_quirk(intel_sandybridge_quirk); - case 58: /* IvyBridge */ memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids)); memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, @@ -2073,6 +2072,29 @@ __init int intel_pmu_init(void) pr_cont("SandyBridge events, "); break; + case 58: /* IvyBridge */ + memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, + sizeof(hw_cache_extra_regs)); + + intel_pmu_lbr_init_snb(); + + x86_pmu.event_constraints = intel_snb_event_constraints; + x86_pmu.pebs_constraints = intel_ivb_pebs_event_constraints; + x86_pmu.pebs_aliases = intel_pebs_aliases_snb; + x86_pmu.extra_regs = intel_snb_extra_regs; + /* all extra regs are per-cpu when HT is on */ + x86_pmu.er_flags |= ERF_HAS_RSP_1; + x86_pmu.er_flags |= ERF_NO_HT_SHARING; + + /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */ + intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = + X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1); + + pr_cont("IvyBridge events, "); + break; + default: switch (x86_pmu.version) { diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index e38d97b..826054a 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -407,6 +407,20 @@ struct event_constraint intel_snb_pebs_event_constraints[] = { EVENT_CONSTRAINT_END }; +struct event_constraint intel_ivb_pebs_event_constraints[] = { + INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */ + INTEL_UEVENT_CONSTRAINT(0x01c2, 0xf), /* UOPS_RETIRED.ALL */ + INTEL_UEVENT_CONSTRAINT(0x02c2, 0xf), /* UOPS_RETIRED.RETIRE_SLOTS */ + INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */ + EVENT_CONSTRAINT_END +}; + struct event_constraint *intel_pebs_constraints(struct perf_event *event) { struct event_constraint *c; -- cgit v1.1 From 4b8073e467e6a66b6a5a8e799d28bc3b243c0d78 Mon Sep 17 00:00:00 2001 From: Peter Senna Tschudin Date: Tue, 18 Sep 2012 18:36:14 +0200 Subject: arch/x86: Remove unecessary semicolons Found by http://coccinelle.lip6.fr/ Signed-off-by: Peter Senna Tschudin Cc: avi@redhat.com Cc: mtosatti@redhat.com Cc: a.p.zijlstra@chello.nl Cc: rusty@rustcorp.com.au Cc: masami.hiramatsu.pt@hitachi.com Cc: suresh.b.siddha@intel.com Cc: joerg.roedel@amd.com Cc: agordeev@redhat.com Cc: yinghai@kernel.org Cc: bhelgaas@google.com Cc: liuj97@gmail.com Link: http://lkml.kernel.org/r/1347986174-30287-7-git-send-email-peter.senna@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/alternative.c | 4 ++-- arch/x86/kernel/apic/apic.c | 2 +- arch/x86/kvm/vmx.c | 2 +- arch/x86/pci/mmconfig-shared.c | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index ced4534..3318b1e 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -317,7 +317,7 @@ static void alternatives_smp_lock(const s32 *start, const s32 *end, /* turn DS segment override prefix into lock prefix */ if (*ptr == 0x3e) text_poke(ptr, ((unsigned char []){0xf0}), 1); - }; + } mutex_unlock(&text_mutex); } @@ -338,7 +338,7 @@ static void alternatives_smp_unlock(const s32 *start, const s32 *end, /* turn lock prefix into DS segment override prefix */ if (*ptr == 0xf0) text_poke(ptr, ((unsigned char []){0x3E}), 1); - }; + } mutex_unlock(&text_mutex); } diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 24deb30..b17416e 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1934,7 +1934,7 @@ void smp_error_interrupt(struct pt_regs *regs) apic_printk(APIC_DEBUG, KERN_CONT " : %s", error_interrupt_reason[i]); i++; v1 >>= 1; - }; + } apic_printk(APIC_DEBUG, KERN_CONT "\n"); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index b1eb202..b06737d 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4543,7 +4543,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) vcpu->run->exit_reason = KVM_EXIT_SET_TPR; return 0; } - }; + } break; case 2: /* clts */ handle_clts(vcpu); diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 937bcec..704b9ec 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -585,7 +585,7 @@ static int __init pci_parse_mcfg(struct acpi_table_header *header) while (i >= sizeof(struct acpi_mcfg_allocation)) { entries++; i -= sizeof(struct acpi_mcfg_allocation); - }; + } if (entries == 0) { pr_err(PREFIX "MMCONFIG has no entries\n"); return -ENODEV; -- cgit v1.1 From 2d297480037e1d9100ca504737820c1bf65db6c0 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 5 Sep 2012 15:30:42 +0300 Subject: x86, microcode, AMD: Fix use after free in free_cache() list_for_each_entry_reverse() dereferences the iterator, but we already freed it. I don't see a reason that this has to be done in reverse order so change it to use list_for_each_entry_safe(). Signed-off-by: Dan Carpenter Signed-off-by: Borislav Petkov --- arch/x86/kernel/microcode_amd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 5511216..7720ff5 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -150,9 +150,9 @@ static void update_cache(struct ucode_patch *new_patch) static void free_cache(void) { - struct ucode_patch *p; + struct ucode_patch *p, *tmp; - list_for_each_entry_reverse(p, &pcache, plist) { + list_for_each_entry_safe(p, tmp, &pcache, plist) { __list_del(p->plist.prev, p->plist.next); kfree(p->data); kfree(p); -- cgit v1.1 From bd49940a35ec7d488ae63bd625639893b3385b97 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Wed, 19 Sep 2012 08:30:55 -0400 Subject: xen/boot: Disable BIOS SMP MP table search. As the initial domain we are able to search/map certain regions of memory to harvest configuration data. For all low-level we use ACPI tables - for interrupts we use exclusively ACPI _PRT (so DSDT) and MADT for INT_SRC_OVR. The SMP MP table is not used at all. As a matter of fact we do not even support machines that only have SMP MP but no ACPI tables. Lets follow how Moorestown does it and just disable searching for BIOS SMP tables. This also fixes an issue on HP Proliant BL680c G5 and DL380 G6: 9f->100 for 1:1 PTE Freeing 9f-100 pfn range: 97 pages freed 1-1 mapping on 9f->100 .. snip.. e820: BIOS-provided physical RAM map: Xen: [mem 0x0000000000000000-0x000000000009efff] usable Xen: [mem 0x000000000009f400-0x00000000000fffff] reserved Xen: [mem 0x0000000000100000-0x00000000cfd1dfff] usable .. snip.. Scan for SMP in [mem 0x00000000-0x000003ff] Scan for SMP in [mem 0x0009fc00-0x0009ffff] Scan for SMP in [mem 0x000f0000-0x000fffff] found SMP MP-table at [mem 0x000f4fa0-0x000f4faf] mapped at [ffff8800000f4fa0] (XEN) mm.c:908:d0 Error getting mfn 100 (pfn 5555555555555555) from L1 entry 0000000000100461 for l1e_owner=0, pg_owner=0 (XEN) mm.c:4995:d0 ptwr_emulate: could not get_page_from_l1e() BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] xen_set_pte_init+0x66/0x71 . snip.. Pid: 0, comm: swapper Not tainted 3.6.0-rc6upstream-00188-gb6fb969-dirty #2 HP ProLiant BL680c G5 .. snip.. Call Trace: [] __early_ioremap+0x18a/0x248 [] ? printk+0x48/0x4a [] early_ioremap+0x13/0x15 [] get_mpc_size+0x2f/0x67 [] smp_scan_config+0x10c/0x136 [] default_find_smp_config+0x36/0x5a [] setup_arch+0x5b3/0xb5b [] ? printk+0x48/0x4a [] start_kernel+0x90/0x390 [] x86_64_start_reservations+0x131/0x136 [] xen_start_kernel+0x65f/0x661 (XEN) Domain 0 crashed: 'noreboot' set - not rebooting. which is that ioremap would end up mapping 0xff using _PAGE_IOMAP (which is what early_ioremap sticks as a flag) - which meant we would get MFN 0xFF (pte ff461, which is OK), and then it would also map 0x100 (b/c ioremap tries to get page aligned request, and it was trying to map 0xf4fa0 + PAGE_SIZE - so it mapped the next page) as _PAGE_IOMAP. Since 0x100 is actually a RAM page, and the _PAGE_IOMAP bypasses the P2M lookup we would happily set the PTE to 1000461. Xen would deny the request since we do not have access to the Machine Frame Number (MFN) of 0x100. The P2M[0x100] is for example 0x80140. CC: stable@vger.kernel.org Fixes-Oracle-Bugzilla: https://bugzilla.oracle.com/bugzilla/show_bug.cgi?id=13665 Acked-by: Jan Beulich Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/enlighten.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 9642d4a..1fbe75a 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1452,6 +1452,10 @@ asmlinkage void __init xen_start_kernel(void) pci_request_acs(); xen_acpi_sleep_register(); + + /* Avoid searching for BIOS MP tables */ + x86_init.mpparse.find_smp_config = x86_init_noop; + x86_init.mpparse.get_smp_config = x86_init_uint_noop; } #ifdef CONFIG_PCI /* PCI BIOS service won't work from a PV guest. */ -- cgit v1.1 From 8ea667f259e3767fd3ee85a885c14e417835695e Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 12 Sep 2012 13:44:53 +0300 Subject: KVM: MMU: Push clean gpte write protection out of gpte_access() gpte_access() computes the access permissions of a guest pte and also write-protects clean gptes. This is wrong when we are servicing a write fault (since we'll be setting the dirty bit momentarily) but correct when instantiating a speculative spte, or when servicing a read fault (since we'll want to trap a following write in order to set the dirty bit). It doesn't seem to hurt in practice, but in order to make the code readable, push the write protection out of gpte_access() and into a new protect_clean_gpte() which is called explicitly when needed. Reviewed-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 12 ++++++++++++ arch/x86/kvm/mmu.h | 3 ++- arch/x86/kvm/paging_tmpl.h | 24 ++++++++++++------------ 3 files changed, 26 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index aa0b469..54c9cb4 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3408,6 +3408,18 @@ static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level) return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0; } +static inline void protect_clean_gpte(unsigned *access, unsigned gpte) +{ + unsigned mask; + + BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK); + + mask = (unsigned)~ACC_WRITE_MASK; + /* Allow write access to dirty gptes */ + mask |= (gpte >> (PT_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) & PT_WRITABLE_MASK; + *access &= mask; +} + static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access, int *nr_present) { diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index e374db9..2832081 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -18,7 +18,8 @@ #define PT_PCD_MASK (1ULL << 4) #define PT_ACCESSED_SHIFT 5 #define PT_ACCESSED_MASK (1ULL << PT_ACCESSED_SHIFT) -#define PT_DIRTY_MASK (1ULL << 6) +#define PT_DIRTY_SHIFT 6 +#define PT_DIRTY_MASK (1ULL << PT_DIRTY_SHIFT) #define PT_PAGE_SIZE_MASK (1ULL << 7) #define PT_PAT_MASK (1ULL << 7) #define PT_GLOBAL_MASK (1ULL << 8) diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index bf8c42b..bf7b4ff 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -101,14 +101,11 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, return (ret != orig_pte); } -static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte, - bool last) +static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte) { unsigned access; access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; - if (last && !is_dirty_gpte(gpte)) - access &= ~ACC_WRITE_MASK; #if PTTYPE == 64 if (vcpu->arch.mmu.nx) @@ -222,8 +219,7 @@ retry_walk: last_gpte = FNAME(is_last_gpte)(walker, vcpu, mmu, pte); if (last_gpte) { - pte_access = pt_access & - FNAME(gpte_access)(vcpu, pte, true); + pte_access = pt_access & FNAME(gpte_access)(vcpu, pte); /* check if the kernel is fetching from user page */ if (unlikely(pte_access & PT_USER_MASK) && kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)) @@ -274,7 +270,7 @@ retry_walk: break; } - pt_access &= FNAME(gpte_access)(vcpu, pte, false); + pt_access &= FNAME(gpte_access)(vcpu, pte); --walker->level; } @@ -283,7 +279,9 @@ retry_walk: goto error; } - if (write_fault && unlikely(!is_dirty_gpte(pte))) { + if (!write_fault) + protect_clean_gpte(&pte_access, pte); + else if (unlikely(!is_dirty_gpte(pte))) { int ret; trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); @@ -368,7 +366,8 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, return; pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); - pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte, true); + pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); + protect_clean_gpte(&pte_access, gpte); pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte)); if (mmu_invalid_pfn(pfn)) return; @@ -441,8 +440,8 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) continue; - pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte, - true); + pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); + protect_clean_gpte(&pte_access, gpte); gfn = gpte_to_gfn(gpte); pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, pte_access & ACC_WRITE_MASK); @@ -794,7 +793,8 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) gfn = gpte_to_gfn(gpte); pte_access = sp->role.access; - pte_access &= FNAME(gpte_access)(vcpu, gpte, true); + pte_access &= FNAME(gpte_access)(vcpu, gpte); + protect_clean_gpte(&pte_access, gpte); if (sync_mmio_spte(&sp->spt[i], gfn, pte_access, &nr_present)) continue; -- cgit v1.1 From edc2ae84eb40a3c062210fe01af1cae1633cc810 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 12 Sep 2012 13:53:08 +0300 Subject: KVM: MMU: Optimize gpte_access() slightly If nx is disabled, then is gpte[63] is set we will hit a reserved bit set fault before checking permissions; so we can ignore the setting of efer.nxe. Reviewed-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/paging_tmpl.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index bf7b4ff..064bcb3 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -106,10 +106,8 @@ static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte) unsigned access; access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; - #if PTTYPE == 64 - if (vcpu->arch.mmu.nx) - access &= ~(gpte >> PT64_NX_SHIFT); + access &= ~(gpte >> PT64_NX_SHIFT); #endif return access; } -- cgit v1.1 From 3d34adec7081621ff51c195be045b87d75c0c49d Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 12 Sep 2012 14:03:28 +0300 Subject: KVM: MMU: Move gpte_access() out of paging_tmpl.h We no longer rely on paging_tmpl.h defines; so we can move the function to mmu.c. Rely on zero extension to 64 bits to get the correct nx behaviour. Reviewed-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 10 ++++++++++ arch/x86/kvm/paging_tmpl.h | 21 +++++---------------- 2 files changed, 15 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 54c9cb4..f297a2c 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3437,6 +3437,16 @@ static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access, return false; } +static inline unsigned gpte_access(struct kvm_vcpu *vcpu, u64 gpte) +{ + unsigned access; + + access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; + access &= ~(gpte >> PT64_NX_SHIFT); + + return access; +} + #define PTTYPE 64 #include "paging_tmpl.h" #undef PTTYPE diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 064bcb3..1cbf576 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -101,17 +101,6 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, return (ret != orig_pte); } -static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte) -{ - unsigned access; - - access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; -#if PTTYPE == 64 - access &= ~(gpte >> PT64_NX_SHIFT); -#endif - return access; -} - static bool FNAME(is_last_gpte)(struct guest_walker *walker, struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, pt_element_t gpte) @@ -217,7 +206,7 @@ retry_walk: last_gpte = FNAME(is_last_gpte)(walker, vcpu, mmu, pte); if (last_gpte) { - pte_access = pt_access & FNAME(gpte_access)(vcpu, pte); + pte_access = pt_access & gpte_access(vcpu, pte); /* check if the kernel is fetching from user page */ if (unlikely(pte_access & PT_USER_MASK) && kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)) @@ -268,7 +257,7 @@ retry_walk: break; } - pt_access &= FNAME(gpte_access)(vcpu, pte); + pt_access &= gpte_access(vcpu, pte); --walker->level; } @@ -364,7 +353,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, return; pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); - pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); + pte_access = sp->role.access & gpte_access(vcpu, gpte); protect_clean_gpte(&pte_access, gpte); pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte)); if (mmu_invalid_pfn(pfn)) @@ -438,7 +427,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) continue; - pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); + pte_access = sp->role.access & gpte_access(vcpu, gpte); protect_clean_gpte(&pte_access, gpte); gfn = gpte_to_gfn(gpte); pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, @@ -791,7 +780,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) gfn = gpte_to_gfn(gpte); pte_access = sp->role.access; - pte_access &= FNAME(gpte_access)(vcpu, gpte); + pte_access &= gpte_access(vcpu, gpte); protect_clean_gpte(&pte_access, gpte); if (sync_mmio_spte(&sp->spt[i], gfn, pte_access, &nr_present)) -- cgit v1.1 From 8cbc70696f149e44753b0fe60162b4ff96c2dd2b Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 16 Sep 2012 14:18:51 +0300 Subject: KVM: MMU: Update accessed and dirty bits after guest pagetable walk While unspecified, the behaviour of Intel processors is to first perform the page table walk, then, if the walk was successful, to atomically update the accessed and dirty bits of walked paging elements. While we are not required to follow this exactly, doing so will allow us to perform the access permissions check after the walk is complete, rather than after each walk step. (the tricky case is SMEP: a zero in any pte's U bit makes the referenced page a supervisor page, so we can't fault on a one bit during the walk itself). Reviewed-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/paging_tmpl.h | 76 ++++++++++++++++++++++++++++------------------ 1 file changed, 47 insertions(+), 29 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 1cbf576..35a05dd 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -63,10 +63,12 @@ */ struct guest_walker { int level; + unsigned max_level; gfn_t table_gfn[PT_MAX_FULL_LEVELS]; pt_element_t ptes[PT_MAX_FULL_LEVELS]; pt_element_t prefetch_ptes[PTE_PREFETCH_NUM]; gpa_t pte_gpa[PT_MAX_FULL_LEVELS]; + pt_element_t __user *ptep_user[PT_MAX_FULL_LEVELS]; unsigned pt_access; unsigned pte_access; gfn_t gfn; @@ -119,6 +121,43 @@ static bool FNAME(is_last_gpte)(struct guest_walker *walker, return false; } +static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, + struct kvm_mmu *mmu, + struct guest_walker *walker, + int write_fault) +{ + unsigned level, index; + pt_element_t pte, orig_pte; + pt_element_t __user *ptep_user; + gfn_t table_gfn; + int ret; + + for (level = walker->max_level; level >= walker->level; --level) { + pte = orig_pte = walker->ptes[level - 1]; + table_gfn = walker->table_gfn[level - 1]; + ptep_user = walker->ptep_user[level - 1]; + index = offset_in_page(ptep_user) / sizeof(pt_element_t); + if (!(pte & PT_ACCESSED_MASK)) { + trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte)); + pte |= PT_ACCESSED_MASK; + } + if (level == walker->level && write_fault && !is_dirty_gpte(pte)) { + trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); + pte |= PT_DIRTY_MASK; + } + if (pte == orig_pte) + continue; + + ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, orig_pte, pte); + if (ret) + return ret; + + mark_page_dirty(vcpu->kvm, table_gfn); + walker->ptes[level] = pte; + } + return 0; +} + /* * Fetch a guest pte for a guest virtual address */ @@ -126,6 +165,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, gva_t addr, u32 access) { + int ret; pt_element_t pte; pt_element_t __user *uninitialized_var(ptep_user); gfn_t table_gfn; @@ -153,6 +193,7 @@ retry_walk: --walker->level; } #endif + walker->max_level = walker->level; ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0); @@ -183,6 +224,7 @@ retry_walk: ptep_user = (pt_element_t __user *)((void *)host_addr + offset); if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte)))) goto error; + walker->ptep_user[walker->level - 1] = ptep_user; trace_kvm_mmu_paging_element(pte, walker->level); @@ -214,21 +256,6 @@ retry_walk: eperm = true; } - if (!eperm && unlikely(!(pte & PT_ACCESSED_MASK))) { - int ret; - trace_kvm_mmu_set_accessed_bit(table_gfn, index, - sizeof(pte)); - ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, - pte, pte|PT_ACCESSED_MASK); - if (unlikely(ret < 0)) - goto error; - else if (ret) - goto retry_walk; - - mark_page_dirty(vcpu->kvm, table_gfn); - pte |= PT_ACCESSED_MASK; - } - walker->ptes[walker->level - 1] = pte; if (last_gpte) { @@ -268,21 +295,12 @@ retry_walk: if (!write_fault) protect_clean_gpte(&pte_access, pte); - else if (unlikely(!is_dirty_gpte(pte))) { - int ret; - trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); - ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, - pte, pte|PT_DIRTY_MASK); - if (unlikely(ret < 0)) - goto error; - else if (ret) - goto retry_walk; - - mark_page_dirty(vcpu->kvm, table_gfn); - pte |= PT_DIRTY_MASK; - walker->ptes[walker->level - 1] = pte; - } + ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault); + if (unlikely(ret < 0)) + goto error; + else if (ret) + goto retry_walk; walker->pt_access = pt_access; walker->pte_access = pte_access; -- cgit v1.1 From 97d64b788114be1c4dc4bfe7a8ba2bf9643fe6af Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 12 Sep 2012 14:52:00 +0300 Subject: KVM: MMU: Optimize pte permission checks walk_addr_generic() permission checks are a maze of branchy code, which is performed four times per lookup. It depends on the type of access, efer.nxe, cr0.wp, cr4.smep, and in the near future, cr4.smap. Optimize this away by precalculating all variants and storing them in a bitmap. The bitmap is recalculated when rarely-changing variables change (cr0, cr4) and is indexed by the often-changing variables (page fault error code, pte access permissions). The permission check is moved to the end of the loop, otherwise an SMEP fault could be reported as a false positive, when PDE.U=1 but PTE.U=0. Noted by Xiao Guangrong. The result is short, branch-free code. Reviewed-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 7 +++++++ arch/x86/kvm/mmu.c | 38 ++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/mmu.h | 19 ++++++++----------- arch/x86/kvm/paging_tmpl.h | 22 ++++------------------ arch/x86/kvm/x86.c | 11 ++++------- 5 files changed, 61 insertions(+), 36 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 64adb61..3318bde 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -287,6 +287,13 @@ struct kvm_mmu { union kvm_mmu_page_role base_role; bool direct_map; + /* + * Bitmap; bit set = permission fault + * Byte index: page fault error code [4:1] + * Bit index: pte permissions in ACC_* format + */ + u8 permissions[16]; + u64 *pae_root; u64 *lm_root; u64 rsvd_bits_mask[2][4]; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index f297a2c..9c61889 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3516,6 +3516,38 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, } } +static void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) +{ + unsigned bit, byte, pfec; + u8 map; + bool fault, x, w, u, wf, uf, ff, smep; + + smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP); + for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) { + pfec = byte << 1; + map = 0; + wf = pfec & PFERR_WRITE_MASK; + uf = pfec & PFERR_USER_MASK; + ff = pfec & PFERR_FETCH_MASK; + for (bit = 0; bit < 8; ++bit) { + x = bit & ACC_EXEC_MASK; + w = bit & ACC_WRITE_MASK; + u = bit & ACC_USER_MASK; + + /* Not really needed: !nx will cause pte.nx to fault */ + x |= !mmu->nx; + /* Allow supervisor writes if !cr0.wp */ + w |= !is_write_protection(vcpu) && !uf; + /* Disallow supervisor fetches of user code if cr4.smep */ + x &= !(smep && u && !uf); + + fault = (ff && !x) || (uf && !u) || (wf && !w); + map |= fault << bit; + } + mmu->permissions[byte] = map; + } +} + static int paging64_init_context_common(struct kvm_vcpu *vcpu, struct kvm_mmu *context, int level) @@ -3524,6 +3556,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, context->root_level = level; reset_rsvds_bits_mask(vcpu, context); + update_permission_bitmask(vcpu, context); ASSERT(is_pae(vcpu)); context->new_cr3 = paging_new_cr3; @@ -3552,6 +3585,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu, context->root_level = PT32_ROOT_LEVEL; reset_rsvds_bits_mask(vcpu, context); + update_permission_bitmask(vcpu, context); context->new_cr3 = paging_new_cr3; context->page_fault = paging32_page_fault; @@ -3612,6 +3646,8 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->gva_to_gpa = paging32_gva_to_gpa; } + update_permission_bitmask(vcpu, context); + return 0; } @@ -3687,6 +3723,8 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu) g_context->gva_to_gpa = paging32_gva_to_gpa_nested; } + update_permission_bitmask(vcpu, g_context); + return 0; } diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 2832081..5846607 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -89,17 +89,14 @@ static inline bool is_write_protection(struct kvm_vcpu *vcpu) return kvm_read_cr0_bits(vcpu, X86_CR0_WP); } -static inline bool check_write_user_access(struct kvm_vcpu *vcpu, - bool write_fault, bool user_fault, - unsigned long pte) +/* + * Will a fault with a given page-fault error code (pfec) cause a permission + * fault with the given access (in ACC_* format)? + */ +static inline bool permission_fault(struct kvm_mmu *mmu, unsigned pte_access, + unsigned pfec) { - if (unlikely(write_fault && !is_writable_pte(pte) - && (user_fault || is_write_protection(vcpu)))) - return false; - - if (unlikely(user_fault && !(pte & PT_USER_MASK))) - return false; - - return true; + return (mmu->permissions[pfec >> 1] >> pte_access) & 1; } + #endif diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 35a05dd..8f6c59f 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -169,7 +169,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, pt_element_t pte; pt_element_t __user *uninitialized_var(ptep_user); gfn_t table_gfn; - unsigned index, pt_access, uninitialized_var(pte_access); + unsigned index, pt_access, pte_access; gpa_t pte_gpa; bool eperm, last_gpte; int offset; @@ -237,24 +237,9 @@ retry_walk: goto error; } - if (!check_write_user_access(vcpu, write_fault, user_fault, - pte)) - eperm = true; - -#if PTTYPE == 64 - if (unlikely(fetch_fault && (pte & PT64_NX_MASK))) - eperm = true; -#endif + pte_access = pt_access & gpte_access(vcpu, pte); last_gpte = FNAME(is_last_gpte)(walker, vcpu, mmu, pte); - if (last_gpte) { - pte_access = pt_access & gpte_access(vcpu, pte); - /* check if the kernel is fetching from user page */ - if (unlikely(pte_access & PT_USER_MASK) && - kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)) - if (fetch_fault && !user_fault) - eperm = true; - } walker->ptes[walker->level - 1] = pte; @@ -284,10 +269,11 @@ retry_walk: break; } - pt_access &= gpte_access(vcpu, pte); + pt_access &= pte_access; --walker->level; } + eperm |= permission_fault(mmu, pte_access, access); if (unlikely(eperm)) { errcode |= PFERR_PRESENT_MASK; goto error; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 19047ea..497226e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3672,20 +3672,17 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, gpa_t *gpa, struct x86_exception *exception, bool write) { - u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + u32 access = ((kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0) + | (write ? PFERR_WRITE_MASK : 0); - if (vcpu_match_mmio_gva(vcpu, gva) && - check_write_user_access(vcpu, write, access, - vcpu->arch.access)) { + if (vcpu_match_mmio_gva(vcpu, gva) + && !permission_fault(vcpu->arch.walk_mmu, vcpu->arch.access, access)) { *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT | (gva & (PAGE_SIZE - 1)); trace_vcpu_match_mmio(gva, *gpa, write, false); return 1; } - if (write) - access |= PFERR_WRITE_MASK; - *gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); if (*gpa == UNMAPPED_GVA) -- cgit v1.1 From 13d22b6aebb000aeaf137862c6c0e0c4d138d798 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 12 Sep 2012 15:12:09 +0300 Subject: KVM: MMU: Simplify walk_addr_generic() loop The page table walk is coded as an infinite loop, with a special case on the last pte. Code it as an ordinary loop with a termination condition on the last pte (large page or walk length exhausted), and put the last pte handling code after the loop where it belongs. Reviewed-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/paging_tmpl.h | 60 +++++++++++++++++++--------------------------- 1 file changed, 25 insertions(+), 35 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 8f6c59f..1b4c14d 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -171,12 +171,15 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, gfn_t table_gfn; unsigned index, pt_access, pte_access; gpa_t pte_gpa; - bool eperm, last_gpte; + bool eperm; int offset; const int write_fault = access & PFERR_WRITE_MASK; const int user_fault = access & PFERR_USER_MASK; const int fetch_fault = access & PFERR_FETCH_MASK; u16 errcode = 0; + gpa_t real_gpa; + gfn_t gfn; + u32 ac; trace_kvm_mmu_pagetable_walk(addr, access); retry_walk: @@ -197,12 +200,16 @@ retry_walk: ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0); - pt_access = ACC_ALL; + pt_access = pte_access = ACC_ALL; + ++walker->level; - for (;;) { + do { gfn_t real_gfn; unsigned long host_addr; + pt_access &= pte_access; + --walker->level; + index = PT_INDEX(addr, walker->level); table_gfn = gpte_to_gfn(pte); @@ -239,39 +246,8 @@ retry_walk: pte_access = pt_access & gpte_access(vcpu, pte); - last_gpte = FNAME(is_last_gpte)(walker, vcpu, mmu, pte); - walker->ptes[walker->level - 1] = pte; - - if (last_gpte) { - int lvl = walker->level; - gpa_t real_gpa; - gfn_t gfn; - u32 ac; - - gfn = gpte_to_gfn_lvl(pte, lvl); - gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT; - - if (PTTYPE == 32 && - walker->level == PT_DIRECTORY_LEVEL && - is_cpuid_PSE36()) - gfn += pse36_gfn_delta(pte); - - ac = write_fault | fetch_fault | user_fault; - - real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), - ac); - if (real_gpa == UNMAPPED_GVA) - return 0; - - walker->gfn = real_gpa >> PAGE_SHIFT; - - break; - } - - pt_access &= pte_access; - --walker->level; - } + } while (!FNAME(is_last_gpte)(walker, vcpu, mmu, pte)); eperm |= permission_fault(mmu, pte_access, access); if (unlikely(eperm)) { @@ -279,6 +255,20 @@ retry_walk: goto error; } + gfn = gpte_to_gfn_lvl(pte, walker->level); + gfn += (addr & PT_LVL_OFFSET_MASK(walker->level)) >> PAGE_SHIFT; + + if (PTTYPE == 32 && walker->level == PT_DIRECTORY_LEVEL && is_cpuid_PSE36()) + gfn += pse36_gfn_delta(pte); + + ac = write_fault | fetch_fault | user_fault; + + real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), ac); + if (real_gpa == UNMAPPED_GVA) + return 0; + + walker->gfn = real_gpa >> PAGE_SHIFT; + if (!write_fault) protect_clean_gpte(&pte_access, pte); -- cgit v1.1 From 6fd01b711bee96ce3356f7b6f370ab708e37504b Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 12 Sep 2012 20:46:56 +0300 Subject: KVM: MMU: Optimize is_last_gpte() Instead of branchy code depending on level, gpte.ps, and mmu configuration, prepare everything in a bitmap during mode changes and look it up during runtime. Reviewed-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 7 +++++++ arch/x86/kvm/mmu.c | 31 +++++++++++++++++++++++++++++++ arch/x86/kvm/mmu.h | 3 ++- arch/x86/kvm/paging_tmpl.h | 20 +------------------- 4 files changed, 41 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 3318bde..43aeb94 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -298,6 +298,13 @@ struct kvm_mmu { u64 *lm_root; u64 rsvd_bits_mask[2][4]; + /* + * Bitmap: bit set = last pte in walk + * index[0:1]: level (zero-based) + * index[2]: pte.ps + */ + u8 last_pte_bitmap; + bool nx; u64 pdptrs[4]; /* pae */ diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 9c61889..d289fee 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3447,6 +3447,15 @@ static inline unsigned gpte_access(struct kvm_vcpu *vcpu, u64 gpte) return access; } +static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gpte) +{ + unsigned index; + + index = level - 1; + index |= (gpte & PT_PAGE_SIZE_MASK) >> (PT_PAGE_SIZE_SHIFT - 2); + return mmu->last_pte_bitmap & (1 << index); +} + #define PTTYPE 64 #include "paging_tmpl.h" #undef PTTYPE @@ -3548,6 +3557,24 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu } } +static void update_last_pte_bitmap(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) +{ + u8 map; + unsigned level, root_level = mmu->root_level; + const unsigned ps_set_index = 1 << 2; /* bit 2 of index: ps */ + + if (root_level == PT32E_ROOT_LEVEL) + --root_level; + /* PT_PAGE_TABLE_LEVEL always terminates */ + map = 1 | (1 << ps_set_index); + for (level = PT_DIRECTORY_LEVEL; level <= root_level; ++level) { + if (level <= PT_PDPE_LEVEL + && (mmu->root_level >= PT32E_ROOT_LEVEL || is_pse(vcpu))) + map |= 1 << (ps_set_index | (level - 1)); + } + mmu->last_pte_bitmap = map; +} + static int paging64_init_context_common(struct kvm_vcpu *vcpu, struct kvm_mmu *context, int level) @@ -3557,6 +3584,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, reset_rsvds_bits_mask(vcpu, context); update_permission_bitmask(vcpu, context); + update_last_pte_bitmap(vcpu, context); ASSERT(is_pae(vcpu)); context->new_cr3 = paging_new_cr3; @@ -3586,6 +3614,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu, reset_rsvds_bits_mask(vcpu, context); update_permission_bitmask(vcpu, context); + update_last_pte_bitmap(vcpu, context); context->new_cr3 = paging_new_cr3; context->page_fault = paging32_page_fault; @@ -3647,6 +3676,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) } update_permission_bitmask(vcpu, context); + update_last_pte_bitmap(vcpu, context); return 0; } @@ -3724,6 +3754,7 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu) } update_permission_bitmask(vcpu, g_context); + update_last_pte_bitmap(vcpu, g_context); return 0; } diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 5846607..6987108 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -20,7 +20,8 @@ #define PT_ACCESSED_MASK (1ULL << PT_ACCESSED_SHIFT) #define PT_DIRTY_SHIFT 6 #define PT_DIRTY_MASK (1ULL << PT_DIRTY_SHIFT) -#define PT_PAGE_SIZE_MASK (1ULL << 7) +#define PT_PAGE_SIZE_SHIFT 7 +#define PT_PAGE_SIZE_MASK (1ULL << PT_PAGE_SIZE_SHIFT) #define PT_PAT_MASK (1ULL << 7) #define PT_GLOBAL_MASK (1ULL << 8) #define PT64_NX_SHIFT 63 diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 1b4c14d..134ea7b 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -103,24 +103,6 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, return (ret != orig_pte); } -static bool FNAME(is_last_gpte)(struct guest_walker *walker, - struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, - pt_element_t gpte) -{ - if (walker->level == PT_PAGE_TABLE_LEVEL) - return true; - - if ((walker->level == PT_DIRECTORY_LEVEL) && is_large_pte(gpte) && - (PTTYPE == 64 || is_pse(vcpu))) - return true; - - if ((walker->level == PT_PDPE_LEVEL) && is_large_pte(gpte) && - (mmu->root_level == PT64_ROOT_LEVEL)) - return true; - - return false; -} - static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, struct guest_walker *walker, @@ -247,7 +229,7 @@ retry_walk: pte_access = pt_access & gpte_access(vcpu, pte); walker->ptes[walker->level - 1] = pte; - } while (!FNAME(is_last_gpte)(walker, vcpu, mmu, pte)); + } while (!is_last_gpte(mmu, walker->level, pte)); eperm |= permission_fault(mmu, pte_access, access); if (unlikely(eperm)) { -- cgit v1.1 From 71331a1da1e3a66d14bb3864f99e32d84ab5a76f Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 16 Sep 2012 14:49:15 +0300 Subject: KVM: MMU: Eliminate eperm temporary 'eperm' is no longer used in the walker loop, so we can eliminate it. Reviewed-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/paging_tmpl.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 134ea7b..95a64d1 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -153,7 +153,6 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, gfn_t table_gfn; unsigned index, pt_access, pte_access; gpa_t pte_gpa; - bool eperm; int offset; const int write_fault = access & PFERR_WRITE_MASK; const int user_fault = access & PFERR_USER_MASK; @@ -165,7 +164,6 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, trace_kvm_mmu_pagetable_walk(addr, access); retry_walk: - eperm = false; walker->level = mmu->root_level; pte = mmu->get_cr3(vcpu); @@ -231,8 +229,7 @@ retry_walk: walker->ptes[walker->level - 1] = pte; } while (!is_last_gpte(mmu, walker->level, pte)); - eperm |= permission_fault(mmu, pte_access, access); - if (unlikely(eperm)) { + if (unlikely(permission_fault(mmu, pte_access, access))) { errcode |= PFERR_PRESENT_MASK; goto error; } -- cgit v1.1 From b514c30f7729b66af481fde3c1225e832e339d5b Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 16 Sep 2012 15:03:02 +0300 Subject: KVM: MMU: Avoid access/dirty update loop if all is well Keep track of accessed/dirty bits; if they are all set, do not enter the accessed/dirty update loop. Reviewed-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/paging_tmpl.h | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 95a64d1..810c1da 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -151,7 +151,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, pt_element_t pte; pt_element_t __user *uninitialized_var(ptep_user); gfn_t table_gfn; - unsigned index, pt_access, pte_access; + unsigned index, pt_access, pte_access, accessed_dirty, shift; gpa_t pte_gpa; int offset; const int write_fault = access & PFERR_WRITE_MASK; @@ -180,6 +180,7 @@ retry_walk: ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0); + accessed_dirty = PT_ACCESSED_MASK; pt_access = pte_access = ACC_ALL; ++walker->level; @@ -224,6 +225,7 @@ retry_walk: goto error; } + accessed_dirty &= pte; pte_access = pt_access & gpte_access(vcpu, pte); walker->ptes[walker->level - 1] = pte; @@ -251,11 +253,23 @@ retry_walk: if (!write_fault) protect_clean_gpte(&pte_access, pte); - ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault); - if (unlikely(ret < 0)) - goto error; - else if (ret) - goto retry_walk; + /* + * On a write fault, fold the dirty bit into accessed_dirty by shifting it one + * place right. + * + * On a read fault, do nothing. + */ + shift = write_fault >> ilog2(PFERR_WRITE_MASK); + shift *= PT_DIRTY_SHIFT - PT_ACCESSED_SHIFT; + accessed_dirty &= pte >> shift; + + if (unlikely(!accessed_dirty)) { + ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault); + if (unlikely(ret < 0)) + goto error; + else if (ret) + goto retry_walk; + } walker->pt_access = pt_access; walker->pte_access = pte_access; -- cgit v1.1 From c5421519f30bd5ed77857a78de6dc8414385e602 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 19 Sep 2012 19:33:48 +0300 Subject: KVM: MMU: Eliminate pointless temporary 'ac' 'ac' essentially reconstructs the 'access' variable we already have, except for the PFERR_PRESENT_MASK and PFERR_RSVD_MASK. As these are not used by callees, just use 'access' directly. Reviewed-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/paging_tmpl.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 810c1da..714e2c0 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -160,7 +160,6 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, u16 errcode = 0; gpa_t real_gpa; gfn_t gfn; - u32 ac; trace_kvm_mmu_pagetable_walk(addr, access); retry_walk: @@ -242,9 +241,7 @@ retry_walk: if (PTTYPE == 32 && walker->level == PT_DIRECTORY_LEVEL && is_cpuid_PSE36()) gfn += pse36_gfn_delta(pte); - ac = write_fault | fetch_fault | user_fault; - - real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), ac); + real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), access); if (real_gpa == UNMAPPED_GVA) return 0; -- cgit v1.1 From 1e08ec4a130e2745d96df169e67c58df98a07311 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Thu, 13 Sep 2012 17:19:24 +0300 Subject: KVM: optimize apic interrupt delivery Most interrupt are delivered to only one vcpu. Use pre-build tables to find interrupt destination instead of looping through all vcpus. In case of logical mode loop only through vcpus in a logical cluster irq is sent to. Signed-off-by: Gleb Natapov Acked-by: Michael S. Tsirkin Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 12 +++ arch/x86/kvm/lapic.c | 188 +++++++++++++++++++++++++++++++++++++--- arch/x86/kvm/lapic.h | 3 + arch/x86/kvm/x86.c | 2 + 4 files changed, 193 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 43aeb94..0b902c9 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -525,6 +525,16 @@ struct kvm_arch_memory_slot { struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1]; }; +struct kvm_apic_map { + struct rcu_head rcu; + u8 ldr_bits; + /* fields bellow are used to decode ldr values in different modes */ + u32 cid_shift, cid_mask, lid_mask; + struct kvm_lapic *phys_map[256]; + /* first index is cluster id second is cpu id in a cluster */ + struct kvm_lapic *logical_map[16][16]; +}; + struct kvm_arch { unsigned int n_used_mmu_pages; unsigned int n_requested_mmu_pages; @@ -542,6 +552,8 @@ struct kvm_arch { struct kvm_ioapic *vioapic; struct kvm_pit *vpit; int vapics_in_nmi_mode; + struct mutex apic_map_lock; + struct kvm_apic_map *apic_map; unsigned int tss_addr; struct page *apic_access_page; diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 6f9fd63..c6e6b72 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -140,11 +140,110 @@ static inline int apic_enabled(struct kvm_lapic *apic) (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \ APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER) +static inline int apic_x2apic_mode(struct kvm_lapic *apic) +{ + return apic->vcpu->arch.apic_base & X2APIC_ENABLE; +} + static inline int kvm_apic_id(struct kvm_lapic *apic) { return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff; } +static inline u16 apic_cluster_id(struct kvm_apic_map *map, u32 ldr) +{ + u16 cid; + ldr >>= 32 - map->ldr_bits; + cid = (ldr >> map->cid_shift) & map->cid_mask; + + BUG_ON(cid >= ARRAY_SIZE(map->logical_map)); + + return cid; +} + +static inline u16 apic_logical_id(struct kvm_apic_map *map, u32 ldr) +{ + ldr >>= (32 - map->ldr_bits); + return ldr & map->lid_mask; +} + +static void recalculate_apic_map(struct kvm *kvm) +{ + struct kvm_apic_map *new, *old = NULL; + struct kvm_vcpu *vcpu; + int i; + + new = kzalloc(sizeof(struct kvm_apic_map), GFP_KERNEL); + + mutex_lock(&kvm->arch.apic_map_lock); + + if (!new) + goto out; + + new->ldr_bits = 8; + /* flat mode is default */ + new->cid_shift = 8; + new->cid_mask = 0; + new->lid_mask = 0xff; + + kvm_for_each_vcpu(i, vcpu, kvm) { + struct kvm_lapic *apic = vcpu->arch.apic; + u16 cid, lid; + u32 ldr; + + if (!kvm_apic_present(vcpu)) + continue; + + /* + * All APICs have to be configured in the same mode by an OS. + * We take advatage of this while building logical id loockup + * table. After reset APICs are in xapic/flat mode, so if we + * find apic with different setting we assume this is the mode + * OS wants all apics to be in; build lookup table accordingly. + */ + if (apic_x2apic_mode(apic)) { + new->ldr_bits = 32; + new->cid_shift = 16; + new->cid_mask = new->lid_mask = 0xffff; + } else if (kvm_apic_sw_enabled(apic) && + !new->cid_mask /* flat mode */ && + kvm_apic_get_reg(apic, APIC_DFR) == APIC_DFR_CLUSTER) { + new->cid_shift = 4; + new->cid_mask = 0xf; + new->lid_mask = 0xf; + } + + new->phys_map[kvm_apic_id(apic)] = apic; + + ldr = kvm_apic_get_reg(apic, APIC_LDR); + cid = apic_cluster_id(new, ldr); + lid = apic_logical_id(new, ldr); + + if (lid) + new->logical_map[cid][ffs(lid) - 1] = apic; + } +out: + old = rcu_dereference_protected(kvm->arch.apic_map, + lockdep_is_held(&kvm->arch.apic_map_lock)); + rcu_assign_pointer(kvm->arch.apic_map, new); + mutex_unlock(&kvm->arch.apic_map_lock); + + if (old) + kfree_rcu(old, rcu); +} + +static inline void kvm_apic_set_id(struct kvm_lapic *apic, u8 id) +{ + apic_set_reg(apic, APIC_ID, id << 24); + recalculate_apic_map(apic->vcpu->kvm); +} + +static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id) +{ + apic_set_reg(apic, APIC_LDR, id); + recalculate_apic_map(apic->vcpu->kvm); +} + static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type) { return !(kvm_apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED); @@ -194,11 +293,6 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu) apic_set_reg(apic, APIC_LVR, v); } -static inline int apic_x2apic_mode(struct kvm_lapic *apic) -{ - return apic->vcpu->arch.apic_base & X2APIC_ENABLE; -} - static const unsigned int apic_lvt_mask[APIC_LVT_NUM] = { LVT_MASK , /* part LVTT mask, timer mode mask added at runtime */ LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */ @@ -483,6 +577,72 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, return result; } +bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, + struct kvm_lapic_irq *irq, int *r) +{ + struct kvm_apic_map *map; + unsigned long bitmap = 1; + struct kvm_lapic **dst; + int i; + bool ret = false; + + *r = -1; + + if (irq->shorthand == APIC_DEST_SELF) { + *r = kvm_apic_set_irq(src->vcpu, irq); + return true; + } + + if (irq->shorthand) + return false; + + rcu_read_lock(); + map = rcu_dereference(kvm->arch.apic_map); + + if (!map) + goto out; + + if (irq->dest_mode == 0) { /* physical mode */ + if (irq->delivery_mode == APIC_DM_LOWEST || + irq->dest_id == 0xff) + goto out; + dst = &map->phys_map[irq->dest_id & 0xff]; + } else { + u32 mda = irq->dest_id << (32 - map->ldr_bits); + + dst = map->logical_map[apic_cluster_id(map, mda)]; + + bitmap = apic_logical_id(map, mda); + + if (irq->delivery_mode == APIC_DM_LOWEST) { + int l = -1; + for_each_set_bit(i, &bitmap, 16) { + if (!dst[i]) + continue; + if (l < 0) + l = i; + else if (kvm_apic_compare_prio(dst[i]->vcpu, dst[l]->vcpu) < 0) + l = i; + } + + bitmap = (l >= 0) ? 1 << l : 0; + } + } + + for_each_set_bit(i, &bitmap, 16) { + if (!dst[i]) + continue; + if (*r < 0) + *r = 0; + *r += kvm_apic_set_irq(dst[i]->vcpu, irq); + } + + ret = true; +out: + rcu_read_unlock(); + return ret; +} + /* * Add a pending IRQ into lapic. * Return 1 if successfully added and 0 if discarded. @@ -886,7 +1046,7 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) switch (reg) { case APIC_ID: /* Local APIC ID */ if (!apic_x2apic_mode(apic)) - apic_set_reg(apic, APIC_ID, val); + kvm_apic_set_id(apic, val >> 24); else ret = 1; break; @@ -902,15 +1062,16 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) case APIC_LDR: if (!apic_x2apic_mode(apic)) - apic_set_reg(apic, APIC_LDR, val & APIC_LDR_MASK); + kvm_apic_set_ldr(apic, val & APIC_LDR_MASK); else ret = 1; break; case APIC_DFR: - if (!apic_x2apic_mode(apic)) + if (!apic_x2apic_mode(apic)) { apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF); - else + recalculate_apic_map(apic->vcpu->kvm); + } else ret = 1; break; @@ -1141,6 +1302,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) static_key_slow_dec_deferred(&apic_hw_disabled); else static_key_slow_inc(&apic_hw_disabled.key); + recalculate_apic_map(vcpu->kvm); } if (!kvm_vcpu_is_bsp(apic->vcpu)) @@ -1150,7 +1312,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) if (apic_x2apic_mode(apic)) { u32 id = kvm_apic_id(apic); u32 ldr = ((id & ~0xf) << 16) | (1 << (id & 0xf)); - apic_set_reg(apic, APIC_LDR, ldr); + kvm_apic_set_ldr(apic, ldr); } apic->base_address = apic->vcpu->arch.apic_base & MSR_IA32_APICBASE_BASE; @@ -1175,7 +1337,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) /* Stop the timer in case it's a reset to an active apic */ hrtimer_cancel(&apic->lapic_timer.timer); - apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24); + kvm_apic_set_id(apic, vcpu->vcpu_id); kvm_apic_set_version(apic->vcpu); for (i = 0; i < APIC_LVT_NUM; i++) @@ -1186,7 +1348,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) apic_set_reg(apic, APIC_DFR, 0xffffffffU); apic_set_spiv(apic, 0xff); apic_set_reg(apic, APIC_TASKPRI, 0); - apic_set_reg(apic, APIC_LDR, 0); + kvm_apic_set_ldr(apic, 0); apic_set_reg(apic, APIC_ESR, 0); apic_set_reg(apic, APIC_ICR, 0); apic_set_reg(apic, APIC_ICR2, 0); @@ -1404,6 +1566,8 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu, /* set SPIV separately to get count of SW disabled APICs right */ apic_set_spiv(apic, *((u32 *)(s->regs + APIC_SPIV))); memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s); + /* call kvm_apic_set_id() to put apic into apic_map */ + kvm_apic_set_id(apic, kvm_apic_id(apic)); kvm_apic_set_version(vcpu); apic_update_ppr(apic); diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 615a8b0..e5ebf9f 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -52,6 +52,9 @@ int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq); int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type); +bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, + struct kvm_lapic_irq *irq, int *r); + u64 kvm_get_apic_base(struct kvm_vcpu *vcpu); void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data); void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 497226e..fc2a0a1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6270,6 +6270,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); raw_spin_lock_init(&kvm->arch.tsc_write_lock); + mutex_init(&kvm->arch.apic_map_lock); return 0; } @@ -6322,6 +6323,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) put_page(kvm->arch.apic_access_page); if (kvm->arch.ept_identity_pagetable) put_page(kvm->arch.ept_identity_pagetable); + kfree(rcu_dereference_check(kvm->arch.apic_map, 1)); } void kvm_arch_free_memslot(struct kvm_memory_slot *free, -- cgit v1.1 From 50a011f6409e888d5f41343024d24885281f048c Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 20 Sep 2012 14:43:54 +0200 Subject: kprobes/x86: Move skip_singlestep up MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I get this warning: arch/x86/kernel/kprobes.c:544:23: warning: ‘skip_singlestep’ declared ‘static’ but never defined on tip/auto-latest. Put the skip_singlestep function declaration up, in KPROBES_CAN_USE_FTRACE and drop the superfluous forward declaration. Signed-off-by: Borislav Petkov Acked-by: Masami Hiramatsu Cc: Steven Rostedt Link: http://lkml.kernel.org/r/1348145034-16603-1-git-send-email-bp@amd64.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/kprobes.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index b7c2a85..57916c0 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -541,8 +541,23 @@ reenter_kprobe(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb return 1; } +#ifdef KPROBES_CAN_USE_FTRACE static void __kprobes skip_singlestep(struct kprobe *p, struct pt_regs *regs, - struct kprobe_ctlblk *kcb); + struct kprobe_ctlblk *kcb) +{ + /* + * Emulate singlestep (and also recover regs->ip) + * as if there is a 5byte nop + */ + regs->ip = (unsigned long)p->addr + MCOUNT_INSN_SIZE; + if (unlikely(p->post_handler)) { + kcb->kprobe_status = KPROBE_HIT_SSDONE; + p->post_handler(p, regs, 0); + } + __this_cpu_write(current_kprobe, NULL); +} +#endif + /* * Interrupts are disabled on entry as trap3 is an interrupt gate and they * remain disabled throughout this function. @@ -1061,21 +1076,6 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) } #ifdef KPROBES_CAN_USE_FTRACE -static void __kprobes skip_singlestep(struct kprobe *p, struct pt_regs *regs, - struct kprobe_ctlblk *kcb) -{ - /* - * Emulate singlestep (and also recover regs->ip) - * as if there is a 5byte nop - */ - regs->ip = (unsigned long)p->addr + MCOUNT_INSN_SIZE; - if (unlikely(p->post_handler)) { - kcb->kprobe_status = KPROBE_HIT_SSDONE; - p->post_handler(p, regs, 0); - } - __this_cpu_write(current_kprobe, NULL); -} - /* Ftrace callback handler for kprobes */ void __kprobes kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ops, struct pt_regs *regs) -- cgit v1.1 From 24cc7fb69a5b5edfdff1d38c6a213d6a33648829 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Thu, 20 Sep 2012 10:28:45 -0400 Subject: x86/kbuild: archscripts depends on scripts_basic While building the SUSE kernel packages, which build the scripts, make clean, and then build everything, we have been running into spurious build failures. We tracked them down to a simple dependency issue: $ make mrproper CLEAN arch/x86/tools CLEAN scripts/basic $ cp patches/config/x86_64/desktop .config $ make archscripts HOSTCC arch/x86/tools/relocs /bin/sh: scripts/basic/fixdep: No such file or directory make[3]: *** [arch/x86/tools/relocs] Error 1 make[2]: *** [archscripts] Error 2 make[1]: *** [sub-make] Error 2 make: *** [all] Error 2 This was introduced by commit 6520fe55 (x86, realmode: 16-bit real-mode code support for relocs), which added the archscripts dependency to archprepare. This patch adds the scripts_basic dependency to the x86 archscripts. Signed-off-by: Jeff Mahoney Signed-off-by: Michal Marek --- arch/x86/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Makefile b/arch/x86/Makefile index b0c5276..c098ca4 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -138,7 +138,7 @@ KBUILD_CFLAGS += $(call cc-option,-mno-avx,) KBUILD_CFLAGS += $(mflags-y) KBUILD_AFLAGS += $(mflags-y) -archscripts: +archscripts: scripts_basic $(Q)$(MAKE) $(build)=arch/x86/tools relocs ### -- cgit v1.1 From 26bf264e871a4b9a8ac09c21a2b518e7f23830d5 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 17 Sep 2012 16:31:13 +0800 Subject: KVM: x86: Export svm/vmx exit code and vector code to userspace Exporting KVM exit information to userspace to be consumed by perf. Signed-off-by: Dong Hao [ Dong Hao : rebase it on acme's git tree ] Signed-off-by: Xiao Guangrong Acked-by: Marcelo Tosatti Cc: Avi Kivity Cc: David Ahern Cc: Ingo Molnar Cc: Marcelo Tosatti Cc: kvm@vger.kernel.org Cc: Runzhen Wang Link: http://lkml.kernel.org/r/1347870675-31495-2-git-send-email-haodong@linux.vnet.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- arch/x86/include/asm/kvm.h | 16 ++++ arch/x86/include/asm/kvm_host.h | 16 ---- arch/x86/include/asm/svm.h | 205 +++++++++++++++++++++++++--------------- arch/x86/include/asm/vmx.h | 127 ++++++++++++++++--------- arch/x86/kvm/trace.h | 89 ----------------- 5 files changed, 230 insertions(+), 223 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h index 246617e..41e08cb 100644 --- a/arch/x86/include/asm/kvm.h +++ b/arch/x86/include/asm/kvm.h @@ -9,6 +9,22 @@ #include #include +#define DE_VECTOR 0 +#define DB_VECTOR 1 +#define BP_VECTOR 3 +#define OF_VECTOR 4 +#define BR_VECTOR 5 +#define UD_VECTOR 6 +#define NM_VECTOR 7 +#define DF_VECTOR 8 +#define TS_VECTOR 10 +#define NP_VECTOR 11 +#define SS_VECTOR 12 +#define GP_VECTOR 13 +#define PF_VECTOR 14 +#define MF_VECTOR 16 +#define MC_VECTOR 18 + /* Select x86 specific features in */ #define __KVM_HAVE_PIT #define __KVM_HAVE_IOAPIC diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 09155d6..1eaa6b0 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -75,22 +75,6 @@ #define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1)) #define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE) -#define DE_VECTOR 0 -#define DB_VECTOR 1 -#define BP_VECTOR 3 -#define OF_VECTOR 4 -#define BR_VECTOR 5 -#define UD_VECTOR 6 -#define NM_VECTOR 7 -#define DF_VECTOR 8 -#define TS_VECTOR 10 -#define NP_VECTOR 11 -#define SS_VECTOR 12 -#define GP_VECTOR 13 -#define PF_VECTOR 14 -#define MF_VECTOR 16 -#define MC_VECTOR 18 - #define SELECTOR_TI_MASK (1 << 2) #define SELECTOR_RPL_MASK 0x03 diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index f2b83bc..cdf5674 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -1,6 +1,135 @@ #ifndef __SVM_H #define __SVM_H +#define SVM_EXIT_READ_CR0 0x000 +#define SVM_EXIT_READ_CR3 0x003 +#define SVM_EXIT_READ_CR4 0x004 +#define SVM_EXIT_READ_CR8 0x008 +#define SVM_EXIT_WRITE_CR0 0x010 +#define SVM_EXIT_WRITE_CR3 0x013 +#define SVM_EXIT_WRITE_CR4 0x014 +#define SVM_EXIT_WRITE_CR8 0x018 +#define SVM_EXIT_READ_DR0 0x020 +#define SVM_EXIT_READ_DR1 0x021 +#define SVM_EXIT_READ_DR2 0x022 +#define SVM_EXIT_READ_DR3 0x023 +#define SVM_EXIT_READ_DR4 0x024 +#define SVM_EXIT_READ_DR5 0x025 +#define SVM_EXIT_READ_DR6 0x026 +#define SVM_EXIT_READ_DR7 0x027 +#define SVM_EXIT_WRITE_DR0 0x030 +#define SVM_EXIT_WRITE_DR1 0x031 +#define SVM_EXIT_WRITE_DR2 0x032 +#define SVM_EXIT_WRITE_DR3 0x033 +#define SVM_EXIT_WRITE_DR4 0x034 +#define SVM_EXIT_WRITE_DR5 0x035 +#define SVM_EXIT_WRITE_DR6 0x036 +#define SVM_EXIT_WRITE_DR7 0x037 +#define SVM_EXIT_EXCP_BASE 0x040 +#define SVM_EXIT_INTR 0x060 +#define SVM_EXIT_NMI 0x061 +#define SVM_EXIT_SMI 0x062 +#define SVM_EXIT_INIT 0x063 +#define SVM_EXIT_VINTR 0x064 +#define SVM_EXIT_CR0_SEL_WRITE 0x065 +#define SVM_EXIT_IDTR_READ 0x066 +#define SVM_EXIT_GDTR_READ 0x067 +#define SVM_EXIT_LDTR_READ 0x068 +#define SVM_EXIT_TR_READ 0x069 +#define SVM_EXIT_IDTR_WRITE 0x06a +#define SVM_EXIT_GDTR_WRITE 0x06b +#define SVM_EXIT_LDTR_WRITE 0x06c +#define SVM_EXIT_TR_WRITE 0x06d +#define SVM_EXIT_RDTSC 0x06e +#define SVM_EXIT_RDPMC 0x06f +#define SVM_EXIT_PUSHF 0x070 +#define SVM_EXIT_POPF 0x071 +#define SVM_EXIT_CPUID 0x072 +#define SVM_EXIT_RSM 0x073 +#define SVM_EXIT_IRET 0x074 +#define SVM_EXIT_SWINT 0x075 +#define SVM_EXIT_INVD 0x076 +#define SVM_EXIT_PAUSE 0x077 +#define SVM_EXIT_HLT 0x078 +#define SVM_EXIT_INVLPG 0x079 +#define SVM_EXIT_INVLPGA 0x07a +#define SVM_EXIT_IOIO 0x07b +#define SVM_EXIT_MSR 0x07c +#define SVM_EXIT_TASK_SWITCH 0x07d +#define SVM_EXIT_FERR_FREEZE 0x07e +#define SVM_EXIT_SHUTDOWN 0x07f +#define SVM_EXIT_VMRUN 0x080 +#define SVM_EXIT_VMMCALL 0x081 +#define SVM_EXIT_VMLOAD 0x082 +#define SVM_EXIT_VMSAVE 0x083 +#define SVM_EXIT_STGI 0x084 +#define SVM_EXIT_CLGI 0x085 +#define SVM_EXIT_SKINIT 0x086 +#define SVM_EXIT_RDTSCP 0x087 +#define SVM_EXIT_ICEBP 0x088 +#define SVM_EXIT_WBINVD 0x089 +#define SVM_EXIT_MONITOR 0x08a +#define SVM_EXIT_MWAIT 0x08b +#define SVM_EXIT_MWAIT_COND 0x08c +#define SVM_EXIT_XSETBV 0x08d +#define SVM_EXIT_NPF 0x400 + +#define SVM_EXIT_ERR -1 + +#define SVM_EXIT_REASONS \ + { SVM_EXIT_READ_CR0, "read_cr0" }, \ + { SVM_EXIT_READ_CR3, "read_cr3" }, \ + { SVM_EXIT_READ_CR4, "read_cr4" }, \ + { SVM_EXIT_READ_CR8, "read_cr8" }, \ + { SVM_EXIT_WRITE_CR0, "write_cr0" }, \ + { SVM_EXIT_WRITE_CR3, "write_cr3" }, \ + { SVM_EXIT_WRITE_CR4, "write_cr4" }, \ + { SVM_EXIT_WRITE_CR8, "write_cr8" }, \ + { SVM_EXIT_READ_DR0, "read_dr0" }, \ + { SVM_EXIT_READ_DR1, "read_dr1" }, \ + { SVM_EXIT_READ_DR2, "read_dr2" }, \ + { SVM_EXIT_READ_DR3, "read_dr3" }, \ + { SVM_EXIT_WRITE_DR0, "write_dr0" }, \ + { SVM_EXIT_WRITE_DR1, "write_dr1" }, \ + { SVM_EXIT_WRITE_DR2, "write_dr2" }, \ + { SVM_EXIT_WRITE_DR3, "write_dr3" }, \ + { SVM_EXIT_WRITE_DR5, "write_dr5" }, \ + { SVM_EXIT_WRITE_DR7, "write_dr7" }, \ + { SVM_EXIT_EXCP_BASE + DB_VECTOR, "DB excp" }, \ + { SVM_EXIT_EXCP_BASE + BP_VECTOR, "BP excp" }, \ + { SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" }, \ + { SVM_EXIT_EXCP_BASE + PF_VECTOR, "PF excp" }, \ + { SVM_EXIT_EXCP_BASE + NM_VECTOR, "NM excp" }, \ + { SVM_EXIT_EXCP_BASE + MC_VECTOR, "MC excp" }, \ + { SVM_EXIT_INTR, "interrupt" }, \ + { SVM_EXIT_NMI, "nmi" }, \ + { SVM_EXIT_SMI, "smi" }, \ + { SVM_EXIT_INIT, "init" }, \ + { SVM_EXIT_VINTR, "vintr" }, \ + { SVM_EXIT_CPUID, "cpuid" }, \ + { SVM_EXIT_INVD, "invd" }, \ + { SVM_EXIT_HLT, "hlt" }, \ + { SVM_EXIT_INVLPG, "invlpg" }, \ + { SVM_EXIT_INVLPGA, "invlpga" }, \ + { SVM_EXIT_IOIO, "io" }, \ + { SVM_EXIT_MSR, "msr" }, \ + { SVM_EXIT_TASK_SWITCH, "task_switch" }, \ + { SVM_EXIT_SHUTDOWN, "shutdown" }, \ + { SVM_EXIT_VMRUN, "vmrun" }, \ + { SVM_EXIT_VMMCALL, "hypercall" }, \ + { SVM_EXIT_VMLOAD, "vmload" }, \ + { SVM_EXIT_VMSAVE, "vmsave" }, \ + { SVM_EXIT_STGI, "stgi" }, \ + { SVM_EXIT_CLGI, "clgi" }, \ + { SVM_EXIT_SKINIT, "skinit" }, \ + { SVM_EXIT_WBINVD, "wbinvd" }, \ + { SVM_EXIT_MONITOR, "monitor" }, \ + { SVM_EXIT_MWAIT, "mwait" }, \ + { SVM_EXIT_XSETBV, "xsetbv" }, \ + { SVM_EXIT_NPF, "npf" } + +#ifdef __KERNEL__ + enum { INTERCEPT_INTR, INTERCEPT_NMI, @@ -264,81 +393,6 @@ struct __attribute__ ((__packed__)) vmcb { #define SVM_EXITINFO_REG_MASK 0x0F -#define SVM_EXIT_READ_CR0 0x000 -#define SVM_EXIT_READ_CR3 0x003 -#define SVM_EXIT_READ_CR4 0x004 -#define SVM_EXIT_READ_CR8 0x008 -#define SVM_EXIT_WRITE_CR0 0x010 -#define SVM_EXIT_WRITE_CR3 0x013 -#define SVM_EXIT_WRITE_CR4 0x014 -#define SVM_EXIT_WRITE_CR8 0x018 -#define SVM_EXIT_READ_DR0 0x020 -#define SVM_EXIT_READ_DR1 0x021 -#define SVM_EXIT_READ_DR2 0x022 -#define SVM_EXIT_READ_DR3 0x023 -#define SVM_EXIT_READ_DR4 0x024 -#define SVM_EXIT_READ_DR5 0x025 -#define SVM_EXIT_READ_DR6 0x026 -#define SVM_EXIT_READ_DR7 0x027 -#define SVM_EXIT_WRITE_DR0 0x030 -#define SVM_EXIT_WRITE_DR1 0x031 -#define SVM_EXIT_WRITE_DR2 0x032 -#define SVM_EXIT_WRITE_DR3 0x033 -#define SVM_EXIT_WRITE_DR4 0x034 -#define SVM_EXIT_WRITE_DR5 0x035 -#define SVM_EXIT_WRITE_DR6 0x036 -#define SVM_EXIT_WRITE_DR7 0x037 -#define SVM_EXIT_EXCP_BASE 0x040 -#define SVM_EXIT_INTR 0x060 -#define SVM_EXIT_NMI 0x061 -#define SVM_EXIT_SMI 0x062 -#define SVM_EXIT_INIT 0x063 -#define SVM_EXIT_VINTR 0x064 -#define SVM_EXIT_CR0_SEL_WRITE 0x065 -#define SVM_EXIT_IDTR_READ 0x066 -#define SVM_EXIT_GDTR_READ 0x067 -#define SVM_EXIT_LDTR_READ 0x068 -#define SVM_EXIT_TR_READ 0x069 -#define SVM_EXIT_IDTR_WRITE 0x06a -#define SVM_EXIT_GDTR_WRITE 0x06b -#define SVM_EXIT_LDTR_WRITE 0x06c -#define SVM_EXIT_TR_WRITE 0x06d -#define SVM_EXIT_RDTSC 0x06e -#define SVM_EXIT_RDPMC 0x06f -#define SVM_EXIT_PUSHF 0x070 -#define SVM_EXIT_POPF 0x071 -#define SVM_EXIT_CPUID 0x072 -#define SVM_EXIT_RSM 0x073 -#define SVM_EXIT_IRET 0x074 -#define SVM_EXIT_SWINT 0x075 -#define SVM_EXIT_INVD 0x076 -#define SVM_EXIT_PAUSE 0x077 -#define SVM_EXIT_HLT 0x078 -#define SVM_EXIT_INVLPG 0x079 -#define SVM_EXIT_INVLPGA 0x07a -#define SVM_EXIT_IOIO 0x07b -#define SVM_EXIT_MSR 0x07c -#define SVM_EXIT_TASK_SWITCH 0x07d -#define SVM_EXIT_FERR_FREEZE 0x07e -#define SVM_EXIT_SHUTDOWN 0x07f -#define SVM_EXIT_VMRUN 0x080 -#define SVM_EXIT_VMMCALL 0x081 -#define SVM_EXIT_VMLOAD 0x082 -#define SVM_EXIT_VMSAVE 0x083 -#define SVM_EXIT_STGI 0x084 -#define SVM_EXIT_CLGI 0x085 -#define SVM_EXIT_SKINIT 0x086 -#define SVM_EXIT_RDTSCP 0x087 -#define SVM_EXIT_ICEBP 0x088 -#define SVM_EXIT_WBINVD 0x089 -#define SVM_EXIT_MONITOR 0x08a -#define SVM_EXIT_MWAIT 0x08b -#define SVM_EXIT_MWAIT_COND 0x08c -#define SVM_EXIT_XSETBV 0x08d -#define SVM_EXIT_NPF 0x400 - -#define SVM_EXIT_ERR -1 - #define SVM_CR0_SELECTIVE_MASK (X86_CR0_TS | X86_CR0_MP) #define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda" @@ -350,3 +404,4 @@ struct __attribute__ ((__packed__)) vmcb { #endif +#endif diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 74fcb96..36ec21c 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -25,6 +25,88 @@ * */ +#define VMX_EXIT_REASONS_FAILED_VMENTRY 0x80000000 + +#define EXIT_REASON_EXCEPTION_NMI 0 +#define EXIT_REASON_EXTERNAL_INTERRUPT 1 +#define EXIT_REASON_TRIPLE_FAULT 2 + +#define EXIT_REASON_PENDING_INTERRUPT 7 +#define EXIT_REASON_NMI_WINDOW 8 +#define EXIT_REASON_TASK_SWITCH 9 +#define EXIT_REASON_CPUID 10 +#define EXIT_REASON_HLT 12 +#define EXIT_REASON_INVD 13 +#define EXIT_REASON_INVLPG 14 +#define EXIT_REASON_RDPMC 15 +#define EXIT_REASON_RDTSC 16 +#define EXIT_REASON_VMCALL 18 +#define EXIT_REASON_VMCLEAR 19 +#define EXIT_REASON_VMLAUNCH 20 +#define EXIT_REASON_VMPTRLD 21 +#define EXIT_REASON_VMPTRST 22 +#define EXIT_REASON_VMREAD 23 +#define EXIT_REASON_VMRESUME 24 +#define EXIT_REASON_VMWRITE 25 +#define EXIT_REASON_VMOFF 26 +#define EXIT_REASON_VMON 27 +#define EXIT_REASON_CR_ACCESS 28 +#define EXIT_REASON_DR_ACCESS 29 +#define EXIT_REASON_IO_INSTRUCTION 30 +#define EXIT_REASON_MSR_READ 31 +#define EXIT_REASON_MSR_WRITE 32 +#define EXIT_REASON_INVALID_STATE 33 +#define EXIT_REASON_MWAIT_INSTRUCTION 36 +#define EXIT_REASON_MONITOR_INSTRUCTION 39 +#define EXIT_REASON_PAUSE_INSTRUCTION 40 +#define EXIT_REASON_MCE_DURING_VMENTRY 41 +#define EXIT_REASON_TPR_BELOW_THRESHOLD 43 +#define EXIT_REASON_APIC_ACCESS 44 +#define EXIT_REASON_EPT_VIOLATION 48 +#define EXIT_REASON_EPT_MISCONFIG 49 +#define EXIT_REASON_WBINVD 54 +#define EXIT_REASON_XSETBV 55 +#define EXIT_REASON_INVPCID 58 + +#define VMX_EXIT_REASONS \ + { EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \ + { EXIT_REASON_EXTERNAL_INTERRUPT, "EXTERNAL_INTERRUPT" }, \ + { EXIT_REASON_TRIPLE_FAULT, "TRIPLE_FAULT" }, \ + { EXIT_REASON_PENDING_INTERRUPT, "PENDING_INTERRUPT" }, \ + { EXIT_REASON_NMI_WINDOW, "NMI_WINDOW" }, \ + { EXIT_REASON_TASK_SWITCH, "TASK_SWITCH" }, \ + { EXIT_REASON_CPUID, "CPUID" }, \ + { EXIT_REASON_HLT, "HLT" }, \ + { EXIT_REASON_INVLPG, "INVLPG" }, \ + { EXIT_REASON_RDPMC, "RDPMC" }, \ + { EXIT_REASON_RDTSC, "RDTSC" }, \ + { EXIT_REASON_VMCALL, "VMCALL" }, \ + { EXIT_REASON_VMCLEAR, "VMCLEAR" }, \ + { EXIT_REASON_VMLAUNCH, "VMLAUNCH" }, \ + { EXIT_REASON_VMPTRLD, "VMPTRLD" }, \ + { EXIT_REASON_VMPTRST, "VMPTRST" }, \ + { EXIT_REASON_VMREAD, "VMREAD" }, \ + { EXIT_REASON_VMRESUME, "VMRESUME" }, \ + { EXIT_REASON_VMWRITE, "VMWRITE" }, \ + { EXIT_REASON_VMOFF, "VMOFF" }, \ + { EXIT_REASON_VMON, "VMON" }, \ + { EXIT_REASON_CR_ACCESS, "CR_ACCESS" }, \ + { EXIT_REASON_DR_ACCESS, "DR_ACCESS" }, \ + { EXIT_REASON_IO_INSTRUCTION, "IO_INSTRUCTION" }, \ + { EXIT_REASON_MSR_READ, "MSR_READ" }, \ + { EXIT_REASON_MSR_WRITE, "MSR_WRITE" }, \ + { EXIT_REASON_MWAIT_INSTRUCTION, "MWAIT_INSTRUCTION" }, \ + { EXIT_REASON_MONITOR_INSTRUCTION, "MONITOR_INSTRUCTION" }, \ + { EXIT_REASON_PAUSE_INSTRUCTION, "PAUSE_INSTRUCTION" }, \ + { EXIT_REASON_MCE_DURING_VMENTRY, "MCE_DURING_VMENTRY" }, \ + { EXIT_REASON_TPR_BELOW_THRESHOLD, "TPR_BELOW_THRESHOLD" }, \ + { EXIT_REASON_APIC_ACCESS, "APIC_ACCESS" }, \ + { EXIT_REASON_EPT_VIOLATION, "EPT_VIOLATION" }, \ + { EXIT_REASON_EPT_MISCONFIG, "EPT_MISCONFIG" }, \ + { EXIT_REASON_WBINVD, "WBINVD" } + +#ifdef __KERNEL__ + #include /* @@ -241,49 +323,6 @@ enum vmcs_field { HOST_RIP = 0x00006c16, }; -#define VMX_EXIT_REASONS_FAILED_VMENTRY 0x80000000 - -#define EXIT_REASON_EXCEPTION_NMI 0 -#define EXIT_REASON_EXTERNAL_INTERRUPT 1 -#define EXIT_REASON_TRIPLE_FAULT 2 - -#define EXIT_REASON_PENDING_INTERRUPT 7 -#define EXIT_REASON_NMI_WINDOW 8 -#define EXIT_REASON_TASK_SWITCH 9 -#define EXIT_REASON_CPUID 10 -#define EXIT_REASON_HLT 12 -#define EXIT_REASON_INVD 13 -#define EXIT_REASON_INVLPG 14 -#define EXIT_REASON_RDPMC 15 -#define EXIT_REASON_RDTSC 16 -#define EXIT_REASON_VMCALL 18 -#define EXIT_REASON_VMCLEAR 19 -#define EXIT_REASON_VMLAUNCH 20 -#define EXIT_REASON_VMPTRLD 21 -#define EXIT_REASON_VMPTRST 22 -#define EXIT_REASON_VMREAD 23 -#define EXIT_REASON_VMRESUME 24 -#define EXIT_REASON_VMWRITE 25 -#define EXIT_REASON_VMOFF 26 -#define EXIT_REASON_VMON 27 -#define EXIT_REASON_CR_ACCESS 28 -#define EXIT_REASON_DR_ACCESS 29 -#define EXIT_REASON_IO_INSTRUCTION 30 -#define EXIT_REASON_MSR_READ 31 -#define EXIT_REASON_MSR_WRITE 32 -#define EXIT_REASON_INVALID_STATE 33 -#define EXIT_REASON_MWAIT_INSTRUCTION 36 -#define EXIT_REASON_MONITOR_INSTRUCTION 39 -#define EXIT_REASON_PAUSE_INSTRUCTION 40 -#define EXIT_REASON_MCE_DURING_VMENTRY 41 -#define EXIT_REASON_TPR_BELOW_THRESHOLD 43 -#define EXIT_REASON_APIC_ACCESS 44 -#define EXIT_REASON_EPT_VIOLATION 48 -#define EXIT_REASON_EPT_MISCONFIG 49 -#define EXIT_REASON_WBINVD 54 -#define EXIT_REASON_XSETBV 55 -#define EXIT_REASON_INVPCID 58 - /* * Interruption-information format */ @@ -488,3 +527,5 @@ enum vm_instruction_error_number { }; #endif + +#endif diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index a71faf7..bca63f0 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -183,95 +183,6 @@ TRACE_EVENT(kvm_apic, #define KVM_ISA_VMX 1 #define KVM_ISA_SVM 2 -#define VMX_EXIT_REASONS \ - { EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \ - { EXIT_REASON_EXTERNAL_INTERRUPT, "EXTERNAL_INTERRUPT" }, \ - { EXIT_REASON_TRIPLE_FAULT, "TRIPLE_FAULT" }, \ - { EXIT_REASON_PENDING_INTERRUPT, "PENDING_INTERRUPT" }, \ - { EXIT_REASON_NMI_WINDOW, "NMI_WINDOW" }, \ - { EXIT_REASON_TASK_SWITCH, "TASK_SWITCH" }, \ - { EXIT_REASON_CPUID, "CPUID" }, \ - { EXIT_REASON_HLT, "HLT" }, \ - { EXIT_REASON_INVLPG, "INVLPG" }, \ - { EXIT_REASON_RDPMC, "RDPMC" }, \ - { EXIT_REASON_RDTSC, "RDTSC" }, \ - { EXIT_REASON_VMCALL, "VMCALL" }, \ - { EXIT_REASON_VMCLEAR, "VMCLEAR" }, \ - { EXIT_REASON_VMLAUNCH, "VMLAUNCH" }, \ - { EXIT_REASON_VMPTRLD, "VMPTRLD" }, \ - { EXIT_REASON_VMPTRST, "VMPTRST" }, \ - { EXIT_REASON_VMREAD, "VMREAD" }, \ - { EXIT_REASON_VMRESUME, "VMRESUME" }, \ - { EXIT_REASON_VMWRITE, "VMWRITE" }, \ - { EXIT_REASON_VMOFF, "VMOFF" }, \ - { EXIT_REASON_VMON, "VMON" }, \ - { EXIT_REASON_CR_ACCESS, "CR_ACCESS" }, \ - { EXIT_REASON_DR_ACCESS, "DR_ACCESS" }, \ - { EXIT_REASON_IO_INSTRUCTION, "IO_INSTRUCTION" }, \ - { EXIT_REASON_MSR_READ, "MSR_READ" }, \ - { EXIT_REASON_MSR_WRITE, "MSR_WRITE" }, \ - { EXIT_REASON_MWAIT_INSTRUCTION, "MWAIT_INSTRUCTION" }, \ - { EXIT_REASON_MONITOR_INSTRUCTION, "MONITOR_INSTRUCTION" }, \ - { EXIT_REASON_PAUSE_INSTRUCTION, "PAUSE_INSTRUCTION" }, \ - { EXIT_REASON_MCE_DURING_VMENTRY, "MCE_DURING_VMENTRY" }, \ - { EXIT_REASON_TPR_BELOW_THRESHOLD, "TPR_BELOW_THRESHOLD" }, \ - { EXIT_REASON_APIC_ACCESS, "APIC_ACCESS" }, \ - { EXIT_REASON_EPT_VIOLATION, "EPT_VIOLATION" }, \ - { EXIT_REASON_EPT_MISCONFIG, "EPT_MISCONFIG" }, \ - { EXIT_REASON_WBINVD, "WBINVD" } - -#define SVM_EXIT_REASONS \ - { SVM_EXIT_READ_CR0, "read_cr0" }, \ - { SVM_EXIT_READ_CR3, "read_cr3" }, \ - { SVM_EXIT_READ_CR4, "read_cr4" }, \ - { SVM_EXIT_READ_CR8, "read_cr8" }, \ - { SVM_EXIT_WRITE_CR0, "write_cr0" }, \ - { SVM_EXIT_WRITE_CR3, "write_cr3" }, \ - { SVM_EXIT_WRITE_CR4, "write_cr4" }, \ - { SVM_EXIT_WRITE_CR8, "write_cr8" }, \ - { SVM_EXIT_READ_DR0, "read_dr0" }, \ - { SVM_EXIT_READ_DR1, "read_dr1" }, \ - { SVM_EXIT_READ_DR2, "read_dr2" }, \ - { SVM_EXIT_READ_DR3, "read_dr3" }, \ - { SVM_EXIT_WRITE_DR0, "write_dr0" }, \ - { SVM_EXIT_WRITE_DR1, "write_dr1" }, \ - { SVM_EXIT_WRITE_DR2, "write_dr2" }, \ - { SVM_EXIT_WRITE_DR3, "write_dr3" }, \ - { SVM_EXIT_WRITE_DR5, "write_dr5" }, \ - { SVM_EXIT_WRITE_DR7, "write_dr7" }, \ - { SVM_EXIT_EXCP_BASE + DB_VECTOR, "DB excp" }, \ - { SVM_EXIT_EXCP_BASE + BP_VECTOR, "BP excp" }, \ - { SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" }, \ - { SVM_EXIT_EXCP_BASE + PF_VECTOR, "PF excp" }, \ - { SVM_EXIT_EXCP_BASE + NM_VECTOR, "NM excp" }, \ - { SVM_EXIT_EXCP_BASE + MC_VECTOR, "MC excp" }, \ - { SVM_EXIT_INTR, "interrupt" }, \ - { SVM_EXIT_NMI, "nmi" }, \ - { SVM_EXIT_SMI, "smi" }, \ - { SVM_EXIT_INIT, "init" }, \ - { SVM_EXIT_VINTR, "vintr" }, \ - { SVM_EXIT_CPUID, "cpuid" }, \ - { SVM_EXIT_INVD, "invd" }, \ - { SVM_EXIT_HLT, "hlt" }, \ - { SVM_EXIT_INVLPG, "invlpg" }, \ - { SVM_EXIT_INVLPGA, "invlpga" }, \ - { SVM_EXIT_IOIO, "io" }, \ - { SVM_EXIT_MSR, "msr" }, \ - { SVM_EXIT_TASK_SWITCH, "task_switch" }, \ - { SVM_EXIT_SHUTDOWN, "shutdown" }, \ - { SVM_EXIT_VMRUN, "vmrun" }, \ - { SVM_EXIT_VMMCALL, "hypercall" }, \ - { SVM_EXIT_VMLOAD, "vmload" }, \ - { SVM_EXIT_VMSAVE, "vmsave" }, \ - { SVM_EXIT_STGI, "stgi" }, \ - { SVM_EXIT_CLGI, "clgi" }, \ - { SVM_EXIT_SKINIT, "skinit" }, \ - { SVM_EXIT_WBINVD, "wbinvd" }, \ - { SVM_EXIT_MONITOR, "monitor" }, \ - { SVM_EXIT_MWAIT, "mwait" }, \ - { SVM_EXIT_XSETBV, "xsetbv" }, \ - { SVM_EXIT_NPF, "npf" } - /* * Tracepoint for kvm guest exit: */ -- cgit v1.1 From 8bd753be7a96443dd5111cb07ed5907a3787c978 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 21 Sep 2012 12:43:06 -0700 Subject: x86-32, mm: The WP test should be done on a kernel page PAGE_READONLY includes user permission, but this is a page used exclusively by the kernel; use PAGE_KERNEL_RO instead. Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/1348256595-29119-3-git-send-email-hpa@linux.intel.com --- arch/x86/mm/init_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 575d86f..e537b35 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -712,7 +712,7 @@ static void __init test_wp_bit(void) "Checking if this processor honours the WP bit even in supervisor mode..."); /* Any page-aligned address will do, the test is non-destructive */ - __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY); + __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_KERNEL_RO); boot_cpu_data.wp_works_ok = do_test_wp_bit(); clear_fixmap(FIX_WP_TEST); -- cgit v1.1 From 85fdf05cc395f23384cb0adb22765cbaa9653b54 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 21 Sep 2012 12:43:07 -0700 Subject: x86, smap: Add CR4 bit for SMAP Add X86_CR4_SMAP to . Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/1348256595-29119-4-git-send-email-hpa@linux.intel.com --- arch/x86/include/asm/processor-flags.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h index aea1d1d..680cf09 100644 --- a/arch/x86/include/asm/processor-flags.h +++ b/arch/x86/include/asm/processor-flags.h @@ -65,6 +65,7 @@ #define X86_CR4_PCIDE 0x00020000 /* enable PCID support */ #define X86_CR4_OSXSAVE 0x00040000 /* enable xsave and xrestore */ #define X86_CR4_SMEP 0x00100000 /* enable SMEP support */ +#define X86_CR4_SMAP 0x00200000 /* enable SMAP support */ /* * x86-64 Task Priority Register, CR8 -- cgit v1.1 From 9cebed423c84a56b871327dd77e555d1d2186a6b Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 21 Sep 2012 12:43:08 -0700 Subject: x86, alternative: Use .pushsection/.popsection .section/.previous doesn't nest. Use .pushsection/.popsection in so that they can be properly nested. Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/1348256595-29119-5-git-send-email-hpa@linux.intel.com --- arch/x86/include/asm/alternative-asm.h | 4 ++-- arch/x86/include/asm/alternative.h | 32 ++++++++++++++++---------------- 2 files changed, 18 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h index 952bd01..018d29f 100644 --- a/arch/x86/include/asm/alternative-asm.h +++ b/arch/x86/include/asm/alternative-asm.h @@ -5,10 +5,10 @@ #ifdef CONFIG_SMP .macro LOCK_PREFIX 672: lock - .section .smp_locks,"a" + .pushsection .smp_locks,"a" .balign 4 .long 672b - . - .previous + .popsection .endm #else .macro LOCK_PREFIX diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 7078068..87bc00d 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -29,10 +29,10 @@ #ifdef CONFIG_SMP #define LOCK_PREFIX_HERE \ - ".section .smp_locks,\"a\"\n" \ - ".balign 4\n" \ - ".long 671f - .\n" /* offset */ \ - ".previous\n" \ + ".pushsection .smp_locks,\"a\"\n" \ + ".balign 4\n" \ + ".long 671f - .\n" /* offset */ \ + ".popsection\n" \ "671:" #define LOCK_PREFIX LOCK_PREFIX_HERE "\n\tlock; " @@ -99,30 +99,30 @@ static inline int alternatives_text_reserved(void *start, void *end) /* alternative assembly primitive: */ #define ALTERNATIVE(oldinstr, newinstr, feature) \ OLDINSTR(oldinstr) \ - ".section .altinstructions,\"a\"\n" \ + ".pushsection .altinstructions,\"a\"\n" \ ALTINSTR_ENTRY(feature, 1) \ - ".previous\n" \ - ".section .discard,\"aw\",@progbits\n" \ + ".popsection\n" \ + ".pushsection .discard,\"aw\",@progbits\n" \ DISCARD_ENTRY(1) \ - ".previous\n" \ - ".section .altinstr_replacement, \"ax\"\n" \ + ".popsection\n" \ + ".pushsection .altinstr_replacement, \"ax\"\n" \ ALTINSTR_REPLACEMENT(newinstr, feature, 1) \ - ".previous" + ".popsection" #define ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)\ OLDINSTR(oldinstr) \ - ".section .altinstructions,\"a\"\n" \ + ".pushsection .altinstructions,\"a\"\n" \ ALTINSTR_ENTRY(feature1, 1) \ ALTINSTR_ENTRY(feature2, 2) \ - ".previous\n" \ - ".section .discard,\"aw\",@progbits\n" \ + ".popsection\n" \ + ".pushsection .discard,\"aw\",@progbits\n" \ DISCARD_ENTRY(1) \ DISCARD_ENTRY(2) \ - ".previous\n" \ - ".section .altinstr_replacement, \"ax\"\n" \ + ".popsection\n" \ + ".pushsection .altinstr_replacement, \"ax\"\n" \ ALTINSTR_REPLACEMENT(newinstr1, feature1, 1) \ ALTINSTR_REPLACEMENT(newinstr2, feature2, 2) \ - ".previous" + ".popsection" /* * This must be included *after* the definition of ALTERNATIVE due to -- cgit v1.1 From 76f30759f690db21ca567a20665ed2679ad3235b Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 21 Sep 2012 12:43:09 -0700 Subject: x86, alternative: Add header guards to Add header guards to protect against multiple inclusion. Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/1348256595-29119-6-git-send-email-hpa@linux.intel.com --- arch/x86/include/asm/alternative-asm.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h index 018d29f..372231c 100644 --- a/arch/x86/include/asm/alternative-asm.h +++ b/arch/x86/include/asm/alternative-asm.h @@ -1,3 +1,6 @@ +#ifndef _ASM_X86_ALTERNATIVE_ASM_H +#define _ASM_X86_ALTERNATIVE_ASM_H + #ifdef __ASSEMBLY__ #include @@ -24,3 +27,5 @@ .endm #endif /* __ASSEMBLY__ */ + +#endif /* _ASM_X86_ALTERNATIVE_ASM_H */ -- cgit v1.1 From 51ae4a2d775e1ee456282d7c60e49693d0a8555d Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 21 Sep 2012 12:43:10 -0700 Subject: x86, smap: Add a header file with macros for STAC/CLAC The STAC/CLAC instructions are only available with SMAP, but on the other hand they aren't needed if SMAP is not available, or before we start to run userspace, so construct them as alternatives which start out as noops and are enabled by the alternatives mechanism. Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/1348256595-29119-7-git-send-email-hpa@linux.intel.com --- arch/x86/Kconfig | 11 ++++++ arch/x86/include/asm/smap.h | 91 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 102 insertions(+) create mode 100644 arch/x86/include/asm/smap.h (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 8ec3a1a..5ce8694 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1487,6 +1487,17 @@ config ARCH_RANDOM If supported, this is a high bandwidth, cryptographically secure hardware random number generator. +config X86_SMAP + def_bool y + prompt "Supervisor Mode Access Prevention" if EXPERT + ---help--- + Supervisor Mode Access Prevention (SMAP) is a security + feature in newer Intel processors. There is a small + performance cost if this enabled and turned on; there is + also a small increase in the kernel size if this is enabled. + + If unsure, say Y. + config EFI bool "EFI runtime service support" depends on ACPI diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h new file mode 100644 index 0000000..3989c24 --- /dev/null +++ b/arch/x86/include/asm/smap.h @@ -0,0 +1,91 @@ +/* + * Supervisor Mode Access Prevention support + * + * Copyright (C) 2012 Intel Corporation + * Author: H. Peter Anvin + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#ifndef _ASM_X86_SMAP_H +#define _ASM_X86_SMAP_H + +#include +#include +#include + +/* "Raw" instruction opcodes */ +#define __ASM_CLAC .byte 0x0f,0x01,0xca +#define __ASM_STAC .byte 0x0f,0x01,0xcb + +#ifdef __ASSEMBLY__ + +#include + +#ifdef CONFIG_X86_SMAP + +#define ASM_CLAC \ + 661: ASM_NOP3 ; \ + .pushsection .altinstr_replacement, "ax" ; \ + 662: __ASM_CLAC ; \ + .popsection ; \ + .pushsection .altinstructions, "a" ; \ + altinstruction_entry 661b, 662b, X86_FEATURE_SMAP, 3, 3 ; \ + .popsection + +#define ASM_STAC \ + 661: ASM_NOP3 ; \ + .pushsection .altinstr_replacement, "ax" ; \ + 662: __ASM_STAC ; \ + .popsection ; \ + .pushsection .altinstructions, "a" ; \ + altinstruction_entry 661b, 662b, X86_FEATURE_SMAP, 3, 3 ; \ + .popsection + +#else /* CONFIG_X86_SMAP */ + +#define ASM_CLAC +#define ASM_STAC + +#endif /* CONFIG_X86_SMAP */ + +#else /* __ASSEMBLY__ */ + +#include + +#ifdef CONFIG_X86_SMAP + +static inline void clac(void) +{ + /* Note: a barrier is implicit in alternative() */ + alternative(ASM_NOP3, __stringify(__ASM_CLAC), X86_FEATURE_SMAP); +} + +static inline void stac(void) +{ + /* Note: a barrier is implicit in alternative() */ + alternative(ASM_NOP3, __stringify(__ASM_STAC), X86_FEATURE_SMAP); +} + +/* These macros can be used in asm() statements */ +#define ASM_CLAC \ + ALTERNATIVE(ASM_NOP3, __stringify(__ASM_CLAC), X86_FEATURE_SMAP) +#define ASM_STAC \ + ALTERNATIVE(ASM_NOP3, __stringify(__ASM_STAC), X86_FEATURE_SMAP) + +#else /* CONFIG_X86_SMAP */ + +static inline void clac(void) { } +static inline void stac(void) { } + +#define ASM_CLAC +#define ASM_STAC + +#endif /* CONFIG_X86_SMAP */ + +#endif /* __ASSEMBLY__ */ + +#endif /* _ASM_X86_SMAP_H */ -- cgit v1.1 From a052858fabb376b695f2c125633daa6728e0f284 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 21 Sep 2012 12:43:11 -0700 Subject: x86, uaccess: Merge prototypes for clear_user/__clear_user The prototypes for clear_user() and __clear_user() are identical in the 32- and 64-bit headers. No functionality change. Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/1348256595-29119-8-git-send-email-hpa@linux.intel.com --- arch/x86/include/asm/uaccess.h | 3 +++ arch/x86/include/asm/uaccess_32.h | 3 --- arch/x86/include/asm/uaccess_64.h | 3 --- 3 files changed, 3 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index e1f3a17..2c7df3d 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -569,6 +569,9 @@ strncpy_from_user(char *dst, const char __user *src, long count); extern __must_check long strlen_user(const char __user *str); extern __must_check long strnlen_user(const char __user *str, long n); +unsigned long __must_check clear_user(void __user *mem, unsigned long len); +unsigned long __must_check __clear_user(void __user *mem, unsigned long len); + /* * movsl can be slow when source and dest are not both 8-byte aligned */ diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h index 576e39b..7f760a9 100644 --- a/arch/x86/include/asm/uaccess_32.h +++ b/arch/x86/include/asm/uaccess_32.h @@ -213,7 +213,4 @@ static inline unsigned long __must_check copy_from_user(void *to, return n; } -unsigned long __must_check clear_user(void __user *mem, unsigned long len); -unsigned long __must_check __clear_user(void __user *mem, unsigned long len); - #endif /* _ASM_X86_UACCESS_32_H */ diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index d8def8b..142810c 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -217,9 +217,6 @@ int __copy_in_user(void __user *dst, const void __user *src, unsigned size) } } -__must_check unsigned long clear_user(void __user *mem, unsigned long len); -__must_check unsigned long __clear_user(void __user *mem, unsigned long len); - static __must_check __always_inline int __copy_from_user_inatomic(void *dst, const void __user *src, unsigned size) { -- cgit v1.1 From 63bcff2a307b9bcc712a8251eb27df8b2e117967 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 21 Sep 2012 12:43:12 -0700 Subject: x86, smap: Add STAC and CLAC instructions to control user space access When Supervisor Mode Access Prevention (SMAP) is enabled, access to userspace from the kernel is controlled by the AC flag. To make the performance of manipulating that flag acceptable, there are two new instructions, STAC and CLAC, to set and clear it. This patch adds those instructions, via alternative(), when the SMAP feature is enabled. It also adds X86_EFLAGS_AC unconditionally to the SYSCALL entry mask; there is simply no reason to make that one conditional. Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/1348256595-29119-9-git-send-email-hpa@linux.intel.com --- arch/x86/ia32/ia32entry.S | 6 ++++++ arch/x86/include/asm/fpu-internal.h | 10 ++++++---- arch/x86/include/asm/futex.h | 19 +++++++++++++------ arch/x86/include/asm/smap.h | 4 ++-- arch/x86/include/asm/uaccess.h | 31 +++++++++++++++++++------------ arch/x86/include/asm/xsave.h | 10 ++++++---- arch/x86/kernel/cpu/common.c | 3 ++- arch/x86/kernel/entry_64.S | 11 ++++++++++- arch/x86/lib/copy_user_64.S | 7 +++++++ arch/x86/lib/copy_user_nocache_64.S | 3 +++ arch/x86/lib/getuser.S | 10 ++++++++++ arch/x86/lib/putuser.S | 8 +++++++- arch/x86/lib/usercopy_32.c | 13 ++++++++++++- arch/x86/lib/usercopy_64.c | 3 +++ 14 files changed, 106 insertions(+), 32 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 20e5f7b..9c28950 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -146,8 +147,10 @@ ENTRY(ia32_sysenter_target) SAVE_ARGS 0,1,0 /* no need to do an access_ok check here because rbp has been 32bit zero extended */ + ASM_STAC 1: movl (%rbp),%ebp _ASM_EXTABLE(1b,ia32_badarg) + ASM_CLAC orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) CFI_REMEMBER_STATE @@ -301,8 +304,10 @@ ENTRY(ia32_cstar_target) /* no need to do an access_ok check here because r8 has been 32bit zero extended */ /* hardware stack frame is complete now */ + ASM_STAC 1: movl (%r8),%r9d _ASM_EXTABLE(1b,ia32_badarg) + ASM_CLAC orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) CFI_REMEMBER_STATE @@ -365,6 +370,7 @@ cstar_tracesys: END(ia32_cstar_target) ia32_badarg: + ASM_CLAC movq $-EFAULT,%rax jmp ia32_sysret CFI_ENDPROC diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index 75f4c6d..0fe1358 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -126,8 +126,9 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx) /* See comment in fxsave() below. */ #ifdef CONFIG_AS_FXSAVEQ - asm volatile("1: fxsaveq %[fx]\n\t" - "2:\n" + asm volatile(ASM_STAC "\n" + "1: fxsaveq %[fx]\n\t" + "2: " ASM_CLAC "\n" ".section .fixup,\"ax\"\n" "3: movl $-1,%[err]\n" " jmp 2b\n" @@ -136,8 +137,9 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx) : [err] "=r" (err), [fx] "=m" (*fx) : "0" (0)); #else - asm volatile("1: rex64/fxsave (%[fx])\n\t" - "2:\n" + asm volatile(ASM_STAC "\n" + "1: rex64/fxsave (%[fx])\n\t" + "2: " ASM_CLAC "\n" ".section .fixup,\"ax\"\n" "3: movl $-1,%[err]\n" " jmp 2b\n" diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h index 71ecbcb..f373046 100644 --- a/arch/x86/include/asm/futex.h +++ b/arch/x86/include/asm/futex.h @@ -9,10 +9,13 @@ #include #include #include +#include #define __futex_atomic_op1(insn, ret, oldval, uaddr, oparg) \ - asm volatile("1:\t" insn "\n" \ - "2:\t.section .fixup,\"ax\"\n" \ + asm volatile("\t" ASM_STAC "\n" \ + "1:\t" insn "\n" \ + "2:\t" ASM_CLAC "\n" \ + "\t.section .fixup,\"ax\"\n" \ "3:\tmov\t%3, %1\n" \ "\tjmp\t2b\n" \ "\t.previous\n" \ @@ -21,12 +24,14 @@ : "i" (-EFAULT), "0" (oparg), "1" (0)) #define __futex_atomic_op2(insn, ret, oldval, uaddr, oparg) \ - asm volatile("1:\tmovl %2, %0\n" \ + asm volatile("\t" ASM_STAC "\n" \ + "1:\tmovl %2, %0\n" \ "\tmovl\t%0, %3\n" \ "\t" insn "\n" \ "2:\t" LOCK_PREFIX "cmpxchgl %3, %2\n" \ "\tjnz\t1b\n" \ - "3:\t.section .fixup,\"ax\"\n" \ + "3:\t" ASM_CLAC "\n" \ + "\t.section .fixup,\"ax\"\n" \ "4:\tmov\t%5, %1\n" \ "\tjmp\t3b\n" \ "\t.previous\n" \ @@ -122,8 +127,10 @@ static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; - asm volatile("1:\t" LOCK_PREFIX "cmpxchgl %4, %2\n" - "2:\t.section .fixup, \"ax\"\n" + asm volatile("\t" ASM_STAC "\n" + "1:\t" LOCK_PREFIX "cmpxchgl %4, %2\n" + "2:\t" ASM_CLAC "\n" + "\t.section .fixup, \"ax\"\n" "3:\tmov %3, %0\n" "\tjmp 2b\n" "\t.previous\n" diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h index 3989c24..8d3120f 100644 --- a/arch/x86/include/asm/smap.h +++ b/arch/x86/include/asm/smap.h @@ -58,13 +58,13 @@ #ifdef CONFIG_X86_SMAP -static inline void clac(void) +static __always_inline void clac(void) { /* Note: a barrier is implicit in alternative() */ alternative(ASM_NOP3, __stringify(__ASM_CLAC), X86_FEATURE_SMAP); } -static inline void stac(void) +static __always_inline void stac(void) { /* Note: a barrier is implicit in alternative() */ alternative(ASM_NOP3, __stringify(__ASM_STAC), X86_FEATURE_SMAP); diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 2c7df3d..b92ece1 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -9,6 +9,7 @@ #include #include #include +#include #define VERIFY_READ 0 #define VERIFY_WRITE 1 @@ -192,9 +193,10 @@ extern int __get_user_bad(void); #ifdef CONFIG_X86_32 #define __put_user_asm_u64(x, addr, err, errret) \ - asm volatile("1: movl %%eax,0(%2)\n" \ + asm volatile(ASM_STAC "\n" \ + "1: movl %%eax,0(%2)\n" \ "2: movl %%edx,4(%2)\n" \ - "3:\n" \ + "3: " ASM_CLAC "\n" \ ".section .fixup,\"ax\"\n" \ "4: movl %3,%0\n" \ " jmp 3b\n" \ @@ -205,9 +207,10 @@ extern int __get_user_bad(void); : "A" (x), "r" (addr), "i" (errret), "0" (err)) #define __put_user_asm_ex_u64(x, addr) \ - asm volatile("1: movl %%eax,0(%1)\n" \ + asm volatile(ASM_STAC "\n" \ + "1: movl %%eax,0(%1)\n" \ "2: movl %%edx,4(%1)\n" \ - "3:\n" \ + "3: " ASM_CLAC "\n" \ _ASM_EXTABLE_EX(1b, 2b) \ _ASM_EXTABLE_EX(2b, 3b) \ : : "A" (x), "r" (addr)) @@ -379,8 +382,9 @@ do { \ } while (0) #define __get_user_asm(x, addr, err, itype, rtype, ltype, errret) \ - asm volatile("1: mov"itype" %2,%"rtype"1\n" \ - "2:\n" \ + asm volatile(ASM_STAC "\n" \ + "1: mov"itype" %2,%"rtype"1\n" \ + "2: " ASM_CLAC "\n" \ ".section .fixup,\"ax\"\n" \ "3: mov %3,%0\n" \ " xor"itype" %"rtype"1,%"rtype"1\n" \ @@ -412,8 +416,9 @@ do { \ } while (0) #define __get_user_asm_ex(x, addr, itype, rtype, ltype) \ - asm volatile("1: mov"itype" %1,%"rtype"0\n" \ - "2:\n" \ + asm volatile(ASM_STAC "\n" \ + "1: mov"itype" %1,%"rtype"0\n" \ + "2: " ASM_CLAC "\n" \ _ASM_EXTABLE_EX(1b, 2b) \ : ltype(x) : "m" (__m(addr))) @@ -443,8 +448,9 @@ struct __large_struct { unsigned long buf[100]; }; * aliasing issues. */ #define __put_user_asm(x, addr, err, itype, rtype, ltype, errret) \ - asm volatile("1: mov"itype" %"rtype"1,%2\n" \ - "2:\n" \ + asm volatile(ASM_STAC "\n" \ + "1: mov"itype" %"rtype"1,%2\n" \ + "2: " ASM_CLAC "\n" \ ".section .fixup,\"ax\"\n" \ "3: mov %3,%0\n" \ " jmp 2b\n" \ @@ -454,8 +460,9 @@ struct __large_struct { unsigned long buf[100]; }; : ltype(x), "m" (__m(addr)), "i" (errret), "0" (err)) #define __put_user_asm_ex(x, addr, itype, rtype, ltype) \ - asm volatile("1: mov"itype" %"rtype"0,%1\n" \ - "2:\n" \ + asm volatile(ASM_STAC "\n" \ + "1: mov"itype" %"rtype"0,%1\n" \ + "2: " ASM_CLAC "\n" \ _ASM_EXTABLE_EX(1b, 2b) \ : : ltype(x), "m" (__m(addr))) diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h index 8a1b6f9..2a923bd 100644 --- a/arch/x86/include/asm/xsave.h +++ b/arch/x86/include/asm/xsave.h @@ -74,8 +74,9 @@ static inline int xsave_user(struct xsave_struct __user *buf) if (unlikely(err)) return -EFAULT; - __asm__ __volatile__("1: .byte " REX_PREFIX "0x0f,0xae,0x27\n" - "2:\n" + __asm__ __volatile__(ASM_STAC "\n" + "1: .byte " REX_PREFIX "0x0f,0xae,0x27\n" + "2: " ASM_CLAC "\n" ".section .fixup,\"ax\"\n" "3: movl $-1,%[err]\n" " jmp 2b\n" @@ -97,8 +98,9 @@ static inline int xrestore_user(struct xsave_struct __user *buf, u64 mask) u32 lmask = mask; u32 hmask = mask >> 32; - __asm__ __volatile__("1: .byte " REX_PREFIX "0x0f,0xae,0x2f\n" - "2:\n" + __asm__ __volatile__(ASM_STAC "\n" + "1: .byte " REX_PREFIX "0x0f,0xae,0x2f\n" + "2: " ASM_CLAC "\n" ".section .fixup,\"ax\"\n" "3: movl $-1,%[err]\n" " jmp 2b\n" diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index a5fbc3c..cd43e52 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1113,7 +1113,8 @@ void syscall_init(void) /* Flags to clear on syscall */ wrmsrl(MSR_SYSCALL_MASK, - X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL); + X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF| + X86_EFLAGS_IOPL|X86_EFLAGS_AC); } unsigned long kernel_eflags; diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 69babd8..ce87e3d 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -56,6 +56,7 @@ #include #include #include +#include #include /* Avoid __ASSEMBLER__'ifying just for this. */ @@ -465,7 +466,8 @@ END(ret_from_fork) * System call entry. Up to 6 arguments in registers are supported. * * SYSCALL does not save anything on the stack and does not change the - * stack pointer. + * stack pointer. However, it does mask the flags register for us, so + * CLD and CLAC are not needed. */ /* @@ -884,6 +886,7 @@ END(interrupt) */ .p2align CONFIG_X86_L1_CACHE_SHIFT common_interrupt: + ASM_CLAC XCPT_FRAME addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */ interrupt do_IRQ @@ -1023,6 +1026,7 @@ END(common_interrupt) */ .macro apicinterrupt num sym do_sym ENTRY(\sym) + ASM_CLAC INTR_FRAME pushq_cfi $~(\num) .Lcommon_\sym: @@ -1077,6 +1081,7 @@ apicinterrupt IRQ_WORK_VECTOR \ */ .macro zeroentry sym do_sym ENTRY(\sym) + ASM_CLAC INTR_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ @@ -1094,6 +1099,7 @@ END(\sym) .macro paranoidzeroentry sym do_sym ENTRY(\sym) + ASM_CLAC INTR_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ @@ -1112,6 +1118,7 @@ END(\sym) #define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8) .macro paranoidzeroentry_ist sym do_sym ist ENTRY(\sym) + ASM_CLAC INTR_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ @@ -1131,6 +1138,7 @@ END(\sym) .macro errorentry sym do_sym ENTRY(\sym) + ASM_CLAC XCPT_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME subq $ORIG_RAX-R15, %rsp @@ -1149,6 +1157,7 @@ END(\sym) /* error code is on the stack already */ .macro paranoiderrorentry sym do_sym ENTRY(\sym) + ASM_CLAC XCPT_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME subq $ORIG_RAX-R15, %rsp diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index 5b2995f..a30ca15 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -17,6 +17,7 @@ #include #include #include +#include /* * By placing feature2 after feature1 in altinstructions section, we logically @@ -130,6 +131,7 @@ ENDPROC(bad_from_user) */ ENTRY(copy_user_generic_unrolled) CFI_STARTPROC + ASM_STAC cmpl $8,%edx jb 20f /* less then 8 bytes, go to byte copy loop */ ALIGN_DESTINATION @@ -177,6 +179,7 @@ ENTRY(copy_user_generic_unrolled) decl %ecx jnz 21b 23: xor %eax,%eax + ASM_CLAC ret .section .fixup,"ax" @@ -232,6 +235,7 @@ ENDPROC(copy_user_generic_unrolled) */ ENTRY(copy_user_generic_string) CFI_STARTPROC + ASM_STAC andl %edx,%edx jz 4f cmpl $8,%edx @@ -246,6 +250,7 @@ ENTRY(copy_user_generic_string) 3: rep movsb 4: xorl %eax,%eax + ASM_CLAC ret .section .fixup,"ax" @@ -273,12 +278,14 @@ ENDPROC(copy_user_generic_string) */ ENTRY(copy_user_enhanced_fast_string) CFI_STARTPROC + ASM_STAC andl %edx,%edx jz 2f movl %edx,%ecx 1: rep movsb 2: xorl %eax,%eax + ASM_CLAC ret .section .fixup,"ax" diff --git a/arch/x86/lib/copy_user_nocache_64.S b/arch/x86/lib/copy_user_nocache_64.S index cacddc7..6a4f43c 100644 --- a/arch/x86/lib/copy_user_nocache_64.S +++ b/arch/x86/lib/copy_user_nocache_64.S @@ -15,6 +15,7 @@ #include #include #include +#include .macro ALIGN_DESTINATION #ifdef FIX_ALIGNMENT @@ -48,6 +49,7 @@ */ ENTRY(__copy_user_nocache) CFI_STARTPROC + ASM_STAC cmpl $8,%edx jb 20f /* less then 8 bytes, go to byte copy loop */ ALIGN_DESTINATION @@ -95,6 +97,7 @@ ENTRY(__copy_user_nocache) decl %ecx jnz 21b 23: xorl %eax,%eax + ASM_CLAC sfence ret diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S index b33b1fb..156b9c8 100644 --- a/arch/x86/lib/getuser.S +++ b/arch/x86/lib/getuser.S @@ -33,6 +33,7 @@ #include #include #include +#include .text ENTRY(__get_user_1) @@ -40,8 +41,10 @@ ENTRY(__get_user_1) GET_THREAD_INFO(%_ASM_DX) cmp TI_addr_limit(%_ASM_DX),%_ASM_AX jae bad_get_user + ASM_STAC 1: movzb (%_ASM_AX),%edx xor %eax,%eax + ASM_CLAC ret CFI_ENDPROC ENDPROC(__get_user_1) @@ -53,8 +56,10 @@ ENTRY(__get_user_2) GET_THREAD_INFO(%_ASM_DX) cmp TI_addr_limit(%_ASM_DX),%_ASM_AX jae bad_get_user + ASM_STAC 2: movzwl -1(%_ASM_AX),%edx xor %eax,%eax + ASM_CLAC ret CFI_ENDPROC ENDPROC(__get_user_2) @@ -66,8 +71,10 @@ ENTRY(__get_user_4) GET_THREAD_INFO(%_ASM_DX) cmp TI_addr_limit(%_ASM_DX),%_ASM_AX jae bad_get_user + ASM_STAC 3: mov -3(%_ASM_AX),%edx xor %eax,%eax + ASM_CLAC ret CFI_ENDPROC ENDPROC(__get_user_4) @@ -80,8 +87,10 @@ ENTRY(__get_user_8) GET_THREAD_INFO(%_ASM_DX) cmp TI_addr_limit(%_ASM_DX),%_ASM_AX jae bad_get_user + ASM_STAC 4: movq -7(%_ASM_AX),%_ASM_DX xor %eax,%eax + ASM_CLAC ret CFI_ENDPROC ENDPROC(__get_user_8) @@ -91,6 +100,7 @@ bad_get_user: CFI_STARTPROC xor %edx,%edx mov $(-EFAULT),%_ASM_AX + ASM_CLAC ret CFI_ENDPROC END(bad_get_user) diff --git a/arch/x86/lib/putuser.S b/arch/x86/lib/putuser.S index 7f951c8..fc6ba17 100644 --- a/arch/x86/lib/putuser.S +++ b/arch/x86/lib/putuser.S @@ -15,6 +15,7 @@ #include #include #include +#include /* @@ -31,7 +32,8 @@ #define ENTER CFI_STARTPROC ; \ GET_THREAD_INFO(%_ASM_BX) -#define EXIT ret ; \ +#define EXIT ASM_CLAC ; \ + ret ; \ CFI_ENDPROC .text @@ -39,6 +41,7 @@ ENTRY(__put_user_1) ENTER cmp TI_addr_limit(%_ASM_BX),%_ASM_CX jae bad_put_user + ASM_STAC 1: movb %al,(%_ASM_CX) xor %eax,%eax EXIT @@ -50,6 +53,7 @@ ENTRY(__put_user_2) sub $1,%_ASM_BX cmp %_ASM_BX,%_ASM_CX jae bad_put_user + ASM_STAC 2: movw %ax,(%_ASM_CX) xor %eax,%eax EXIT @@ -61,6 +65,7 @@ ENTRY(__put_user_4) sub $3,%_ASM_BX cmp %_ASM_BX,%_ASM_CX jae bad_put_user + ASM_STAC 3: movl %eax,(%_ASM_CX) xor %eax,%eax EXIT @@ -72,6 +77,7 @@ ENTRY(__put_user_8) sub $7,%_ASM_BX cmp %_ASM_BX,%_ASM_CX jae bad_put_user + ASM_STAC 4: mov %_ASM_AX,(%_ASM_CX) #ifdef CONFIG_X86_32 5: movl %edx,4(%_ASM_CX) diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index 1781b2f..98f6d6b6 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -42,10 +42,11 @@ do { \ int __d0; \ might_fault(); \ __asm__ __volatile__( \ + ASM_STAC "\n" \ "0: rep; stosl\n" \ " movl %2,%0\n" \ "1: rep; stosb\n" \ - "2:\n" \ + "2: " ASM_CLAC "\n" \ ".section .fixup,\"ax\"\n" \ "3: lea 0(%2,%0,4),%0\n" \ " jmp 2b\n" \ @@ -626,10 +627,12 @@ survive: return n; } #endif + stac(); if (movsl_is_ok(to, from, n)) __copy_user(to, from, n); else n = __copy_user_intel(to, from, n); + clac(); return n; } EXPORT_SYMBOL(__copy_to_user_ll); @@ -637,10 +640,12 @@ EXPORT_SYMBOL(__copy_to_user_ll); unsigned long __copy_from_user_ll(void *to, const void __user *from, unsigned long n) { + stac(); if (movsl_is_ok(to, from, n)) __copy_user_zeroing(to, from, n); else n = __copy_user_zeroing_intel(to, from, n); + clac(); return n; } EXPORT_SYMBOL(__copy_from_user_ll); @@ -648,11 +653,13 @@ EXPORT_SYMBOL(__copy_from_user_ll); unsigned long __copy_from_user_ll_nozero(void *to, const void __user *from, unsigned long n) { + stac(); if (movsl_is_ok(to, from, n)) __copy_user(to, from, n); else n = __copy_user_intel((void __user *)to, (const void *)from, n); + clac(); return n; } EXPORT_SYMBOL(__copy_from_user_ll_nozero); @@ -660,6 +667,7 @@ EXPORT_SYMBOL(__copy_from_user_ll_nozero); unsigned long __copy_from_user_ll_nocache(void *to, const void __user *from, unsigned long n) { + stac(); #ifdef CONFIG_X86_INTEL_USERCOPY if (n > 64 && cpu_has_xmm2) n = __copy_user_zeroing_intel_nocache(to, from, n); @@ -668,6 +676,7 @@ unsigned long __copy_from_user_ll_nocache(void *to, const void __user *from, #else __copy_user_zeroing(to, from, n); #endif + clac(); return n; } EXPORT_SYMBOL(__copy_from_user_ll_nocache); @@ -675,6 +684,7 @@ EXPORT_SYMBOL(__copy_from_user_ll_nocache); unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *from, unsigned long n) { + stac(); #ifdef CONFIG_X86_INTEL_USERCOPY if (n > 64 && cpu_has_xmm2) n = __copy_user_intel_nocache(to, from, n); @@ -683,6 +693,7 @@ unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *fr #else __copy_user(to, from, n); #endif + clac(); return n; } EXPORT_SYMBOL(__copy_from_user_ll_nocache_nozero); diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index e5b130b..05928aa 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -18,6 +18,7 @@ unsigned long __clear_user(void __user *addr, unsigned long size) might_fault(); /* no memory constraint because it doesn't change any memory gcc knows about */ + stac(); asm volatile( " testq %[size8],%[size8]\n" " jz 4f\n" @@ -40,6 +41,7 @@ unsigned long __clear_user(void __user *addr, unsigned long size) : [size8] "=&c"(size), [dst] "=&D" (__d0) : [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr), [zero] "r" (0UL), [eight] "r" (8UL)); + clac(); return size; } EXPORT_SYMBOL(__clear_user); @@ -82,5 +84,6 @@ copy_user_handle_tail(char *to, char *from, unsigned len, unsigned zerorest) for (c = 0, zero_len = len; zerorest && zero_len; --zero_len) if (__put_user_nocheck(c, to++, sizeof(char))) break; + clac(); return len; } -- cgit v1.1 From 52b6179ac87d33c2eeaff5292786a10fe98cff64 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 21 Sep 2012 12:43:13 -0700 Subject: x86, smap: Turn on Supervisor Mode Access Prevention If Supervisor Mode Access Prevention is available and not disabled by the user, turn it on. Also fix the expansion of SMEP (Supervisor Mode Execution Prevention.) Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/1348256595-29119-10-git-send-email-hpa@linux.intel.com --- arch/x86/kernel/cpu/common.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index cd43e52..7d35d65 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -278,6 +278,31 @@ static __cpuinit void setup_smep(struct cpuinfo_x86 *c) } } +static int disable_smap __cpuinitdata; +static __init int setup_disable_smap(char *arg) +{ + disable_smap = 1; + return 1; +} +__setup("nosmap", setup_disable_smap); + +static __cpuinit void setup_smap(struct cpuinfo_x86 *c) +{ + if (cpu_has(c, X86_FEATURE_SMAP)) { + if (unlikely(disable_smap)) { + setup_clear_cpu_cap(X86_FEATURE_SMAP); + clear_in_cr4(X86_CR4_SMAP); + } else { + set_in_cr4(X86_CR4_SMAP); + /* + * Don't use clac() here since alternatives + * haven't run yet... + */ + asm volatile(__stringify(__ASM_CLAC) ::: "memory"); + } + } +} + /* * Some CPU features depend on higher CPUID levels, which may not always * be available due to CPUID level capping or broken virtualization @@ -713,6 +738,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) filter_cpuid_features(c, false); setup_smep(c); + setup_smap(c); if (this_cpu->c_bsp_init) this_cpu->c_bsp_init(c); -- cgit v1.1 From 40d3cd6695014bf3c44e2ca66b610b18acaf923d Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 21 Sep 2012 12:43:14 -0700 Subject: x86, smap: A page fault due to SMAP is an oops If we get a page fault due to SMAP, trigger an oops rather than spinning forever. Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/1348256595-29119-11-git-send-email-hpa@linux.intel.com --- arch/x86/mm/fault.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 76dcd9d..f2fb75d 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -995,6 +995,17 @@ static int fault_in_kernel_space(unsigned long address) return address >= TASK_SIZE_MAX; } +static inline bool smap_violation(int error_code, struct pt_regs *regs) +{ + if (error_code & PF_USER) + return false; + + if (!user_mode_vm(regs) && (regs->flags & X86_EFLAGS_AC)) + return false; + + return true; +} + /* * This routine handles page faults. It determines the address, * and the problem, and then passes it off to one of the appropriate @@ -1088,6 +1099,13 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) if (unlikely(error_code & PF_RSVD)) pgtable_bad(regs, error_code, address); + if (static_cpu_has(X86_FEATURE_SMAP)) { + if (unlikely(smap_violation(error_code, regs))) { + bad_area_nosemaphore(regs, error_code, address); + return; + } + } + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); /* -- cgit v1.1 From 5e88353d8b5f483bc1c873ad24ac2b59a6b66c73 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 21 Sep 2012 12:43:15 -0700 Subject: x86, smap: Reduce the SMAP overhead for signal handling Signal handling contains a bunch of accesses to individual user space items, which causes an excessive number of STAC and CLAC instructions. Instead, let get/put_user_try ... get/put_user_catch() contain the STAC and CLAC instructions. This means that get/put_user_try no longer nests, and furthermore that it is no longer legal to use user space access functions other than __get/put_user_ex() inside those blocks. However, these macros are x86-specific anyway and are only used in the signal-handling paths; a simple reordering of moving the larger subroutine calls out of the try...catch blocks resolves that problem. Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/1348256595-29119-12-git-send-email-hpa@linux.intel.com --- arch/x86/ia32/ia32_signal.c | 12 +++++++----- arch/x86/include/asm/uaccess.h | 14 ++++++-------- arch/x86/kernel/signal.c | 24 ++++++++++++++---------- 3 files changed, 27 insertions(+), 23 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 673ac9b..05e62a3 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -250,11 +250,12 @@ static int ia32_restore_sigcontext(struct pt_regs *regs, get_user_ex(tmp, &sc->fpstate); buf = compat_ptr(tmp); - err |= restore_i387_xstate_ia32(buf); get_user_ex(*pax, &sc->ax); } get_user_catch(err); + err |= restore_i387_xstate_ia32(buf); + return err; } @@ -502,7 +503,6 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, put_user_ex(sig, &frame->sig); put_user_ex(ptr_to_compat(&frame->info), &frame->pinfo); put_user_ex(ptr_to_compat(&frame->uc), &frame->puc); - err |= copy_siginfo_to_user32(&frame->info, info); /* Create the ucontext. */ if (cpu_has_xsave) @@ -514,9 +514,6 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, put_user_ex(sas_ss_flags(regs->sp), &frame->uc.uc_stack.ss_flags); put_user_ex(current->sas_ss_size, &frame->uc.uc_stack.ss_size); - err |= ia32_setup_sigcontext(&frame->uc.uc_mcontext, fpstate, - regs, set->sig[0]); - err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); if (ka->sa.sa_flags & SA_RESTORER) restorer = ka->sa.sa_restorer; @@ -532,6 +529,11 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, put_user_ex(*((u64 *)&code), (u64 *)frame->retcode); } put_user_catch(err); + err |= copy_siginfo_to_user32(&frame->info, info); + err |= ia32_setup_sigcontext(&frame->uc.uc_mcontext, fpstate, + regs, set->sig[0]); + err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); + if (err) return -EFAULT; diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index b92ece1..a91acfb 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -416,9 +416,8 @@ do { \ } while (0) #define __get_user_asm_ex(x, addr, itype, rtype, ltype) \ - asm volatile(ASM_STAC "\n" \ - "1: mov"itype" %1,%"rtype"0\n" \ - "2: " ASM_CLAC "\n" \ + asm volatile("1: mov"itype" %1,%"rtype"0\n" \ + "2:\n" \ _ASM_EXTABLE_EX(1b, 2b) \ : ltype(x) : "m" (__m(addr))) @@ -460,9 +459,8 @@ struct __large_struct { unsigned long buf[100]; }; : ltype(x), "m" (__m(addr)), "i" (errret), "0" (err)) #define __put_user_asm_ex(x, addr, itype, rtype, ltype) \ - asm volatile(ASM_STAC "\n" \ - "1: mov"itype" %"rtype"0,%1\n" \ - "2: " ASM_CLAC "\n" \ + asm volatile("1: mov"itype" %"rtype"0,%1\n" \ + "2:\n" \ _ASM_EXTABLE_EX(1b, 2b) \ : : ltype(x), "m" (__m(addr))) @@ -470,13 +468,13 @@ struct __large_struct { unsigned long buf[100]; }; * uaccess_try and catch */ #define uaccess_try do { \ - int prev_err = current_thread_info()->uaccess_err; \ current_thread_info()->uaccess_err = 0; \ + stac(); \ barrier(); #define uaccess_catch(err) \ + clac(); \ (err) |= (current_thread_info()->uaccess_err ? -EFAULT : 0); \ - current_thread_info()->uaccess_err = prev_err; \ } while (0) /** diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index b280908..9326128 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -114,11 +114,12 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, regs->orig_ax = -1; /* disable syscall checks */ get_user_ex(buf, &sc->fpstate); - err |= restore_i387_xstate(buf); get_user_ex(*pax, &sc->ax); } get_user_catch(err); + err |= restore_i387_xstate(buf); + return err; } @@ -357,7 +358,6 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, put_user_ex(sig, &frame->sig); put_user_ex(&frame->info, &frame->pinfo); put_user_ex(&frame->uc, &frame->puc); - err |= copy_siginfo_to_user(&frame->info, info); /* Create the ucontext. */ if (cpu_has_xsave) @@ -369,9 +369,6 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, put_user_ex(sas_ss_flags(regs->sp), &frame->uc.uc_stack.ss_flags); put_user_ex(current->sas_ss_size, &frame->uc.uc_stack.ss_size); - err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate, - regs, set->sig[0]); - err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); /* Set up to return from userspace. */ restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn); @@ -389,6 +386,11 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, put_user_ex(*((u64 *)&rt_retcode), (u64 *)frame->retcode); } put_user_catch(err); + err |= copy_siginfo_to_user(&frame->info, info); + err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate, + regs, set->sig[0]); + err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); + if (err) return -EFAULT; @@ -436,8 +438,6 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, put_user_ex(sas_ss_flags(regs->sp), &frame->uc.uc_stack.ss_flags); put_user_ex(me->sas_ss_size, &frame->uc.uc_stack.ss_size); - err |= setup_sigcontext(&frame->uc.uc_mcontext, fp, regs, set->sig[0]); - err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); /* Set up to return from userspace. If provided, use a stub already in userspace. */ @@ -450,6 +450,9 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, } } put_user_catch(err); + err |= setup_sigcontext(&frame->uc.uc_mcontext, fp, regs, set->sig[0]); + err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); + if (err) return -EFAULT; @@ -855,9 +858,6 @@ static int x32_setup_rt_frame(int sig, struct k_sigaction *ka, &frame->uc.uc_stack.ss_flags); put_user_ex(current->sas_ss_size, &frame->uc.uc_stack.ss_size); put_user_ex(0, &frame->uc.uc__pad0); - err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate, - regs, set->sig[0]); - err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); if (ka->sa.sa_flags & SA_RESTORER) { restorer = ka->sa.sa_restorer; @@ -869,6 +869,10 @@ static int x32_setup_rt_frame(int sig, struct k_sigaction *ka, put_user_ex(restorer, &frame->pretcode); } put_user_catch(err); + err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate, + regs, set->sig[0]); + err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); + if (err) return -EFAULT; -- cgit v1.1 From 4cd8daf05c7071ac80008c8d4368860110fa6466 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 19 Sep 2012 10:49:00 -0700 Subject: x86/PCI: Clear host bridge aperture struct resource Use kzalloc() so the struct resource doesn't contain garbage in fields we don't initialize. [bhelgaas: changelog] Signed-off-by: Yinghai Lu Signed-off-by: Bjorn Helgaas Cc: x86@kernel.org --- arch/x86/pci/acpi.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 505acdd..192397c 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -305,7 +305,6 @@ setup_resource(struct acpi_resource *acpi_res, void *data) res->flags = flags; res->start = start; res->end = end; - res->child = NULL; if (!pci_use_crs) { dev_printk(KERN_DEBUG, &info->bridge->dev, @@ -434,7 +433,7 @@ probe_pci_root_info(struct pci_root_info *info, struct acpi_device *device, size = sizeof(*info->res) * info->res_num; info->res_num = 0; - info->res = kmalloc(size, GFP_KERNEL); + info->res = kzalloc(size, GFP_KERNEL); if (!info->res) return; -- cgit v1.1 From e59d1b0a24199db01978e6c1e89859eda93ce683 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 21 Sep 2012 13:58:10 -0700 Subject: x86-32, smap: Add STAC/CLAC instructions to 32-bit kernel entry The changes to entry_32.S got missed in checkin: 63bcff2a x86, smap: Add STAC and CLAC instructions to control user space access The resulting kernel was largely functional but SMAP protection could have been bypassed. Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/1348256595-29119-9-git-send-email-hpa@linux.intel.com --- arch/x86/kernel/entry_32.S | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 623f288..9ebbeca 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -57,6 +57,7 @@ #include #include #include +#include /* Avoid __ASSEMBLER__'ifying just for this. */ #include @@ -407,7 +408,9 @@ sysenter_past_esp: */ cmpl $__PAGE_OFFSET-3,%ebp jae syscall_fault + ASM_STAC 1: movl (%ebp),%ebp + ASM_CLAC movl %ebp,PT_EBP(%esp) _ASM_EXTABLE(1b,syscall_fault) @@ -488,6 +491,7 @@ ENDPROC(ia32_sysenter_target) # system call handler stub ENTRY(system_call) RING0_INT_FRAME # can't unwind into user space anyway + ASM_CLAC pushl_cfi %eax # save orig_eax SAVE_ALL GET_THREAD_INFO(%ebp) @@ -670,6 +674,7 @@ END(syscall_exit_work) RING0_INT_FRAME # can't unwind into user space anyway syscall_fault: + ASM_CLAC GET_THREAD_INFO(%ebp) movl $-EFAULT,PT_EAX(%esp) jmp resume_userspace @@ -825,6 +830,7 @@ END(interrupt) */ .p2align CONFIG_X86_L1_CACHE_SHIFT common_interrupt: + ASM_CLAC addl $-0x80,(%esp) /* Adjust vector into the [-256,-1] range */ SAVE_ALL TRACE_IRQS_OFF @@ -841,6 +847,7 @@ ENDPROC(common_interrupt) #define BUILD_INTERRUPT3(name, nr, fn) \ ENTRY(name) \ RING0_INT_FRAME; \ + ASM_CLAC; \ pushl_cfi $~(nr); \ SAVE_ALL; \ TRACE_IRQS_OFF \ @@ -857,6 +864,7 @@ ENDPROC(name) ENTRY(coprocessor_error) RING0_INT_FRAME + ASM_CLAC pushl_cfi $0 pushl_cfi $do_coprocessor_error jmp error_code @@ -865,6 +873,7 @@ END(coprocessor_error) ENTRY(simd_coprocessor_error) RING0_INT_FRAME + ASM_CLAC pushl_cfi $0 #ifdef CONFIG_X86_INVD_BUG /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */ @@ -886,6 +895,7 @@ END(simd_coprocessor_error) ENTRY(device_not_available) RING0_INT_FRAME + ASM_CLAC pushl_cfi $-1 # mark this as an int pushl_cfi $do_device_not_available jmp error_code @@ -906,6 +916,7 @@ END(native_irq_enable_sysexit) ENTRY(overflow) RING0_INT_FRAME + ASM_CLAC pushl_cfi $0 pushl_cfi $do_overflow jmp error_code @@ -914,6 +925,7 @@ END(overflow) ENTRY(bounds) RING0_INT_FRAME + ASM_CLAC pushl_cfi $0 pushl_cfi $do_bounds jmp error_code @@ -922,6 +934,7 @@ END(bounds) ENTRY(invalid_op) RING0_INT_FRAME + ASM_CLAC pushl_cfi $0 pushl_cfi $do_invalid_op jmp error_code @@ -930,6 +943,7 @@ END(invalid_op) ENTRY(coprocessor_segment_overrun) RING0_INT_FRAME + ASM_CLAC pushl_cfi $0 pushl_cfi $do_coprocessor_segment_overrun jmp error_code @@ -938,6 +952,7 @@ END(coprocessor_segment_overrun) ENTRY(invalid_TSS) RING0_EC_FRAME + ASM_CLAC pushl_cfi $do_invalid_TSS jmp error_code CFI_ENDPROC @@ -945,6 +960,7 @@ END(invalid_TSS) ENTRY(segment_not_present) RING0_EC_FRAME + ASM_CLAC pushl_cfi $do_segment_not_present jmp error_code CFI_ENDPROC @@ -952,6 +968,7 @@ END(segment_not_present) ENTRY(stack_segment) RING0_EC_FRAME + ASM_CLAC pushl_cfi $do_stack_segment jmp error_code CFI_ENDPROC @@ -959,6 +976,7 @@ END(stack_segment) ENTRY(alignment_check) RING0_EC_FRAME + ASM_CLAC pushl_cfi $do_alignment_check jmp error_code CFI_ENDPROC @@ -966,6 +984,7 @@ END(alignment_check) ENTRY(divide_error) RING0_INT_FRAME + ASM_CLAC pushl_cfi $0 # no error code pushl_cfi $do_divide_error jmp error_code @@ -975,6 +994,7 @@ END(divide_error) #ifdef CONFIG_X86_MCE ENTRY(machine_check) RING0_INT_FRAME + ASM_CLAC pushl_cfi $0 pushl_cfi machine_check_vector jmp error_code @@ -984,6 +1004,7 @@ END(machine_check) ENTRY(spurious_interrupt_bug) RING0_INT_FRAME + ASM_CLAC pushl_cfi $0 pushl_cfi $do_spurious_interrupt_bug jmp error_code @@ -1207,6 +1228,7 @@ return_to_handler: ENTRY(page_fault) RING0_EC_FRAME + ASM_CLAC pushl_cfi $do_page_fault ALIGN error_code: @@ -1279,6 +1301,7 @@ END(page_fault) ENTRY(debug) RING0_INT_FRAME + ASM_CLAC cmpl $ia32_sysenter_target,(%esp) jne debug_stack_correct FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn @@ -1303,6 +1326,7 @@ END(debug) */ ENTRY(nmi) RING0_INT_FRAME + ASM_CLAC pushl_cfi %eax movl %ss, %eax cmpw $__ESPFIX_SS, %ax @@ -1373,6 +1397,7 @@ END(nmi) ENTRY(int3) RING0_INT_FRAME + ASM_CLAC pushl_cfi $-1 # mark this as an int SAVE_ALL TRACE_IRQS_OFF @@ -1393,6 +1418,7 @@ END(general_protection) #ifdef CONFIG_KVM_GUEST ENTRY(async_page_fault) RING0_EC_FRAME + ASM_CLAC pushl_cfi $do_async_page_fault jmp error_code CFI_ENDPROC -- cgit v1.1 From b1a74bf8212367be2b1d6685c11a84e056eaaaf1 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Thu, 20 Sep 2012 11:01:49 -0700 Subject: x86, kvm: fix kvm's usage of kernel_fpu_begin/end() Preemption is disabled between kernel_fpu_begin/end() and as such it is not a good idea to use these routines in kvm_load/put_guest_fpu() which can be very far apart. kvm_load/put_guest_fpu() routines are already called with preemption disabled and KVM already uses the preempt notifier to save the guest fpu state using kvm_put_guest_fpu(). So introduce __kernel_fpu_begin/end() routines which don't touch preemption and use them instead of kernel_fpu_begin/end() for KVM's use model of saving/restoring guest FPU state. Also with this change (and with eagerFPU model), fix the host cr0.TS vm-exit state in the case of VMX. For eagerFPU case, host cr0.TS is always clear. So no need to worry about it. For the traditional lazyFPU restore case, change the cr0.TS bit for the host state during vm-exit to be always clear and cr0.TS bit is set in the __vmx_load_host_state() when the FPU (guest FPU or the host task's FPU) state is not active. This ensures that the host/guest FPU state is properly saved, restored during context-switch and with interrupts (using irq_fpu_usable()) not stomping on the active FPU state. Signed-off-by: Suresh Siddha Link: http://lkml.kernel.org/r/1348164109.26695.338.camel@sbsiddha-desk.sc.intel.com Cc: Avi Kivity Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/i387.h | 28 ++++++++++++++++++++++++++-- arch/x86/kernel/i387.c | 13 +++++-------- arch/x86/kvm/vmx.c | 10 +++++++--- arch/x86/kvm/x86.c | 4 ++-- 4 files changed, 40 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 6c3bd37..ed8089d6 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -24,8 +24,32 @@ extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); extern void math_state_restore(void); extern bool irq_fpu_usable(void); -extern void kernel_fpu_begin(void); -extern void kernel_fpu_end(void); + +/* + * Careful: __kernel_fpu_begin/end() must be called with preempt disabled + * and they don't touch the preempt state on their own. + * If you enable preemption after __kernel_fpu_begin(), preempt notifier + * should call the __kernel_fpu_end() to prevent the kernel/user FPU + * state from getting corrupted. KVM for example uses this model. + * + * All other cases use kernel_fpu_begin/end() which disable preemption + * during kernel FPU usage. + */ +extern void __kernel_fpu_begin(void); +extern void __kernel_fpu_end(void); + +static inline void kernel_fpu_begin(void) +{ + WARN_ON_ONCE(!irq_fpu_usable()); + preempt_disable(); + __kernel_fpu_begin(); +} + +static inline void kernel_fpu_end(void) +{ + __kernel_fpu_end(); + preempt_enable(); +} /* * Some instructions like VIA's padlock instructions generate a spurious diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 6782e39..675a050 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -73,32 +73,29 @@ bool irq_fpu_usable(void) } EXPORT_SYMBOL(irq_fpu_usable); -void kernel_fpu_begin(void) +void __kernel_fpu_begin(void) { struct task_struct *me = current; - WARN_ON_ONCE(!irq_fpu_usable()); - preempt_disable(); if (__thread_has_fpu(me)) { __save_init_fpu(me); __thread_clear_has_fpu(me); - /* We do 'stts()' in kernel_fpu_end() */ + /* We do 'stts()' in __kernel_fpu_end() */ } else if (!use_eager_fpu()) { this_cpu_write(fpu_owner_task, NULL); clts(); } } -EXPORT_SYMBOL(kernel_fpu_begin); +EXPORT_SYMBOL(__kernel_fpu_begin); -void kernel_fpu_end(void) +void __kernel_fpu_end(void) { if (use_eager_fpu()) math_state_restore(); else stts(); - preempt_enable(); } -EXPORT_SYMBOL(kernel_fpu_end); +EXPORT_SYMBOL(__kernel_fpu_end); void unlazy_fpu(struct task_struct *tsk) { diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index c00f03d..70dfcec 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1493,8 +1493,12 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx) #ifdef CONFIG_X86_64 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); #endif - if (user_has_fpu()) - clts(); + /* + * If the FPU is not active (through the host task or + * the guest vcpu), then restore the cr0.TS bit. + */ + if (!user_has_fpu() && !vmx->vcpu.guest_fpu_loaded) + stts(); load_gdt(&__get_cpu_var(host_gdt)); } @@ -3730,7 +3734,7 @@ static void vmx_set_constant_host_state(void) unsigned long tmpl; struct desc_ptr dt; - vmcs_writel(HOST_CR0, read_cr0() | X86_CR0_TS); /* 22.2.3 */ + vmcs_writel(HOST_CR0, read_cr0() & ~X86_CR0_TS); /* 22.2.3 */ vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */ vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index cf637f5..02b2cd5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5972,7 +5972,7 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) */ kvm_put_guest_xcr0(vcpu); vcpu->guest_fpu_loaded = 1; - kernel_fpu_begin(); + __kernel_fpu_begin(); fpu_restore_checking(&vcpu->arch.guest_fpu); trace_kvm_fpu(1); } @@ -5986,7 +5986,7 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) vcpu->guest_fpu_loaded = 0; fpu_save_init(&vcpu->arch.guest_fpu); - kernel_fpu_end(); + __kernel_fpu_end(); ++vcpu->stat.fpu_reload; kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu); trace_kvm_fpu(0); -- cgit v1.1 From 7a84428af7ca6a847f058c9ff244a18a2664fd1b Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Fri, 21 Sep 2012 11:58:03 -0600 Subject: KVM: Add resampling irqfds for level triggered interrupts To emulate level triggered interrupts, add a resample option to KVM_IRQFD. When specified, a new resamplefd is provided that notifies the user when the irqchip has been resampled by the VM. This may, for instance, indicate an EOI. Also in this mode, posting of an interrupt through an irqfd only asserts the interrupt. On resampling, the interrupt is automatically de-asserted prior to user notification. This enables level triggered interrupts to be posted and re-enabled from vfio with no userspace intervention. All resampling irqfds can make use of a single irq source ID, so we reserve a new one for this interface. Signed-off-by: Alex Williamson Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fc2a0a1..7d44204 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2176,6 +2176,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_PCI_2_3: case KVM_CAP_KVMCLOCK_CTRL: case KVM_CAP_READONLY_MEM: + case KVM_CAP_IRQFD_RESAMPLE: r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -6268,6 +6269,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); + /* Reserve bit 1 of irq_sources_bitmap for irqfd-resampler */ + set_bit(KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, + &kvm->arch.irq_sources_bitmap); raw_spin_lock_init(&kvm->arch.tsc_write_lock); mutex_init(&kvm->arch.apic_map_lock); -- cgit v1.1 From c863901075a42d50678616d8ee4b96ef13080498 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Fri, 21 Sep 2012 05:42:55 +0200 Subject: KVM: x86: Fix guest debug across vcpu INIT reset If we reset a vcpu on INIT, we so far overwrote dr7 as provided by KVM_SET_GUEST_DEBUG, and we also cleared switch_db_regs unconditionally. Fix this by saving the dr7 used for guest debugging and calculating the effective register value as well as switch_db_regs on any potential change. This will change to focus of the set_guest_debug vendor op to update_dp_bp_intercept. Found while trying to stop on start_secondary. Signed-off-by: Jan Kiszka Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 4 ++-- arch/x86/kvm/svm.c | 23 ++++------------------- arch/x86/kvm/vmx.c | 14 +------------- arch/x86/kvm/x86.c | 26 +++++++++++++++++--------- 4 files changed, 24 insertions(+), 43 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 0b902c9..c9a9136 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -471,6 +471,7 @@ struct kvm_vcpu_arch { unsigned long dr6; unsigned long dr7; unsigned long eff_db[KVM_NR_DB_REGS]; + unsigned long guest_debug_dr7; u64 mcg_cap; u64 mcg_status; @@ -647,8 +648,7 @@ struct kvm_x86_ops { void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); void (*vcpu_put)(struct kvm_vcpu *vcpu); - void (*set_guest_debug)(struct kvm_vcpu *vcpu, - struct kvm_guest_debug *dbg); + void (*update_db_bp_intercept)(struct kvm_vcpu *vcpu); int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 818fceb..d017df3 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1146,7 +1146,6 @@ static void init_vmcb(struct vcpu_svm *svm) svm_set_efer(&svm->vcpu, 0); save->dr6 = 0xffff0ff0; - save->dr7 = 0x400; kvm_set_rflags(&svm->vcpu, 2); save->rip = 0x0000fff0; svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip; @@ -1643,7 +1642,7 @@ static void svm_set_segment(struct kvm_vcpu *vcpu, mark_dirty(svm->vmcb, VMCB_SEG); } -static void update_db_intercept(struct kvm_vcpu *vcpu) +static void update_db_bp_intercept(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1663,20 +1662,6 @@ static void update_db_intercept(struct kvm_vcpu *vcpu) vcpu->guest_debug = 0; } -static void svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) - svm->vmcb->save.dr7 = dbg->arch.debugreg[7]; - else - svm->vmcb->save.dr7 = vcpu->arch.dr7; - - mark_dirty(svm->vmcb, VMCB_DR); - - update_db_intercept(vcpu); -} - static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) { if (sd->next_asid > sd->max_asid) { @@ -1748,7 +1733,7 @@ static int db_interception(struct vcpu_svm *svm) if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) svm->vmcb->save.rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF); - update_db_intercept(&svm->vcpu); + update_db_bp_intercept(&svm->vcpu); } if (svm->vcpu.guest_debug & @@ -3659,7 +3644,7 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu) */ svm->nmi_singlestep = true; svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); - update_db_intercept(vcpu); + update_db_bp_intercept(vcpu); } static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr) @@ -4253,7 +4238,7 @@ static struct kvm_x86_ops svm_x86_ops = { .vcpu_load = svm_vcpu_load, .vcpu_put = svm_vcpu_put, - .set_guest_debug = svm_guest_debug, + .update_db_bp_intercept = update_db_bp_intercept, .get_msr = svm_get_msr, .set_msr = svm_set_msr, .get_segment_base = svm_get_segment_base, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 30bcb95..5d46c90 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2288,16 +2288,6 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) } } -static void set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) -{ - if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) - vmcs_writel(GUEST_DR7, dbg->arch.debugreg[7]); - else - vmcs_writel(GUEST_DR7, vcpu->arch.dr7); - - update_exception_bitmap(vcpu); -} - static __init int cpu_has_kvm_support(void) { return cpu_has_vmx(); @@ -3960,8 +3950,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) kvm_rip_write(vcpu, 0); kvm_register_write(vcpu, VCPU_REGS_RSP, 0); - vmcs_writel(GUEST_DR7, 0x400); - vmcs_writel(GUEST_GDTR_BASE, 0); vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); @@ -7237,7 +7225,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .vcpu_load = vmx_vcpu_load, .vcpu_put = vmx_vcpu_put, - .set_guest_debug = set_guest_debug, + .update_db_bp_intercept = update_exception_bitmap, .get_msr = vmx_get_msr, .set_msr = vmx_set_msr, .get_segment_base = vmx_get_segment_base, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7d44204..b16d4a5b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -692,6 +692,18 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_get_cr8); +static void kvm_update_dr7(struct kvm_vcpu *vcpu) +{ + unsigned long dr7; + + if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) + dr7 = vcpu->arch.guest_debug_dr7; + else + dr7 = vcpu->arch.dr7; + kvm_x86_ops->set_dr7(vcpu, dr7); + vcpu->arch.switch_db_regs = (dr7 & DR7_BP_EN_MASK); +} + static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) { switch (dr) { @@ -717,10 +729,7 @@ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) if (val & 0xffffffff00000000ULL) return -1; /* #GP */ vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1; - if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { - kvm_x86_ops->set_dr7(vcpu, vcpu->arch.dr7); - vcpu->arch.switch_db_regs = (val & DR7_BP_EN_MASK); - } + kvm_update_dr7(vcpu); break; } @@ -5851,13 +5860,12 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { for (i = 0; i < KVM_NR_DB_REGS; ++i) vcpu->arch.eff_db[i] = dbg->arch.debugreg[i]; - vcpu->arch.switch_db_regs = - (dbg->arch.debugreg[7] & DR7_BP_EN_MASK); + vcpu->arch.guest_debug_dr7 = dbg->arch.debugreg[7]; } else { for (i = 0; i < KVM_NR_DB_REGS; i++) vcpu->arch.eff_db[i] = vcpu->arch.db[i]; - vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK); } + kvm_update_dr7(vcpu); if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) + @@ -5869,7 +5877,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, */ kvm_set_rflags(vcpu, rflags); - kvm_x86_ops->set_guest_debug(vcpu, dbg); + kvm_x86_ops->update_db_bp_intercept(vcpu); r = 0; @@ -6045,10 +6053,10 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) vcpu->arch.nmi_pending = 0; vcpu->arch.nmi_injected = false; - vcpu->arch.switch_db_regs = 0; memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db)); vcpu->arch.dr6 = DR6_FIXED_1; vcpu->arch.dr7 = DR7_FIXED_1; + kvm_update_dr7(vcpu); kvm_make_request(KVM_REQ_EVENT, vcpu); vcpu->arch.apf.msr_val = 0; -- cgit v1.1 From a2db672aa305a045404615e5222ba681bab6cf58 Mon Sep 17 00:00:00 2001 From: Silas Boyd-Wickizer Date: Fri, 3 Aug 2012 12:33:27 -0700 Subject: Use get_online_cpus to avoid races involving CPU hotplug If arch/x86/kernel/msr.c is a module, a CPU might offline or online between the for_each_online_cpu(i) loop and the call to register_hotcpu_notifier in msr_init or the call to unregister_hotcpu_notifier in msr_exit. The potential races can lead to leaks/duplicates, attempts to destroy non-existant devices, or random pointer dereferences. For example, in msr_init if: for_each_online_cpu(i) { err = msr_device_create(i); if (err != 0) goto out_class; } <----- CPU offlines register_hotcpu_notifier(&msr_class_cpu_notifier); and the CPU never onlines before msr_exit, then the module will never call msr_device_destroy for the associated CPU. This fix surrounds for_each_online_cpu and register_hotcpu_notifier or unregister_hotcpu_notifier with get_online_cpus+put_online_cpus. Tested on a VM. Signed-off-by: Silas Boyd-Wickizer Signed-off-by: Paul E. McKenney --- arch/x86/kernel/msr.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index eb11369..a7c5661 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -257,12 +257,14 @@ static int __init msr_init(void) goto out_chrdev; } msr_class->devnode = msr_devnode; + get_online_cpus(); for_each_online_cpu(i) { err = msr_device_create(i); if (err != 0) goto out_class; } register_hotcpu_notifier(&msr_class_cpu_notifier); + put_online_cpus(); err = 0; goto out; @@ -271,6 +273,7 @@ out_class: i = 0; for_each_online_cpu(i) msr_device_destroy(i); + put_online_cpus(); class_destroy(msr_class); out_chrdev: __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr"); @@ -281,11 +284,13 @@ out: static void __exit msr_exit(void) { int cpu = 0; + get_online_cpus(); for_each_online_cpu(cpu) msr_device_destroy(cpu); class_destroy(msr_class); __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr"); unregister_hotcpu_notifier(&msr_class_cpu_notifier); + put_online_cpus(); } module_init(msr_init); -- cgit v1.1 From 429227bbe55647aa42f8f63cac61e4544e248629 Mon Sep 17 00:00:00 2001 From: Silas Boyd-Wickizer Date: Fri, 3 Aug 2012 12:34:50 -0700 Subject: Use get_online_cpus to avoid races involving CPU hotplug If arch/x86/kernel/cpuid.c is a module, a CPU might offline or online between the for_each_online_cpu() loop and the call to register_hotcpu_notifier in cpuid_init or the call to unregister_hotcpu_notifier in cpuid_exit. The potential races can lead to leaks/duplicates, attempts to destroy non-existant devices, or random pointer dereferences. For example, in cpuid_exit if: for_each_online_cpu(cpu) cpuid_device_destroy(cpu); class_destroy(cpuid_class); __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid"); <----- CPU onlines unregister_hotcpu_notifier(&cpuid_class_cpu_notifier); the hotcpu notifier will attempt to create a device for the cpuid_class, which the module already destroyed. This fix surrounds for_each_online_cpu and register_hotcpu_notifier or unregister_hotcpu_notifier with get_online_cpus+put_online_cpus. Tested on a VM. Signed-off-by: Silas Boyd-Wickizer Signed-off-by: Paul E. McKenney --- arch/x86/kernel/cpuid.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 39472dd..60c7891 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -199,12 +199,14 @@ static int __init cpuid_init(void) goto out_chrdev; } cpuid_class->devnode = cpuid_devnode; + get_online_cpus(); for_each_online_cpu(i) { err = cpuid_device_create(i); if (err != 0) goto out_class; } register_hotcpu_notifier(&cpuid_class_cpu_notifier); + put_online_cpus(); err = 0; goto out; @@ -214,6 +216,7 @@ out_class: for_each_online_cpu(i) { cpuid_device_destroy(i); } + put_online_cpus(); class_destroy(cpuid_class); out_chrdev: __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid"); @@ -225,11 +228,13 @@ static void __exit cpuid_exit(void) { int cpu = 0; + get_online_cpus(); for_each_online_cpu(cpu) cpuid_device_destroy(cpu); class_destroy(cpuid_class); __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid"); unregister_hotcpu_notifier(&cpuid_class_cpu_notifier); + put_online_cpus(); } module_init(cpuid_init); -- cgit v1.1 From 8d54db795dfb1049d45dc34f0dddbc5347ec5642 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 17 Aug 2012 10:22:37 -0400 Subject: xen/boot: Disable NUMA for PV guests. The hypervisor is in charge of allocating the proper "NUMA" memory and dealing with the CPU scheduler to keep them bound to the proper NUMA node. The PV guests (and PVHVM) have no inkling of where they run and do not need to know that right now. In the future we will need to inject NUMA configuration data (if a guest spans two or more NUMA nodes) so that the kernel can make the right choices. But those patches are not yet present. In the meantime, disable the NUMA capability in the PV guest, which also fixes a bootup issue. Andre says: "we see Dom0 crashes due to the kernel detecting the NUMA topology not by ACPI, but directly from the northbridge (CONFIG_AMD_NUMA). This will detect the actual NUMA config of the physical machine, but will crash about the mismatch with Dom0's virtual memory. Variation of the theme: Dom0 sees what it's not supposed to see. This happens with the said config option enabled and on a machine where this scanning is still enabled (K8 and Fam10h, not Bulldozer class) We have this dump then: NUMA: Warning: node ids are out of bound, from=-1 to=-1 distance=10 Scanning NUMA topology in Northbridge 24 Number of physical nodes 4 Node 0 MemBase 0000000000000000 Limit 0000000040000000 Node 1 MemBase 0000000040000000 Limit 0000000138000000 Node 2 MemBase 0000000138000000 Limit 00000001f8000000 Node 3 MemBase 00000001f8000000 Limit 0000000238000000 Initmem setup node 0 0000000000000000-0000000040000000 NODE_DATA [000000003ffd9000 - 000000003fffffff] Initmem setup node 1 0000000040000000-0000000138000000 NODE_DATA [0000000137fd9000 - 0000000137ffffff] Initmem setup node 2 0000000138000000-00000001f8000000 NODE_DATA [00000001f095e000 - 00000001f0984fff] Initmem setup node 3 00000001f8000000-0000000238000000 Cannot find 159744 bytes in node 3 BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] __alloc_bootmem_node+0x43/0x96 Pid: 0, comm: swapper Not tainted 3.3.6 #1 AMD Dinar/Dinar RIP: e030:[] [] __alloc_bootmem_node+0x43/0x96 .. snip.. [] sparse_early_usemaps_alloc_node+0x64/0x178 [] sparse_init+0xe4/0x25a [] paging_init+0x13/0x22 [] setup_arch+0x9c6/0xa9b [] ? printk+0x3c/0x3e [] start_kernel+0xe5/0x468 [] x86_64_start_reservations+0xba/0xc1 [] ? xen_setup_runstate_info+0x2c/0x36 [] xen_start_kernel+0x565/0x56c " so we just disable NUMA scanning by setting numa_off=1. CC: stable@vger.kernel.org Reported-and-Tested-by: Andre Przywara Acked-by: Andre Przywara Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/setup.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index d11ca11..e2d62d6 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -544,4 +545,7 @@ void __init xen_arch_setup(void) disable_cpufreq(); WARN_ON(set_pm_idle_to_default()); fiddle_vdso(); +#ifdef CONFIG_NUMA + numa_off = 1; +#endif } -- cgit v1.1 From ffb8b233c2261b7978dc3bd759aaa19bd1a7fadf Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 21 Sep 2012 12:30:35 -0400 Subject: xen/x86: retrieve keyboard shift status flags from hypervisor. The xen c/s 25873 allows the hypervisor to retrieve the NUMLOCK flag. With this patch, the Linux kernel can get the state according to the data in the BIOS. Acked-by: Jan Beulich Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/enlighten.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 47b3acd..6789715 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1440,11 +1440,19 @@ asmlinkage void __init xen_start_kernel(void) const struct dom0_vga_console_info *info = (void *)((char *)xen_start_info + xen_start_info->console.dom0.info_off); + struct xen_platform_op op = { + .cmd = XENPF_firmware_info, + .interface_version = XENPF_INTERFACE_VERSION, + .u.firmware_info.type = XEN_FW_KBD_SHIFT_FLAGS, + }; xen_init_vga(info, xen_start_info->console.dom0.info_size); xen_start_info->console.domU.mfn = 0; xen_start_info->console.domU.evtchn = 0; + if (HYPERVISOR_dom0_op(&op) == 0) + boot_params.kbd_status = op.u.firmware_info.u.kbd_shift_flags; + xen_init_apic(); /* Make sure ACS will be enabled */ -- cgit v1.1 From aa387d630cfed1a694a9c8c61fba3877ba8d4f07 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 9 Feb 2012 11:33:51 +0800 Subject: xen/vga: add the xen EFI video mode support In order to add xen EFI frambebuffer video support, it is required to add xen-efi's new video type (XEN_VGATYPE_EFI_LFB) case and handle it in the function xen_init_vga and set the video type to VIDEO_TYPE_EFI to enable efi video mode. The original patch from which this was broken out from: http://marc.info/?i=4E099AA6020000780004A4C6@nat28.tlf.novell.com Signed-off-by: Jan Beulich Signed-off-by: Tang Liang [v2: The original author is Jan Beulich and Liang Tang ported it to upstream] Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/vga.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/vga.c b/arch/x86/xen/vga.c index 1cd7f4d..6722e37 100644 --- a/arch/x86/xen/vga.c +++ b/arch/x86/xen/vga.c @@ -35,6 +35,7 @@ void __init xen_init_vga(const struct dom0_vga_console_info *info, size_t size) info->u.text_mode_3.font_height; break; + case XEN_VGATYPE_EFI_LFB: case XEN_VGATYPE_VESA_LFB: if (size < offsetof(struct dom0_vga_console_info, u.vesa_lfb.gbl_caps)) @@ -54,6 +55,12 @@ void __init xen_init_vga(const struct dom0_vga_console_info *info, size_t size) screen_info->blue_pos = info->u.vesa_lfb.blue_pos; screen_info->rsvd_size = info->u.vesa_lfb.rsvd_size; screen_info->rsvd_pos = info->u.vesa_lfb.rsvd_pos; + + if (info->video_type == XEN_VGATYPE_EFI_LFB) { + screen_info->orig_video_isVGA = VIDEO_TYPE_EFI; + break; + } + if (size >= offsetof(struct dom0_vga_console_info, u.vesa_lfb.gbl_caps) + sizeof(info->u.vesa_lfb.gbl_caps)) -- cgit v1.1 From 82c93fcc2e1737fede2752520f1bf8f4de6304d8 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 24 Sep 2012 07:34:51 +0000 Subject: x86: bpf_jit_comp: add XOR instruction for BPF JIT This patch is a follow-up for patch "filter: add XOR instruction for use with X/K" that implements BPF x86 JIT parts for the BPF XOR operation. Signed-off-by: Daniel Borkmann Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- arch/x86/net/bpf_jit_comp.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 106c578..520d2bd 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -335,9 +335,18 @@ void bpf_jit_compile(struct sk_filter *fp) EMIT1_off32(0x0d, K); /* or imm32,%eax */ break; case BPF_S_ANC_ALU_XOR_X: /* A ^= X; */ + case BPF_S_ALU_XOR_X: seen |= SEEN_XREG; EMIT2(0x31, 0xd8); /* xor %ebx,%eax */ break; + case BPF_S_ALU_XOR_K: /* A ^= K; */ + if (K == 0) + break; + if (is_imm8(K)) + EMIT3(0x83, 0xf0, K); /* xor imm8,%eax */ + else + EMIT1_off32(0x35, K); /* xor imm32,%eax */ + break; case BPF_S_ALU_LSH_X: /* A <<= X; */ seen |= SEEN_XREG; EMIT4(0x89, 0xd9, 0xd3, 0xe0); /* mov %ebx,%ecx; shl %cl,%eax */ -- cgit v1.1 From fdf9c356502ae02238efcdf90cefd7b473a63fd4 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 9 Sep 2012 14:56:31 +0200 Subject: cputime: Make finegrained irqtime accounting generally available There is no known reason for this option to be unavailable on other archs than x86. They just need to call enable_sched_clock_irqtime() if they have a sufficiently finegrained clock to make it working. Move it to the general option and let the user choose between it and pure tick based or virtual cputime accounting. Note that virtual cputime accounting already performs a finegrained irqtime accounting. CONFIG_IRQ_TIME_ACCOUNTING is a kind of middle ground between tick and virtual based accounting. So CONFIG_IRQ_TIME_ACCOUNTING and CONFIG_VIRT_CPU_ACCOUNTING are mutually exclusive choices. Signed-off-by: Frederic Weisbecker Cc: Tony Luck Cc: Fenghua Yu Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra --- arch/x86/Kconfig | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 8ec3a1a..b86833a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -97,6 +97,7 @@ config X86 select KTIME_SCALAR if X86_32 select GENERIC_STRNCPY_FROM_USER select GENERIC_STRNLEN_USER + select HAVE_IRQ_TIME_ACCOUNTING config INSTRUCTION_DECODER def_bool (KPROBES || PERF_EVENTS || UPROBES) @@ -796,17 +797,6 @@ config SCHED_MC making when dealing with multi-core CPU chips at a cost of slightly increased overhead in some places. If unsure say N here. -config IRQ_TIME_ACCOUNTING - bool "Fine granularity task level IRQ time accounting" - default n - ---help--- - Select this option to enable fine granularity task irq time - accounting. This is done by reading a timestamp on each - transitions between softirq and hardirq state, so there can be a - small performance impact. - - If in doubt, say N here. - source "kernel/Kconfig.preempt" config X86_UP_APIC -- cgit v1.1 From e139e95590dfebab81841bf7a3ac46500f51a47c Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 25 Sep 2012 15:42:18 -0700 Subject: x86, smap: Do not abuse the [f][x]rstor_checking() functions for user space With SMAP, the [f][x]rstor_checking() functions are no longer usable for user-space pointers by applying a simple __force cast. Instead, create new [f][x]rstor_user() functions which do the proper SMAP magic. Signed-off-by: H. Peter Anvin Cc: Suresh Siddha Link: http://lkml.kernel.org/r/1343171129-2747-3-git-send-email-suresh.b.siddha@intel.com --- arch/x86/include/asm/fpu-internal.h | 17 +++++++++++++++++ arch/x86/kernel/xsave.c | 6 +++--- 2 files changed, 20 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index 409b9cc..831dbb9 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -181,11 +181,28 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx) "m" (*fx)); } +static inline int fxrstor_user(struct i387_fxsave_struct __user *fx) +{ + if (config_enabled(CONFIG_X86_32)) + return user_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); + else if (config_enabled(CONFIG_AS_FXSAVEQ)) + return user_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); + + /* See comment in fpu_fxsave() below. */ + return user_insn(rex64/fxrstor (%[fx]), "=m" (*fx), [fx] "R" (fx), + "m" (*fx)); +} + static inline int frstor_checking(struct i387_fsave_struct *fx) { return check_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); } +static inline int frstor_user(struct i387_fsave_struct __user *fx) +{ + return user_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); +} + static inline void fpu_fxsave(struct fpu *fpu) { if (config_enabled(CONFIG_X86_32)) diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 4e89b3d..ada87a3 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -315,7 +315,7 @@ static inline int restore_user_xstate(void __user *buf, u64 xbv, int fx_only) if ((unsigned long)buf % 64 || fx_only) { u64 init_bv = pcntxt_mask & ~XSTATE_FPSSE; xrstor_state(init_xstate_buf, init_bv); - return fxrstor_checking((__force void *) buf); + return fxrstor_user(buf); } else { u64 init_bv = pcntxt_mask & ~xbv; if (unlikely(init_bv)) @@ -323,9 +323,9 @@ static inline int restore_user_xstate(void __user *buf, u64 xbv, int fx_only) return xrestore_user(buf, xbv); } } else if (use_fxsr()) { - return fxrstor_checking((__force void *) buf); + return fxrstor_user(buf); } else - return frstor_checking((__force void *) buf); + return frstor_user(buf); } int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size) -- cgit v1.1 From dec08a837fda146fee498ffc5ecd0d2eeeacd025 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Wed, 19 Sep 2012 13:42:23 +0800 Subject: x86: Remove the useless branch in c_start() Since 'cpu == -1' in cpumask_next() is legal, no need to handle '*pos == 0' specially. About the comments: /* just in case, cpu 0 is not the first */ A test with a cpumask in which cpu 0 is not the first has been done, and it works well. This patch will remove that useless branch to clean the code. Signed-off-by: Michael Wang Cc: kjwinchester@gmail.com Cc: borislav.petkov@amd.com Cc: ak@linux.intel.com Link: http://lkml.kernel.org/r/1348033343-23658-1-git-send-email-wangyun@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/proc.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 8022c66..fbd8955 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -140,10 +140,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) static void *c_start(struct seq_file *m, loff_t *pos) { - if (*pos == 0) /* just in case, cpu 0 is not the first */ - *pos = cpumask_first(cpu_online_mask); - else - *pos = cpumask_next(*pos - 1, cpu_online_mask); + *pos = cpumask_next(*pos - 1, cpu_online_mask); if ((*pos) < nr_cpu_ids) return &cpu_data(*pos); return NULL; -- cgit v1.1 From 57c078ce13983d27c83f962b3e9722887209005c Mon Sep 17 00:00:00 2001 From: Yasuaki Ishimatsu Date: Wed, 26 Sep 2012 09:54:17 +0900 Subject: x86/api: Rename mp_register_lapic in a comment Commit 31d2092eb0c23636b73d2c24c0c11b66470cef58 ("x86: move mp_register_lapic_address to boot.c") renamed mp_register_lapic to acpi_register_lapic. But mp_register_lapic remains in a comment. So the patch rename it. Signed-off-by: Yasuaki Ishimatsu Cc: Len Brown Link: http://lkml.kernel.org/r/50625239.3050403@jp.fujitsu.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/acpi/boot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index b2297e5..e651f7a 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -656,7 +656,7 @@ static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu) acpi_register_lapic(physid, ACPI_MADT_ENABLED); /* - * If mp_register_lapic successfully generates a new logical cpu + * If acpi_register_lapic successfully generates a new logical cpu * number, then the following will get us exactly what was mapped */ cpumask_andnot(new_map, cpu_present_mask, tmp_map); -- cgit v1.1 From 1b2b23d8573076a587ed2081e0d2b69691079e0e Mon Sep 17 00:00:00 2001 From: Tao Guo Date: Wed, 26 Sep 2012 04:28:22 -0400 Subject: x86_64: Work around old GAS bug GAS in binutils(2.16.91) could not parse parentheses within macro parameters unless fully parenthesized, and this is a workaround to make old gas work without generating below errors: arch/x86/kernel/entry_64.S: Assembler messages: arch/x86/kernel/entry_64.S:387: Error: too many positional arguments arch/x86/kernel/entry_64.S:389: Error: too many positional arguments [...] Signed-off-by: Tao Guo Reluctantly-Acked-by: Jan Beulich Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/1348648102-12653-1-git-send-email-glorioustao@gmail.com [ Jan argues that these old GAS versions are fragile - which is so, but lets give them a chance. ] Signed-off-by: Ingo Molnar --- arch/x86/include/asm/calling.h | 48 ++++++++++++++++++++---------------------- arch/x86/kernel/entry_64.S | 20 +++++++++--------- 2 files changed, 33 insertions(+), 35 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h index a9e3a74..7f8422a 100644 --- a/arch/x86/include/asm/calling.h +++ b/arch/x86/include/asm/calling.h @@ -49,38 +49,36 @@ For 32-bit we have the following conventions - kernel is built with #include "dwarf2.h" /* - * 64-bit system call stack frame layout defines and helpers, for - * assembly code (note that the seemingly unnecessary parentheses - * are to prevent cpp from inserting spaces in expressions that get - * passed to macros): + * 64-bit system call stack frame layout defines and helpers, + * for assembly code: */ -#define R15 (0) -#define R14 (8) -#define R13 (16) -#define R12 (24) -#define RBP (32) -#define RBX (40) +#define R15 0 +#define R14 8 +#define R13 16 +#define R12 24 +#define RBP 32 +#define RBX 40 /* arguments: interrupts/non tracing syscalls only save up to here: */ -#define R11 (48) -#define R10 (56) -#define R9 (64) -#define R8 (72) -#define RAX (80) -#define RCX (88) -#define RDX (96) -#define RSI (104) -#define RDI (112) -#define ORIG_RAX (120) /* + error_code */ +#define R11 48 +#define R10 56 +#define R9 64 +#define R8 72 +#define RAX 80 +#define RCX 88 +#define RDX 96 +#define RSI 104 +#define RDI 112 +#define ORIG_RAX 120 /* + error_code */ /* end of arguments */ /* cpu exception frame or undefined in case of fast syscall: */ -#define RIP (128) -#define CS (136) -#define EFLAGS (144) -#define RSP (152) -#define SS (160) +#define RIP 128 +#define CS 136 +#define EFLAGS 144 +#define RSP 152 +#define SS 160 #define ARGOFFSET R11 #define SWFRAME ORIG_RAX diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index b1dac12..2c67061 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -342,15 +342,15 @@ ENDPROC(native_usergs_sysret64) .macro SAVE_ARGS_IRQ cld /* start from rbp in pt_regs and jump over */ - movq_cfi rdi, RDI-RBP - movq_cfi rsi, RSI-RBP - movq_cfi rdx, RDX-RBP - movq_cfi rcx, RCX-RBP - movq_cfi rax, RAX-RBP - movq_cfi r8, R8-RBP - movq_cfi r9, R9-RBP - movq_cfi r10, R10-RBP - movq_cfi r11, R11-RBP + movq_cfi rdi, (RDI-RBP) + movq_cfi rsi, (RSI-RBP) + movq_cfi rdx, (RDX-RBP) + movq_cfi rcx, (RCX-RBP) + movq_cfi rax, (RAX-RBP) + movq_cfi r8, (R8-RBP) + movq_cfi r9, (R9-RBP) + movq_cfi r10, (R10-RBP) + movq_cfi r11, (R11-RBP) /* Save rbp so that we can unwind from get_irq_regs() */ movq_cfi rbp, 0 @@ -384,7 +384,7 @@ ENDPROC(native_usergs_sysret64) .endm ENTRY(save_rest) - PARTIAL_FRAME 1 REST_SKIP+8 + PARTIAL_FRAME 1 (REST_SKIP+8) movq 5*8+16(%rsp), %r11 /* save return address */ movq_cfi rbx, RBX+16 movq_cfi rbp, RBP+16 -- cgit v1.1 From c416ddf5b909736f5b57d348f5de159693e699ad Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 25 Sep 2012 14:51:19 +0200 Subject: x86: Unspaghettize do_trap() Cleanup the label maze in this function. Having a seperate function to first handle the traps that don't generate a signal makes it easier to convert into more readable conditional paths. Signed-off-by: Frederic Weisbecker Cc: Oleg Nesterov Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1348577479-2564-1-git-send-email-fweisbec@gmail.com [ Fixed 32-bit build failure. ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps.c | 60 ++++++++++++++++++++++++------------------------- 1 file changed, 29 insertions(+), 31 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index b481341..6ff7715 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -107,30 +107,45 @@ static inline void preempt_conditional_cli(struct pt_regs *regs) dec_preempt_count(); } -static void __kprobes -do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, - long error_code, siginfo_t *info) +static int __kprobes +do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str, + struct pt_regs *regs, long error_code) { - struct task_struct *tsk = current; - #ifdef CONFIG_X86_32 if (regs->flags & X86_VM_MASK) { /* - * traps 0, 1, 3, 4, and 5 should be forwarded to vm86. + * Traps 0, 1, 3, 4, and 5 should be forwarded to vm86. * On nmi (interrupt 2), do_trap should not be called. */ - if (trapnr < X86_TRAP_UD) - goto vm86_trap; - goto trap_signal; + if (trapnr < X86_TRAP_UD) { + if (!handle_vm86_trap((struct kernel_vm86_regs *) regs, + error_code, trapnr)) + return 0; + } + return -1; } #endif + if (!user_mode(regs)) { + if (!fixup_exception(regs)) { + tsk->thread.error_code = error_code; + tsk->thread.trap_nr = trapnr; + die(str, regs, error_code); + } + return 0; + } - if (!user_mode(regs)) - goto kernel_trap; + return -1; +} -#ifdef CONFIG_X86_32 -trap_signal: -#endif +static void __kprobes +do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, + long error_code, siginfo_t *info) +{ + struct task_struct *tsk = current; + + + if (!do_trap_no_signal(tsk, trapnr, str, regs, error_code)) + return; /* * We want error_code and trap_nr set for userspace faults and * kernelspace faults which result in die(), but not @@ -158,23 +173,6 @@ trap_signal: force_sig_info(signr, info, tsk); else force_sig(signr, tsk); - return; - -kernel_trap: - if (!fixup_exception(regs)) { - tsk->thread.error_code = error_code; - tsk->thread.trap_nr = trapnr; - die(str, regs, error_code); - } - return; - -#ifdef CONFIG_X86_32 -vm86_trap: - if (handle_vm86_trap((struct kernel_vm86_regs *) regs, - error_code, trapnr)) - goto trap_signal; - return; -#endif } #define DO_ERROR(trapnr, signr, str, name) \ -- cgit v1.1 From bf5a3c13b939813d28ce26c01425054c740d6731 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 11 Jul 2012 20:26:34 +0200 Subject: x86: Syscall hooks for userspace RCU extended QS Add syscall slow path hooks to notify syscall entry and exit on CPUs that want to support userspace RCU extended quiescent state. Signed-off-by: Frederic Weisbecker Cc: Alessio Igor Bogani Cc: Andrew Morton Cc: Avi Kivity Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Josh Triplett Cc: Kevin Hilman Cc: Max Krasnyansky Cc: Peter Zijlstra Cc: Stephen Hemminger Cc: Steven Rostedt Cc: Sven-Thorsten Dietrich Cc: Thomas Gleixner Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- arch/x86/include/asm/thread_info.h | 10 +++++++--- arch/x86/kernel/ptrace.c | 5 +++++ 2 files changed, 12 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 89f794f..c535d847 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -89,6 +89,7 @@ struct thread_info { #define TIF_NOTSC 16 /* TSC is not accessible in userland */ #define TIF_IA32 17 /* IA32 compatibility process */ #define TIF_FORK 18 /* ret_from_fork */ +#define TIF_NOHZ 19 /* in adaptive nohz mode */ #define TIF_MEMDIE 20 /* is terminating due to OOM killer */ #define TIF_DEBUG 21 /* uses debug registers */ #define TIF_IO_BITMAP 22 /* uses I/O bitmap */ @@ -114,6 +115,7 @@ struct thread_info { #define _TIF_NOTSC (1 << TIF_NOTSC) #define _TIF_IA32 (1 << TIF_IA32) #define _TIF_FORK (1 << TIF_FORK) +#define _TIF_NOHZ (1 << TIF_NOHZ) #define _TIF_DEBUG (1 << TIF_DEBUG) #define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) #define _TIF_FORCED_TF (1 << TIF_FORCED_TF) @@ -126,12 +128,13 @@ struct thread_info { /* work to do in syscall_trace_enter() */ #define _TIF_WORK_SYSCALL_ENTRY \ (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_AUDIT | \ - _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT) + _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT | \ + _TIF_NOHZ) /* work to do in syscall_trace_leave() */ #define _TIF_WORK_SYSCALL_EXIT \ (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP | \ - _TIF_SYSCALL_TRACEPOINT) + _TIF_SYSCALL_TRACEPOINT | _TIF_NOHZ) /* work to do on interrupt/exception return */ #define _TIF_WORK_MASK \ @@ -141,7 +144,8 @@ struct thread_info { /* work to do on any return to user space */ #define _TIF_ALLWORK_MASK \ - ((0x0000FFFF & ~_TIF_SECCOMP) | _TIF_SYSCALL_TRACEPOINT) + ((0x0000FFFF & ~_TIF_SECCOMP) | _TIF_SYSCALL_TRACEPOINT | \ + _TIF_NOHZ) /* Only used for 64 bit */ #define _TIF_DO_NOTIFY_MASK \ diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index c4c6a5c..9f94f8e 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -1463,6 +1464,8 @@ long syscall_trace_enter(struct pt_regs *regs) { long ret = 0; + rcu_user_exit(); + /* * If we stepped into a sysenter/syscall insn, it trapped in * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP. @@ -1526,4 +1529,6 @@ void syscall_trace_leave(struct pt_regs *regs) !test_thread_flag(TIF_SYSCALL_EMU); if (step || test_thread_flag(TIF_SYSCALL_TRACE)) tracehook_report_syscall_exit(regs, step); + + rcu_user_enter(); } -- cgit v1.1 From ef3f628872c838933a279d0d7e63e707783c9710 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 24 Sep 2012 21:05:52 +0200 Subject: x86: Unspaghettize do_general_protection() There is some unnatural label based layout in this function. Convert the unnecessary goto to readable conditional blocks. Signed-off-by: Frederic Weisbecker Cc: Thomas Gleixner Cc: H. Peter Anvin Cc: Ingo Molnar --- arch/x86/kernel/traps.c | 38 ++++++++++++++++---------------------- 1 file changed, 16 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index b481341..74850e5 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -258,13 +258,25 @@ do_general_protection(struct pt_regs *regs, long error_code) conditional_sti(regs); #ifdef CONFIG_X86_32 - if (regs->flags & X86_VM_MASK) - goto gp_in_vm86; + if (regs->flags & X86_VM_MASK) { + local_irq_enable(); + handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); + return; + } #endif tsk = current; - if (!user_mode(regs)) - goto gp_in_kernel; + if (!user_mode(regs)) { + if (fixup_exception(regs)) + return; + + tsk->thread.error_code = error_code; + tsk->thread.trap_nr = X86_TRAP_GP; + if (!notify_die(DIE_GPF, "general protection fault", regs, error_code, + X86_TRAP_GP, SIGSEGV) == NOTIFY_STOP) + die("general protection fault", regs, error_code); + return; + } tsk->thread.error_code = error_code; tsk->thread.trap_nr = X86_TRAP_GP; @@ -280,24 +292,6 @@ do_general_protection(struct pt_regs *regs, long error_code) force_sig(SIGSEGV, tsk); return; - -#ifdef CONFIG_X86_32 -gp_in_vm86: - local_irq_enable(); - handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); - return; -#endif - -gp_in_kernel: - if (fixup_exception(regs)) - return; - - tsk->thread.error_code = error_code; - tsk->thread.trap_nr = X86_TRAP_GP; - if (notify_die(DIE_GPF, "general protection fault", regs, error_code, - X86_TRAP_GP, SIGSEGV) == NOTIFY_STOP) - return; - die("general protection fault", regs, error_code); } /* May run on IST stack. */ -- cgit v1.1 From 6ba3c97a38803883c2eee489505796cb0a727122 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 11 Jul 2012 20:26:35 +0200 Subject: x86: Exception hooks for userspace RCU extended QS Add necessary hooks to x86 exception for userspace RCU extended quiescent state support. This includes traps, page fault, debug exceptions, etc... Signed-off-by: Frederic Weisbecker Cc: Alessio Igor Bogani Cc: Andrew Morton Cc: Avi Kivity Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Josh Triplett Cc: Kevin Hilman Cc: Max Krasnyansky Cc: Peter Zijlstra Cc: Stephen Hemminger Cc: Steven Rostedt Cc: Sven-Thorsten Dietrich Cc: Thomas Gleixner Signed-off-by: Paul E. McKenney --- arch/x86/include/asm/rcu.h | 20 ++++++++++++ arch/x86/kernel/traps.c | 81 +++++++++++++++++++++++++++++++--------------- arch/x86/mm/fault.c | 13 ++++++-- 3 files changed, 86 insertions(+), 28 deletions(-) create mode 100644 arch/x86/include/asm/rcu.h (limited to 'arch/x86') diff --git a/arch/x86/include/asm/rcu.h b/arch/x86/include/asm/rcu.h new file mode 100644 index 0000000..439815b --- /dev/null +++ b/arch/x86/include/asm/rcu.h @@ -0,0 +1,20 @@ +#ifndef _ASM_X86_RCU_H +#define _ASM_X86_RCU_H + +#include +#include + +static inline void exception_enter(struct pt_regs *regs) +{ + rcu_user_exit(); +} + +static inline void exception_exit(struct pt_regs *regs) +{ +#ifdef CONFIG_RCU_USER_QS + if (user_mode(regs)) + rcu_user_enter(); +#endif +} + +#endif diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 74850e5..3789675 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -55,6 +55,7 @@ #include #include #include +#include #include @@ -180,11 +181,15 @@ vm86_trap: #define DO_ERROR(trapnr, signr, str, name) \ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ { \ - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ + exception_enter(regs); \ + if (notify_die(DIE_TRAP, str, regs, error_code, \ + trapnr, signr) == NOTIFY_STOP) { \ + exception_exit(regs); \ return; \ + } \ conditional_sti(regs); \ do_trap(trapnr, signr, str, regs, error_code, NULL); \ + exception_exit(regs); \ } #define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ @@ -195,11 +200,15 @@ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ info.si_errno = 0; \ info.si_code = sicode; \ info.si_addr = (void __user *)siaddr; \ - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ + exception_enter(regs); \ + if (notify_die(DIE_TRAP, str, regs, error_code, \ + trapnr, signr) == NOTIFY_STOP) { \ + exception_exit(regs); \ return; \ + } \ conditional_sti(regs); \ do_trap(trapnr, signr, str, regs, error_code, &info); \ + exception_exit(regs); \ } DO_ERROR_INFO(X86_TRAP_DE, SIGFPE, "divide error", divide_error, FPE_INTDIV, @@ -222,12 +231,14 @@ DO_ERROR_INFO(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check, /* Runs on IST stack */ dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code) { + exception_enter(regs); if (notify_die(DIE_TRAP, "stack segment", regs, error_code, - X86_TRAP_SS, SIGBUS) == NOTIFY_STOP) - return; - preempt_conditional_sti(regs); - do_trap(X86_TRAP_SS, SIGBUS, "stack segment", regs, error_code, NULL); - preempt_conditional_cli(regs); + X86_TRAP_SS, SIGBUS) != NOTIFY_STOP) { + preempt_conditional_sti(regs); + do_trap(X86_TRAP_SS, SIGBUS, "stack segment", regs, error_code, NULL); + preempt_conditional_cli(regs); + } + exception_exit(regs); } dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) @@ -235,6 +246,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) static const char str[] = "double fault"; struct task_struct *tsk = current; + exception_enter(regs); /* Return not checked because double check cannot be ignored */ notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV); @@ -255,27 +267,28 @@ do_general_protection(struct pt_regs *regs, long error_code) { struct task_struct *tsk; + exception_enter(regs); conditional_sti(regs); #ifdef CONFIG_X86_32 if (regs->flags & X86_VM_MASK) { local_irq_enable(); handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); - return; + goto exit; } #endif tsk = current; if (!user_mode(regs)) { if (fixup_exception(regs)) - return; + goto exit; tsk->thread.error_code = error_code; tsk->thread.trap_nr = X86_TRAP_GP; - if (!notify_die(DIE_GPF, "general protection fault", regs, error_code, - X86_TRAP_GP, SIGSEGV) == NOTIFY_STOP) + if (notify_die(DIE_GPF, "general protection fault", regs, error_code, + X86_TRAP_GP, SIGSEGV) != NOTIFY_STOP) die("general protection fault", regs, error_code); - return; + goto exit; } tsk->thread.error_code = error_code; @@ -291,7 +304,8 @@ do_general_protection(struct pt_regs *regs, long error_code) } force_sig(SIGSEGV, tsk); - return; +exit: + exception_exit(regs); } /* May run on IST stack. */ @@ -306,15 +320,16 @@ dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_co ftrace_int3_handler(regs)) return; #endif + exception_enter(regs); #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, SIGTRAP) == NOTIFY_STOP) - return; + goto exit; #endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */ if (notify_die(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, SIGTRAP) == NOTIFY_STOP) - return; + goto exit; /* * Let others (NMI) know that the debug stack is in use @@ -325,6 +340,8 @@ dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_co do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, NULL); preempt_conditional_cli(regs); debug_stack_usage_dec(); +exit: + exception_exit(regs); } #ifdef CONFIG_X86_64 @@ -385,6 +402,8 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) unsigned long dr6; int si_code; + exception_enter(regs); + get_debugreg(dr6, 6); /* Filter out all the reserved bits which are preset to 1 */ @@ -400,7 +419,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) /* Catch kmemcheck conditions first of all! */ if ((dr6 & DR_STEP) && kmemcheck_trap(regs)) - return; + goto exit; /* DR6 may or may not be cleared by the CPU */ set_debugreg(0, 6); @@ -415,7 +434,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) if (notify_die(DIE_DEBUG, "debug", regs, PTR_ERR(&dr6), error_code, SIGTRAP) == NOTIFY_STOP) - return; + goto exit; /* * Let others (NMI) know that the debug stack is in use @@ -431,7 +450,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) X86_TRAP_DB); preempt_conditional_cli(regs); debug_stack_usage_dec(); - return; + goto exit; } /* @@ -452,7 +471,8 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) preempt_conditional_cli(regs); debug_stack_usage_dec(); - return; +exit: + exception_exit(regs); } /* @@ -549,14 +569,17 @@ dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) #ifdef CONFIG_X86_32 ignore_fpu_irq = 1; #endif - + exception_enter(regs); math_error(regs, error_code, X86_TRAP_MF); + exception_exit(regs); } dotraplinkage void do_simd_coprocessor_error(struct pt_regs *regs, long error_code) { + exception_enter(regs); math_error(regs, error_code, X86_TRAP_XF); + exception_exit(regs); } dotraplinkage void @@ -623,6 +646,7 @@ EXPORT_SYMBOL_GPL(math_state_restore); dotraplinkage void __kprobes do_device_not_available(struct pt_regs *regs, long error_code) { + exception_enter(regs); #ifdef CONFIG_MATH_EMULATION if (read_cr0() & X86_CR0_EM) { struct math_emu_info info = { }; @@ -631,6 +655,7 @@ do_device_not_available(struct pt_regs *regs, long error_code) info.regs = regs; math_emulate(&info); + exception_exit(regs); return; } #endif @@ -638,12 +663,15 @@ do_device_not_available(struct pt_regs *regs, long error_code) #ifdef CONFIG_X86_32 conditional_sti(regs); #endif + exception_exit(regs); } #ifdef CONFIG_X86_32 dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) { siginfo_t info; + + exception_enter(regs); local_irq_enable(); info.si_signo = SIGILL; @@ -651,10 +679,11 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) info.si_code = ILL_BADSTK; info.si_addr = NULL; if (notify_die(DIE_TRAP, "iret exception", regs, error_code, - X86_TRAP_IRET, SIGILL) == NOTIFY_STOP) - return; - do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code, - &info); + X86_TRAP_IRET, SIGILL) != NOTIFY_STOP) { + do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code, + &info); + } + exception_exit(regs); } #endif diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 76dcd9d..7dde46d 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -18,6 +18,7 @@ #include /* pgd_*(), ... */ #include /* kmemcheck_*(), ... */ #include /* VSYSCALL_START */ +#include /* exception_enter(), ... */ /* * Page fault error code bits: @@ -1000,8 +1001,8 @@ static int fault_in_kernel_space(unsigned long address) * and the problem, and then passes it off to one of the appropriate * routines. */ -dotraplinkage void __kprobes -do_page_fault(struct pt_regs *regs, unsigned long error_code) +static void __kprobes +__do_page_fault(struct pt_regs *regs, unsigned long error_code) { struct vm_area_struct *vma; struct task_struct *tsk; @@ -1209,3 +1210,11 @@ good_area: up_read(&mm->mmap_sem); } + +dotraplinkage void __kprobes +do_page_fault(struct pt_regs *regs, unsigned long error_code) +{ + exception_enter(regs); + __do_page_fault(regs, error_code); + exception_exit(regs); +} -- cgit v1.1 From 0430499ce9d78691f3985962021b16bf8f8a8048 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 11 Jul 2012 20:26:38 +0200 Subject: x86: Use the new schedule_user API on userspace preemption This way we can exit the RCU extended quiescent state before we schedule a new task from irq/exception exit. Signed-off-by: Frederic Weisbecker Cc: Alessio Igor Bogani Cc: Andrew Morton Cc: Avi Kivity Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Josh Triplett Cc: Kevin Hilman Cc: Max Krasnyansky Cc: Peter Zijlstra Cc: Stephen Hemminger Cc: Steven Rostedt Cc: Sven-Thorsten Dietrich Cc: Thomas Gleixner Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- arch/x86/include/asm/rcu.h | 12 ++++++++++++ arch/x86/kernel/entry_64.S | 9 +++++---- 2 files changed, 17 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/rcu.h b/arch/x86/include/asm/rcu.h index 439815b..d1ac07a 100644 --- a/arch/x86/include/asm/rcu.h +++ b/arch/x86/include/asm/rcu.h @@ -1,6 +1,8 @@ #ifndef _ASM_X86_RCU_H #define _ASM_X86_RCU_H +#ifndef __ASSEMBLY__ + #include #include @@ -17,4 +19,14 @@ static inline void exception_exit(struct pt_regs *regs) #endif } +#else /* __ASSEMBLY__ */ + +#ifdef CONFIG_RCU_USER_QS +# define SCHEDULE_USER call schedule_user +#else +# define SCHEDULE_USER call schedule +#endif + +#endif /* !__ASSEMBLY__ */ + #endif diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 69babd8..1a8f3cb 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -56,6 +56,7 @@ #include #include #include +#include #include /* Avoid __ASSEMBLER__'ifying just for this. */ @@ -565,7 +566,7 @@ sysret_careful: TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) pushq_cfi %rdi - call schedule + SCHEDULE_USER popq_cfi %rdi jmp sysret_check @@ -678,7 +679,7 @@ int_careful: TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) pushq_cfi %rdi - call schedule + SCHEDULE_USER popq_cfi %rdi DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF @@ -974,7 +975,7 @@ retint_careful: TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) pushq_cfi %rdi - call schedule + SCHEDULE_USER popq_cfi %rdi GET_THREAD_INFO(%rcx) DISABLE_INTERRUPTS(CLBR_NONE) @@ -1449,7 +1450,7 @@ paranoid_userspace: paranoid_schedule: TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_ANY) - call schedule + SCHEDULE_USER DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF jmp paranoid_userspace -- cgit v1.1 From edf55fda35c7dc7f2d9241c3abaddaf759b457c6 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 11 Jul 2012 20:26:39 +0200 Subject: x86: Exit RCU extended QS on notify resume do_notify_resume() may be called on irq or exception exit. But at that time the exception has already called rcu_user_enter() and the irq has already called rcu_irq_exit(). Since it can use RCU read side critical section, we must call rcu_user_exit() before doing anything there. Then we must call back rcu_user_enter() after this function because we know we are going to userspace from there. This complete support for userspace RCU extended quiescent state in x86-64. Signed-off-by: Frederic Weisbecker Cc: Alessio Igor Bogani Cc: Andrew Morton Cc: Avi Kivity Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Josh Triplett Cc: Kevin Hilman Cc: Max Krasnyansky Cc: Peter Zijlstra Cc: Stephen Hemminger Cc: Steven Rostedt Cc: Sven-Thorsten Dietrich Cc: Thomas Gleixner Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- arch/x86/Kconfig | 1 + arch/x86/kernel/signal.c | 4 ++++ 2 files changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 50a1d1f..20c49b8 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -97,6 +97,7 @@ config X86 select KTIME_SCALAR if X86_32 select GENERIC_STRNCPY_FROM_USER select GENERIC_STRNLEN_USER + select HAVE_RCU_USER_QS if X86_64 config INSTRUCTION_DECODER def_bool (KPROBES || PERF_EVENTS || UPROBES) diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index b280908..bca0ab9 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -779,6 +779,8 @@ static void do_signal(struct pt_regs *regs) void do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) { + rcu_user_exit(); + #ifdef CONFIG_X86_MCE /* notify userspace of pending MCEs */ if (thread_info_flags & _TIF_MCE_NOTIFY) @@ -804,6 +806,8 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) #ifdef CONFIG_X86_32 clear_thread_flag(TIF_IRET); #endif /* CONFIG_X86_32 */ + + rcu_user_enter(); } void signal_fault(struct pt_regs *regs, void __user *frame, char *where) -- cgit v1.1 From 5a5a51db78ef24aa61a4cb2ae36f07f6fa37356d Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 24 Sep 2012 16:05:48 -0700 Subject: x86-32: Start out eflags and cr4 clean %cr4 is supposed to reflect a set of features into which the operating system is opting in. If the BIOS or bootloader leaks bits here, this is not desirable. Consider a bootloader passing in %cr4.pae set to a legacy paging kernel, for example -- it will not have any immediate effect, but the kernel would crash when turning paging on. A similar argument applies to %eflags, and since we have to look for %eflags.id being settable we can use a sequence which clears %eflags as a side effect. Note that we already do this for x86-64. Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/1348529239-17943-1-git-send-email-hpa@linux.intel.com --- arch/x86/kernel/head_32.S | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index d42ab17..957a47a 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -287,27 +287,28 @@ ENTRY(startup_32_smp) leal -__PAGE_OFFSET(%ecx),%esp default_entry: - /* * New page tables may be in 4Mbyte page mode and may * be using the global pages. * * NOTE! If we are on a 486 we may have no cr4 at all! - * So we do not try to touch it unless we really have - * some bits in it to set. This won't work if the BSP - * implements cr4 but this AP does not -- very unlikely - * but be warned! The same applies to the pse feature - * if not equally supported. --macro - * - * NOTE! We have to correct for the fact that we're - * not yet offset PAGE_OFFSET.. + * Specifically, cr4 exists if and only if CPUID exists, + * which in turn exists if and only if EFLAGS.ID exists. */ -#define cr4_bits pa(mmu_cr4_features) - movl cr4_bits,%edx - andl %edx,%edx - jz 6f - movl %cr4,%eax # Turn on paging options (PSE,PAE,..) - orl %edx,%eax + movl $X86_EFLAGS_ID,%ecx + pushl %ecx + popfl + pushfl + popl %eax + pushl $0 + popfl + pushfl + popl %edx + xorl %edx,%eax + testl %ecx,%eax + jz 6f # No ID flag = no CPUID = no CR4 + + movl pa(mmu_cr4_features),%eax movl %eax,%cr4 testb $X86_CR4_PAE, %al # check if PAE is enabled -- cgit v1.1 From 73201dbec64aebf6b0dca855b523f437972dc7bb Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 26 Sep 2012 15:02:34 -0700 Subject: x86, suspend: On wakeup always initialize cr4 and EFER We already have a flag word to indicate the existence of MISC_ENABLES, so use the same flag word to indicate existence of cr4 and EFER, and always restore them if they exist. That way if something passes a nonzero value when the value *should* be zero, we will still initialize it. Signed-off-by: H. Peter Anvin Cc: Rafael J. Wysocki Link: http://lkml.kernel.org/r/1348529239-17943-1-git-send-email-hpa@linux.intel.com --- arch/x86/kernel/acpi/sleep.c | 15 ++++++++++----- arch/x86/realmode/rm/wakeup.h | 2 ++ arch/x86/realmode/rm/wakeup_asm.S | 29 +++++++++++++++++++---------- 3 files changed, 31 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 1b8e5a0..11676cf 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -43,17 +43,22 @@ int acpi_suspend_lowlevel(void) header->video_mode = saved_video_mode; + header->pmode_behavior = 0; + #ifndef CONFIG_64BIT store_gdt((struct desc_ptr *)&header->pmode_gdt); - if (rdmsr_safe(MSR_EFER, &header->pmode_efer_low, - &header->pmode_efer_high)) - header->pmode_efer_low = header->pmode_efer_high = 0; + if (!rdmsr_safe(MSR_EFER, + &header->pmode_efer_low, + &header->pmode_efer_high)) + header->pmode_behavior |= (1 << WAKEUP_BEHAVIOR_RESTORE_EFER); #endif /* !CONFIG_64BIT */ header->pmode_cr0 = read_cr0(); - header->pmode_cr4 = read_cr4_safe(); - header->pmode_behavior = 0; + if (__this_cpu_read(cpu_info.cpuid_level) >= 0) { + header->pmode_cr4 = read_cr4(); + header->pmode_behavior |= (1 << WAKEUP_BEHAVIOR_RESTORE_CR4); + } if (!rdmsr_safe(MSR_IA32_MISC_ENABLE, &header->pmode_misc_en_low, &header->pmode_misc_en_high)) diff --git a/arch/x86/realmode/rm/wakeup.h b/arch/x86/realmode/rm/wakeup.h index 9317e00..7dd86a4 100644 --- a/arch/x86/realmode/rm/wakeup.h +++ b/arch/x86/realmode/rm/wakeup.h @@ -36,5 +36,7 @@ extern struct wakeup_header wakeup_header; /* Wakeup behavior bits */ #define WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE 0 +#define WAKEUP_BEHAVIOR_RESTORE_CR4 1 +#define WAKEUP_BEHAVIOR_RESTORE_EFER 2 #endif /* ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H */ diff --git a/arch/x86/realmode/rm/wakeup_asm.S b/arch/x86/realmode/rm/wakeup_asm.S index 8905166..e56479e 100644 --- a/arch/x86/realmode/rm/wakeup_asm.S +++ b/arch/x86/realmode/rm/wakeup_asm.S @@ -74,9 +74,18 @@ ENTRY(wakeup_start) lidtl wakeup_idt - /* Clear the EFLAGS */ - pushl $0 + /* Clear the EFLAGS but remember if we have EFLAGS.ID */ + movl $X86_EFLAGS_ID, %ecx + pushl %ecx popfl + pushfl + popl %edi + pushl $0 + popfl + pushfl + popl %edx + xorl %edx, %edi + andl %ecx, %edi /* %edi is zero iff CPUID & %cr4 are missing */ /* Check header signature... */ movl signature, %eax @@ -93,8 +102,8 @@ ENTRY(wakeup_start) /* Restore MISC_ENABLE before entering protected mode, in case BIOS decided to clear XD_DISABLE during S3. */ - movl pmode_behavior, %eax - btl $WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE, %eax + movl pmode_behavior, %edi + btl $WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE, %edi jnc 1f movl pmode_misc_en, %eax @@ -110,15 +119,15 @@ ENTRY(wakeup_start) movl pmode_cr3, %eax movl %eax, %cr3 - movl pmode_cr4, %ecx - jecxz 1f - movl %ecx, %cr4 + btl $WAKEUP_BEHAVIOR_RESTORE_CR4, %edi + jz 1f + movl pmode_cr4, %eax + movl %eax, %cr4 1: + btl $WAKEUP_BEHAVIOR_RESTORE_EFER, %edi + jz 1f movl pmode_efer, %eax movl pmode_efer + 4, %edx - movl %eax, %ecx - orl %edx, %ecx - jz 1f movl $MSR_EFER, %ecx wrmsr 1: -- cgit v1.1 From 402537fd9f5499d1a50a6f20cdd9031d24ee2e47 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 27 Sep 2012 09:33:26 +0800 Subject: perf/x86: Fix typo in uncore_pmu_to_box The variable box should not be declared as static. This could probably cause crashes with sufficiently parallel uncore PMU use. Signed-off-by: Yan, Zheng Cc: eranian@google.com Cc: a.p.zijlstra@chello.nl Link: http://lkml.kernel.org/r/1348709606-2759-1-git-send-email-zheng.z.yan@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_uncore.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index 38e4894..c7c55e7 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -1950,7 +1950,7 @@ struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type, int cp static struct intel_uncore_box * uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu) { - static struct intel_uncore_box *box; + struct intel_uncore_box *box; box = *per_cpu_ptr(pmu->box, cpu); if (box) -- cgit v1.1 From 200429cc63399e99dd2abcdca5088559a911ef2b Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Wed, 19 Sep 2012 14:24:57 +0300 Subject: crypto: cast5/avx - fix storing of new IV in CBC encryption cast5/avx incorrectly XORs new IV over old IV at end of CBC encryption function when it should store. This causes CBC encryption to give incorrect output on multi-page encryption requests. Signed-off-by: Jussi Kivilinna Signed-off-by: Herbert Xu --- arch/x86/crypto/cast5_avx_glue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c index 445aab0..e0ea14f 100644 --- a/arch/x86/crypto/cast5_avx_glue.c +++ b/arch/x86/crypto/cast5_avx_glue.c @@ -165,7 +165,7 @@ static unsigned int __cbc_encrypt(struct blkcipher_desc *desc, nbytes -= bsize; } while (nbytes >= bsize); - *(u64 *)walk->iv ^= *iv; + *(u64 *)walk->iv = *iv; return nbytes; } -- cgit v1.1 From fe04ddf7c2910362f3817c8156e41cbd6c0ee35d Mon Sep 17 00:00:00 2001 From: Michal Marek Date: Tue, 25 Sep 2012 16:03:03 +0200 Subject: kbuild: Do not package /boot and /lib in make tar-pkg There were reports of users destroying their Fedora installs by a kernel tarball that replaces the /lib -> /usr/lib symlink. Let's remove the toplevel directories from the tarball to prevent this from happening. Reported-by: Andi Kleen Suggested-by: Ben Hutchings Cc: Signed-off-by: Michal Marek --- arch/x86/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Makefile b/arch/x86/Makefile index c098ca4..b0c5276 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -138,7 +138,7 @@ KBUILD_CFLAGS += $(call cc-option,-mno-avx,) KBUILD_CFLAGS += $(mflags-y) KBUILD_AFLAGS += $(mflags-y) -archscripts: scripts_basic +archscripts: $(Q)$(MAKE) $(build)=arch/x86/tools relocs ### -- cgit v1.1 From f9a38eace4498a5e9f6d2cdfc879d5444edc3a5f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 6 Sep 2012 13:39:47 -0400 Subject: um: let signal_delivered() do SIGTRAP on singlestepping into handler ... rather than duplicating that in sigframe setup code (and doing that inconsistently, at that) Signed-off-by: Al Viro --- arch/x86/um/signal.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c index a508cea1..ba7363e 100644 --- a/arch/x86/um/signal.c +++ b/arch/x86/um/signal.c @@ -416,9 +416,6 @@ int setup_signal_stack_sc(unsigned long stack_top, int sig, PT_REGS_AX(regs) = (unsigned long) sig; PT_REGS_DX(regs) = (unsigned long) 0; PT_REGS_CX(regs) = (unsigned long) 0; - - if ((current->ptrace & PT_DTRACE) && (current->ptrace & PT_PTRACED)) - ptrace_notify(SIGTRAP); return 0; } @@ -466,9 +463,6 @@ int setup_signal_stack_si(unsigned long stack_top, int sig, PT_REGS_AX(regs) = (unsigned long) sig; PT_REGS_DX(regs) = (unsigned long) &frame->info; PT_REGS_CX(regs) = (unsigned long) &frame->uc; - - if ((current->ptrace & PT_DTRACE) && (current->ptrace & PT_PTRACED)) - ptrace_notify(SIGTRAP); return 0; } -- cgit v1.1 From d2ce4e92fa4f79a5fdb4cc912b411280afe21697 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 20 Sep 2012 09:28:25 -0400 Subject: um: kill thread->forking we only use that to tell copy_thread() done by syscall from that done by kernel_thread(). However, it's easier to do simply by checking PF_KTHREAD in thread flags. Merge sys_clone() guts for 32bit and 64bit, while we are at it... Signed-off-by: Al Viro --- arch/x86/um/shared/sysdep/syscalls.h | 2 ++ arch/x86/um/sys_call_table_32.c | 2 +- arch/x86/um/syscalls_32.c | 27 +++++++-------------------- arch/x86/um/syscalls_64.c | 23 +++-------------------- 4 files changed, 13 insertions(+), 41 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/um/shared/sysdep/syscalls.h b/arch/x86/um/shared/sysdep/syscalls.h index bd9a89b..ca255a8 100644 --- a/arch/x86/um/shared/sysdep/syscalls.h +++ b/arch/x86/um/shared/sysdep/syscalls.h @@ -1,3 +1,5 @@ +extern long sys_clone(unsigned long clone_flags, unsigned long newsp, + void __user *parent_tid, void __user *child_tid); #ifdef __i386__ #include "syscalls_32.h" #else diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c index 68d1dc9..b5408ce 100644 --- a/arch/x86/um/sys_call_table_32.c +++ b/arch/x86/um/sys_call_table_32.c @@ -28,7 +28,7 @@ #define ptregs_execve sys_execve #define ptregs_iopl sys_iopl #define ptregs_vm86old sys_vm86old -#define ptregs_clone sys_clone +#define ptregs_clone i386_clone #define ptregs_vm86 sys_vm86 #define ptregs_sigaltstack sys_sigaltstack #define ptregs_vfork sys_vfork diff --git a/arch/x86/um/syscalls_32.c b/arch/x86/um/syscalls_32.c index b853e86..db444c7 100644 --- a/arch/x86/um/syscalls_32.c +++ b/arch/x86/um/syscalls_32.c @@ -3,37 +3,24 @@ * Licensed under the GPL */ -#include "linux/sched.h" -#include "linux/shm.h" -#include "linux/ipc.h" -#include "linux/syscalls.h" -#include "asm/mman.h" -#include "asm/uaccess.h" -#include "asm/unistd.h" +#include +#include /* * The prototype on i386 is: * - * int clone(int flags, void * child_stack, int * parent_tidptr, struct user_desc * newtls, int * child_tidptr) + * int clone(int flags, void * child_stack, int * parent_tidptr, struct user_desc * newtls * * and the "newtls" arg. on i386 is read by copy_thread directly from the * register saved on the stack. */ -long sys_clone(unsigned long clone_flags, unsigned long newsp, - int __user *parent_tid, void *newtls, int __user *child_tid) +long i386_clone(unsigned long clone_flags, unsigned long newsp, + int __user *parent_tid, void *newtls, int __user *child_tid) { - long ret; - - if (!newsp) - newsp = UPT_SP(¤t->thread.regs.regs); - - current->thread.forking = 1; - ret = do_fork(clone_flags, newsp, ¤t->thread.regs, 0, parent_tid, - child_tid); - current->thread.forking = 0; - return ret; + return sys_clone(clone_flags, newsp, parent_tid, child_tid); } + long sys_sigaction(int sig, const struct old_sigaction __user *act, struct old_sigaction __user *oact) { diff --git a/arch/x86/um/syscalls_64.c b/arch/x86/um/syscalls_64.c index f3d82bb..adb08eb 100644 --- a/arch/x86/um/syscalls_64.c +++ b/arch/x86/um/syscalls_64.c @@ -5,12 +5,9 @@ * Licensed under the GPL */ -#include "linux/linkage.h" -#include "linux/personality.h" -#include "linux/utsname.h" -#include "asm/prctl.h" /* XXX This should get the constants from libc */ -#include "asm/uaccess.h" -#include "os.h" +#include +#include /* XXX This should get the constants from libc */ +#include long arch_prctl(struct task_struct *task, int code, unsigned long __user *addr) { @@ -79,20 +76,6 @@ long sys_arch_prctl(int code, unsigned long addr) return arch_prctl(current, code, (unsigned long __user *) addr); } -long sys_clone(unsigned long clone_flags, unsigned long newsp, - void __user *parent_tid, void __user *child_tid) -{ - long ret; - - if (!newsp) - newsp = UPT_SP(¤t->thread.regs.regs); - current->thread.forking = 1; - ret = do_fork(clone_flags, newsp, ¤t->thread.regs, 0, parent_tid, - child_tid); - current->thread.forking = 0; - return ret; -} - void arch_switch_to(struct task_struct *to) { if ((to->thread.arch.fs == 0) || (to->mm == NULL)) -- cgit v1.1 From b2cc2a074de75671bbed5e2dda67a9252ef353ea Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 26 Sep 2012 18:02:28 -0700 Subject: x86, smep, smap: Make the switching functions one-way There is no fundamental reason why we should switch SMEP and SMAP on during early cpu initialization just to switch them off again. Now with %eflags and %cr4 forced to be initialized to a clean state, we only need the one-way enable. Also, make the functions inline to make them (somewhat) harder to abuse. This does mean that SMEP and SMAP do not get initialized anywhere near as early. Even using early_param() instead of __setup() doesn't give us control early enough to do this during the early cpu initialization phase. This seems reasonable to me, because SMEP and SMAP should not matter until we have userspace to protect ourselves from, but it does potentially make it possible for a bug involving a "leak of permissions to userspace" to get uncaught. Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/common.c | 49 ++++++++++++++++---------------------------- 1 file changed, 18 insertions(+), 31 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 44aec5d..fefd9b7 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -259,48 +259,36 @@ static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c) } #endif -static int disable_smep __cpuinitdata; static __init int setup_disable_smep(char *arg) { - disable_smep = 1; + setup_clear_cpu_cap(X86_FEATURE_SMEP); return 1; } __setup("nosmep", setup_disable_smep); -static __cpuinit void setup_smep(struct cpuinfo_x86 *c) +static __always_inline void setup_smep(struct cpuinfo_x86 *c) { - if (cpu_has(c, X86_FEATURE_SMEP)) { - if (unlikely(disable_smep)) { - setup_clear_cpu_cap(X86_FEATURE_SMEP); - clear_in_cr4(X86_CR4_SMEP); - } else - set_in_cr4(X86_CR4_SMEP); - } + if (cpu_has(c, X86_FEATURE_SMEP)) + set_in_cr4(X86_CR4_SMEP); } -static int disable_smap __cpuinitdata; static __init int setup_disable_smap(char *arg) { - disable_smap = 1; + setup_clear_cpu_cap(X86_FEATURE_SMAP); return 1; } __setup("nosmap", setup_disable_smap); -static __cpuinit void setup_smap(struct cpuinfo_x86 *c) +static __always_inline void setup_smap(struct cpuinfo_x86 *c) { - if (cpu_has(c, X86_FEATURE_SMAP)) { - if (unlikely(disable_smap)) { - setup_clear_cpu_cap(X86_FEATURE_SMAP); - clear_in_cr4(X86_CR4_SMAP); - } else { - set_in_cr4(X86_CR4_SMAP); - /* - * Don't use clac() here since alternatives - * haven't run yet... - */ - asm volatile(__stringify(__ASM_CLAC) ::: "memory"); - } - } + unsigned long eflags; + + /* This should have been cleared long ago */ + raw_local_save_flags(eflags); + BUG_ON(eflags & X86_EFLAGS_AC); + + if (cpu_has(c, X86_FEATURE_SMAP)) + set_in_cr4(X86_CR4_SMAP); } /* @@ -737,9 +725,6 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) c->cpu_index = 0; filter_cpuid_features(c, false); - setup_smep(c); - setup_smap(c); - if (this_cpu->c_bsp_init) this_cpu->c_bsp_init(c); } @@ -824,8 +809,6 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c) c->phys_proc_id = c->initial_apicid; } - setup_smep(c); - get_model_name(c); /* Default name */ detect_nopl(c); @@ -890,6 +873,10 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) /* Disable the PN if appropriate */ squash_the_stupid_serial_number(c); + /* Set up SMEP/SMAP */ + setup_smep(c); + setup_smap(c); + /* * The vendor-specific functions might have changed features. * Now we do "generic changes." -- cgit v1.1 From 450cc201038f31bd496e1b3a44a49790b8827a06 Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Thu, 27 Sep 2012 10:08:00 -0700 Subject: x86/mce: Provide boot argument to honour bios-set CMCI threshold The ACPI spec doesn't provide for a way for the bios to pass down recommended thresholds to the OS on a _per-bank_ basis. This patch adds a new boot option, which if passed, tells Linux to use CMCI thresholds set by the bios. As fail-safe, we initialize threshold to 1 if some banks have not been initialized by the bios and warn the user. Signed-off-by: Naveen N. Rao Signed-off-by: Tony Luck --- arch/x86/include/asm/mce.h | 1 + arch/x86/kernel/cpu/mcheck/mce.c | 10 ++++++++++ arch/x86/kernel/cpu/mcheck/mce_intel.c | 35 +++++++++++++++++++++++++++++++--- 3 files changed, 43 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index ccaf7c5..54d73b1 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -161,6 +161,7 @@ DECLARE_PER_CPU(struct device *, mce_device); #ifdef CONFIG_X86_MCE_INTEL extern int mce_cmci_disabled; extern int mce_ignore_ce; +extern int mce_bios_cmci_threshold; void mce_intel_feature_init(struct cpuinfo_x86 *c); void cmci_clear(void); void cmci_reenable(void); diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index c311122..29e87d3 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -83,6 +83,7 @@ static int mce_dont_log_ce __read_mostly; int mce_cmci_disabled __read_mostly; int mce_ignore_ce __read_mostly; int mce_ser __read_mostly; +int mce_bios_cmci_threshold __read_mostly; struct mce_bank *mce_banks __read_mostly; @@ -1946,6 +1947,7 @@ static struct miscdevice mce_chrdev_device = { * check, or 0 to not wait * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. * mce=nobootlog Don't log MCEs from before booting. + * mce=bios_cmci_threshold Don't program the CMCI threshold */ static int __init mcheck_enable(char *str) { @@ -1965,6 +1967,8 @@ static int __init mcheck_enable(char *str) mce_ignore_ce = 1; else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) mce_bootlog = (str[0] == 'b'); + else if (!strcmp(str, "bios_cmci_threshold")) + mce_bios_cmci_threshold = 1; else if (isdigit(str[0])) { get_option(&str, &tolerant); if (*str == ',') { @@ -2205,6 +2209,11 @@ static struct dev_ext_attribute dev_attr_cmci_disabled = { &mce_cmci_disabled }; +static struct dev_ext_attribute dev_attr_bios_cmci_threshold = { + __ATTR(bios_cmci_threshold, 0444, device_show_int, NULL), + &mce_bios_cmci_threshold +}; + static struct device_attribute *mce_device_attrs[] = { &dev_attr_tolerant.attr, &dev_attr_check_interval.attr, @@ -2213,6 +2222,7 @@ static struct device_attribute *mce_device_attrs[] = { &dev_attr_dont_log_ce.attr, &dev_attr_ignore_ce.attr, &dev_attr_cmci_disabled.attr, + &dev_attr_bios_cmci_threshold.attr, NULL }; diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 098386f..5f88abf 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -181,10 +181,12 @@ static void cmci_discover(int banks) unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned); unsigned long flags; int i; + int bios_wrong_thresh = 0; raw_spin_lock_irqsave(&cmci_discover_lock, flags); for (i = 0; i < banks; i++) { u64 val; + int bios_zero_thresh = 0; if (test_bit(i, owned)) continue; @@ -198,8 +200,20 @@ static void cmci_discover(int banks) continue; } - val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK; - val |= MCI_CTL2_CMCI_EN | CMCI_THRESHOLD; + if (!mce_bios_cmci_threshold) { + val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK; + val |= CMCI_THRESHOLD; + } else if (!(val & MCI_CTL2_CMCI_THRESHOLD_MASK)) { + /* + * If bios_cmci_threshold boot option was specified + * but the threshold is zero, we'll try to initialize + * it to 1. + */ + bios_zero_thresh = 1; + val |= CMCI_THRESHOLD; + } + + val |= MCI_CTL2_CMCI_EN; wrmsrl(MSR_IA32_MCx_CTL2(i), val); rdmsrl(MSR_IA32_MCx_CTL2(i), val); @@ -207,11 +221,26 @@ static void cmci_discover(int banks) if (val & MCI_CTL2_CMCI_EN) { set_bit(i, owned); __clear_bit(i, __get_cpu_var(mce_poll_banks)); + /* + * We are able to set thresholds for some banks that + * had a threshold of 0. This means the BIOS has not + * set the thresholds properly or does not work with + * this boot option. Note down now and report later. + */ + if (mce_bios_cmci_threshold && bios_zero_thresh && + (val & MCI_CTL2_CMCI_THRESHOLD_MASK)) + bios_wrong_thresh = 1; } else { WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks))); } } raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); + if (mce_bios_cmci_threshold && bios_wrong_thresh) { + pr_info_once( + "bios_cmci_threshold: Some banks do not have valid thresholds set\n"); + pr_info_once( + "bios_cmci_threshold: Make sure your BIOS supports this boot option\n"); + } } /* @@ -249,7 +278,7 @@ void cmci_clear(void) continue; /* Disable CMCI */ rdmsrl(MSR_IA32_MCx_CTL2(i), val); - val &= ~(MCI_CTL2_CMCI_EN|MCI_CTL2_CMCI_THRESHOLD_MASK); + val &= ~MCI_CTL2_CMCI_EN; wrmsrl(MSR_IA32_MCx_CTL2(i), val); __clear_bit(i, __get_cpu_var(mce_banks_owned)); } -- cgit v1.1 From bbb35efcda41d589cfff5e2b08c5fb457791117c Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Thu, 27 Sep 2012 20:10:57 +0200 Subject: um: Fix IPC on um MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit c1d7e01d (ipc: use Kconfig options for __ARCH_WANT_[COMPAT_]IPC_PARSE_VERSION) forgot UML and broke IPC on it. Also UML has to select ARCH_WANT_IPC_PARSE_VERSION usin Kconfig. Reported-and-tested-by: Signed-off-by: Richard Weinberger --- arch/x86/um/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/um/Kconfig b/arch/x86/um/Kconfig index 9926e11..aeaff8b 100644 --- a/arch/x86/um/Kconfig +++ b/arch/x86/um/Kconfig @@ -21,6 +21,7 @@ config 64BIT config X86_32 def_bool !64BIT select HAVE_AOUT + select ARCH_WANT_IPC_PARSE_VERSION config X86_64 def_bool 64BIT -- cgit v1.1 From 9429ec96c2718c0d1e3317cf60a87a0405223814 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 16 Aug 2012 20:15:05 +0200 Subject: um: Preinclude include/linux/kern_levels.h The userspace part of UML uses the asm-offsets.h generator mechanism to create definitions for UM_KERN_ that match the in-kernel KERN_ constant definitions. As of commit 04d2c8c83d0e3ac5f78aeede51babb3236200112 ("printk: convert the format for KERN_ to a 2 byte pattern"), KERN_ is no longer expanded to the literal '""', but to '"\001" "LEVEL"', i.e. it contains two parts. However, the combo of DEFINE_STR() in arch/x86/um/shared/sysdep/kernel-offsets.h and sed-y in Kbuild doesn't support string literals consisting of multiple parts. Hence for all UM_KERN_ definitions, only the SOH character is retained in the actual definition, while the remainder ends up in the comment. E.g. in include/generated/asm-offsets.h we get #define UM_KERN_INFO "\001" /* "6" KERN_INFO */ instead of #define UM_KERN_INFO "\001" "6" /* KERN_INFO */ This causes spurious '^A' output in some kernel messages: Calibrating delay loop... 4640.76 BogoMIPS (lpj=23203840) pid_max: default: 32768 minimum: 301 Mount-cache hash table entries: 256 ^AChecking that host ptys support output SIGIO...Yes ^AChecking that host ptys support SIGIO on close...No, enabling workaround ^AUsing 2.6 host AIO NET: Registered protocol family 16 bio: create slab at 0 Switching to clocksource itimer To fix this: - Move the mapping from UM_KERN_ to KERN_ from arch/um/include/shared/common-offsets.h to arch/um/include/shared/user.h, which is preincluded for all userspace parts, - Preinclude include/linux/kern_levels.h for all userspace parts, to obtain the in-kernel KERN_ constant definitions. This doesn't violate the kernel/userspace separation, as include/linux/kern_levels.h is self-contained and doesn't expose any other kernel internals. - Remove the now unused STR() and DEFINE_STR() macros. Signed-off-by: Geert Uytterhoeven Signed-off-by: Richard Weinberger --- arch/x86/um/shared/sysdep/kernel-offsets.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/um/shared/sysdep/kernel-offsets.h b/arch/x86/um/shared/sysdep/kernel-offsets.h index 5868526..46a9df9 100644 --- a/arch/x86/um/shared/sysdep/kernel-offsets.h +++ b/arch/x86/um/shared/sysdep/kernel-offsets.h @@ -7,9 +7,6 @@ #define DEFINE(sym, val) \ asm volatile("\n->" #sym " %0 " #val : : "i" (val)) -#define STR(x) #x -#define DEFINE_STR(sym, val) asm volatile("\n->" #sym " " STR(val) " " #val: : ) - #define BLANK() asm volatile("\n->" : : ) #define OFFSET(sym, str, mem) \ -- cgit v1.1 From eccbb05a64fef867362ff05b5d266757e3c82b36 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 28 Sep 2012 15:05:15 +0930 Subject: virtio: remove CONFIG_VIRTIO_RING Everyone who selects VIRTIO is also made to select VIRTIO_RING; just make them synonymous, since we removed the indirection layer some time ago. Signed-off-by: Rusty Russell --- arch/x86/lguest/Kconfig | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig index 6e121a2..7872a33 100644 --- a/arch/x86/lguest/Kconfig +++ b/arch/x86/lguest/Kconfig @@ -4,7 +4,6 @@ config LGUEST_GUEST depends on X86_32 select VIRTUALIZATION select VIRTIO - select VIRTIO_RING select VIRTIO_CONSOLE help Lguest is a tiny in-kernel hypervisor. Selecting this will -- cgit v1.1 From fd0f5869724ff6195c6e7f12f8287c66a132e0ba Mon Sep 17 00:00:00 2001 From: Tomoki Sekiyama Date: Wed, 26 Sep 2012 11:11:28 +0900 Subject: x86: Distinguish TLB shootdown interrupts from other functions call interrupts As TLB shootdown requests to other CPU cores are now using function call interrupts, TLB shootdowns entry in /proc/interrupts is always shown as 0. This behavior change was introduced by commit 52aec3308db8 ("x86/tlb: replace INVALIDATE_TLB_VECTOR by CALL_FUNCTION_VECTOR"). This patch reverts TLB shootdowns entry in /proc/interrupts to count TLB shootdowns separately from the other function call interrupts. Signed-off-by: Tomoki Sekiyama Link: http://lkml.kernel.org/r/20120926021128.22212.20440.stgit@hpxw Acked-by: Alex Shi Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/hardirq.h | 4 ++++ arch/x86/kernel/irq.c | 4 ++-- arch/x86/mm/tlb.c | 2 ++ 3 files changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index d3895db..81f04ce 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -18,6 +18,10 @@ typedef struct { #ifdef CONFIG_SMP unsigned int irq_resched_count; unsigned int irq_call_count; + /* + * irq_tlb_count is double-counted in irq_call_count, so it must be + * subtracted from irq_call_count when displaying irq_call_count + */ unsigned int irq_tlb_count; #endif #ifdef CONFIG_X86_THERMAL_VECTOR diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 1f5f1d5..355b13f 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -92,7 +92,8 @@ int arch_show_interrupts(struct seq_file *p, int prec) seq_printf(p, " Rescheduling interrupts\n"); seq_printf(p, "%*s: ", prec, "CAL"); for_each_online_cpu(j) - seq_printf(p, "%10u ", irq_stats(j)->irq_call_count); + seq_printf(p, "%10u ", irq_stats(j)->irq_call_count - + irq_stats(j)->irq_tlb_count); seq_printf(p, " Function call interrupts\n"); seq_printf(p, "%*s: ", prec, "TLB"); for_each_online_cpu(j) @@ -147,7 +148,6 @@ u64 arch_irq_stat_cpu(unsigned int cpu) #ifdef CONFIG_SMP sum += irq_stats(cpu)->irq_resched_count; sum += irq_stats(cpu)->irq_call_count; - sum += irq_stats(cpu)->irq_tlb_count; #endif #ifdef CONFIG_X86_THERMAL_VECTOR sum += irq_stats(cpu)->irq_thermal_count; diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index a085c56..0777f04 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -98,6 +98,8 @@ static void flush_tlb_func(void *info) { struct flush_tlb_info *f = info; + inc_irq_stat(irq_tlb_count); + if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm)) return; -- cgit v1.1 From 785107923a83d8456bbd8564e288a24d84109a46 Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Fri, 28 Sep 2012 17:55:44 -0700 Subject: efi: Defer freeing boot services memory until after ACPI init Some new ACPI 5.0 tables reference resources stored in boot services memory, so keep that memory around until we have ACPI and can extract data from it. Signed-off-by: Josh Triplett Link: http://lkml.kernel.org/r/baaa6d44bdc4eb0c58e5d1b4ccd2c729f854ac55.1348876882.git.josh@joshtriplett.org Signed-off-by: H. Peter Anvin --- arch/x86/platform/efi/efi.c | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index f55a4ce..b3dbbdb 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -419,10 +419,21 @@ void __init efi_reserve_boot_services(void) } } -static void __init efi_free_boot_services(void) +static void __init efi_unmap_memmap(void) +{ + if (memmap.map) { + early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size); + memmap.map = NULL; + } +} + +void __init efi_free_boot_services(void) { void *p; + if (!efi_native) + return; + for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { efi_memory_desc_t *md = p; unsigned long long start = md->phys_addr; @@ -438,6 +449,8 @@ static void __init efi_free_boot_services(void) free_bootmem_late(start, size); } + + efi_unmap_memmap(); } static int __init efi_systab_init(void *phys) @@ -787,8 +800,10 @@ void __init efi_enter_virtual_mode(void) * non-native EFI */ - if (!efi_native) - goto out; + if (!efi_native) { + efi_unmap_memmap(); + return; + } /* Merge contiguous regions of the same type and attribute */ for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { @@ -878,13 +893,6 @@ void __init efi_enter_virtual_mode(void) } /* - * Thankfully, it does seem that no runtime services other than - * SetVirtualAddressMap() will touch boot services code, so we can - * get rid of it all at this point - */ - efi_free_boot_services(); - - /* * Now that EFI is in virtual mode, update the function * pointers in the runtime service table to the new virtual addresses. * @@ -907,9 +915,6 @@ void __init efi_enter_virtual_mode(void) if (__supported_pte_mask & _PAGE_NX) runtime_code_page_mkexec(); -out: - early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size); - memmap.map = NULL; kfree(new_memmap); } -- cgit v1.1 From 7bc90e01c3f66c137e7e761f574bbf883087d590 Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Fri, 28 Sep 2012 17:56:08 -0700 Subject: efi: Add a function to look up existing IO memory mappings The EFI initialization creates virtual mappings for EFI boot services memory, so if a driver wants to access EFI boot services memory, it cannot call ioremap itself; doing so will trip the WARN about mapping RAM twice. Thus, a driver accessing EFI boot services memory must do so via the existing mapping already created during EFI intiialization. Since the EFI code already maintains a memory map for that memory, add a function efi_lookup_mapped_addr to look up mappings in that memory map. Signed-off-by: Josh Triplett Link: http://lkml.kernel.org/r/0eb48ae012797912874919110660ad420b90268b.1348876882.git.josh@joshtriplett.org Signed-off-by: H. Peter Anvin --- arch/x86/platform/efi/efi.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index b3dbbdb..f7f928c 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -777,6 +777,34 @@ static void __init runtime_code_page_mkexec(void) } /* + * We can't ioremap data in EFI boot services RAM, because we've already mapped + * it as RAM. So, look it up in the existing EFI memory map instead. Only + * callable after efi_enter_virtual_mode and before efi_free_boot_services. + */ +void __iomem *efi_lookup_mapped_addr(u64 phys_addr) +{ + void *p; + if (WARN_ON(!memmap.map)) + return NULL; + for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { + efi_memory_desc_t *md = p; + u64 size = md->num_pages << EFI_PAGE_SHIFT; + u64 end = md->phys_addr + size; + if (!(md->attribute & EFI_MEMORY_RUNTIME) && + md->type != EFI_BOOT_SERVICES_CODE && + md->type != EFI_BOOT_SERVICES_DATA) + continue; + if (!md->virt_addr) + continue; + if (phys_addr >= md->phys_addr && phys_addr < end) { + phys_addr += md->virt_addr - md->phys_addr; + return (__force void __iomem *)(unsigned long)phys_addr; + } + } + return NULL; +} + +/* * This function will switch the EFI runtime services to virtual mode. * Essentially, look through the EFI memmap and map every region that * has the runtime attribute bit set in its memory descriptor and update -- cgit v1.1 From 2223af389032425e3d1a70f9cb3a63feaa654ced Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Fri, 28 Sep 2012 17:57:05 -0700 Subject: efi: Fix the ACPI BGRT driver for images located in EFI boot services memory The ACPI BGRT driver accesses the BIOS logo image when it initializes. However, ACPI 5.0 (which introduces the BGRT) recommends putting the logo image in EFI boot services memory, so that the OS can reclaim that memory. Production systems follow this recommendation, breaking the ACPI BGRT driver. Move the bulk of the BGRT code to run during a new EFI late initialization phase, which occurs after switching EFI to virtual mode, and after initializing ACPI, but before freeing boot services memory. Copy the BIOS logo image to kernel memory at that point, and make it accessible to the BGRT driver. Rework the existing ACPI BGRT driver to act as a simple wrapper exposing that image (and the properties from the BGRT) via sysfs. Signed-off-by: Josh Triplett Link: http://lkml.kernel.org/r/93ce9f823f1c1f3bb88bdd662cce08eee7a17f5d.1348876882.git.josh@joshtriplett.org Signed-off-by: H. Peter Anvin --- arch/x86/platform/efi/Makefile | 1 + arch/x86/platform/efi/efi-bgrt.c | 76 ++++++++++++++++++++++++++++++++++++++++ arch/x86/platform/efi/efi.c | 6 ++++ 3 files changed, 83 insertions(+) create mode 100644 arch/x86/platform/efi/efi-bgrt.c (limited to 'arch/x86') diff --git a/arch/x86/platform/efi/Makefile b/arch/x86/platform/efi/Makefile index 73b8be0..6db1cc4 100644 --- a/arch/x86/platform/efi/Makefile +++ b/arch/x86/platform/efi/Makefile @@ -1 +1,2 @@ obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o +obj-$(CONFIG_ACPI_BGRT) += efi-bgrt.o diff --git a/arch/x86/platform/efi/efi-bgrt.c b/arch/x86/platform/efi/efi-bgrt.c new file mode 100644 index 0000000..f6a0c1b --- /dev/null +++ b/arch/x86/platform/efi/efi-bgrt.c @@ -0,0 +1,76 @@ +/* + * Copyright 2012 Intel Corporation + * Author: Josh Triplett + * + * Based on the bgrt driver: + * Copyright 2012 Red Hat, Inc + * Author: Matthew Garrett + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include +#include + +struct acpi_table_bgrt *bgrt_tab; +void *bgrt_image; +size_t bgrt_image_size; + +struct bmp_header { + u16 id; + u32 size; +} __packed; + +void efi_bgrt_init(void) +{ + acpi_status status; + void __iomem *image; + bool ioremapped = false; + struct bmp_header bmp_header; + + if (acpi_disabled) + return; + + status = acpi_get_table("BGRT", 0, + (struct acpi_table_header **)&bgrt_tab); + if (ACPI_FAILURE(status)) + return; + + if (bgrt_tab->version != 1) + return; + if (bgrt_tab->image_type != 0 || !bgrt_tab->image_address) + return; + + image = efi_lookup_mapped_addr(bgrt_tab->image_address); + if (!image) { + image = ioremap(bgrt_tab->image_address, sizeof(bmp_header)); + ioremapped = true; + if (!image) + return; + } + + memcpy_fromio(&bmp_header, image, sizeof(bmp_header)); + if (ioremapped) + iounmap(image); + bgrt_image_size = bmp_header.size; + + bgrt_image = kmalloc(bgrt_image_size, GFP_KERNEL); + if (!bgrt_image) + return; + + if (ioremapped) { + image = ioremap(bgrt_tab->image_address, bmp_header.size); + if (!image) { + kfree(bgrt_image); + bgrt_image = NULL; + return; + } + } + + memcpy_fromio(bgrt_image, image, bgrt_image_size); + if (ioremapped) + iounmap(image); +} diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index f7f928c..aded2a9 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -745,6 +746,11 @@ void __init efi_init(void) #endif } +void __init efi_late_init(void) +{ + efi_bgrt_init(); +} + void __init efi_set_executable(efi_memory_desc_t *md, bool executable) { u64 addr, npages; -- cgit v1.1 From a1ce39288e6fbefdd8d607021d02384eb4a20b99 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 2 Oct 2012 18:01:25 +0100 Subject: UAPI: (Scripted) Convert #include "..." to #include in kernel system headers Convert #include "..." to #include in kernel system headers. Signed-off-by: David Howells Acked-by: Arnd Bergmann Acked-by: Thomas Gleixner Acked-by: Paul E. McKenney Acked-by: Dave Jones --- arch/x86/include/asm/atomic.h | 4 ++-- arch/x86/include/asm/calling.h | 2 +- arch/x86/include/asm/checksum.h | 4 ++-- arch/x86/include/asm/cmpxchg.h | 4 ++-- arch/x86/include/asm/mmzone.h | 4 ++-- arch/x86/include/asm/mutex.h | 4 ++-- arch/x86/include/asm/numa.h | 4 ++-- arch/x86/include/asm/pci.h | 2 +- arch/x86/include/asm/pgtable.h | 4 ++-- arch/x86/include/asm/pgtable_types.h | 4 ++-- arch/x86/include/asm/posix_types.h | 10 +++++----- arch/x86/include/asm/seccomp.h | 4 ++-- arch/x86/include/asm/string.h | 4 ++-- arch/x86/include/asm/suspend.h | 4 ++-- arch/x86/include/asm/uaccess.h | 4 ++-- arch/x86/include/asm/user.h | 4 ++-- arch/x86/include/asm/xen/interface.h | 4 ++-- arch/x86/include/asm/xor.h | 4 ++-- arch/x86/include/asm/xor_32.h | 2 +- arch/x86/include/asm/xor_64.h | 2 +- 20 files changed, 39 insertions(+), 39 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index 58cb6d4..250b877 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -309,9 +309,9 @@ static inline void atomic_or_long(unsigned long *v1, unsigned long v2) #define smp_mb__after_atomic_inc() barrier() #ifdef CONFIG_X86_32 -# include "atomic64_32.h" +# include #else -# include "atomic64_64.h" +# include #endif #endif /* _ASM_X86_ATOMIC_H */ diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h index 7f8422a..0fa6750 100644 --- a/arch/x86/include/asm/calling.h +++ b/arch/x86/include/asm/calling.h @@ -46,7 +46,7 @@ For 32-bit we have the following conventions - kernel is built with */ -#include "dwarf2.h" +#include /* * 64-bit system call stack frame layout defines and helpers, diff --git a/arch/x86/include/asm/checksum.h b/arch/x86/include/asm/checksum.h index 848850f..5f5bb0f 100644 --- a/arch/x86/include/asm/checksum.h +++ b/arch/x86/include/asm/checksum.h @@ -1,5 +1,5 @@ #ifdef CONFIG_X86_32 -# include "checksum_32.h" +# include #else -# include "checksum_64.h" +# include #endif diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h index 99480e5..8d871ea 100644 --- a/arch/x86/include/asm/cmpxchg.h +++ b/arch/x86/include/asm/cmpxchg.h @@ -138,9 +138,9 @@ extern void __add_wrong_size(void) __raw_cmpxchg((ptr), (old), (new), (size), "") #ifdef CONFIG_X86_32 -# include "cmpxchg_32.h" +# include #else -# include "cmpxchg_64.h" +# include #endif #ifdef __HAVE_ARCH_CMPXCHG diff --git a/arch/x86/include/asm/mmzone.h b/arch/x86/include/asm/mmzone.h index 64217ea..d497bc4 100644 --- a/arch/x86/include/asm/mmzone.h +++ b/arch/x86/include/asm/mmzone.h @@ -1,5 +1,5 @@ #ifdef CONFIG_X86_32 -# include "mmzone_32.h" +# include #else -# include "mmzone_64.h" +# include #endif diff --git a/arch/x86/include/asm/mutex.h b/arch/x86/include/asm/mutex.h index a731b9c..7d3a482 100644 --- a/arch/x86/include/asm/mutex.h +++ b/arch/x86/include/asm/mutex.h @@ -1,5 +1,5 @@ #ifdef CONFIG_X86_32 -# include "mutex_32.h" +# include #else -# include "mutex_64.h" +# include #endif diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h index bfacd2c..49119fc 100644 --- a/arch/x86/include/asm/numa.h +++ b/arch/x86/include/asm/numa.h @@ -53,9 +53,9 @@ static inline int numa_cpu_node(int cpu) #endif /* CONFIG_NUMA */ #ifdef CONFIG_X86_32 -# include "numa_32.h" +# include #else -# include "numa_64.h" +# include #endif #ifdef CONFIG_NUMA diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index df75d07..6e41b93 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -141,7 +141,7 @@ void default_restore_msi_irqs(struct pci_dev *dev, int irq); #endif /* __KERNEL__ */ #ifdef CONFIG_X86_64 -#include "pci_64.h" +#include #endif /* implement the pci_ DMA API in terms of the generic device dma_ one */ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 49afb3f..fc99484 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -384,9 +384,9 @@ pte_t *populate_extra_pte(unsigned long vaddr); #endif /* __ASSEMBLY__ */ #ifdef CONFIG_X86_32 -# include "pgtable_32.h" +# include #else -# include "pgtable_64.h" +# include #endif #ifndef __ASSEMBLY__ diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index db8fec6..ec8a1fc 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -174,9 +174,9 @@ #endif #ifdef CONFIG_X86_32 -# include "pgtable_32_types.h" +# include #else -# include "pgtable_64_types.h" +# include #endif #ifndef __ASSEMBLY__ diff --git a/arch/x86/include/asm/posix_types.h b/arch/x86/include/asm/posix_types.h index 7ef7c30..bad3665 100644 --- a/arch/x86/include/asm/posix_types.h +++ b/arch/x86/include/asm/posix_types.h @@ -1,15 +1,15 @@ #ifdef __KERNEL__ # ifdef CONFIG_X86_32 -# include "posix_types_32.h" +# include # else -# include "posix_types_64.h" +# include # endif #else # ifdef __i386__ -# include "posix_types_32.h" +# include # elif defined(__ILP32__) -# include "posix_types_x32.h" +# include # else -# include "posix_types_64.h" +# include # endif #endif diff --git a/arch/x86/include/asm/seccomp.h b/arch/x86/include/asm/seccomp.h index c62e58a..0f3d7f0 100644 --- a/arch/x86/include/asm/seccomp.h +++ b/arch/x86/include/asm/seccomp.h @@ -1,5 +1,5 @@ #ifdef CONFIG_X86_32 -# include "seccomp_32.h" +# include #else -# include "seccomp_64.h" +# include #endif diff --git a/arch/x86/include/asm/string.h b/arch/x86/include/asm/string.h index 6dfd6d9..09224d7 100644 --- a/arch/x86/include/asm/string.h +++ b/arch/x86/include/asm/string.h @@ -1,5 +1,5 @@ #ifdef CONFIG_X86_32 -# include "string_32.h" +# include #else -# include "string_64.h" +# include #endif diff --git a/arch/x86/include/asm/suspend.h b/arch/x86/include/asm/suspend.h index 9bd521f..2fab6c2 100644 --- a/arch/x86/include/asm/suspend.h +++ b/arch/x86/include/asm/suspend.h @@ -1,5 +1,5 @@ #ifdef CONFIG_X86_32 -# include "suspend_32.h" +# include #else -# include "suspend_64.h" +# include #endif diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index a91acfb..7ccf8d1 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -589,9 +589,9 @@ extern struct movsl_mask { #define ARCH_HAS_NOCACHE_UACCESS 1 #ifdef CONFIG_X86_32 -# include "uaccess_32.h" +# include #else -# include "uaccess_64.h" +# include #endif #endif /* _ASM_X86_UACCESS_H */ diff --git a/arch/x86/include/asm/user.h b/arch/x86/include/asm/user.h index 24532c7..ccab4af 100644 --- a/arch/x86/include/asm/user.h +++ b/arch/x86/include/asm/user.h @@ -2,9 +2,9 @@ #define _ASM_X86_USER_H #ifdef CONFIG_X86_32 -# include "user_32.h" +# include #else -# include "user_64.h" +# include #endif #include diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h index cbf0c9d..80502a2 100644 --- a/arch/x86/include/asm/xen/interface.h +++ b/arch/x86/include/asm/xen/interface.h @@ -116,9 +116,9 @@ struct arch_shared_info { #endif /* !__ASSEMBLY__ */ #ifdef CONFIG_X86_32 -#include "interface_32.h" +#include #else -#include "interface_64.h" +#include #endif #ifndef __ASSEMBLY__ diff --git a/arch/x86/include/asm/xor.h b/arch/x86/include/asm/xor.h index 7fcf6f3..f8fde90 100644 --- a/arch/x86/include/asm/xor.h +++ b/arch/x86/include/asm/xor.h @@ -3,8 +3,8 @@ # include #else #ifdef CONFIG_X86_32 -# include "xor_32.h" +# include #else -# include "xor_64.h" +# include #endif #endif diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h index aabd585..f79cb7e 100644 --- a/arch/x86/include/asm/xor_32.h +++ b/arch/x86/include/asm/xor_32.h @@ -822,7 +822,7 @@ static struct xor_block_template xor_block_pIII_sse = { }; /* Also try the AVX routines */ -#include "xor_avx.h" +#include /* Also try the generic routines. */ #include diff --git a/arch/x86/include/asm/xor_64.h b/arch/x86/include/asm/xor_64.h index 5fc06d0..87ac522 100644 --- a/arch/x86/include/asm/xor_64.h +++ b/arch/x86/include/asm/xor_64.h @@ -306,7 +306,7 @@ static struct xor_block_template xor_block_sse = { /* Also try the AVX routines */ -#include "xor_avx.h" +#include #undef XOR_TRY_TEMPLATES #define XOR_TRY_TEMPLATES \ -- cgit v1.1 From abbf1590de22a6d2240a59383477da50d1402f6a Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 2 Oct 2012 18:01:26 +0100 Subject: UAPI: Partition the header include path sets and add uapi/ header directories Partition the header include path flags into two sets, one for kernelspace builds and one for userspace builds. Add the following directories to build after the ordinary include directories so that #include will pick up the UAPI header directly if the kernel header has been moved there. The userspace set (represented by the USERINCLUDE make variable) contains: -I $(srctree)/arch/$(hdr-arch)/include/uapi -I arch/$(hdr-arch)/include/generated/uapi -I $(srctree)/include/uapi -I include/generated/uapi -include $(srctree)/include/linux/kconfig.h and the kernelspace set (represented by the LINUXINCLUDE make variable) contains: -I $(srctree)/arch/$(hdr-arch)/include -I arch/$(hdr-arch)/include/generated -I $(srctree)/include -I include --- if not building in the source tree plus everything in the USERINCLUDE set. Then use USERINCLUDE in building the x86 boot code. Signed-off-by: David Howells Acked-by: Arnd Bergmann Acked-by: Thomas Gleixner Acked-by: Paul E. McKenney Acked-by: Dave Jones --- arch/x86/boot/Makefile | 4 ++-- arch/x86/boot/mkcpustr.c | 2 ++ arch/x86/include/asm/cpufeature.h | 2 ++ arch/x86/kernel/cpu/mkcapflags.pl | 5 ++++- 4 files changed, 10 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index f7535be..ce03476 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -37,7 +37,7 @@ setup-y += video-bios.o targets += $(setup-y) hostprogs-y := mkcpustr tools/build -HOST_EXTRACFLAGS += -I$(srctree)/tools/include $(LINUXINCLUDE) \ +HOST_EXTRACFLAGS += -I$(srctree)/tools/include $(USERINCLUDE) \ -D__EXPORTED_HEADERS__ $(obj)/cpu.o: $(obj)/cpustr.h @@ -52,7 +52,7 @@ $(obj)/cpustr.h: $(obj)/mkcpustr FORCE # How to compile the 16-bit code. Note we always compile for -march=i386, # that way we can complain to the user if the CPU is insufficient. -KBUILD_CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D__KERNEL__ \ +KBUILD_CFLAGS := $(USERINCLUDE) -g -Os -D_SETUP -D__KERNEL__ \ -DDISABLE_BRANCH_PROFILING \ -Wall -Wstrict-prototypes \ -march=i386 -mregparm=3 \ diff --git a/arch/x86/boot/mkcpustr.c b/arch/x86/boot/mkcpustr.c index 919257f..4579eff 100644 --- a/arch/x86/boot/mkcpustr.c +++ b/arch/x86/boot/mkcpustr.c @@ -15,6 +15,8 @@ #include +#include "../include/asm/required-features.h" +#include "../include/asm/cpufeature.h" #include "../kernel/cpu/capflags.c" int main(void) diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 16cae42..8c297aa 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -4,7 +4,9 @@ #ifndef _ASM_X86_CPUFEATURE_H #define _ASM_X86_CPUFEATURE_H +#ifndef _ASM_X86_REQUIRED_FEATURES_H #include +#endif #define NCAPINTS 10 /* N 32-bit words worth of info */ diff --git a/arch/x86/kernel/cpu/mkcapflags.pl b/arch/x86/kernel/cpu/mkcapflags.pl index c7b3fe2..091972e 100644 --- a/arch/x86/kernel/cpu/mkcapflags.pl +++ b/arch/x86/kernel/cpu/mkcapflags.pl @@ -8,7 +8,10 @@ open(IN, "< $in\0") or die "$0: cannot open: $in: $!\n"; open(OUT, "> $out\0") or die "$0: cannot create: $out: $!\n"; -print OUT "#include \n\n"; +print OUT "#ifndef _ASM_X86_CPUFEATURE_H\n"; +print OUT "#include \n"; +print OUT "#endif\n"; +print OUT "\n"; print OUT "const char * const x86_cap_flags[NCAPINTS*32] = {\n"; %features = (); -- cgit v1.1 From 4413e16d9d21673bb5048a2e542f1aaa00015c2e Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 2 Oct 2012 18:01:35 +0100 Subject: UAPI: (Scripted) Set up UAPI Kbuild files Set up empty UAPI Kbuild files to be populated by the header splitter. Signed-off-by: David Howells Acked-by: Arnd Bergmann Acked-by: Thomas Gleixner Acked-by: Paul E. McKenney Acked-by: Dave Jones --- arch/x86/include/uapi/asm/Kbuild | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 arch/x86/include/uapi/asm/Kbuild (limited to 'arch/x86') diff --git a/arch/x86/include/uapi/asm/Kbuild b/arch/x86/include/uapi/asm/Kbuild new file mode 100644 index 0000000..83b6e9a --- /dev/null +++ b/arch/x86/include/uapi/asm/Kbuild @@ -0,0 +1,6 @@ +# UAPI Header export list +include include/uapi/asm-generic/Kbuild.asm + +genhdr-y += unistd_32.h +genhdr-y += unistd_64.h +genhdr-y += unistd_x32.h -- cgit v1.1 From 584c5ef813c63354ed9e03b732048240c09b86af Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 2 Oct 2012 18:01:56 +0100 Subject: UAPI: x86: Fix the test_get_len tool Fix the x86 test_get_len tool to have the right include paths in the right order (it includes a non-exported kernel header directly), otherwise errors like the following occur: /data/fs/linux-2.6-hdr/include/linux/types.h:18:26: error: conflicting types for 'fd_set' /usr/include/sys/select.h:78:5: note: previous declaration of 'fd_set' was here and /data/fs/linux-2.6-hdr/include/linux/string.h:42:12: error: expected identifier or '(' before '__extension__' Signed-off-by: David Howells Acked-by: Arnd Bergmann Acked-by: Thomas Gleixner Acked-by: Paul E. McKenney Acked-by: Dave Jones --- arch/x86/tools/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/tools/Makefile b/arch/x86/tools/Makefile index 733057b..bae601f 100644 --- a/arch/x86/tools/Makefile +++ b/arch/x86/tools/Makefile @@ -28,7 +28,7 @@ posttest: $(obj)/test_get_len vmlinux $(obj)/insn_sanity hostprogs-y += test_get_len insn_sanity # -I needed for generated C source and C source which in the kernel tree. -HOSTCFLAGS_test_get_len.o := -Wall -I$(objtree)/arch/x86/lib/ -I$(srctree)/arch/x86/include/ -I$(srctree)/arch/x86/lib/ -I$(srctree)/include/ +HOSTCFLAGS_test_get_len.o := -Wall -I$(objtree)/arch/x86/lib/ -I$(srctree)/arch/x86/include/uapi/ -I$(srctree)/arch/x86/include/ -I$(srctree)/arch/x86/lib/ -I$(srctree)/include/uapi/ HOSTCFLAGS_insn_sanity.o := -Wall -I$(objtree)/arch/x86/lib/ -I$(srctree)/arch/x86/include/ -I$(srctree)/arch/x86/lib/ -I$(srctree)/include/ -- cgit v1.1 From c0522b6cc1237c935b2cead3fa7b45465df2f839 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 2 Oct 2012 18:01:56 +0100 Subject: UAPI: x86: Fix insn_sanity build failure after UAPI split Fix a build failure in the x86 insn_sanity program after the UAPI split. The problem is that insn_sanity.c #includes arch/x86/lib/insn.c - which uses the kernel string header. This leads to conflicts for various definitions against the /usr/include/ headers. linux/string.h can be replaced with the normal userspace string.h if __KERNEL__ is not specified. HOSTCC arch/x86/tools/insn_sanity In file included from /data/fs/linux-2.6-hdr/include/linux/string.h:6:0, from /data/fs/linux-2.6-hdr/arch/x86/lib/insn.c:21, from arch/x86/tools/insn_sanity.c:36: /data/fs/linux-2.6-hdr/include/linux/types.h:14:26: error: conflicting types for 'fd_set' /usr/include/sys/select.h:76:5: note: previous declaration of 'fd_set' was here /data/fs/linux-2.6-hdr/include/linux/types.h:15:25: error: conflicting types for 'dev_t' /usr/include/sys/types.h:61:17: note: previous declaration of 'dev_t' was here /data/fs/linux-2.6-hdr/include/linux/types.h:25:26: error: conflicting types for 'timer_t' /usr/include/time.h:104:19: note: previous declaration of 'timer_t' was here /data/fs/linux-2.6-hdr/include/linux/types.h:45:26: error: conflicting types for 'loff_t' /usr/include/sys/types.h:45:18: note: previous declaration of 'loff_t' was here /data/fs/linux-2.6-hdr/include/linux/types.h:112:17: error: conflicting types for 'u_int64_t' /usr/include/sys/types.h:204:1: note: previous declaration of 'u_int64_t' was here /data/fs/linux-2.6-hdr/include/linux/types.h:113:17: error: conflicting types for 'int64_t' /usr/include/sys/types.h:198:1: note: previous declaration of 'int64_t' was here /data/fs/linux-2.6-hdr/include/linux/types.h:134:23: error: conflicting types for 'blkcnt_t' /usr/include/sys/types.h:236:20: note: previous declaration of 'blkcnt_t' was here In file included from /data/fs/linux-2.6-hdr/arch/x86/lib/insn.c:21:0, from arch/x86/tools/insn_sanity.c:36: /data/fs/linux-2.6-hdr/include/linux/string.h:38:12: error: expected identifier or '(' before '__extension__' /data/fs/linux-2.6-hdr/include/linux/string.h:38:12: error: expected identifier or '(' before ')' token /data/fs/linux-2.6-hdr/include/linux/string.h:41:12: error: expected identifier or '(' before '__extension__' /data/fs/linux-2.6-hdr/include/linux/string.h:53:15: error: expected identifier or '(' before '__extension__' /data/fs/linux-2.6-hdr/include/linux/string.h:61:28: error: expected '=', ',', ';', 'asm' or '__attribute__' before 'skip_spaces' /data/fs/linux-2.6-hdr/include/linux/string.h:65:28: error: expected '=', ',', ';', 'asm' or '__attribute__' before 'char' /data/fs/linux-2.6-hdr/include/linux/string.h:83:15: error: expected identifier or '(' before '__extension__' /data/fs/linux-2.6-hdr/include/linux/string.h:83:15: error: expected identifier or '(' before ')' token /data/fs/linux-2.6-hdr/include/linux/string.h:86:15: error: expected identifier or '(' before '__extension__' /data/fs/linux-2.6-hdr/include/linux/string.h:86:15: error: expected identifier or '(' before ')' token /data/fs/linux-2.6-hdr/include/linux/string.h:89:24: error: expected identifier or '(' before '__extension__' /data/fs/linux-2.6-hdr/include/linux/string.h:89:24: error: expected identifier or '(' before ')' token /data/fs/linux-2.6-hdr/include/linux/string.h:92:24: error: expected identifier or '(' before '__extension__' /data/fs/linux-2.6-hdr/include/linux/string.h:92:24: error: expected identifier or '(' before ')' token Signed-off-by: David Howells Acked-by: Arnd Bergmann Acked-by: Thomas Gleixner Acked-by: Paul E. McKenney Acked-by: Dave Jones --- arch/x86/lib/insn.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c index b1e6c4b..54fcffe 100644 --- a/arch/x86/lib/insn.c +++ b/arch/x86/lib/insn.c @@ -18,7 +18,11 @@ * Copyright (C) IBM Corporation, 2002, 2004, 2009 */ +#ifdef __KERNEL__ #include +#else +#include +#endif #include #include -- cgit v1.1 From ec28b7f250b19f31e14b69b015d61d0818bf43a0 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 2 Oct 2012 18:01:56 +0100 Subject: UAPI: x86: Differentiate the generated UAPI and internal headers Differentiate the generated UAPI and internal headers during generation such that the UAPI headers can be installed elsewhere. A later patch will use this to move the UAPI headers to: arch/x86/include/generated/uapi/asm/ to make them easier to handle. A previous patch added a -I for this path. Signed-off-by: David Howells Acked-by: Arnd Bergmann Acked-by: Thomas Gleixner Acked-by: Paul E. McKenney Acked-by: Dave Jones --- arch/x86/syscalls/Makefile | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/syscalls/Makefile b/arch/x86/syscalls/Makefile index 3236aeb..174b032 100644 --- a/arch/x86/syscalls/Makefile +++ b/arch/x86/syscalls/Makefile @@ -1,7 +1,9 @@ out := $(obj)/../include/generated/asm +uapi := $(obj)/../include/generated/asm # Create output directory if not already present -_dummy := $(shell [ -d '$(out)' ] || mkdir -p '$(out)') +_dummy := $(shell [ -d '$(out)' ] || mkdir -p '$(out)') \ + $(shell [ -d '$(uapi)' ] || mkdir -p '$(uapi)') syscall32 := $(srctree)/$(src)/syscall_32.tbl syscall64 := $(srctree)/$(src)/syscall_64.tbl @@ -18,7 +20,7 @@ quiet_cmd_systbl = SYSTBL $@ cmd_systbl = $(CONFIG_SHELL) '$(systbl)' $< $@ syshdr_abi_unistd_32 := i386 -$(out)/unistd_32.h: $(syscall32) $(syshdr) +$(uapi)/unistd_32.h: $(syscall32) $(syshdr) $(call if_changed,syshdr) syshdr_abi_unistd_32_ia32 := i386 @@ -28,11 +30,11 @@ $(out)/unistd_32_ia32.h: $(syscall32) $(syshdr) syshdr_abi_unistd_x32 := common,x32 syshdr_offset_unistd_x32 := __X32_SYSCALL_BIT -$(out)/unistd_x32.h: $(syscall64) $(syshdr) +$(uapi)/unistd_x32.h: $(syscall64) $(syshdr) $(call if_changed,syshdr) syshdr_abi_unistd_64 := common,64 -$(out)/unistd_64.h: $(syscall64) $(syshdr) +$(uapi)/unistd_64.h: $(syscall64) $(syshdr) $(call if_changed,syshdr) syshdr_abi_unistd_64_x32 := x32 @@ -45,11 +47,12 @@ $(out)/syscalls_32.h: $(syscall32) $(systbl) $(out)/syscalls_64.h: $(syscall64) $(systbl) $(call if_changed,systbl) -syshdr-y += unistd_32.h unistd_64.h unistd_x32.h +uapisyshdr-y += unistd_32.h unistd_64.h unistd_x32.h syshdr-y += syscalls_32.h syshdr-$(CONFIG_X86_64) += unistd_32_ia32.h unistd_64_x32.h syshdr-$(CONFIG_X86_64) += syscalls_64.h -targets += $(syshdr-y) +targets += $(uapisyshdr-y) $(syshdr-y) -all: $(addprefix $(out)/,$(targets)) +all: $(addprefix $(uapi)/,$(uapisyshdr-y)) +all: $(addprefix $(out)/,$(syshdr-y)) -- cgit v1.1 From 10b63956fce7f369cc37fd4d994f09bd5203efe4 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 2 Oct 2012 18:01:57 +0100 Subject: UAPI: Plumb the UAPI Kbuilds into the user header installation and checking Plumb the UAPI Kbuilds into the user header installation and checking system. As the headers are split the entries will be transferred across from the old Kbuild files to the UAPI Kbuild files. The changes made in this commit are: (1) Exported generated files (of which there are currently four) are moved to uapi/ directories under the appropriate generated/ directory, thus we get: include/generated/uapi/linux/version.h arch/x86/include/generated/uapi/asm/unistd_32.h arch/x86/include/generated/uapi/asm/unistd_64.h arch/x86/include/generated/uapi/asm/unistd_x32.h These paths were added to the build as -I flags in a previous patch. (2) scripts/Makefile.headersinst is now given the UAPI path to install from rather than the old path. It then determines the old path from that and includes that Kbuild also if it exists, thus permitting the headers to exist in either directory during the changeover. I also renamed the "install" variable to "installdir" as it refers to a directory not the install program. (3) scripts/headers_install.pl is altered to take a list of source file paths instead of just their names so that the makefile can tell it exactly where to find each file. For the moment, files can be obtained from one of four places for each output directory: .../include/uapi/foo/ .../include/generated/uapi/foo/ .../include/foo/ .../include/generated/foo/ The non-UAPI paths will be dropped later. Signed-off-by: David Howells Acked-by: Arnd Bergmann Acked-by: Thomas Gleixner Acked-by: Paul E. McKenney Acked-by: Dave Jones --- arch/x86/include/asm/Kbuild | 4 ---- arch/x86/syscalls/Makefile | 2 +- 2 files changed, 1 insertion(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index f9c0d3b..1595d68 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -22,7 +22,3 @@ header-y += sigcontext32.h header-y += ucontext.h header-y += vm86.h header-y += vsyscall.h - -genhdr-y += unistd_32.h -genhdr-y += unistd_64.h -genhdr-y += unistd_x32.h diff --git a/arch/x86/syscalls/Makefile b/arch/x86/syscalls/Makefile index 174b032..f325af2 100644 --- a/arch/x86/syscalls/Makefile +++ b/arch/x86/syscalls/Makefile @@ -1,5 +1,5 @@ out := $(obj)/../include/generated/asm -uapi := $(obj)/../include/generated/asm +uapi := $(obj)/../include/generated/uapi/asm # Create output directory if not already present _dummy := $(shell [ -d '$(out)' ] || mkdir -p '$(out)') \ -- cgit v1.1 From b1e0d8b70fa31821ebca3965f2ef8619d7c5e316 Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Tue, 2 Oct 2012 16:42:36 +0200 Subject: kbuild: Fix gcc -x syntax The correct syntax for gcc -x is "gcc -x assembler", not "gcc -xassembler". Even though the latter happens to work, the former is what is documented in the manual page and thus what gcc wrappers such as icecream do expect. This isn't a cosmetic change. The missing space prevents icecream from recognizing compilation tasks it can't handle, leading to silent kernel miscompilations. Besides me, credits go to Michael Matz and Dirk Mueller for investigating the miscompilation issue and tracking it down to this incorrect -x parameter syntax. Signed-off-by: Jean Delvare Acked-by: Ingo Molnar Cc: stable@vger.kernel.org Cc: Bernhard Walle Cc: Michal Marek Cc: Ralf Baechle Signed-off-by: Michal Marek --- arch/x86/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Makefile b/arch/x86/Makefile index b0c5276..cb8ac93 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -88,7 +88,7 @@ endif ifdef CONFIG_X86_X32 x32_ld_ok := $(call try-run,\ /bin/echo -e '1: .quad 1b' | \ - $(CC) $(KBUILD_AFLAGS) -c -xassembler -o "$$TMP" - && \ + $(CC) $(KBUILD_AFLAGS) -c -x assembler -o "$$TMP" - && \ $(OBJCOPY) -O elf32-x86-64 "$$TMP" "$$TMPO" && \ $(LD) -m elf32_x86_64 "$$TMPO" -o "$$TMP",y,n) ifeq ($(x32_ld_ok),y) -- cgit v1.1 From e7a570ff7dff9af6e54ff5e580a61ec7652137a0 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 5 Sep 2012 12:04:14 +0800 Subject: asm-generic: Add default clkdev.h Ease the deployment of clkdev by providing a default asm/clkdev.h for use if the arch does not have an include/asm/clkdev.h. Due to limitations in Kbuild we manually add clkdev.h to all architectures that don't have one rather than having the header appear by default. Signed-off-by: Mark Brown Reviewed-by: Stephen Rothwell Signed-off-by: Arnd Bergmann --- arch/x86/include/asm/Kbuild | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index f9c0d3b..66e5f0e 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -26,3 +26,5 @@ header-y += vsyscall.h genhdr-y += unistd_32.h genhdr-y += unistd_64.h genhdr-y += unistd_x32.h + +generic-y += clkdev.h -- cgit v1.1 From c9f97a27ceee84998999bf3341e6d5d207b05539 Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Wed, 19 Sep 2012 09:40:30 +0300 Subject: crypto: x86/glue_helper - fix storing of new IV in CBC encryption Glue_helper incorrectly XORs new IV over old IV at end of CBC encryption function when it should store. This causes CBC encryption to give incorrect output on multi-page encryption requests. Signed-off-by: Jussi Kivilinna Signed-off-by: Herbert Xu --- arch/x86/crypto/glue_helper.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c index 4854f0f..30b3927 100644 --- a/arch/x86/crypto/glue_helper.c +++ b/arch/x86/crypto/glue_helper.c @@ -110,7 +110,7 @@ static unsigned int __glue_cbc_encrypt_128bit(const common_glue_func_t fn, nbytes -= bsize; } while (nbytes >= bsize); - u128_xor((u128 *)walk->iv, (u128 *)walk->iv, iv); + *(u128 *)walk->iv = *iv; return nbytes; } -- cgit v1.1 From 75fdd155eaf755aa183ca9844a9a178b7a0e3959 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 4 Oct 2012 17:11:42 -0700 Subject: sections: fix section conflicts in arch/x86 Signed-off-by: Andi Kleen Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/apic.h | 2 +- arch/x86/kernel/apic/apic_numachip.c | 4 ++-- arch/x86/kernel/rtc.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index f342612..3388034 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -409,7 +409,7 @@ extern struct apic *apic; * to enforce the order with in them. */ #define apic_driver(sym) \ - static struct apic *__apicdrivers_##sym __used \ + static const struct apic *__apicdrivers_##sym __used \ __aligned(sizeof(struct apic *)) \ __section(.apicdrivers) = { &sym } diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index bc552cf..a65829a 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -30,7 +30,7 @@ static int numachip_system __read_mostly; -static struct apic apic_numachip __read_mostly; +static const struct apic apic_numachip __read_mostly; static unsigned int get_apic_id(unsigned long x) { @@ -199,7 +199,7 @@ static int numachip_acpi_madt_oem_check(char *oem_id, char *oem_table_id) return 0; } -static struct apic apic_numachip __refconst = { +static const struct apic apic_numachip __refconst = { .name = "NumaConnect system", .probe = numachip_probe, diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index af6db6e..4929c1b 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -225,7 +225,7 @@ static struct platform_device rtc_device = { static __init int add_rtc_cmos(void) { #ifdef CONFIG_PNP - static const char *ids[] __initconst = + static const char * const const ids[] __initconst = { "PNP0b00", "PNP0b01", "PNP0b02", }; struct pnp_dev *dev; struct pnp_id *id; -- cgit v1.1 From 751f409db6216ebd134a94f6dcd97779933a5106 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Thu, 4 Oct 2012 17:15:31 -0700 Subject: compat: move compat_siginfo_t definition to asm/compat.h This is a preparatory patch for the introduction of NT_SIGINFO elf note. Make the location of compat_siginfo_t uniform across eight architectures which have it. Now it can be pulled in by including asm/compat.h or linux/compat.h. Most of the copies are verbatim. compat_uid[32]_t had to be replaced by __compat_uid[32]_t. compat_uptr_t had to be moved up before compat_siginfo_t in asm/compat.h on a several architectures (tile already had it moved up). compat_sigval_t had to be relocated from linux/compat.h to asm/compat.h. Signed-off-by: Denys Vlasenko Cc: Oleg Nesterov Cc: Amerigo Wang Cc: "Jonathan M. Foote" Cc: Roland McGrath Cc: Pedro Alves Cc: Fengguang Wu Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/compat.h | 74 ++++++++++++++++++++++++++++++++++++++++++- arch/x86/include/asm/ia32.h | 67 --------------------------------------- 2 files changed, 73 insertions(+), 68 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index fedf32b..59c6c40 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -41,6 +41,7 @@ typedef s64 __attribute__((aligned(4))) compat_s64; typedef u32 compat_uint_t; typedef u32 compat_ulong_t; typedef u64 __attribute__((aligned(4))) compat_u64; +typedef u32 compat_uptr_t; struct compat_timespec { compat_time_t tv_sec; @@ -124,6 +125,78 @@ typedef u32 compat_old_sigset_t; /* at least 32 bits */ typedef u32 compat_sigset_word; +typedef union compat_sigval { + compat_int_t sival_int; + compat_uptr_t sival_ptr; +} compat_sigval_t; + +typedef struct compat_siginfo { + int si_signo; + int si_errno; + int si_code; + + union { + int _pad[128/sizeof(int) - 3]; + + /* kill() */ + struct { + unsigned int _pid; /* sender's pid */ + unsigned int _uid; /* sender's uid */ + } _kill; + + /* POSIX.1b timers */ + struct { + compat_timer_t _tid; /* timer id */ + int _overrun; /* overrun count */ + compat_sigval_t _sigval; /* same as below */ + int _sys_private; /* not to be passed to user */ + int _overrun_incr; /* amount to add to overrun */ + } _timer; + + /* POSIX.1b signals */ + struct { + unsigned int _pid; /* sender's pid */ + unsigned int _uid; /* sender's uid */ + compat_sigval_t _sigval; + } _rt; + + /* SIGCHLD */ + struct { + unsigned int _pid; /* which child */ + unsigned int _uid; /* sender's uid */ + int _status; /* exit code */ + compat_clock_t _utime; + compat_clock_t _stime; + } _sigchld; + + /* SIGCHLD (x32 version) */ + struct { + unsigned int _pid; /* which child */ + unsigned int _uid; /* sender's uid */ + int _status; /* exit code */ + compat_s64 _utime; + compat_s64 _stime; + } _sigchld_x32; + + /* SIGILL, SIGFPE, SIGSEGV, SIGBUS */ + struct { + unsigned int _addr; /* faulting insn/memory ref. */ + } _sigfault; + + /* SIGPOLL */ + struct { + int _band; /* POLL_IN, POLL_OUT, POLL_MSG */ + int _fd; + } _sigpoll; + + struct { + unsigned int _call_addr; /* calling insn */ + int _syscall; /* triggering system call number */ + unsigned int _arch; /* AUDIT_ARCH_* of syscall */ + } _sigsys; + } _sifields; +} compat_siginfo_t; + #define COMPAT_OFF_T_MAX 0x7fffffff #define COMPAT_LOFF_T_MAX 0x7fffffffffffffffL @@ -209,7 +282,6 @@ typedef struct user_regs_struct32 compat_elf_gregset_t; * as pointers because the syscall entry code will have * appropriately converted them already. */ -typedef u32 compat_uptr_t; static inline void __user *compat_ptr(compat_uptr_t uptr) { diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h index b04cbdb..e623277 100644 --- a/arch/x86/include/asm/ia32.h +++ b/arch/x86/include/asm/ia32.h @@ -86,73 +86,6 @@ struct stat64 { unsigned long long st_ino; } __attribute__((packed)); -typedef struct compat_siginfo { - int si_signo; - int si_errno; - int si_code; - - union { - int _pad[((128 / sizeof(int)) - 3)]; - - /* kill() */ - struct { - unsigned int _pid; /* sender's pid */ - unsigned int _uid; /* sender's uid */ - } _kill; - - /* POSIX.1b timers */ - struct { - compat_timer_t _tid; /* timer id */ - int _overrun; /* overrun count */ - compat_sigval_t _sigval; /* same as below */ - int _sys_private; /* not to be passed to user */ - int _overrun_incr; /* amount to add to overrun */ - } _timer; - - /* POSIX.1b signals */ - struct { - unsigned int _pid; /* sender's pid */ - unsigned int _uid; /* sender's uid */ - compat_sigval_t _sigval; - } _rt; - - /* SIGCHLD */ - struct { - unsigned int _pid; /* which child */ - unsigned int _uid; /* sender's uid */ - int _status; /* exit code */ - compat_clock_t _utime; - compat_clock_t _stime; - } _sigchld; - - /* SIGCHLD (x32 version) */ - struct { - unsigned int _pid; /* which child */ - unsigned int _uid; /* sender's uid */ - int _status; /* exit code */ - compat_s64 _utime; - compat_s64 _stime; - } _sigchld_x32; - - /* SIGILL, SIGFPE, SIGSEGV, SIGBUS */ - struct { - unsigned int _addr; /* faulting insn/memory ref. */ - } _sigfault; - - /* SIGPOLL */ - struct { - int _band; /* POLL_IN, POLL_OUT, POLL_MSG */ - int _fd; - } _sigpoll; - - struct { - unsigned int _call_addr; /* calling insn */ - int _syscall; /* triggering system call number */ - unsigned int _arch; /* AUDIT_ARCH_* of syscall */ - } _sigsys; - } _sifields; -} compat_siginfo_t; - #define IA32_STACK_TOP IA32_PAGE_OFFSET #ifdef __KERNEL__ -- cgit v1.1 From af1839eb4bd4fe079a125eb199205fceb6ae19e6 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Mon, 8 Oct 2012 16:28:08 -0700 Subject: Kconfig: clean up the long arch list for the UID16 config option Introduce HAVE_UID16 config option and select it in corresponding architecture Kconfig files. UID16 now only depends on HAVE_UID16. Signed-off-by: Catalin Marinas Acked-by: Geert Uytterhoeven Cc: Russell King Cc: Mike Frysinger Cc: Mikael Starvik Cc: Jesper Nilsson Cc: David Howells Cc: Yoshinori Sato Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Paul Mundt Cc: "David S. Miller" Cc: Jeff Dike Cc: Richard Weinberger Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b72777f..fd5d7c2 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -10,6 +10,7 @@ config X86_32 def_bool y depends on !64BIT select CLKSRC_I8253 + select HAVE_UID16 config X86_64 def_bool y @@ -2168,6 +2169,7 @@ config IA32_EMULATION bool "IA32 Emulation" depends on X86_64 select COMPAT_BINFMT_ELF + select HAVE_UID16 ---help--- Include code to run legacy 32-bit programs under a 64-bit kernel. You should likely turn this on, unless you're -- cgit v1.1 From b69ec42b1b194cc88f04b3fbcda8d3f93182d6c3 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Mon, 8 Oct 2012 16:28:11 -0700 Subject: Kconfig: clean up the long arch list for the DEBUG_KMEMLEAK config option Introduce HAVE_DEBUG_KMEMLEAK config option and select it in corresponding architecture Kconfig files. DEBUG_KMEMLEAK now only depends on HAVE_DEBUG_KMEMLEAK. Signed-off-by: Catalin Marinas Cc: Russell King Cc: Michal Simek Cc: Ralf Baechle Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Paul Mundt Cc: "David S. Miller" Cc: Chris Metcalf Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index fd5d7c2..3fea184 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -66,6 +66,7 @@ config X86 select HAVE_PERF_EVENTS_NMI select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP + select HAVE_DEBUG_KMEMLEAK select ANON_INODES select HAVE_ALIGNED_STRUCT_PAGE if SLUB && !M386 select HAVE_CMPXCHG_LOCAL if !M386 -- cgit v1.1 From 7ac57a89de958fbb5271dc504d0c25e34dbeec32 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Mon, 8 Oct 2012 16:28:16 -0700 Subject: Kconfig: clean up the "#if defined(arch)" list for exception-trace sysctl entry Introduce SYSCTL_EXCEPTION_TRACE config option and selec it in the architectures requiring support for the "exception-trace" debug_table entry in kernel/sysctl.c. Signed-off-by: Catalin Marinas Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: "David S. Miller" Cc: Chris Metcalf Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 3fea184..6119d6c 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -47,6 +47,7 @@ config X86 select HAVE_FUNCTION_GRAPH_FP_TEST select HAVE_FUNCTION_TRACE_MCOUNT_TEST select HAVE_SYSCALL_TRACEPOINTS + select SYSCTL_EXCEPTION_TRACE select HAVE_KVM select HAVE_ARCH_KGDB select HAVE_ARCH_TRACEHOOK -- cgit v1.1 From b1a86e15dc0304366f50ba1720834bc419c801b1 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 8 Oct 2012 16:28:23 -0700 Subject: x86, pat: remove the dependency on 'vm_pgoff' in track/untrack pfn vma routines 'pfn' argument for track_pfn_vma_new() can be used for reserving the attribute for the pfn range. No need to depend on 'vm_pgoff' Similarly, untrack_pfn_vma() can depend on the 'pfn' argument if it is non-zero or can use follow_phys() to get the starting value of the pfn range. Also the non zero 'size' argument can be used instead of recomputing it from vma. This cleanup also prepares the ground for the track/untrack pfn vma routines to take over the ownership of setting PAT specific vm_flag in the 'vma'. [khlebnikov@openvz.org: Clear pfn to paddr conversion] Signed-off-by: Suresh Siddha Signed-off-by: Konstantin Khlebnikov Cc: Venkatesh Pallipadi Cc: H. Peter Anvin Cc: Nick Piggin Cc: Ingo Molnar Cc: Alexander Viro Cc: Carsten Otte Cc: Chris Metcalf Cc: Cyrill Gorcunov Cc: Eric Paris Cc: H. Peter Anvin Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Morris Cc: Jason Baron Cc: Kentaro Takeda Cc: Matt Helsley Cc: Nick Piggin Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Robert Richter Cc: Tetsuo Handa Acked-by: Linus Torvalds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/pat.c | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 3d68ef6..de36c88 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -704,21 +704,18 @@ int track_pfn_vma_copy(struct vm_area_struct *vma) int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot, unsigned long pfn, unsigned long size) { + resource_size_t paddr = (resource_size_t)pfn << PAGE_SHIFT; unsigned long flags; - resource_size_t paddr; - unsigned long vma_size = vma->vm_end - vma->vm_start; - if (is_linear_pfn_mapping(vma)) { - /* reserve the whole chunk starting from vm_pgoff */ - paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT; - return reserve_pfn_range(paddr, vma_size, prot, 0); - } + /* reserve the whole chunk starting from paddr */ + if (is_linear_pfn_mapping(vma)) + return reserve_pfn_range(paddr, size, prot, 0); if (!pat_enabled) return 0; /* for vm_insert_pfn and friends, we set prot based on lookup */ - flags = lookup_memtype(pfn << PAGE_SHIFT); + flags = lookup_memtype(paddr); *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) | flags); @@ -728,20 +725,28 @@ int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot, /* * untrack_pfn_vma is called while unmapping a pfnmap for a region. * untrack can be called for a specific region indicated by pfn and size or - * can be for the entire vma (in which case size can be zero). + * can be for the entire vma (in which case pfn, size are zero). */ void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn, unsigned long size) { resource_size_t paddr; - unsigned long vma_size = vma->vm_end - vma->vm_start; + unsigned long prot; - if (is_linear_pfn_mapping(vma)) { - /* free the whole chunk starting from vm_pgoff */ - paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT; - free_pfn_range(paddr, vma_size); + if (!is_linear_pfn_mapping(vma)) return; + + /* free the chunk starting from pfn or the whole chunk */ + paddr = (resource_size_t)pfn << PAGE_SHIFT; + if (!paddr && !size) { + if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) { + WARN_ON_ONCE(1); + return; + } + + size = vma->vm_end - vma->vm_start; } + free_pfn_range(paddr, size); } pgprot_t pgprot_writecombine(pgprot_t prot) -- cgit v1.1 From 5180da410db6369d1f95c9014da1c9bc33fb043e Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 8 Oct 2012 16:28:29 -0700 Subject: x86, pat: separate the pfn attribute tracking for remap_pfn_range and vm_insert_pfn With PAT enabled, vm_insert_pfn() looks up the existing pfn memory attribute and uses it. Expectation is that the driver reserves the memory attributes for the pfn before calling vm_insert_pfn(). remap_pfn_range() (when called for the whole vma) will setup a new attribute (based on the prot argument) for the specified pfn range. This addresses the legacy usage which typically calls remap_pfn_range() with a desired memory attribute. For ranges smaller than the vma size (which is typically not the case), remap_pfn_range() will use the existing memory attribute for the pfn range. Expose two different API's for these different behaviors. track_pfn_insert() for tracking the pfn attribute set by vm_insert_pfn() and track_pfn_remap() for the remap_pfn_range(). This cleanup also prepares the ground for the track/untrack pfn vma routines to take over the ownership of setting PAT specific vm_flag in the 'vma'. [khlebnikov@openvz.org: Clear checks in track_pfn_remap()] [akpm@linux-foundation.org: tweak a few comments] Signed-off-by: Suresh Siddha Signed-off-by: Konstantin Khlebnikov Cc: Venkatesh Pallipadi Cc: H. Peter Anvin Cc: Nick Piggin Cc: Ingo Molnar Cc: Alexander Viro Cc: Carsten Otte Cc: Chris Metcalf Cc: Cyrill Gorcunov Cc: Eric Paris Cc: Hugh Dickins Cc: James Morris Cc: Jason Baron Cc: Kentaro Takeda Cc: Konstantin Khlebnikov Cc: Matt Helsley Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Robert Richter Cc: Suresh Siddha Cc: Tetsuo Handa Acked-by: Linus Torvalds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/pat.c | 47 ++++++++++++++++++++++++++++++++++++----------- 1 file changed, 36 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index de36c88..74a7026 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -664,13 +664,13 @@ static void free_pfn_range(u64 paddr, unsigned long size) } /* - * track_pfn_vma_copy is called when vma that is covering the pfnmap gets + * track_pfn_copy is called when vma that is covering the pfnmap gets * copied through copy_page_range(). * * If the vma has a linear pfn mapping for the entire range, we get the prot * from pte and reserve the entire vma range with single reserve_pfn_range call. */ -int track_pfn_vma_copy(struct vm_area_struct *vma) +int track_pfn_copy(struct vm_area_struct *vma) { resource_size_t paddr; unsigned long prot; @@ -694,15 +694,12 @@ int track_pfn_vma_copy(struct vm_area_struct *vma) } /* - * track_pfn_vma_new is called when a _new_ pfn mapping is being established - * for physical range indicated by pfn and size. - * * prot is passed in as a parameter for the new mapping. If the vma has a * linear pfn mapping for the entire range reserve the entire vma range with * single reserve_pfn_range call. */ -int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot, - unsigned long pfn, unsigned long size) +int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, + unsigned long pfn, unsigned long size) { resource_size_t paddr = (resource_size_t)pfn << PAGE_SHIFT; unsigned long flags; @@ -714,8 +711,36 @@ int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot, if (!pat_enabled) return 0; - /* for vm_insert_pfn and friends, we set prot based on lookup */ + /* + * For anything smaller than the vma size we set prot based on the + * lookup. + */ flags = lookup_memtype(paddr); + + /* Check memtype for the remaining pages */ + while (size > PAGE_SIZE) { + size -= PAGE_SIZE; + paddr += PAGE_SIZE; + if (flags != lookup_memtype(paddr)) + return -EINVAL; + } + + *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) | + flags); + + return 0; +} + +int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, + unsigned long pfn) +{ + unsigned long flags; + + if (!pat_enabled) + return 0; + + /* Set prot based on lookup */ + flags = lookup_memtype((resource_size_t)pfn << PAGE_SHIFT); *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) | flags); @@ -723,12 +748,12 @@ int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot, } /* - * untrack_pfn_vma is called while unmapping a pfnmap for a region. + * untrack_pfn is called while unmapping a pfnmap for a region. * untrack can be called for a specific region indicated by pfn and size or * can be for the entire vma (in which case pfn, size are zero). */ -void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn, - unsigned long size) +void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, + unsigned long size) { resource_size_t paddr; unsigned long prot; -- cgit v1.1 From b3b9c2932c32e0692018ed5f12f3fd8c70eea8ce Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Mon, 8 Oct 2012 16:28:34 -0700 Subject: mm, x86, pat: rework linear pfn-mmap tracking Replace the generic vma-flag VM_PFN_AT_MMAP with x86-only VM_PAT. We can toss mapping address from remap_pfn_range() into track_pfn_vma_new(), and collect all PAT-related logic together in arch/x86/. This patch also restores orignal frustration-free is_cow_mapping() check in remap_pfn_range(), as it was before commit v2.6.28-rc8-88-g3c8bb73 ("x86: PAT: store vm_pgoff for all linear_over_vma_region mappings - v3") is_linear_pfn_mapping() checks can be removed from mm/huge_memory.c, because it already handled by VM_PFNMAP in VM_NO_THP bit-mask. [suresh.b.siddha@intel.com: Reset the VM_PAT flag as part of untrack_pfn_vma()] Signed-off-by: Konstantin Khlebnikov Signed-off-by: Suresh Siddha Cc: Venkatesh Pallipadi Cc: H. Peter Anvin Cc: Nick Piggin Cc: Ingo Molnar Cc: Alexander Viro Cc: Carsten Otte Cc: Chris Metcalf Cc: Cyrill Gorcunov Cc: Eric Paris Cc: Hugh Dickins Cc: James Morris Cc: Jason Baron Cc: Kentaro Takeda Cc: Matt Helsley Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Robert Richter Cc: Tetsuo Handa Cc: Venkatesh Pallipadi Acked-by: Linus Torvalds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/pat.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 74a7026..0eb572e 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -677,7 +677,7 @@ int track_pfn_copy(struct vm_area_struct *vma) unsigned long vma_size = vma->vm_end - vma->vm_start; pgprot_t pgprot; - if (is_linear_pfn_mapping(vma)) { + if (vma->vm_flags & VM_PAT) { /* * reserve the whole chunk covered by vma. We need the * starting address and protection from pte. @@ -699,14 +699,20 @@ int track_pfn_copy(struct vm_area_struct *vma) * single reserve_pfn_range call. */ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, - unsigned long pfn, unsigned long size) + unsigned long pfn, unsigned long addr, unsigned long size) { resource_size_t paddr = (resource_size_t)pfn << PAGE_SHIFT; unsigned long flags; /* reserve the whole chunk starting from paddr */ - if (is_linear_pfn_mapping(vma)) - return reserve_pfn_range(paddr, size, prot, 0); + if (addr == vma->vm_start && size == (vma->vm_end - vma->vm_start)) { + int ret; + + ret = reserve_pfn_range(paddr, size, prot, 0); + if (!ret) + vma->vm_flags |= VM_PAT; + return ret; + } if (!pat_enabled) return 0; @@ -758,7 +764,7 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, resource_size_t paddr; unsigned long prot; - if (!is_linear_pfn_mapping(vma)) + if (!(vma->vm_flags & VM_PAT)) return; /* free the chunk starting from pfn or the whole chunk */ @@ -772,6 +778,7 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, size = vma->vm_end - vma->vm_start; } free_pfn_range(paddr, size); + vma->vm_flags &= ~VM_PAT; } pgprot_t pgprot_writecombine(pgprot_t prot) -- cgit v1.1 From 314e51b9851b4f4e8ab302243ff5a6fc6147f379 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Mon, 8 Oct 2012 16:29:02 -0700 Subject: mm: kill vma flag VM_RESERVED and mm->reserved_vm counter A long time ago, in v2.4, VM_RESERVED kept swapout process off VMA, currently it lost original meaning but still has some effects: | effect | alternative flags -+------------------------+--------------------------------------------- 1| account as reserved_vm | VM_IO 2| skip in core dump | VM_IO, VM_DONTDUMP 3| do not merge or expand | VM_IO, VM_DONTEXPAND, VM_HUGETLB, VM_PFNMAP 4| do not mlock | VM_IO, VM_DONTEXPAND, VM_HUGETLB, VM_PFNMAP This patch removes reserved_vm counter from mm_struct. Seems like nobody cares about it, it does not exported into userspace directly, it only reduces total_vm showed in proc. Thus VM_RESERVED can be replaced with VM_IO or pair VM_DONTEXPAND | VM_DONTDUMP. remap_pfn_range() and io_remap_pfn_range() set VM_IO|VM_DONTEXPAND|VM_DONTDUMP. remap_vmalloc_range() set VM_DONTEXPAND | VM_DONTDUMP. [akpm@linux-foundation.org: drivers/vfio/pci/vfio_pci.c fixup] Signed-off-by: Konstantin Khlebnikov Cc: Alexander Viro Cc: Carsten Otte Cc: Chris Metcalf Cc: Cyrill Gorcunov Cc: Eric Paris Cc: H. Peter Anvin Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Morris Cc: Jason Baron Cc: Kentaro Takeda Cc: Matt Helsley Cc: Nick Piggin Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Robert Richter Cc: Suresh Siddha Cc: Tetsuo Handa Cc: Venkatesh Pallipadi Acked-by: Linus Torvalds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/xen/mmu.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 5a16824..fd28d86 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -2451,8 +2451,7 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma, prot = __pgprot(pgprot_val(prot) | _PAGE_IOMAP); - BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_RESERVED | VM_IO)) == - (VM_PFNMAP | VM_RESERVED | VM_IO))); + BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO))); rmd.mfn = mfn; rmd.prot = prot; -- cgit v1.1 From 5d3a551c28c6669dc43be40d8fafafbc2ec8f42b Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Mon, 8 Oct 2012 16:29:32 -0700 Subject: mm: hugetlb: add arch hook for clearing page flags before entering pool The core page allocator ensures that page flags are zeroed when freeing pages via free_pages_check. A number of architectures (ARM, PPC, MIPS) rely on this property to treat new pages as dirty with respect to the data cache and perform the appropriate flushing before mapping the pages into userspace. This can lead to cache synchronisation problems when using hugepages, since the allocator keeps its own pool of pages above the usual page allocator and does not reset the page flags when freeing a page into the pool. This patch adds a new architecture hook, arch_clear_hugepage_flags, so that architectures which rely on the page flags being in a particular state for fresh allocations can adjust the flags accordingly when a page is freed into the pool. Signed-off-by: Will Deacon Cc: Michal Hocko Reviewed-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/hugetlb.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h index 439a9ac..bdd35db 100644 --- a/arch/x86/include/asm/hugetlb.h +++ b/arch/x86/include/asm/hugetlb.h @@ -90,4 +90,8 @@ static inline void arch_release_hugepage(struct page *page) { } +static inline void arch_clear_hugepage_flags(struct page *page) +{ +} + #endif /* _ASM_X86_HUGETLB_H */ -- cgit v1.1 From 15626062f4a98279c59a2a5208c496cf65cbf8c0 Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Mon, 8 Oct 2012 16:30:04 -0700 Subject: thp, x86: introduce HAVE_ARCH_TRANSPARENT_HUGEPAGE Cleanup patch in preparation for transparent hugepage support on s390. Adding new architectures to the TRANSPARENT_HUGEPAGE config option can make the "depends" line rather ugly, like "depends on (X86 || (S390 && 64BIT)) && MMU". This patch adds a HAVE_ARCH_TRANSPARENT_HUGEPAGE instead. x86 already has MMU "def_bool y", so the MMU check is superfluous there and HAVE_ARCH_TRANSPARENT_HUGEPAGE can be selected in arch/x86/Kconfig. Signed-off-by: Gerald Schaefer Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Andrea Arcangeli Cc: Andi Kleen Cc: Hugh Dickins Cc: Hillf Danton Cc: Martin Schwidefsky Cc: Heiko Carstens Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 6119d6c..1ae94bc 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -88,6 +88,7 @@ config X86 select IRQ_FORCED_THREADING select USE_GENERIC_SMP_HELPERS if SMP select HAVE_BPF_JIT if X86_64 + select HAVE_ARCH_TRANSPARENT_HUGEPAGE select CLKEVT_I8253 select ARCH_HAVE_NMI_SAFE_CMPXCHG select GENERIC_IOMAP -- cgit v1.1 From 9d9e6f9703bbd642f3f2f807e6aaa642a4cbcec9 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:31:20 -0700 Subject: rbtree: remove prior augmented rbtree implementation convert arch/x86/mm/pat_rbtree.c to the proposed augmented rbtree api and remove the old augmented rbtree implementation. Signed-off-by: Michel Lespinasse Acked-by: Rik van Riel Cc: Peter Zijlstra Cc: Andrea Arcangeli Cc: David Woodhouse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/pat_rbtree.c | 65 ++++++++++++++++++++++++++++++++++-------------- 1 file changed, 46 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c index 8acaddd..7e1515b 100644 --- a/arch/x86/mm/pat_rbtree.c +++ b/arch/x86/mm/pat_rbtree.c @@ -54,29 +54,57 @@ static u64 get_subtree_max_end(struct rb_node *node) return ret; } -/* Update 'subtree_max_end' for a node, based on node and its children */ -static void memtype_rb_augment_cb(struct rb_node *node, void *__unused) +static u64 compute_subtree_max_end(struct memtype *data) { - struct memtype *data; - u64 max_end, child_max_end; - - if (!node) - return; - - data = container_of(node, struct memtype, rb); - max_end = data->end; + u64 max_end = data->end, child_max_end; - child_max_end = get_subtree_max_end(node->rb_right); + child_max_end = get_subtree_max_end(data->rb.rb_right); if (child_max_end > max_end) max_end = child_max_end; - child_max_end = get_subtree_max_end(node->rb_left); + child_max_end = get_subtree_max_end(data->rb.rb_left); if (child_max_end > max_end) max_end = child_max_end; - data->subtree_max_end = max_end; + return max_end; +} + +/* Update 'subtree_max_end' for node and its parents */ +static void memtype_rb_propagate_cb(struct rb_node *node, struct rb_node *stop) +{ + while (node != stop) { + struct memtype *data = container_of(node, struct memtype, rb); + u64 subtree_max_end = compute_subtree_max_end(data); + if (data->subtree_max_end == subtree_max_end) + break; + data->subtree_max_end = subtree_max_end; + node = rb_parent(&data->rb); + } +} + +static void memtype_rb_copy_cb(struct rb_node *old, struct rb_node *new) +{ + struct memtype *old_data = container_of(old, struct memtype, rb); + struct memtype *new_data = container_of(new, struct memtype, rb); + + new_data->subtree_max_end = old_data->subtree_max_end; } +/* Update 'subtree_max_end' after tree rotation. old and new are the + * former and current subtree roots */ +static void memtype_rb_rotate_cb(struct rb_node *old, struct rb_node *new) +{ + struct memtype *old_data = container_of(old, struct memtype, rb); + struct memtype *new_data = container_of(new, struct memtype, rb); + + new_data->subtree_max_end = old_data->subtree_max_end; + old_data->subtree_max_end = compute_subtree_max_end(old_data); +} + +static const struct rb_augment_callbacks memtype_rb_augment_cb = { + memtype_rb_propagate_cb, memtype_rb_copy_cb, memtype_rb_rotate_cb +}; + /* Find the first (lowest start addr) overlapping range from rb tree */ static struct memtype *memtype_rb_lowest_match(struct rb_root *root, u64 start, u64 end) @@ -179,15 +207,17 @@ static void memtype_rb_insert(struct rb_root *root, struct memtype *newdata) struct memtype *data = container_of(*node, struct memtype, rb); parent = *node; + if (data->subtree_max_end < newdata->end) + data->subtree_max_end = newdata->end; if (newdata->start <= data->start) node = &((*node)->rb_left); else if (newdata->start > data->start) node = &((*node)->rb_right); } + newdata->subtree_max_end = newdata->end; rb_link_node(&newdata->rb, parent, node); - rb_insert_color(&newdata->rb, root); - rb_augment_insert(&newdata->rb, memtype_rb_augment_cb, NULL); + rb_insert_augmented(&newdata->rb, root, &memtype_rb_augment_cb); } int rbt_memtype_check_insert(struct memtype *new, unsigned long *ret_type) @@ -209,16 +239,13 @@ int rbt_memtype_check_insert(struct memtype *new, unsigned long *ret_type) struct memtype *rbt_memtype_erase(u64 start, u64 end) { - struct rb_node *deepest; struct memtype *data; data = memtype_rb_exact_match(&memtype_rbroot, start, end); if (!data) goto out; - deepest = rb_augment_erase_begin(&data->rb); - rb_erase(&data->rb, &memtype_rbroot); - rb_augment_erase_end(deepest, memtype_rb_augment_cb, NULL); + rb_erase_augmented(&data->rb, &memtype_rbroot, &memtype_rb_augment_cb); out: return data; } -- cgit v1.1 From 3908836aa77e3621aaf2101f2920e01d7c8460d6 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:31:21 -0700 Subject: rbtree: add RB_DECLARE_CALLBACKS() macro As proposed by Peter Zijlstra, this makes it easier to define the augmented rbtree callbacks. Signed-off-by: Michel Lespinasse Cc: Rik van Riel Cc: Peter Zijlstra Cc: Andrea Arcangeli Cc: David Woodhouse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/pat_rbtree.c | 37 ++----------------------------------- 1 file changed, 2 insertions(+), 35 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c index 7e1515b..4d11695 100644 --- a/arch/x86/mm/pat_rbtree.c +++ b/arch/x86/mm/pat_rbtree.c @@ -69,41 +69,8 @@ static u64 compute_subtree_max_end(struct memtype *data) return max_end; } -/* Update 'subtree_max_end' for node and its parents */ -static void memtype_rb_propagate_cb(struct rb_node *node, struct rb_node *stop) -{ - while (node != stop) { - struct memtype *data = container_of(node, struct memtype, rb); - u64 subtree_max_end = compute_subtree_max_end(data); - if (data->subtree_max_end == subtree_max_end) - break; - data->subtree_max_end = subtree_max_end; - node = rb_parent(&data->rb); - } -} - -static void memtype_rb_copy_cb(struct rb_node *old, struct rb_node *new) -{ - struct memtype *old_data = container_of(old, struct memtype, rb); - struct memtype *new_data = container_of(new, struct memtype, rb); - - new_data->subtree_max_end = old_data->subtree_max_end; -} - -/* Update 'subtree_max_end' after tree rotation. old and new are the - * former and current subtree roots */ -static void memtype_rb_rotate_cb(struct rb_node *old, struct rb_node *new) -{ - struct memtype *old_data = container_of(old, struct memtype, rb); - struct memtype *new_data = container_of(new, struct memtype, rb); - - new_data->subtree_max_end = old_data->subtree_max_end; - old_data->subtree_max_end = compute_subtree_max_end(old_data); -} - -static const struct rb_augment_callbacks memtype_rb_augment_cb = { - memtype_rb_propagate_cb, memtype_rb_copy_cb, memtype_rb_rotate_cb -}; +RB_DECLARE_CALLBACKS(static, memtype_rb_augment_cb, struct memtype, rb, + u64, subtree_max_end, compute_subtree_max_end) /* Find the first (lowest start addr) overlapping range from rb tree */ static struct memtype *memtype_rb_lowest_match(struct rb_root *root, -- cgit v1.1 From 6b2dbba8b6ac4df26f72eda1e5ea7bab9f950e08 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:31:25 -0700 Subject: mm: replace vma prio_tree with an interval tree Implement an interval tree as a replacement for the VMA prio_tree. The algorithms are similar to lib/interval_tree.c; however that code can't be directly reused as the interval endpoints are not explicitly stored in the VMA. So instead, the common algorithm is moved into a template and the details (node type, how to get interval endpoints from the node, etc) are filled in using the C preprocessor. Once the interval tree functions are available, using them as a replacement to the VMA prio tree is a relatively simple, mechanical job. Signed-off-by: Michel Lespinasse Cc: Rik van Riel Cc: Hillf Danton Cc: Peter Zijlstra Cc: Catalin Marinas Cc: Andrea Arcangeli Cc: David Woodhouse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/hugetlbpage.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index b91e485..937bff5 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -71,7 +71,6 @@ huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) struct address_space *mapping = vma->vm_file->f_mapping; pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; - struct prio_tree_iter iter; struct vm_area_struct *svma; unsigned long saddr; pte_t *spte = NULL; @@ -81,7 +80,7 @@ huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) return (pte_t *)pmd_alloc(mm, pud, addr); mutex_lock(&mapping->i_mmap_mutex); - vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) { + vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { if (svma == vma) continue; -- cgit v1.1 From 9c079add0d0f45220f4bb37febf0621137ec2d38 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:31:33 -0700 Subject: rbtree: move augmented rbtree functionality to rbtree_augmented.h Provide rb_insert_augmented() and rb_erase_augmented() through a new rbtree_augmented.h include file. rb_erase_augmented() is defined there as an __always_inline function, in order to allow inlining of augmented rbtree callbacks into it. Since this generates a relatively large function, each augmented rbtree user should make sure to have a single call site. Signed-off-by: Michel Lespinasse Cc: Rik van Riel Cc: Hillf Danton Cc: Peter Zijlstra Cc: Catalin Marinas Cc: Andrea Arcangeli Cc: David Woodhouse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/pat_rbtree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c index 4d11695..415f6c4 100644 --- a/arch/x86/mm/pat_rbtree.c +++ b/arch/x86/mm/pat_rbtree.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include -- cgit v1.1 From e79bee24fd6134f90af4228cfebd010136d67631 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 8 Oct 2012 16:32:18 -0700 Subject: atomic: implement generic atomic_dec_if_positive() The x86 implementation of atomic_dec_if_positive is quite generic, so make it available to all architectures. This is needed for "swap: add a simple detector for inappropriate swapin readahead". [akpm@linux-foundation.org: do the "#define foo foo" trick in the conventional manner] Signed-off-by: Shaohua Li Cc: Stephen Rothwell Cc: "David S. Miller" Cc: Rik van Riel Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Benjamin Herrenschmidt Cc: Michal Simek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/atomic.h | 24 ------------------------ 1 file changed, 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index 250b877..b6c3b82 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -240,30 +240,6 @@ static inline int __atomic_add_unless(atomic_t *v, int a, int u) return c; } - -/* - * atomic_dec_if_positive - decrement by 1 if old value positive - * @v: pointer of type atomic_t - * - * The function returns the old value of *v minus 1, even if - * the atomic variable, v, was not decremented. - */ -static inline int atomic_dec_if_positive(atomic_t *v) -{ - int c, old, dec; - c = atomic_read(v); - for (;;) { - dec = c - 1; - if (unlikely(dec < 0)) - break; - old = atomic_cmpxchg((v), c, dec); - if (likely(old == c)) - break; - c = old; - } - return dec; -} - /** * atomic_inc_short - increment of a short integer * @v: pointer to type int -- cgit v1.1 From 45cac65b0fcd287ebb877b141d40ba9bbe8e5da7 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 8 Oct 2012 16:32:19 -0700 Subject: readahead: fault retry breaks mmap file read random detection .fault now can retry. The retry can break state machine of .fault. In filemap_fault, if page is miss, ra->mmap_miss is increased. In the second try, since the page is in page cache now, ra->mmap_miss is decreased. And these are done in one fault, so we can't detect random mmap file access. Add a new flag to indicate .fault is tried once. In the second try, skip ra->mmap_miss decreasing. The filemap_fault state machine is ok with it. I only tested x86, didn't test other archs, but looks the change for other archs is obvious, but who knows :) Signed-off-by: Shaohua Li Cc: Rik van Riel Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/fault.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index a530b23..8e13ecb 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1220,6 +1220,7 @@ good_area: /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk * of starvation. */ flags &= ~FAULT_FLAG_ALLOW_RETRY; + flags |= FAULT_FLAG_TRIED; goto retry; } } -- cgit v1.1 From 027ef6c87853b0a9df53175063028edb4950d476 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Mon, 8 Oct 2012 16:33:27 -0700 Subject: mm: thp: fix pmd_present for split_huge_page and PROT_NONE with THP In many places !pmd_present has been converted to pmd_none. For pmds that's equivalent and pmd_none is quicker so using pmd_none is better. However (unless we delete pmd_present) we should provide an accurate pmd_present too. This will avoid the risk of code thinking the pmd is non present because it's under __split_huge_page_map, see the pmd_mknotpresent there and the comment above it. If the page has been mprotected as PROT_NONE, it would also lead to a pmd_present false negative in the same way as the race with split_huge_page. Because the PSE bit stays on at all times (both during split_huge_page and when the _PAGE_PROTNONE bit get set), we could only check for the PSE bit, but checking the PROTNONE bit too is still good to remember pmd_present must always keep PROT_NONE into account. This explains a not reproducible BUG_ON that was seldom reported on the lists. The same issue is in pmd_large, it would go wrong with both PROT_NONE and if it races with split_huge_page. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Cc: Johannes Weiner Cc: Hugh Dickins Cc: Mel Gorman Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/pgtable.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index fc99484..a1f780d 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -146,8 +146,7 @@ static inline unsigned long pmd_pfn(pmd_t pmd) static inline int pmd_large(pmd_t pte) { - return (pmd_flags(pte) & (_PAGE_PSE | _PAGE_PRESENT)) == - (_PAGE_PSE | _PAGE_PRESENT); + return pmd_flags(pte) & _PAGE_PSE; } #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -415,7 +414,13 @@ static inline int pte_hidden(pte_t pte) static inline int pmd_present(pmd_t pmd) { - return pmd_flags(pmd) & _PAGE_PRESENT; + /* + * Checking for _PAGE_PSE is needed too because + * split_huge_page will temporarily clear the present bit (but + * the _PAGE_PSE flag will remain set at all times while the + * _PAGE_PRESENT bit is clear). + */ + return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE); } static inline int pmd_none(pmd_t pmd) -- cgit v1.1 From b113da65785d5f3f9ff1451ec0fe43d6d76da25b Mon Sep 17 00:00:00 2001 From: David Miller Date: Mon, 8 Oct 2012 16:34:25 -0700 Subject: mm: Add and use update_mmu_cache_pmd() in transparent huge page code. The transparent huge page code passes a PMD pointer in as the third argument of update_mmu_cache(), which expects a PTE pointer. This never got noticed because X86 implements update_mmu_cache() as a macro and thus we don't get any type checking, and X86 is the only architecture which supports transparent huge pages currently. Before other architectures can support transparent huge pages properly we need to add a new interface which will take a PMD pointer as the third argument rather than a PTE pointer. [akpm@linux-foundation.org: implement update_mm_cache_pmd() for s390] Signed-off-by: David S. Miller Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Gerald Schaefer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/pgtable_32.h | 1 + arch/x86/include/asm/pgtable_64.h | 1 + 2 files changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h index 0c92113..8faa215 100644 --- a/arch/x86/include/asm/pgtable_32.h +++ b/arch/x86/include/asm/pgtable_32.h @@ -71,6 +71,7 @@ do { \ * tables contain all the necessary information. */ #define update_mmu_cache(vma, address, ptep) do { } while (0) +#define update_mmu_cache_pmd(vma, address, pmd) do { } while (0) #endif /* !__ASSEMBLY__ */ diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 8251be0..47356f9 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -143,6 +143,7 @@ static inline int pgd_large(pgd_t pgd) { return 0; } #define pte_unmap(pte) ((void)(pte))/* NOP */ #define update_mmu_cache(vma, address, ptep) do { } while (0) +#define update_mmu_cache_pmd(vma, address, pmd) do { } while (0) /* Encode and de-code a swap entry */ #if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE -- cgit v1.1