summaryrefslogtreecommitdiffstats
path: root/arch/x86/include/asm/xor_64.h
diff options
context:
space:
mode:
authorSuresh Siddha <suresh.b.siddha@intel.com>2012-08-24 14:13:00 -0700
committerH. Peter Anvin <hpa@linux.intel.com>2012-09-18 15:52:08 -0700
commit841e3604d35aa70d399146abdc526d8c89a2c2f5 (patch)
tree80d2266c21e5ae0b5f4097db3ee71888fc92bec1 /arch/x86/include/asm/xor_64.h
parent9c1c3fac53378c9782c18f80107965578d7b7167 (diff)
downloadop-kernel-dev-841e3604d35aa70d399146abdc526d8c89a2c2f5.zip
op-kernel-dev-841e3604d35aa70d399146abdc526d8c89a2c2f5.tar.gz
x86, fpu: always use kernel_fpu_begin/end() for in-kernel FPU usage
use kernel_fpu_begin/end() instead of unconditionally accessing cr0 and saving/restoring just the few used xmm/ymm registers. This has some advantages like: * If the task's FPU state is already active, then kernel_fpu_begin() will just save the user-state and avoiding the read/write of cr0. In general, cr0 accesses are much slower. * Manual save/restore of xmm/ymm registers will affect the 'modified' and the 'init' optimizations brought in the by xsaveopt/xrstor infrastructure. * Foward compatibility with future vector register extensions will be a problem if the xmm/ymm registers are manually saved and restored (corrupting the extended state of those vector registers). With this patch, there was no significant difference in the xor throughput using AVX, measured during boot. Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com> Link: http://lkml.kernel.org/r/1345842782-24175-5-git-send-email-suresh.b.siddha@intel.com Cc: Jim Kukunas <james.t.kukunas@linux.intel.com> Cc: NeilBrown <neilb@suse.de> Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Diffstat (limited to 'arch/x86/include/asm/xor_64.h')
-rw-r--r--arch/x86/include/asm/xor_64.h61
1 files changed, 9 insertions, 52 deletions
diff --git a/arch/x86/include/asm/xor_64.h b/arch/x86/include/asm/xor_64.h
index b9b2323..5fc06d0 100644
--- a/arch/x86/include/asm/xor_64.h
+++ b/arch/x86/include/asm/xor_64.h
@@ -34,41 +34,7 @@
* no advantages to be gotten from x86-64 here anyways.
*/
-typedef struct {
- unsigned long a, b;
-} __attribute__((aligned(16))) xmm_store_t;
-
-/* Doesn't use gcc to save the XMM registers, because there is no easy way to
- tell it to do a clts before the register saving. */
-#define XMMS_SAVE \
-do { \
- preempt_disable(); \
- asm volatile( \
- "movq %%cr0,%0 ;\n\t" \
- "clts ;\n\t" \
- "movups %%xmm0,(%1) ;\n\t" \
- "movups %%xmm1,0x10(%1) ;\n\t" \
- "movups %%xmm2,0x20(%1) ;\n\t" \
- "movups %%xmm3,0x30(%1) ;\n\t" \
- : "=&r" (cr0) \
- : "r" (xmm_save) \
- : "memory"); \
-} while (0)
-
-#define XMMS_RESTORE \
-do { \
- asm volatile( \
- "sfence ;\n\t" \
- "movups (%1),%%xmm0 ;\n\t" \
- "movups 0x10(%1),%%xmm1 ;\n\t" \
- "movups 0x20(%1),%%xmm2 ;\n\t" \
- "movups 0x30(%1),%%xmm3 ;\n\t" \
- "movq %0,%%cr0 ;\n\t" \
- : \
- : "r" (cr0), "r" (xmm_save) \
- : "memory"); \
- preempt_enable(); \
-} while (0)
+#include <asm/i387.h>
#define OFFS(x) "16*("#x")"
#define PF_OFFS(x) "256+16*("#x")"
@@ -91,10 +57,8 @@ static void
xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
{
unsigned int lines = bytes >> 8;
- unsigned long cr0;
- xmm_store_t xmm_save[4];
- XMMS_SAVE;
+ kernel_fpu_begin();
asm volatile(
#undef BLOCK
@@ -135,7 +99,7 @@ xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
: [inc] "r" (256UL)
: "memory");
- XMMS_RESTORE;
+ kernel_fpu_end();
}
static void
@@ -143,11 +107,8 @@ xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3)
{
unsigned int lines = bytes >> 8;
- xmm_store_t xmm_save[4];
- unsigned long cr0;
-
- XMMS_SAVE;
+ kernel_fpu_begin();
asm volatile(
#undef BLOCK
#define BLOCK(i) \
@@ -194,7 +155,7 @@ xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
[p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
: [inc] "r" (256UL)
: "memory");
- XMMS_RESTORE;
+ kernel_fpu_end();
}
static void
@@ -202,10 +163,8 @@ xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3, unsigned long *p4)
{
unsigned int lines = bytes >> 8;
- xmm_store_t xmm_save[4];
- unsigned long cr0;
- XMMS_SAVE;
+ kernel_fpu_begin();
asm volatile(
#undef BLOCK
@@ -261,7 +220,7 @@ xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
: [inc] "r" (256UL)
: "memory" );
- XMMS_RESTORE;
+ kernel_fpu_end();
}
static void
@@ -269,10 +228,8 @@ xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3, unsigned long *p4, unsigned long *p5)
{
unsigned int lines = bytes >> 8;
- xmm_store_t xmm_save[4];
- unsigned long cr0;
- XMMS_SAVE;
+ kernel_fpu_begin();
asm volatile(
#undef BLOCK
@@ -336,7 +293,7 @@ xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
: [inc] "r" (256UL)
: "memory");
- XMMS_RESTORE;
+ kernel_fpu_end();
}
static struct xor_block_template xor_block_sse = {
OpenPOWER on IntegriCloud