summaryrefslogtreecommitdiff
path: root/arch/x86/include
diff options
context:
space:
mode:
authorJan Beulich <JBeulich@suse.com>2012-11-02 14:20:24 +0000
committerIngo Molnar <mingo@kernel.org>2013-01-25 09:23:50 +0100
commitf317820cb6ee3fb173319bf76e0e62437be78ad2 (patch)
treefc57358da4ba9f11a8d80e508d01e99c2c62c1f9 /arch/x86/include
parente8f6e3f8a14bae98197c6d9f280cd23d22eb1a33 (diff)
downloadlinux-3.10-f317820cb6ee3fb173319bf76e0e62437be78ad2.tar.gz
linux-3.10-f317820cb6ee3fb173319bf76e0e62437be78ad2.tar.bz2
linux-3.10-f317820cb6ee3fb173319bf76e0e62437be78ad2.zip
x86/xor: Add alternative SSE implementation only prefetching once per 64-byte line
On CPUs with 64-byte last level cache lines, this yields roughly 10% better performance, independent of CPU vendor or specific model (as far as I was able to test). Signed-off-by: Jan Beulich <jbeulich@suse.com> Acked-by: H. Peter Anvin <hpa@zytor.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Link: http://lkml.kernel.org/r/5093E4B802000078000A615E@nat28.tlf.novell.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'arch/x86/include')
-rw-r--r--arch/x86/include/asm/xor.h172
-rw-r--r--arch/x86/include/asm/xor_32.h23
-rw-r--r--arch/x86/include/asm/xor_64.h10
3 files changed, 187 insertions, 18 deletions
diff --git a/arch/x86/include/asm/xor.h b/arch/x86/include/asm/xor.h
index c661571ca0b..d8829751b3f 100644
--- a/arch/x86/include/asm/xor.h
+++ b/arch/x86/include/asm/xor.h
@@ -58,6 +58,14 @@
#define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n"
#define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n"
#define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n"
+#define NOP(x)
+
+#define BLK64(pf, op, i) \
+ pf(i) \
+ op(i, 0) \
+ op(i + 1, 1) \
+ op(i + 2, 2) \
+ op(i + 3, 3)
static void
xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
@@ -111,6 +119,40 @@ xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
}
static void
+xor_sse_2_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2)
+{
+ unsigned long lines = bytes >> 8;
+
+ kernel_fpu_begin();
+
+ asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+ BLK64(PF0, LD, i) \
+ BLK64(PF1, XO1, i) \
+ BLK64(NOP, ST, i) \
+
+ " .align 32 ;\n"
+ " 1: ;\n"
+
+ BLOCK(0)
+ BLOCK(4)
+ BLOCK(8)
+ BLOCK(12)
+
+ " add %[inc], %[p1] ;\n"
+ " add %[inc], %[p2] ;\n"
+ " dec %[cnt] ;\n"
+ " jnz 1b ;\n"
+ : [cnt] "+r" (lines),
+ [p1] "+r" (p1), [p2] "+r" (p2)
+ : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
+ : "memory");
+
+ kernel_fpu_end();
+}
+
+static void
xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3)
{
@@ -170,6 +212,43 @@ xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
}
static void
+xor_sse_3_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ unsigned long *p3)
+{
+ unsigned long lines = bytes >> 8;
+
+ kernel_fpu_begin();
+
+ asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+ BLK64(PF0, LD, i) \
+ BLK64(PF1, XO1, i) \
+ BLK64(PF2, XO2, i) \
+ BLK64(NOP, ST, i) \
+
+ " .align 32 ;\n"
+ " 1: ;\n"
+
+ BLOCK(0)
+ BLOCK(4)
+ BLOCK(8)
+ BLOCK(12)
+
+ " add %[inc], %[p1] ;\n"
+ " add %[inc], %[p2] ;\n"
+ " add %[inc], %[p3] ;\n"
+ " dec %[cnt] ;\n"
+ " jnz 1b ;\n"
+ : [cnt] "+r" (lines),
+ [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
+ : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
+ : "memory");
+
+ kernel_fpu_end();
+}
+
+static void
xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3, unsigned long *p4)
{
@@ -236,6 +315,45 @@ xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
}
static void
+xor_sse_4_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ unsigned long *p3, unsigned long *p4)
+{
+ unsigned long lines = bytes >> 8;
+
+ kernel_fpu_begin();
+
+ asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+ BLK64(PF0, LD, i) \
+ BLK64(PF1, XO1, i) \
+ BLK64(PF2, XO2, i) \
+ BLK64(PF3, XO3, i) \
+ BLK64(NOP, ST, i) \
+
+ " .align 32 ;\n"
+ " 1: ;\n"
+
+ BLOCK(0)
+ BLOCK(4)
+ BLOCK(8)
+ BLOCK(12)
+
+ " add %[inc], %[p1] ;\n"
+ " add %[inc], %[p2] ;\n"
+ " add %[inc], %[p3] ;\n"
+ " add %[inc], %[p4] ;\n"
+ " dec %[cnt] ;\n"
+ " jnz 1b ;\n"
+ : [cnt] "+r" (lines), [p1] "+r" (p1),
+ [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
+ : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
+ : "memory");
+
+ kernel_fpu_end();
+}
+
+static void
xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3, unsigned long *p4, unsigned long *p5)
{
@@ -308,12 +426,63 @@ xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
kernel_fpu_end();
}
+static void
+xor_sse_5_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ unsigned long *p3, unsigned long *p4, unsigned long *p5)
+{
+ unsigned long lines = bytes >> 8;
+
+ kernel_fpu_begin();
+
+ asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+ BLK64(PF0, LD, i) \
+ BLK64(PF1, XO1, i) \
+ BLK64(PF2, XO2, i) \
+ BLK64(PF3, XO3, i) \
+ BLK64(PF4, XO4, i) \
+ BLK64(NOP, ST, i) \
+
+ " .align 32 ;\n"
+ " 1: ;\n"
+
+ BLOCK(0)
+ BLOCK(4)
+ BLOCK(8)
+ BLOCK(12)
+
+ " add %[inc], %[p1] ;\n"
+ " add %[inc], %[p2] ;\n"
+ " add %[inc], %[p3] ;\n"
+ " add %[inc], %[p4] ;\n"
+ " add %[inc], %[p5] ;\n"
+ " dec %[cnt] ;\n"
+ " jnz 1b ;\n"
+ : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
+ [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
+ : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
+ : "memory");
+
+ kernel_fpu_end();
+}
+
+static struct xor_block_template xor_block_sse_pf64 = {
+ .name = "prefetch64-sse",
+ .do_2 = xor_sse_2_pf64,
+ .do_3 = xor_sse_3_pf64,
+ .do_4 = xor_sse_4_pf64,
+ .do_5 = xor_sse_5_pf64,
+};
+
#undef LD
#undef XO1
#undef XO2
#undef XO3
#undef XO4
#undef ST
+#undef NOP
+#undef BLK64
#undef BLOCK
#undef XOR_CONSTANT_CONSTRAINT
@@ -324,4 +493,7 @@ xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
# include <asm/xor_64.h>
#endif
+#define XOR_SELECT_TEMPLATE(FASTEST) \
+ AVX_SELECT(FASTEST)
+
#endif /* _ASM_X86_XOR_H */
diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h
index b85dc87f3cc..ce05722e3c6 100644
--- a/arch/x86/include/asm/xor_32.h
+++ b/arch/x86/include/asm/xor_32.h
@@ -543,26 +543,25 @@ static struct xor_block_template xor_block_pIII_sse = {
/* Also try the generic routines. */
#include <asm-generic/xor.h>
+/* We force the use of the SSE xor block because it can write around L2.
+ We may also be able to load into the L1 only depending on how the cpu
+ deals with a load to a line that is being prefetched. */
#undef XOR_TRY_TEMPLATES
#define XOR_TRY_TEMPLATES \
do { \
- xor_speed(&xor_block_8regs); \
- xor_speed(&xor_block_8regs_p); \
- xor_speed(&xor_block_32regs); \
- xor_speed(&xor_block_32regs_p); \
AVX_XOR_SPEED; \
- if (cpu_has_xmm) \
+ if (cpu_has_xmm) { \
xor_speed(&xor_block_pIII_sse); \
- if (cpu_has_mmx) { \
+ xor_speed(&xor_block_sse_pf64); \
+ } else if (cpu_has_mmx) { \
xor_speed(&xor_block_pII_mmx); \
xor_speed(&xor_block_p5_mmx); \
+ } else { \
+ xor_speed(&xor_block_8regs); \
+ xor_speed(&xor_block_8regs_p); \
+ xor_speed(&xor_block_32regs); \
+ xor_speed(&xor_block_32regs_p); \
} \
} while (0)
-/* We force the use of the SSE xor block because it can write around L2.
- We may also be able to load into the L1 only depending on how the cpu
- deals with a load to a line that is being prefetched. */
-#define XOR_SELECT_TEMPLATE(FASTEST) \
- AVX_SELECT(cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)
-
#endif /* _ASM_X86_XOR_32_H */
diff --git a/arch/x86/include/asm/xor_64.h b/arch/x86/include/asm/xor_64.h
index 1baf89dcc42..546f1e3b87c 100644
--- a/arch/x86/include/asm/xor_64.h
+++ b/arch/x86/include/asm/xor_64.h
@@ -13,17 +13,15 @@ static struct xor_block_template xor_block_sse = {
/* Also try the AVX routines */
#include <asm/xor_avx.h>
+/* We force the use of the SSE xor block because it can write around L2.
+ We may also be able to load into the L1 only depending on how the cpu
+ deals with a load to a line that is being prefetched. */
#undef XOR_TRY_TEMPLATES
#define XOR_TRY_TEMPLATES \
do { \
AVX_XOR_SPEED; \
+ xor_speed(&xor_block_sse_pf64); \
xor_speed(&xor_block_sse); \
} while (0)
-/* We force the use of the SSE xor block because it can write around L2.
- We may also be able to load into the L1 only depending on how the cpu
- deals with a load to a line that is being prefetched. */
-#define XOR_SELECT_TEMPLATE(FASTEST) \
- AVX_SELECT(&xor_block_sse)
-
#endif /* _ASM_X86_XOR_64_H */