summaryrefslogtreecommitdiff
path: root/arch/x86/lib
diff options
context:
space:
mode:
authorJan Beulich <JBeulich@suse.com>2012-01-26 15:55:32 +0000
committerIngo Molnar <mingo@elte.hu>2012-01-26 21:19:20 +0100
commit9d8e22777e66f420e46490e9fc6f8cb7e0e2222b (patch)
treedd0ec6122dda1409206dda70f6ae4fd3c9a2cd35 /arch/x86/lib
parent2ab560911a427fdc73bfd3a7d2944d8ee0ca6db8 (diff)
downloadlinux-3.10-9d8e22777e66f420e46490e9fc6f8cb7e0e2222b.tar.gz
linux-3.10-9d8e22777e66f420e46490e9fc6f8cb7e0e2222b.tar.bz2
linux-3.10-9d8e22777e66f420e46490e9fc6f8cb7e0e2222b.zip
x86-64: Handle byte-wise tail copying in memcpy() without a loop
While hard to measure, reducing the number of possibly/likely mis-predicted branches can generally be expected to be slightly better. Other than apparent at the first glance, this also doesn't grow the function size (the alignment gap to the next function just gets smaller). Signed-off-by: Jan Beulich <jbeulich@suse.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Link: http://lkml.kernel.org/r/4F218584020000780006F422@nat28.tlf.novell.com Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'arch/x86/lib')
-rw-r--r--arch/x86/lib/memcpy_64.S19
1 files changed, 10 insertions, 9 deletions
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 1235b04a9a6..1c273be7c97 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -164,18 +164,19 @@ ENTRY(memcpy)
retq
.p2align 4
.Lless_3bytes:
- cmpl $0, %edx
- je .Lend
+ subl $1, %edx
+ jb .Lend
/*
* Move data from 1 bytes to 3 bytes.
*/
-.Lloop_1:
- movb (%rsi), %r8b
- movb %r8b, (%rdi)
- incq %rdi
- incq %rsi
- decl %edx
- jnz .Lloop_1
+ movzbl (%rsi), %ecx
+ jz .Lstore_1byte
+ movzbq 1(%rsi), %r8
+ movzbq (%rsi, %rdx), %r9
+ movb %r8b, 1(%rdi)
+ movb %r9b, (%rdi, %rdx)
+.Lstore_1byte:
+ movb %cl, (%rdi)
.Lend:
retq