summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSteve MacLean <sdmaclea@qti.qualcomm.com>2017-05-20 12:10:00 -0400
committerJan Kotas <jkotas@microsoft.com>2017-05-20 09:10:00 -0700
commitf33807f6e5e3086d85e53969a9bbfff4bd7f631d (patch)
treecbc8ed7ff5110b90ddd5c71011c2fc556ae8b413
parent4f8be95166a30ea7c0b1d6aed4ef424ee47c425a (diff)
downloadcoreclr-f33807f6e5e3086d85e53969a9bbfff4bd7f631d.tar.gz
coreclr-f33807f6e5e3086d85e53969a9bbfff4bd7f631d.tar.bz2
coreclr-f33807f6e5e3086d85e53969a9bbfff4bd7f631d.zip
[Arm64/Win] Revise JIT_MemCpy (#11261)
* [Arm64/Win] Revise JIT_MemCpy This is the Windows equivalent of #11143 which merged * [Arm64/Win] Use b<cond> instead of newer b.<cond> Looks like Windows arm64 assembler does not support the newer assembly mnemonic
-rw-r--r--src/vm/arm64/crthelpers.asm176
1 files changed, 85 insertions, 91 deletions
diff --git a/src/vm/arm64/crthelpers.asm b/src/vm/arm64/crthelpers.asm
index 74a2186ed0..98ec982beb 100644
--- a/src/vm/arm64/crthelpers.asm
+++ b/src/vm/arm64/crthelpers.asm
@@ -160,64 +160,78 @@ JIT_MemSet_0xd8
; See comments above for JIT_MemSet
;void JIT_MemCpy(void *dst, const void *src, SIZE_T count)
-;{
+;
; // If not aligned then make it 8-byte aligned
; if(((uintptr_t)dst&0x7) != 0)
; {
-; if(((uintptr_t)dst&0x3) == 0)
+; // Calculate alignment we can do without exceeding count
+; // Use math to avoid introducing more unpredictable branches
+; // Due to inherent mod in lsr, ~7 is used instead of ~0 to handle count == 0
+; // Note logic will fail if count >= (1 << 61). But this exceeds max physical memory for arm64
+; uint8_t align = (dst & 0x7) & (~uint64_t(7) >> (countLeadingZeros(count) mod 64))
+;
+; if(align&0x1)
; {
-; *(UINT*)dst = *(UINT*)src;
-; dst = (UINT*)dst + 1;
-; src = (UINT*)src + 1;
-; count-=4;
+; *(unit8_t*)dst = *(unit8_t*)src;
+; dst = (unit8_t*)dst + 1;
+; src = (unit8_t*)src + 1;
+; count-=1;
; }
-; else if(((uintptr_t)dst&0x1) == 0)
+;
+; if(align&0x2)
; {
-; while(count > 0 && ((uintptr_t)dst&0x7) != 0)
-; {
-; *(short*)dst = *(short*)src;
-; dst = (short*)dst + 1;
-; src = (short*)src + 1;
-; count-=2;
-; }
+; *(unit16_t*)dst = *(unit16_t*)src;
+; dst = (unit16_t*)dst + 1;
+; src = (unit16_t*)src + 1;
+; count-=2;
; }
-; else
+;
+; if(align&0x4)
; {
-; while(count > 0 && ((uintptr_t)dst&0x7) != 0)
-; {
-; *(char*)dst = *(char*)src;
-; dst = (char*)dst + 1;
-; src = (char*)src + 1;
-; count--;
-; }
+; *(unit32_t*)dst = *(unit32_t*)src;
+; dst = (unit32_t*)dst + 1;
+; src = (unit32_t*)src + 1;
+; count-=4;
; }
; }
;
-; while(count >= 8)
+; count-=16;
+;
+; while(count >= 0)
; {
-; *(uintptr_t*)dst = *(uintptr_t*)src;
-; dst = (uintptr_t*)dst + 1;
-; src = (uintptr_t*)src + 1;
-; count-=8;
+; *(unit64_t*)dst = *(unit64_t*)src;
+; dst = (unit64_t*)dst + 1;
+; src = (unit64_t*)src + 1;
+; *(unit64_t*)dst = *(unit64_t*)src;
+; dst = (unit64_t*)dst + 1;
+; src = (unit64_t*)src + 1;
+; count-=16;
+; }
+;
+; if(count & 8)
+; {
+; *(unit64_t*)dst = *(unit64_t*)src;
+; dst = (unit64_t*)dst + 1;
+; src = (unit64_t*)src + 1;
; }
;
; if(count & 4)
; {
-; *(UINT*)dst = *(UINT*)src;
-; dst = (UINT*)dst + 1;
-; src = (UINT*)src + 1;
+; *(unit32_t*)dst = *(unit32_t*)src;
+; dst = (unit32_t*)dst + 1;
+; src = (unit32_t*)src + 1;
; }
;
; if(count & 2)
; {
-; *(short*)dst = *(short*)src;
-; dst = (short*)dst + 1;
-; src = (short*)src + 1;
+; *(unit16_t*)dst = *(unit16_t*)src;
+; dst = (unit16_t*)dst + 1;
+; src = (unit16_t*)src + 1;
; }
;
; if(count & 1)
; {
-; *(char*)dst = *(char*)src;
+; *(unit8_t*)dst = *(unit8_t*)src;
; }
;}
;
@@ -225,69 +239,49 @@ JIT_MemSet_0xd8
; Assembly code corresponding to above C++ method.
; See comments above for JIT_MemSet method
LEAF_ENTRY JIT_MemCpy
- and x8,x0,#7
- cbz x8,JIT_MemCpy_0x80
- and x8,x0,#3
- cbnz x8,JIT_MemCpy_0x2c
- ldr w8,[x1]
- str w8,[x0]
- add x0,x0,#4
- add x1,x1,#4
- mov x8,#-4
- add x2,x2,x8
- b JIT_MemCpy_0x80
+ ands x3, x0, #7
+ movn x4, #7
+ clz x5, x2
+ beq JIT_MemCpy_0xa8
+ lsr x4, x4, x5
+ and x3, x3, x4
+ tbz x3, #0, JIT_MemCpy_0x2c
+ ldrsb w8, [x1], #1
+ strb w8, [x0], #1
+ sub x2, x2, #1
JIT_MemCpy_0x2c
- cbz x2,JIT_MemCpy_0x80
- tbnz x0,#0,JIT_MemCpy_0x5c
-JIT_MemCpy_0x34
- and x8,x0,#7
- cbz x8,JIT_MemCpy_0x80
- ldrsh w8,[x1]
- strh w8,[x0]
- add x0,x0,#2
- add x1,x1,#2
- mov x8,#-2
- add x2,x2,x8
- cbnz x2,JIT_MemCpy_0x34
- b JIT_MemCpy_0x80
+ tbz x3, #1, JIT_MemCpy_0x5c
+ ldrsh w8, [x1], #2
+ strh w8, [x0], #2
+ sub x2, x2, #2
JIT_MemCpy_0x5c
- and x8,x0,#7
- cbz x8,JIT_MemCpy_0x80
- ldrsb w8,[x1]
- strb w8,[x0]
- add x0,x0,#1
- add x1,x1,#1
- mov x8,#-1
- add x2,x2,x8
- cbnz x2,JIT_MemCpy_0x5c
-JIT_MemCpy_0x80
- cmp x2,#8
- blo JIT_MemCpy_0xb4
- lsr x9,x2,#3
- mov x8,#-8
- madd x2,x9,x8,x2
+ tbz x3, #2, JIT_MemCpy_0xa8
+ ldr w8, [x1], #4
+ str w8, [x0], #4
+ sub x2, x2, #4
+ b JIT_MemCpy_0xa8
JIT_MemCpy_0xa0
- ldr x8,[x1],#8
- str x8,[x0],#8
- mov x8,#-1
- add x9,x9,x8
- cbnz x9,JIT_MemCpy_0xa0
+ ldp x8, x9, [x1], #16
+ stp x8, x9, [x0], #16
+JIT_MemCpy_0xa8
+ subs x2, x2, #16
+ bge JIT_MemCpy_0xa0
+JIT_MemCpy_0xb0
+ tbz x2, #3, JIT_MemCpy_0xb4
+ ldr x8, [x1], #8
+ str x8, [x0], #8
JIT_MemCpy_0xb4
- tbz x2,#2,JIT_MemCpy_0xc8
- ldr w8,[x1]
- str w8,[x0]
- add x0,x0,#4
- add x1,x1,#4
+ tbz x2, #2, JIT_MemCpy_0xc8
+ ldr w8, [x1], #4
+ str w8, [x0], #4
JIT_MemCpy_0xc8
- tbz x2,#1,JIT_MemCpy_0xdc
- ldrsh w8,[x1]
- strh w8,[x0]
- add x0,x0,#2
- add x1,x1,#2
+ tbz x2, #1, JIT_MemCpy_0xdc
+ ldrsh w8, [x1], #2
+ strh w8, [x0], #2
JIT_MemCpy_0xdc
- tbz x2,#0,JIT_MemCpy_0xe8
- ldrsb w8,[x1]
- strb w8,[x0]
+ tbz x2, #0, JIT_MemCpy_0xe8
+ ldrsb w8, [x1]
+ strb w8, [x0]
JIT_MemCpy_0xe8
ret lr
LEAF_END