diff options
author | Steve MacLean <sdmaclea@qti.qualcomm.com> | 2017-05-20 12:10:00 -0400 |
---|---|---|
committer | Jan Kotas <jkotas@microsoft.com> | 2017-05-20 09:10:00 -0700 |
commit | f33807f6e5e3086d85e53969a9bbfff4bd7f631d (patch) | |
tree | cbc8ed7ff5110b90ddd5c71011c2fc556ae8b413 | |
parent | 4f8be95166a30ea7c0b1d6aed4ef424ee47c425a (diff) | |
download | coreclr-f33807f6e5e3086d85e53969a9bbfff4bd7f631d.tar.gz coreclr-f33807f6e5e3086d85e53969a9bbfff4bd7f631d.tar.bz2 coreclr-f33807f6e5e3086d85e53969a9bbfff4bd7f631d.zip |
[Arm64/Win] Revise JIT_MemCpy (#11261)
* [Arm64/Win] Revise JIT_MemCpy
This is the Windows equivalent of #11143 which merged
* [Arm64/Win] Use b<cond> instead of newer b.<cond>
Looks like Windows arm64 assembler does not support the newer assembly mnemonic
-rw-r--r-- | src/vm/arm64/crthelpers.asm | 176 |
1 files changed, 85 insertions, 91 deletions
diff --git a/src/vm/arm64/crthelpers.asm b/src/vm/arm64/crthelpers.asm index 74a2186ed0..98ec982beb 100644 --- a/src/vm/arm64/crthelpers.asm +++ b/src/vm/arm64/crthelpers.asm @@ -160,64 +160,78 @@ JIT_MemSet_0xd8 ; See comments above for JIT_MemSet ;void JIT_MemCpy(void *dst, const void *src, SIZE_T count) -;{ +; ; // If not aligned then make it 8-byte aligned ; if(((uintptr_t)dst&0x7) != 0) ; { -; if(((uintptr_t)dst&0x3) == 0) +; // Calculate alignment we can do without exceeding count +; // Use math to avoid introducing more unpredictable branches +; // Due to inherent mod in lsr, ~7 is used instead of ~0 to handle count == 0 +; // Note logic will fail if count >= (1 << 61). But this exceeds max physical memory for arm64 +; uint8_t align = (dst & 0x7) & (~uint64_t(7) >> (countLeadingZeros(count) mod 64)) +; +; if(align&0x1) ; { -; *(UINT*)dst = *(UINT*)src; -; dst = (UINT*)dst + 1; -; src = (UINT*)src + 1; -; count-=4; +; *(unit8_t*)dst = *(unit8_t*)src; +; dst = (unit8_t*)dst + 1; +; src = (unit8_t*)src + 1; +; count-=1; ; } -; else if(((uintptr_t)dst&0x1) == 0) +; +; if(align&0x2) ; { -; while(count > 0 && ((uintptr_t)dst&0x7) != 0) -; { -; *(short*)dst = *(short*)src; -; dst = (short*)dst + 1; -; src = (short*)src + 1; -; count-=2; -; } +; *(unit16_t*)dst = *(unit16_t*)src; +; dst = (unit16_t*)dst + 1; +; src = (unit16_t*)src + 1; +; count-=2; ; } -; else +; +; if(align&0x4) ; { -; while(count > 0 && ((uintptr_t)dst&0x7) != 0) -; { -; *(char*)dst = *(char*)src; -; dst = (char*)dst + 1; -; src = (char*)src + 1; -; count--; -; } +; *(unit32_t*)dst = *(unit32_t*)src; +; dst = (unit32_t*)dst + 1; +; src = (unit32_t*)src + 1; +; count-=4; ; } ; } ; -; while(count >= 8) +; count-=16; +; +; while(count >= 0) ; { -; *(uintptr_t*)dst = *(uintptr_t*)src; -; dst = (uintptr_t*)dst + 1; -; src = (uintptr_t*)src + 1; -; count-=8; +; *(unit64_t*)dst = *(unit64_t*)src; +; dst = (unit64_t*)dst + 1; +; src = (unit64_t*)src + 1; +; *(unit64_t*)dst = *(unit64_t*)src; +; dst = (unit64_t*)dst + 1; +; src = (unit64_t*)src + 1; +; count-=16; +; } +; +; if(count & 8) +; { +; *(unit64_t*)dst = *(unit64_t*)src; +; dst = (unit64_t*)dst + 1; +; src = (unit64_t*)src + 1; ; } ; ; if(count & 4) ; { -; *(UINT*)dst = *(UINT*)src; -; dst = (UINT*)dst + 1; -; src = (UINT*)src + 1; +; *(unit32_t*)dst = *(unit32_t*)src; +; dst = (unit32_t*)dst + 1; +; src = (unit32_t*)src + 1; ; } ; ; if(count & 2) ; { -; *(short*)dst = *(short*)src; -; dst = (short*)dst + 1; -; src = (short*)src + 1; +; *(unit16_t*)dst = *(unit16_t*)src; +; dst = (unit16_t*)dst + 1; +; src = (unit16_t*)src + 1; ; } ; ; if(count & 1) ; { -; *(char*)dst = *(char*)src; +; *(unit8_t*)dst = *(unit8_t*)src; ; } ;} ; @@ -225,69 +239,49 @@ JIT_MemSet_0xd8 ; Assembly code corresponding to above C++ method. ; See comments above for JIT_MemSet method LEAF_ENTRY JIT_MemCpy - and x8,x0,#7 - cbz x8,JIT_MemCpy_0x80 - and x8,x0,#3 - cbnz x8,JIT_MemCpy_0x2c - ldr w8,[x1] - str w8,[x0] - add x0,x0,#4 - add x1,x1,#4 - mov x8,#-4 - add x2,x2,x8 - b JIT_MemCpy_0x80 + ands x3, x0, #7 + movn x4, #7 + clz x5, x2 + beq JIT_MemCpy_0xa8 + lsr x4, x4, x5 + and x3, x3, x4 + tbz x3, #0, JIT_MemCpy_0x2c + ldrsb w8, [x1], #1 + strb w8, [x0], #1 + sub x2, x2, #1 JIT_MemCpy_0x2c - cbz x2,JIT_MemCpy_0x80 - tbnz x0,#0,JIT_MemCpy_0x5c -JIT_MemCpy_0x34 - and x8,x0,#7 - cbz x8,JIT_MemCpy_0x80 - ldrsh w8,[x1] - strh w8,[x0] - add x0,x0,#2 - add x1,x1,#2 - mov x8,#-2 - add x2,x2,x8 - cbnz x2,JIT_MemCpy_0x34 - b JIT_MemCpy_0x80 + tbz x3, #1, JIT_MemCpy_0x5c + ldrsh w8, [x1], #2 + strh w8, [x0], #2 + sub x2, x2, #2 JIT_MemCpy_0x5c - and x8,x0,#7 - cbz x8,JIT_MemCpy_0x80 - ldrsb w8,[x1] - strb w8,[x0] - add x0,x0,#1 - add x1,x1,#1 - mov x8,#-1 - add x2,x2,x8 - cbnz x2,JIT_MemCpy_0x5c -JIT_MemCpy_0x80 - cmp x2,#8 - blo JIT_MemCpy_0xb4 - lsr x9,x2,#3 - mov x8,#-8 - madd x2,x9,x8,x2 + tbz x3, #2, JIT_MemCpy_0xa8 + ldr w8, [x1], #4 + str w8, [x0], #4 + sub x2, x2, #4 + b JIT_MemCpy_0xa8 JIT_MemCpy_0xa0 - ldr x8,[x1],#8 - str x8,[x0],#8 - mov x8,#-1 - add x9,x9,x8 - cbnz x9,JIT_MemCpy_0xa0 + ldp x8, x9, [x1], #16 + stp x8, x9, [x0], #16 +JIT_MemCpy_0xa8 + subs x2, x2, #16 + bge JIT_MemCpy_0xa0 +JIT_MemCpy_0xb0 + tbz x2, #3, JIT_MemCpy_0xb4 + ldr x8, [x1], #8 + str x8, [x0], #8 JIT_MemCpy_0xb4 - tbz x2,#2,JIT_MemCpy_0xc8 - ldr w8,[x1] - str w8,[x0] - add x0,x0,#4 - add x1,x1,#4 + tbz x2, #2, JIT_MemCpy_0xc8 + ldr w8, [x1], #4 + str w8, [x0], #4 JIT_MemCpy_0xc8 - tbz x2,#1,JIT_MemCpy_0xdc - ldrsh w8,[x1] - strh w8,[x0] - add x0,x0,#2 - add x1,x1,#2 + tbz x2, #1, JIT_MemCpy_0xdc + ldrsh w8, [x1], #2 + strh w8, [x0], #2 JIT_MemCpy_0xdc - tbz x2,#0,JIT_MemCpy_0xe8 - ldrsb w8,[x1] - strb w8,[x0] + tbz x2, #0, JIT_MemCpy_0xe8 + ldrsb w8, [x1] + strb w8, [x0] JIT_MemCpy_0xe8 ret lr LEAF_END |