summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSteve MacLean <sdmaclea.qdt@qualcommdatacenter.com>2018-04-13 12:46:52 -0400
committerJan Kotas <jkotas@microsoft.com>2018-04-13 09:46:52 -0700
commitbc28740cd5f0533655f347fc315f6a28836a7efe (patch)
tree25b747509ff980adf10982a810449e7b3418e752
parentbba18ae06d9f132914ec77234bfd1c3ab9100d1a (diff)
downloadcoreclr-bc28740cd5f0533655f347fc315f6a28836a7efe.tar.gz
coreclr-bc28740cd5f0533655f347fc315f6a28836a7efe.tar.bz2
coreclr-bc28740cd5f0533655f347fc315f6a28836a7efe.zip
[Arm64/Linux] Use platform memset/memcpy (#17536)
Fixes buggy memset implementation Use heavily optimized platform implementation Follows amd64 & arm precedent
-rw-r--r--src/vm/amd64/crthelpers.S2
-rw-r--r--src/vm/arm64/crthelpers.S344
2 files changed, 16 insertions, 330 deletions
diff --git a/src/vm/amd64/crthelpers.S b/src/vm/amd64/crthelpers.S
index 168359e192..a9b54e4d3b 100644
--- a/src/vm/amd64/crthelpers.S
+++ b/src/vm/amd64/crthelpers.S
@@ -8,7 +8,7 @@
// JIT_MemSet/JIT_MemCpy
//
-// It is IMPORANT that the exception handling code is able to find these guys
+// It is IMPORTANT that the exception handling code is able to find these guys
// on the stack, but on non-windows platforms we can just defer to the platform
// implementation.
//
diff --git a/src/vm/arm64/crthelpers.S b/src/vm/arm64/crthelpers.S
index c8b108ca8f..c0317edf57 100644
--- a/src/vm/arm64/crthelpers.S
+++ b/src/vm/arm64/crthelpers.S
@@ -2,347 +2,33 @@
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
-// ==++==
-//
-
#include "unixasmmacros.inc"
+// JIT_MemSet/JIT_MemCpy
//
-// ==--==
-
-// Calls to JIT_MemSet is emitted by jit for initialization of large structs.
-// We need to provide our own implementation of memset instead of using the ones in crt because crt implementation does not gurantee
-// that aligned 8/4/2 - byte memory will be written atomically. This is required because members in a struct can be read atomically
-// and their values should be written atomically.
-//
-//
-//void JIT_MemSet(void *dst, int val, SIZE_T count)
-//
-// uint64_t valEx = (unsigned char)val;
-// valEx = valEx | valEx << 8;
-// valEx = valEx | valEx << 16;
-// valEx = valEx | valEx << 32;
-//
-// size_t dc_zva_size = 4ULL << DCZID_EL0.BS;
-//
-// uint64_t use_dc_zva = (val == 0) && !DCZID_EL0.p ? count / (2 * dc_zva_size) : 0; // ~Minimum size (assumes worst case alignment)
-//
-// // If not aligned then make it 8-byte aligned
-// if(((uint64_t)dst&0xf) != 0)
-// {
-// // Calculate alignment we can do without exceeding count
-// // Use math to avoid introducing more unpredictable branches
-// // Due to inherent mod in lsr, ~7 is used instead of ~0 to handle count == 0
-// // Note logic will fail is count >= (1 << 61). But this exceeds max physical memory for arm64
-// uint8_t align = (dst & 0x7) & (~uint64_t(7) >> (countLeadingZeros(count) mod 64))
-//
-// if(align&0x1)
-// {
-// *(unit8_t*)dst = (unit8_t)valEx;
-// dst = (unit8_t*)dst + 1;
-// count-=1;
-// }
-//
-// if(align&0x2)
-// {
-// *(unit16_t*)dst = (unit16_t)valEx;
-// dst = (unit16_t*)dst + 1;
-// count-=2;
-// }
-//
-// if(align&0x4)
-// {
-// *(unit32_t*)dst = (unit32_t)valEx;
-// dst = (unit32_t*)dst + 1;
-// count-=4;
-// }
-// }
-//
-// if(use_dc_zva)
-// {
-// // If not aligned then make it aligned to dc_zva_size
-// if(dst&0x8)
-// {
-// *(uint64_t*)dst = (uint64_t)valEx;
-// dst = (uint64_t*)dst + 1;
-// count-=8;
-// }
+// It is IMPORTANT that the exception handling code is able to find these guys
+// on the stack, but on non-windows platforms we can just defer to the platform
+// implementation.
//
-// while(dst & (dc_zva_size - 1))
-// {
-// *(uint64_t*)dst = valEx;
-// dst = (uint64_t*)dst + 1;
-// *(uint64_t*)dst = valEx;
-// dst = (uint64_t*)dst + 1;
-// count-=16;
-// }
-//
-// count -= dc_zva_size;
-//
-// while(count >= 0)
-// {
-// dc_zva(dst);
-// dst = (uint8_t*)dst + dc_zva_size;
-// count-=dc_zva_size;
-// }
-//
-// count += dc_zva_size;
-// }
-//
-// count-=16;
-//
-// while(count >= 0)
-// {
-// *(uint64_t*)dst = valEx;
-// dst = (uint64_t*)dst + 1;
-// *(uint64_t*)dst = valEx;
-// dst = (uint64_t*)dst + 1;
-// count-=16;
-// }
-//
-// if(count & 8)
-// {
-// *(uint64_t*)dst = valEx;
-// dst = (uint64_t*)dst + 1;
-// }
-//
-// if(count & 4)
-// {
-// *(uint32_t*)dst = (uint32_t)valEx;
-// dst = (uint32_t*)dst + 1;
-// }
-//
-// if(count & 2)
-// {
-// *(uint16_t*)dst = (uint16_t)valEx;
-// dst = (uint16_t*)dst + 1;
-// }
-//
-// if(count & 1)
-// {
-// *(uint8_t*)dst = (uint8_t)valEx;
-// }
-//
-//
-
-// Assembly code corresponding to above C++ method. JIT_MemSet can AV and clr exception personality routine needs to
-// determine if the exception has taken place inside JIT_Memset in order to throw corresponding managed exception.
-// Determining this is slow if the method were implemented as C++ method (using unwind info). In .asm file by adding JIT_MemSet_End
-// marker it can be easily determined if exception happened in JIT_MemSet. Therefore, JIT_MemSet has been written in assembly instead of
-// as C++ method.
-
LEAF_ENTRY JIT_MemSet, _TEXT
- ands w8, w1, #0xff
- mrs x3, DCZID_EL0 // x3 = DCZID_EL0
- mov x6, #4
- lsr x11, x2, #3 // x11 = count >> 3
-
- orr w8, w8, w8, lsl #8
- and x5, x3, #0xf // x5 = dczid_el0.bs
- csel x11, x11, xzr, eq // x11 = (val == 0) ? count >> 3 : 0
- tst x3, (1 << 4)
-
- orr w8, w8, w8, lsl #0x10
- csel x11, x11, xzr, eq // x11 = (val == 0) && !DCZID_EL0.p ? count >> 3 : 0
- ands x3, x0, #7 // x3 = dst & 7
- lsl x9, x6, x5 // x9 = size
-
- orr x8, x8, x8, lsl #0x20
- lsr x11, x11, x5 // x11 = (val == 0) && !DCZID_EL0.p ? count >> (3 + DCZID_EL0.bs) : 0
- sub x10, x9, #1 // x10 = mask
-
- b.eq LOCAL_LABEL(JIT_MemSet_0x80)
-
- movn x4, #7
- clz x5, x2
- lsr x4, x4, x5
- and x3, x3, x4
+ cbz x2, LOCAL_LABEL(JIT_MemSet_ret)
- tbz x3, #0, LOCAL_LABEL(JIT_MemSet_0x2c)
- strb w8, [x0], #1
- sub x2, x2, #1
-LOCAL_LABEL(JIT_MemSet_0x2c):
- tbz x3, #1, LOCAL_LABEL(JIT_MemSet_0x5c)
- strh w8, [x0], #2
- sub x2, x2, #2
-LOCAL_LABEL(JIT_MemSet_0x5c):
- tbz x3, #2, LOCAL_LABEL(JIT_MemSet_0x80)
- str w8, [x0], #4
- sub x2, x2, #4
-LOCAL_LABEL(JIT_MemSet_0x80):
- cbz x11, LOCAL_LABEL(JIT_MemSet_0x9c)
- tbz x0, #3, LOCAL_LABEL(JIT_MemSet_0x84)
- str x8, [x0], #8
- sub x2, x2, #8
+ strb w1, [x0]
- b LOCAL_LABEL(JIT_MemSet_0x85)
-LOCAL_LABEL(JIT_MemSet_0x84):
- stp x8, x8, [x0], #16
- sub x2, x2, #16
-LOCAL_LABEL(JIT_MemSet_0x85):
- tst x0, x10
- b.ne LOCAL_LABEL(JIT_MemSet_0x84)
+ b C_PLTFUNC(memset)
- b LOCAL_LABEL(JIT_MemSet_0x8a)
-LOCAL_LABEL(JIT_MemSet_0x88):
- dc zva, x0
- add x0, x0, x9
-LOCAL_LABEL(JIT_MemSet_0x8a):
- subs x2, x2, x9
- b.ge LOCAL_LABEL(JIT_MemSet_0x88)
-
-LOCAL_LABEL(JIT_MemSet_0x8c):
- add x2, x2, x9
-
-LOCAL_LABEL(JIT_MemSet_0x9c):
- b LOCAL_LABEL(JIT_MemSet_0xa8)
-LOCAL_LABEL(JIT_MemSet_0xa0):
- stp x8, x8, [x0], #16
-LOCAL_LABEL(JIT_MemSet_0xa8):
- subs x2, x2, #16
- b.ge LOCAL_LABEL(JIT_MemSet_0xa0)
-
-LOCAL_LABEL(JIT_MemSet_0xb0):
- tbz x2, #3, LOCAL_LABEL(JIT_MemSet_0xb4)
- str x8, [x0], #8
-LOCAL_LABEL(JIT_MemSet_0xb4):
- tbz x2, #2, LOCAL_LABEL(JIT_MemSet_0xc8)
- str w8, [x0], #4
-LOCAL_LABEL(JIT_MemSet_0xc8):
- tbz x2, #1, LOCAL_LABEL(JIT_MemSet_0xdc)
- strh w8, [x0], #2
-LOCAL_LABEL(JIT_MemSet_0xdc):
- tbz x2, #0, LOCAL_LABEL(JIT_MemSet_0xe8)
- strb w8, [x0]
-LOCAL_LABEL(JIT_MemSet_0xe8):
+LOCAL_LABEL(JIT_MemSet_ret):
ret lr
LEAF_END_MARKED JIT_MemSet, _TEXT
-// See comments above for JIT_MemSet
+LEAF_ENTRY JIT_MemCpy, _TEXT
+ cbz x2, LOCAL_LABEL(JIT_MemCpy_ret)
-//void JIT_MemCpy(void *dst, const void *src, SIZE_T count)
-//
-// // If not aligned then make it 8-byte aligned
-// if(((uintptr_t)dst&0x7) != 0)
-// {
-// // Calculate alignment we can do without exceeding count
-// // Use math to avoid introducing more unpredictable branches
-// // Due to inherent mod in lsr, ~7 is used instead of ~0 to handle count == 0
-// // Note logic will fail is count >= (1 << 61). But this exceeds max physical memory for arm64
-// uint8_t align = (dst & 0x7) & (~uint64_t(7) >> (countLeadingZeros(count) mod 64))
-//
-// if(align&0x1)
-// {
-// *(unit8_t*)dst = *(unit8_t*)src;
-// dst = (unit8_t*)dst + 1;
-// src = (unit8_t*)src + 1;
-// count-=1;
-// }
-//
-// if(align&0x2)
-// {
-// *(unit16_t*)dst = *(unit16_t*)src;
-// dst = (unit16_t*)dst + 1;
-// src = (unit16_t*)src + 1;
-// count-=2;
-// }
-//
-// if(align&0x4)
-// {
-// *(unit32_t*)dst = *(unit32_t*)src;
-// dst = (unit32_t*)dst + 1;
-// src = (unit32_t*)src + 1;
-// count-=4;
-// }
-// }
-//
-// count-=16;
-//
-// while(count >= 0)
-// {
-// *(unit64_t*)dst = *(unit64_t*)src;
-// dst = (unit64_t*)dst + 1;
-// src = (unit64_t*)src + 1;
-// *(unit64_t*)dst = *(unit64_t*)src;
-// dst = (unit64_t*)dst + 1;
-// src = (unit64_t*)src + 1;
-// count-=16;
-// }
-//
-// if(count & 8)
-// {
-// *(unit64_t*)dst = *(unit64_t*)src;
-// dst = (unit64_t*)dst + 1;
-// src = (unit64_t*)src + 1;
-// }
-//
-// if(count & 4)
-// {
-// *(unit32_t*)dst = *(unit32_t*)src;
-// dst = (unit32_t*)dst + 1;
-// src = (unit32_t*)src + 1;
-// }
-//
-// if(count & 2)
-// {
-// *(unit16_t*)dst = *(unit16_t*)src;
-// dst = (unit16_t*)dst + 1;
-// src = (unit16_t*)src + 1;
-// }
-//
-// if(count & 1)
-// {
-// *(unit8_t*)dst = *(unit8_t*)src;
-// }
-//
-//
+ strb wzr, [x0]
+ ldrb wzr, [x1]
-// Assembly code corresponding to above C++ method.
-// See comments above for JIT_MemSet method
-LEAF_ENTRY JIT_MemCpy, _TEXT
- ands x3, x0, #7
- movn x4, #7
- clz x5, x2
- b.eq LOCAL_LABEL(JIT_MemCpy_0xa8)
- lsr x4, x4, x5
- and x3, x3, x4
- tbz x3, #0, LOCAL_LABEL(JIT_MemCpy_0x2c)
- ldrsb w8, [x1], #1
- strb w8, [x0], #1
- sub x2, x2, #1
-LOCAL_LABEL(JIT_MemCpy_0x2c):
- tbz x3, #1, LOCAL_LABEL(JIT_MemCpy_0x5c)
- ldrsh w8, [x1], #2
- strh w8, [x0], #2
- sub x2, x2, #2
-LOCAL_LABEL(JIT_MemCpy_0x5c):
- tbz x3, #2, LOCAL_LABEL(JIT_MemCpy_0xa8)
- ldr w8, [x1], #4
- str w8, [x0], #4
- sub x2, x2, #4
- b LOCAL_LABEL(JIT_MemCpy_0xa8)
-LOCAL_LABEL(JIT_MemCpy_0xa0):
- ldp x8, x9, [x1], #16
- stp x8, x9, [x0], #16
-LOCAL_LABEL(JIT_MemCpy_0xa8):
- subs x2, x2, #16
- b.ge LOCAL_LABEL(JIT_MemCpy_0xa0)
-LOCAL_LABEL(JIT_MemCpy_0xb0):
- tbz x2, #3, LOCAL_LABEL(JIT_MemCpy_0xb4)
- ldr x8, [x1], #8
- str x8, [x0], #8
-LOCAL_LABEL(JIT_MemCpy_0xb4):
- tbz x2, #2, LOCAL_LABEL(JIT_MemCpy_0xc8)
- ldr w8, [x1], #4
- str w8, [x0], #4
-LOCAL_LABEL(JIT_MemCpy_0xc8):
- tbz x2, #1, LOCAL_LABEL(JIT_MemCpy_0xdc)
- ldrsh w8, [x1], #2
- strh w8, [x0], #2
-LOCAL_LABEL(JIT_MemCpy_0xdc):
- tbz x2, #0, LOCAL_LABEL(JIT_MemCpy_0xe8)
- ldrsb w8, [x1]
- strb w8, [x0]
-LOCAL_LABEL(JIT_MemCpy_0xe8):
+ b C_PLTFUNC(memcpy)
+
+LOCAL_LABEL(JIT_MemCpy_ret):
ret lr
LEAF_END_MARKED JIT_MemCpy, _TEXT