summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSteve MacLean <sdmaclea.qdt@qualcommdatacenter.com>2018-04-13 12:46:19 -0400
committerJan Kotas <jkotas@microsoft.com>2018-04-13 09:46:19 -0700
commitbba18ae06d9f132914ec77234bfd1c3ab9100d1a (patch)
tree977b636c343e5c0664fc5d3a8264c01bc36ecd29
parentd81f619201a010fe92bcdaa0c09569f7390a69ae (diff)
downloadcoreclr-bba18ae06d9f132914ec77234bfd1c3ab9100d1a.tar.gz
coreclr-bba18ae06d9f132914ec77234bfd1c3ab9100d1a.tar.bz2
coreclr-bba18ae06d9f132914ec77234bfd1c3ab9100d1a.zip
[Arm64/Windows] Simplify JIT_MemSet & JIT_MemCpy (#17537)
JIT_Memset alignment code was definitly broken for some unaligned cases JIT_MemCpy likely had the same issue Simplify implementation to reduce maintenance burden
-rw-r--r--src/vm/arm64/crthelpers.asm259
1 files changed, 37 insertions, 222 deletions
diff --git a/src/vm/arm64/crthelpers.asm b/src/vm/arm64/crthelpers.asm
index af0b672975..df282ee895 100644
--- a/src/vm/arm64/crthelpers.asm
+++ b/src/vm/arm64/crthelpers.asm
@@ -12,12 +12,6 @@
TEXTAREA
-; Calls to JIT_MemSet is emitted by jit for initialization of large structs.
-; We need to provide our own implementation of memset instead of using the ones in crt because crt implementation does not gurantee
-; that aligned 8/4/2 - byte memory will be written atomically. This is required because members in a struct can be read atomically
-; and their values should be written atomically.
-;
-;
;void JIT_MemSet(void *dst, int val, SIZE_T count)
;{
; uint64_t valEx = (unsigned char)val;
@@ -25,72 +19,6 @@
; valEx = valEx | valEx << 16;
; valEx = valEx | valEx << 32;
;
-; size_t dc_zva_size = 4ULL << DCZID_EL0.BS;
-;
-; uint64_t use_dc_zva = (val == 0) && !DCZID_EL0.p ? count / (2 * dc_zva_size) : 0; // ~Minimum size (assumes worst case alignment)
-;
-; // If not aligned then make it 8-byte aligned
-; if(((uint64_t)dst&0xf) != 0)
-; {
-; // Calculate alignment we can do without exceeding count
-; // Use math to avoid introducing more unpredictable branches
-; // Due to inherent mod in lsr, ~7 is used instead of ~0 to handle count == 0
-; // Note logic will fail is count >= (1 << 61). But this exceeds max physical memory for arm64
-; uint8_t align = (dst & 0x7) & (~uint64_t(7) >> (countLeadingZeros(count) mod 64))
-;
-; if(align&0x1)
-; {
-; *(unit8_t*)dst = (unit8_t)valEx;
-; dst = (unit8_t*)dst + 1;
-; count-=1;
-; }
-;
-; if(align&0x2)
-; {
-; *(unit16_t*)dst = (unit16_t)valEx;
-; dst = (unit16_t*)dst + 1;
-; count-=2;
-; }
-;
-; if(align&0x4)
-; {
-; *(unit32_t*)dst = (unit32_t)valEx;
-; dst = (unit32_t*)dst + 1;
-; count-=4;
-; }
-; }
-;
-; if(use_dc_zva)
-; {
-; // If not aligned then make it aligned to dc_zva_size
-; if(dst&0x8)
-; {
-; *(uint64_t*)dst = (uint64_t)valEx;
-; dst = (uint64_t*)dst + 1;
-; count-=8;
-; }
-;
-; while(dst & (dc_zva_size - 1))
-; {
-; *(uint64_t*)dst = valEx;
-; dst = (uint64_t*)dst + 1;
-; *(uint64_t*)dst = valEx;
-; dst = (uint64_t*)dst + 1;
-; count-=16;
-; }
-;
-; count -= dc_zva_size;
-;
-; while(count >= 0)
-; {
-; dc_zva(dst);
-; dst = (uint8_t*)dst + dc_zva_size;
-; count-=dc_zva_size;
-; }
-;
-; count += dc_zva_size;
-; }
-;
; count-=16;
;
; while(count >= 0)
@@ -134,89 +62,30 @@
; as C++ method.
LEAF_ENTRY JIT_MemSet
- ands w8, w1, #0xff
- mrs x3, DCZID_EL0 ; x3 = DCZID_EL0
- mov x6, #4
- lsr x11, x2, #3 ; x11 = count >> 3
-
- orr w8, w8, w8, lsl #8
- and x5, x3, #0xf ; x5 = dczid_el0.bs
- cseleq x11, x11, xzr ; x11 = (val == 0) ? count >> 3 : 0
- tst x3, (1 << 4)
-
- orr w8, w8, w8, lsl #0x10
- cseleq x11, x11, xzr ; x11 = (val == 0) && !DCZID_EL0.p ? count >> 3 : 0
- ands x3, x0, #7 ; x3 = dst & 7
- lsl x9, x6, x5 ; x9 = size
-
- orr x8, x8, x8, lsl #0x20
- lsr x11, x11, x5 ; x11 = (val == 0) && !DCZID_EL0.p ? count >> (3 + DCZID_EL0.bs) : 0
- sub x10, x9, #1 ; x10 = mask
-
- beq JIT_MemSet_0x80
-
- movn x4, #7
- clz x5, x2
- lsr x4, x4, x5
- and x3, x3, x4
-
- tbz x3, #0, JIT_MemSet_0x2c
- strb w8, [x0], #1
- sub x2, x2, #1
-JIT_MemSet_0x2c
- tbz x3, #1, JIT_MemSet_0x5c
- strh w8, [x0], #2
- sub x2, x2, #2
-JIT_MemSet_0x5c
- tbz x3, #2, JIT_MemSet_0x80
- str w8, [x0], #4
- sub x2, x2, #4
-JIT_MemSet_0x80
- cbz x11, JIT_MemSet_0x9c
- tbz x0, #3, JIT_MemSet_0x84
- str x8, [x0], #8
- sub x2, x2, #8
-
- b JIT_MemSet_0x85
-JIT_MemSet_0x84
- stp x8, x8, [x0], #16
- sub x2, x2, #16
-JIT_MemSet_0x85
- tst x0, x10
- bne JIT_MemSet_0x84
-
- b JIT_MemSet_0x8a
-JIT_MemSet_0x88
- dc zva, x0
- add x0, x0, x9
-JIT_MemSet_0x8a
- subs x2, x2, x9
- bge JIT_MemSet_0x88
-
-JIT_MemSet_0x8c
- add x2, x2, x9
-
-JIT_MemSet_0x9c
- b JIT_MemSet_0xa8
-JIT_MemSet_0xa0
- stp x8, x8, [x0], #16
-JIT_MemSet_0xa8
+ ands w1, w1, #0xff
+ orr w1, w1, w1, lsl #8
+ orr w1, w1, w1, lsl #0x10
+ orr x1, x1, x1, lsl #0x20
+
+ b JIT_MemSet_bottom
+JIT_MemSet_top
+ stp x1, x1, [x0], #16
+JIT_MemSet_bottom
subs x2, x2, #16
- bge JIT_MemSet_0xa0
-
-JIT_MemSet_0xb0
- tbz x2, #3, JIT_MemSet_0xb4
- str x8, [x0], #8
-JIT_MemSet_0xb4
- tbz x2, #2, JIT_MemSet_0xc8
- str w8, [x0], #4
-JIT_MemSet_0xc8
- tbz x2, #1, JIT_MemSet_0xdc
- strh w8, [x0], #2
-JIT_MemSet_0xdc
- tbz x2, #0, JIT_MemSet_0xe8
- strb w8, [x0]
-JIT_MemSet_0xe8
+ bge JIT_MemSet_top
+
+ tbz x2, #3, JIT_MemSet_tbz4
+ str x1, [x0], #8
+JIT_MemSet_tbz4
+ tbz x2, #2, JIT_MemSet_tbz2
+ str w1, [x0], #4
+JIT_MemSet_tbz2
+ tbz x2, #1, JIT_MemSet_tbz1
+ strh w1, [x0], #2
+JIT_MemSet_tbz1
+ tbz x2, #0, JIT_MemSet_ret
+ strb w1, [x0]
+JIT_MemSet_ret
ret lr
LEAF_END
@@ -228,41 +97,7 @@ JIT_MemSet_0xe8
; See comments above for JIT_MemSet
;void JIT_MemCpy(void *dst, const void *src, SIZE_T count)
-;
-; // If not aligned then make it 8-byte aligned
-; if(((uintptr_t)dst&0x7) != 0)
-; {
-; // Calculate alignment we can do without exceeding count
-; // Use math to avoid introducing more unpredictable branches
-; // Due to inherent mod in lsr, ~7 is used instead of ~0 to handle count == 0
-; // Note logic will fail if count >= (1 << 61). But this exceeds max physical memory for arm64
-; uint8_t align = (dst & 0x7) & (~uint64_t(7) >> (countLeadingZeros(count) mod 64))
-;
-; if(align&0x1)
-; {
-; *(unit8_t*)dst = *(unit8_t*)src;
-; dst = (unit8_t*)dst + 1;
-; src = (unit8_t*)src + 1;
-; count-=1;
-; }
-;
-; if(align&0x2)
-; {
-; *(unit16_t*)dst = *(unit16_t*)src;
-; dst = (unit16_t*)dst + 1;
-; src = (unit16_t*)src + 1;
-; count-=2;
-; }
-;
-; if(align&0x4)
-; {
-; *(unit32_t*)dst = *(unit32_t*)src;
-; dst = (unit32_t*)dst + 1;
-; src = (unit32_t*)src + 1;
-; count-=4;
-; }
-; }
-;
+;{
; count-=16;
;
; while(count >= 0)
@@ -307,50 +142,30 @@ JIT_MemSet_0xe8
; Assembly code corresponding to above C++ method.
; See comments above for JIT_MemSet method
LEAF_ENTRY JIT_MemCpy
- ands x3, x0, #7
- movn x4, #7
- clz x5, x2
- beq JIT_MemCpy_0xa8
- lsr x4, x4, x5
- and x3, x3, x4
- tbz x3, #0, JIT_MemCpy_0x2c
- ldrsb w8, [x1], #1
- strb w8, [x0], #1
- sub x2, x2, #1
-JIT_MemCpy_0x2c
- tbz x3, #1, JIT_MemCpy_0x5c
- ldrsh w8, [x1], #2
- strh w8, [x0], #2
- sub x2, x2, #2
-JIT_MemCpy_0x5c
- tbz x3, #2, JIT_MemCpy_0xa8
- ldr w8, [x1], #4
- str w8, [x0], #4
- sub x2, x2, #4
- b JIT_MemCpy_0xa8
-JIT_MemCpy_0xa0
+ b JIT_MemCpy_bottom
+JIT_MemCpy_top
ldp x8, x9, [x1], #16
stp x8, x9, [x0], #16
-JIT_MemCpy_0xa8
+JIT_MemCpy_bottom
subs x2, x2, #16
- bge JIT_MemCpy_0xa0
-JIT_MemCpy_0xb0
- tbz x2, #3, JIT_MemCpy_0xb4
+ bge JIT_MemCpy_top
+
+ tbz x2, #3, JIT_MemCpy_tbz4
ldr x8, [x1], #8
str x8, [x0], #8
-JIT_MemCpy_0xb4
- tbz x2, #2, JIT_MemCpy_0xc8
+JIT_MemCpy_tbz4
+ tbz x2, #2, JIT_MemCpy_tbz2
ldr w8, [x1], #4
str w8, [x0], #4
-JIT_MemCpy_0xc8
- tbz x2, #1, JIT_MemCpy_0xdc
+JIT_MemCpy_tbz2
+ tbz x2, #1, JIT_MemCpy_tbz1
ldrsh w8, [x1], #2
strh w8, [x0], #2
-JIT_MemCpy_0xdc
- tbz x2, #0, JIT_MemCpy_0xe8
+JIT_MemCpy_tbz1
+ tbz x2, #0, JIT_MemCpy_ret
ldrsb w8, [x1]
strb w8, [x0]
-JIT_MemCpy_0xe8
+JIT_MemCpy_ret
ret lr
LEAF_END