diff options
author | Steve MacLean <sdmaclea.qdt@qualcommdatacenter.com> | 2018-04-13 12:46:19 -0400 |
---|---|---|
committer | Jan Kotas <jkotas@microsoft.com> | 2018-04-13 09:46:19 -0700 |
commit | bba18ae06d9f132914ec77234bfd1c3ab9100d1a (patch) | |
tree | 977b636c343e5c0664fc5d3a8264c01bc36ecd29 | |
parent | d81f619201a010fe92bcdaa0c09569f7390a69ae (diff) | |
download | coreclr-bba18ae06d9f132914ec77234bfd1c3ab9100d1a.tar.gz coreclr-bba18ae06d9f132914ec77234bfd1c3ab9100d1a.tar.bz2 coreclr-bba18ae06d9f132914ec77234bfd1c3ab9100d1a.zip |
[Arm64/Windows] Simplify JIT_MemSet & JIT_MemCpy (#17537)
JIT_Memset alignment code was definitly broken for some
unaligned cases
JIT_MemCpy likely had the same issue
Simplify implementation to reduce maintenance burden
-rw-r--r-- | src/vm/arm64/crthelpers.asm | 259 |
1 files changed, 37 insertions, 222 deletions
diff --git a/src/vm/arm64/crthelpers.asm b/src/vm/arm64/crthelpers.asm index af0b672975..df282ee895 100644 --- a/src/vm/arm64/crthelpers.asm +++ b/src/vm/arm64/crthelpers.asm @@ -12,12 +12,6 @@ TEXTAREA -; Calls to JIT_MemSet is emitted by jit for initialization of large structs. -; We need to provide our own implementation of memset instead of using the ones in crt because crt implementation does not gurantee -; that aligned 8/4/2 - byte memory will be written atomically. This is required because members in a struct can be read atomically -; and their values should be written atomically. -; -; ;void JIT_MemSet(void *dst, int val, SIZE_T count) ;{ ; uint64_t valEx = (unsigned char)val; @@ -25,72 +19,6 @@ ; valEx = valEx | valEx << 16; ; valEx = valEx | valEx << 32; ; -; size_t dc_zva_size = 4ULL << DCZID_EL0.BS; -; -; uint64_t use_dc_zva = (val == 0) && !DCZID_EL0.p ? count / (2 * dc_zva_size) : 0; // ~Minimum size (assumes worst case alignment) -; -; // If not aligned then make it 8-byte aligned -; if(((uint64_t)dst&0xf) != 0) -; { -; // Calculate alignment we can do without exceeding count -; // Use math to avoid introducing more unpredictable branches -; // Due to inherent mod in lsr, ~7 is used instead of ~0 to handle count == 0 -; // Note logic will fail is count >= (1 << 61). But this exceeds max physical memory for arm64 -; uint8_t align = (dst & 0x7) & (~uint64_t(7) >> (countLeadingZeros(count) mod 64)) -; -; if(align&0x1) -; { -; *(unit8_t*)dst = (unit8_t)valEx; -; dst = (unit8_t*)dst + 1; -; count-=1; -; } -; -; if(align&0x2) -; { -; *(unit16_t*)dst = (unit16_t)valEx; -; dst = (unit16_t*)dst + 1; -; count-=2; -; } -; -; if(align&0x4) -; { -; *(unit32_t*)dst = (unit32_t)valEx; -; dst = (unit32_t*)dst + 1; -; count-=4; -; } -; } -; -; if(use_dc_zva) -; { -; // If not aligned then make it aligned to dc_zva_size -; if(dst&0x8) -; { -; *(uint64_t*)dst = (uint64_t)valEx; -; dst = (uint64_t*)dst + 1; -; count-=8; -; } -; -; while(dst & (dc_zva_size - 1)) -; { -; *(uint64_t*)dst = valEx; -; dst = (uint64_t*)dst + 1; -; *(uint64_t*)dst = valEx; -; dst = (uint64_t*)dst + 1; -; count-=16; -; } -; -; count -= dc_zva_size; -; -; while(count >= 0) -; { -; dc_zva(dst); -; dst = (uint8_t*)dst + dc_zva_size; -; count-=dc_zva_size; -; } -; -; count += dc_zva_size; -; } -; ; count-=16; ; ; while(count >= 0) @@ -134,89 +62,30 @@ ; as C++ method. LEAF_ENTRY JIT_MemSet - ands w8, w1, #0xff - mrs x3, DCZID_EL0 ; x3 = DCZID_EL0 - mov x6, #4 - lsr x11, x2, #3 ; x11 = count >> 3 - - orr w8, w8, w8, lsl #8 - and x5, x3, #0xf ; x5 = dczid_el0.bs - cseleq x11, x11, xzr ; x11 = (val == 0) ? count >> 3 : 0 - tst x3, (1 << 4) - - orr w8, w8, w8, lsl #0x10 - cseleq x11, x11, xzr ; x11 = (val == 0) && !DCZID_EL0.p ? count >> 3 : 0 - ands x3, x0, #7 ; x3 = dst & 7 - lsl x9, x6, x5 ; x9 = size - - orr x8, x8, x8, lsl #0x20 - lsr x11, x11, x5 ; x11 = (val == 0) && !DCZID_EL0.p ? count >> (3 + DCZID_EL0.bs) : 0 - sub x10, x9, #1 ; x10 = mask - - beq JIT_MemSet_0x80 - - movn x4, #7 - clz x5, x2 - lsr x4, x4, x5 - and x3, x3, x4 - - tbz x3, #0, JIT_MemSet_0x2c - strb w8, [x0], #1 - sub x2, x2, #1 -JIT_MemSet_0x2c - tbz x3, #1, JIT_MemSet_0x5c - strh w8, [x0], #2 - sub x2, x2, #2 -JIT_MemSet_0x5c - tbz x3, #2, JIT_MemSet_0x80 - str w8, [x0], #4 - sub x2, x2, #4 -JIT_MemSet_0x80 - cbz x11, JIT_MemSet_0x9c - tbz x0, #3, JIT_MemSet_0x84 - str x8, [x0], #8 - sub x2, x2, #8 - - b JIT_MemSet_0x85 -JIT_MemSet_0x84 - stp x8, x8, [x0], #16 - sub x2, x2, #16 -JIT_MemSet_0x85 - tst x0, x10 - bne JIT_MemSet_0x84 - - b JIT_MemSet_0x8a -JIT_MemSet_0x88 - dc zva, x0 - add x0, x0, x9 -JIT_MemSet_0x8a - subs x2, x2, x9 - bge JIT_MemSet_0x88 - -JIT_MemSet_0x8c - add x2, x2, x9 - -JIT_MemSet_0x9c - b JIT_MemSet_0xa8 -JIT_MemSet_0xa0 - stp x8, x8, [x0], #16 -JIT_MemSet_0xa8 + ands w1, w1, #0xff + orr w1, w1, w1, lsl #8 + orr w1, w1, w1, lsl #0x10 + orr x1, x1, x1, lsl #0x20 + + b JIT_MemSet_bottom +JIT_MemSet_top + stp x1, x1, [x0], #16 +JIT_MemSet_bottom subs x2, x2, #16 - bge JIT_MemSet_0xa0 - -JIT_MemSet_0xb0 - tbz x2, #3, JIT_MemSet_0xb4 - str x8, [x0], #8 -JIT_MemSet_0xb4 - tbz x2, #2, JIT_MemSet_0xc8 - str w8, [x0], #4 -JIT_MemSet_0xc8 - tbz x2, #1, JIT_MemSet_0xdc - strh w8, [x0], #2 -JIT_MemSet_0xdc - tbz x2, #0, JIT_MemSet_0xe8 - strb w8, [x0] -JIT_MemSet_0xe8 + bge JIT_MemSet_top + + tbz x2, #3, JIT_MemSet_tbz4 + str x1, [x0], #8 +JIT_MemSet_tbz4 + tbz x2, #2, JIT_MemSet_tbz2 + str w1, [x0], #4 +JIT_MemSet_tbz2 + tbz x2, #1, JIT_MemSet_tbz1 + strh w1, [x0], #2 +JIT_MemSet_tbz1 + tbz x2, #0, JIT_MemSet_ret + strb w1, [x0] +JIT_MemSet_ret ret lr LEAF_END @@ -228,41 +97,7 @@ JIT_MemSet_0xe8 ; See comments above for JIT_MemSet ;void JIT_MemCpy(void *dst, const void *src, SIZE_T count) -; -; // If not aligned then make it 8-byte aligned -; if(((uintptr_t)dst&0x7) != 0) -; { -; // Calculate alignment we can do without exceeding count -; // Use math to avoid introducing more unpredictable branches -; // Due to inherent mod in lsr, ~7 is used instead of ~0 to handle count == 0 -; // Note logic will fail if count >= (1 << 61). But this exceeds max physical memory for arm64 -; uint8_t align = (dst & 0x7) & (~uint64_t(7) >> (countLeadingZeros(count) mod 64)) -; -; if(align&0x1) -; { -; *(unit8_t*)dst = *(unit8_t*)src; -; dst = (unit8_t*)dst + 1; -; src = (unit8_t*)src + 1; -; count-=1; -; } -; -; if(align&0x2) -; { -; *(unit16_t*)dst = *(unit16_t*)src; -; dst = (unit16_t*)dst + 1; -; src = (unit16_t*)src + 1; -; count-=2; -; } -; -; if(align&0x4) -; { -; *(unit32_t*)dst = *(unit32_t*)src; -; dst = (unit32_t*)dst + 1; -; src = (unit32_t*)src + 1; -; count-=4; -; } -; } -; +;{ ; count-=16; ; ; while(count >= 0) @@ -307,50 +142,30 @@ JIT_MemSet_0xe8 ; Assembly code corresponding to above C++ method. ; See comments above for JIT_MemSet method LEAF_ENTRY JIT_MemCpy - ands x3, x0, #7 - movn x4, #7 - clz x5, x2 - beq JIT_MemCpy_0xa8 - lsr x4, x4, x5 - and x3, x3, x4 - tbz x3, #0, JIT_MemCpy_0x2c - ldrsb w8, [x1], #1 - strb w8, [x0], #1 - sub x2, x2, #1 -JIT_MemCpy_0x2c - tbz x3, #1, JIT_MemCpy_0x5c - ldrsh w8, [x1], #2 - strh w8, [x0], #2 - sub x2, x2, #2 -JIT_MemCpy_0x5c - tbz x3, #2, JIT_MemCpy_0xa8 - ldr w8, [x1], #4 - str w8, [x0], #4 - sub x2, x2, #4 - b JIT_MemCpy_0xa8 -JIT_MemCpy_0xa0 + b JIT_MemCpy_bottom +JIT_MemCpy_top ldp x8, x9, [x1], #16 stp x8, x9, [x0], #16 -JIT_MemCpy_0xa8 +JIT_MemCpy_bottom subs x2, x2, #16 - bge JIT_MemCpy_0xa0 -JIT_MemCpy_0xb0 - tbz x2, #3, JIT_MemCpy_0xb4 + bge JIT_MemCpy_top + + tbz x2, #3, JIT_MemCpy_tbz4 ldr x8, [x1], #8 str x8, [x0], #8 -JIT_MemCpy_0xb4 - tbz x2, #2, JIT_MemCpy_0xc8 +JIT_MemCpy_tbz4 + tbz x2, #2, JIT_MemCpy_tbz2 ldr w8, [x1], #4 str w8, [x0], #4 -JIT_MemCpy_0xc8 - tbz x2, #1, JIT_MemCpy_0xdc +JIT_MemCpy_tbz2 + tbz x2, #1, JIT_MemCpy_tbz1 ldrsh w8, [x1], #2 strh w8, [x0], #2 -JIT_MemCpy_0xdc - tbz x2, #0, JIT_MemCpy_0xe8 +JIT_MemCpy_tbz1 + tbz x2, #0, JIT_MemCpy_ret ldrsb w8, [x1] strb w8, [x0] -JIT_MemCpy_0xe8 +JIT_MemCpy_ret ret lr LEAF_END |