diff options
author | Steve MacLean <sdmaclea@qti.qualcomm.com> | 2017-05-24 14:31:44 -0400 |
---|---|---|
committer | Jan Vorlicek <janvorli@microsoft.com> | 2017-05-24 11:31:44 -0700 |
commit | 7bdf517d73dc8332e30af25deddb880a3b533794 (patch) | |
tree | 32f5d5fe01ccce5741631782484ce0239bf486ba /src | |
parent | 44285ef65b626db7954066ff596d6be07c7dd7a2 (diff) | |
download | coreclr-7bdf517d73dc8332e30af25deddb880a3b533794.tar.gz coreclr-7bdf517d73dc8332e30af25deddb880a3b533794.tar.bz2 coreclr-7bdf517d73dc8332e30af25deddb880a3b533794.zip |
[Arm64/Win] Revise JIT_MemSet (#11420)
* [Arm64/Win] Revise JIT_MemSet
This is the Windows equivalent of #11217 which merged
* [Arm64/Win] Use csel<con> for csel
MS Assembler expects csel<cond> for the csel instruction
Diffstat (limited to 'src')
-rw-r--r-- | src/vm/arm64/crthelpers.asm | 246 |
1 files changed, 157 insertions, 89 deletions
diff --git a/src/vm/arm64/crthelpers.asm b/src/vm/arm64/crthelpers.asm index 98ec982beb..af0b672975 100644 --- a/src/vm/arm64/crthelpers.asm +++ b/src/vm/arm64/crthelpers.asm @@ -20,62 +20,109 @@ ; ;void JIT_MemSet(void *dst, int val, SIZE_T count) ;{ -; uintptr_t valEx = (unsigned char)val; +; uint64_t valEx = (unsigned char)val; ; valEx = valEx | valEx << 8; ; valEx = valEx | valEx << 16; ; valEx = valEx | valEx << 32; ; +; size_t dc_zva_size = 4ULL << DCZID_EL0.BS; +; +; uint64_t use_dc_zva = (val == 0) && !DCZID_EL0.p ? count / (2 * dc_zva_size) : 0; // ~Minimum size (assumes worst case alignment) +; ; // If not aligned then make it 8-byte aligned -; if(((uintptr_t)dst&0x7) != 0) +; if(((uint64_t)dst&0xf) != 0) ; { -; if(((uintptr_t)dst&0x3) == 0) +; // Calculate alignment we can do without exceeding count +; // Use math to avoid introducing more unpredictable branches +; // Due to inherent mod in lsr, ~7 is used instead of ~0 to handle count == 0 +; // Note logic will fail is count >= (1 << 61). But this exceeds max physical memory for arm64 +; uint8_t align = (dst & 0x7) & (~uint64_t(7) >> (countLeadingZeros(count) mod 64)) +; +; if(align&0x1) ; { -; *(UINT*)dst = (UINT)valEx; -; dst = (UINT*)dst + 1; +; *(unit8_t*)dst = (unit8_t)valEx; +; dst = (unit8_t*)dst + 1; +; count-=1; +; } +; +; if(align&0x2) +; { +; *(unit16_t*)dst = (unit16_t)valEx; +; dst = (unit16_t*)dst + 1; +; count-=2; +; } +; +; if(align&0x4) +; { +; *(unit32_t*)dst = (unit32_t)valEx; +; dst = (unit32_t*)dst + 1; ; count-=4; ; } -; else if(((uintptr_t)dst&0x1) == 0) +; } +; +; if(use_dc_zva) +; { +; // If not aligned then make it aligned to dc_zva_size +; if(dst&0x8) +; { +; *(uint64_t*)dst = (uint64_t)valEx; +; dst = (uint64_t*)dst + 1; +; count-=8; +; } +; +; while(dst & (dc_zva_size - 1)) ; { -; while(count > 0 && ((uintptr_t)dst&0x7) != 0) -; { -; *(short*)dst = (short)valEx; -; dst = (short*)dst + 1; -; count-=2; -; } +; *(uint64_t*)dst = valEx; +; dst = (uint64_t*)dst + 1; +; *(uint64_t*)dst = valEx; +; dst = (uint64_t*)dst + 1; +; count-=16; ; } -; else +; +; count -= dc_zva_size; +; +; while(count >= 0) ; { -; while(count > 0 && ((uintptr_t)dst&0x7) != 0) -; { -; *(char*)dst = (char)valEx; -; dst = (char*)dst + 1; -; count--; -; } +; dc_zva(dst); +; dst = (uint8_t*)dst + dc_zva_size; +; count-=dc_zva_size; ; } +; +; count += dc_zva_size; +; } +; +; count-=16; +; +; while(count >= 0) +; { +; *(uint64_t*)dst = valEx; +; dst = (uint64_t*)dst + 1; +; *(uint64_t*)dst = valEx; +; dst = (uint64_t*)dst + 1; +; count-=16; ; } ; -; while(count >= 8) +; if(count & 8) ; { -; *(uintptr_t*)dst = valEx; -; dst = (uintptr_t*)dst + 1; -; count-=8; +; *(uint64_t*)dst = valEx; +; dst = (uint64_t*)dst + 1; ; } ; ; if(count & 4) ; { -; *(UINT*)dst = (UINT)valEx; -; dst = (UINT*)dst + 1; +; *(uint32_t*)dst = (uint32_t)valEx; +; dst = (uint32_t*)dst + 1; ; } ; ; if(count & 2) ; { -; *(short*)dst = (short)valEx; -; dst = (short*)dst + 1; +; *(uint16_t*)dst = (uint16_t)valEx; +; dst = (uint16_t*)dst + 1; ; } ; ; if(count & 1) ; { -; *(char*)dst = (char)valEx; +; *(uint8_t*)dst = (uint8_t)valEx; ; } ;} ; @@ -87,68 +134,89 @@ ; as C++ method. LEAF_ENTRY JIT_MemSet - uxtb w8,w1 - sxtw x8,w8 - orr x8,x8,x8 lsl #8 - orr x8,x8,x8 lsl #0x10 - orr x9,x8,x8 lsl #0x20 - and x8,x0,#7 - cbz x8,JIT_MemSet_0x7c - and x8,x0,#3 - cbnz x8,JIT_MemSet_0x38 - str w9,[x0] - add x0,x0,#4 - mov x8,#-4 - add x2,x2,x8 - b JIT_MemSet_0x7c -JIT_MemSet_0x38 - cbz x2,JIT_MemSet_0x7c - tbnz x0,#0,JIT_MemSet_0x60 -JIT_MemSet_0x40 - and x8,x0,#7 - cbz x8,JIT_MemSet_0x7c - strh w9,[x0] - add x0,x0,#2 - mov x8,#-2 - add x2,x2,x8 - cbnz x2,JIT_MemSet_0x40 - b JIT_MemSet_0x7c -JIT_MemSet_0x60 - and x8,x0,#7 - cbz x8,JIT_MemSet_0x7c - strb w9,[x0] - add x0,x0,#1 - mov x8,#-1 - add x2,x2,x8 - cbnz x2,JIT_MemSet_0x60 -JIT_MemSet_0x7c - cmp x2,#8 - blo JIT_MemSet_0xb8 - lsr x8,x2,#3 - mov x11,x8 - mov x10,x0 - add x8,x10,x11 lsl #3 + ands w8, w1, #0xff + mrs x3, DCZID_EL0 ; x3 = DCZID_EL0 + mov x6, #4 + lsr x11, x2, #3 ; x11 = count >> 3 + + orr w8, w8, w8, lsl #8 + and x5, x3, #0xf ; x5 = dczid_el0.bs + cseleq x11, x11, xzr ; x11 = (val == 0) ? count >> 3 : 0 + tst x3, (1 << 4) + + orr w8, w8, w8, lsl #0x10 + cseleq x11, x11, xzr ; x11 = (val == 0) && !DCZID_EL0.p ? count >> 3 : 0 + ands x3, x0, #7 ; x3 = dst & 7 + lsl x9, x6, x5 ; x9 = size + + orr x8, x8, x8, lsl #0x20 + lsr x11, x11, x5 ; x11 = (val == 0) && !DCZID_EL0.p ? count >> (3 + DCZID_EL0.bs) : 0 + sub x10, x9, #1 ; x10 = mask + + beq JIT_MemSet_0x80 + + movn x4, #7 + clz x5, x2 + lsr x4, x4, x5 + and x3, x3, x4 + + tbz x3, #0, JIT_MemSet_0x2c + strb w8, [x0], #1 + sub x2, x2, #1 +JIT_MemSet_0x2c + tbz x3, #1, JIT_MemSet_0x5c + strh w8, [x0], #2 + sub x2, x2, #2 +JIT_MemSet_0x5c + tbz x3, #2, JIT_MemSet_0x80 + str w8, [x0], #4 + sub x2, x2, #4 +JIT_MemSet_0x80 + cbz x11, JIT_MemSet_0x9c + tbz x0, #3, JIT_MemSet_0x84 + str x8, [x0], #8 + sub x2, x2, #8 + + b JIT_MemSet_0x85 +JIT_MemSet_0x84 + stp x8, x8, [x0], #16 + sub x2, x2, #16 +JIT_MemSet_0x85 + tst x0, x10 + bne JIT_MemSet_0x84 + + b JIT_MemSet_0x8a +JIT_MemSet_0x88 + dc zva, x0 + add x0, x0, x9 +JIT_MemSet_0x8a + subs x2, x2, x9 + bge JIT_MemSet_0x88 + +JIT_MemSet_0x8c + add x2, x2, x9 + JIT_MemSet_0x9c - cmp x10,x8 - beq JIT_MemSet_0xac - str x9,[x10],#8 - b JIT_MemSet_0x9c -JIT_MemSet_0xac - mov x8,#-8 - madd x2,x11,x8,x2 - add x0,x0,x11 lsl #3 -JIT_MemSet_0xb8 - tbz x2,#2,JIT_MemSet_0xc4 - str w9,[x0] - add x0,x0,#4 -JIT_MemSet_0xc4 - tbz x2,#1,JIT_MemSet_0xd0 - strh w9,[x0] - add x0,x0,#2 -JIT_MemSet_0xd0 - tbz x2,#0,JIT_MemSet_0xd8 - strb w9,[x0] -JIT_MemSet_0xd8 + b JIT_MemSet_0xa8 +JIT_MemSet_0xa0 + stp x8, x8, [x0], #16 +JIT_MemSet_0xa8 + subs x2, x2, #16 + bge JIT_MemSet_0xa0 + +JIT_MemSet_0xb0 + tbz x2, #3, JIT_MemSet_0xb4 + str x8, [x0], #8 +JIT_MemSet_0xb4 + tbz x2, #2, JIT_MemSet_0xc8 + str w8, [x0], #4 +JIT_MemSet_0xc8 + tbz x2, #1, JIT_MemSet_0xdc + strh w8, [x0], #2 +JIT_MemSet_0xdc + tbz x2, #0, JIT_MemSet_0xe8 + strb w8, [x0] +JIT_MemSet_0xe8 ret lr LEAF_END |