summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorSteve MacLean <sdmaclea@qti.qualcomm.com>2017-05-24 14:31:44 -0400
committerJan Vorlicek <janvorli@microsoft.com>2017-05-24 11:31:44 -0700
commit7bdf517d73dc8332e30af25deddb880a3b533794 (patch)
tree32f5d5fe01ccce5741631782484ce0239bf486ba /src
parent44285ef65b626db7954066ff596d6be07c7dd7a2 (diff)
downloadcoreclr-7bdf517d73dc8332e30af25deddb880a3b533794.tar.gz
coreclr-7bdf517d73dc8332e30af25deddb880a3b533794.tar.bz2
coreclr-7bdf517d73dc8332e30af25deddb880a3b533794.zip
[Arm64/Win] Revise JIT_MemSet (#11420)
* [Arm64/Win] Revise JIT_MemSet This is the Windows equivalent of #11217 which merged * [Arm64/Win] Use csel<con> for csel MS Assembler expects csel<cond> for the csel instruction
Diffstat (limited to 'src')
-rw-r--r--src/vm/arm64/crthelpers.asm246
1 files changed, 157 insertions, 89 deletions
diff --git a/src/vm/arm64/crthelpers.asm b/src/vm/arm64/crthelpers.asm
index 98ec982beb..af0b672975 100644
--- a/src/vm/arm64/crthelpers.asm
+++ b/src/vm/arm64/crthelpers.asm
@@ -20,62 +20,109 @@
;
;void JIT_MemSet(void *dst, int val, SIZE_T count)
;{
-; uintptr_t valEx = (unsigned char)val;
+; uint64_t valEx = (unsigned char)val;
; valEx = valEx | valEx << 8;
; valEx = valEx | valEx << 16;
; valEx = valEx | valEx << 32;
;
+; size_t dc_zva_size = 4ULL << DCZID_EL0.BS;
+;
+; uint64_t use_dc_zva = (val == 0) && !DCZID_EL0.p ? count / (2 * dc_zva_size) : 0; // ~Minimum size (assumes worst case alignment)
+;
; // If not aligned then make it 8-byte aligned
-; if(((uintptr_t)dst&0x7) != 0)
+; if(((uint64_t)dst&0xf) != 0)
; {
-; if(((uintptr_t)dst&0x3) == 0)
+; // Calculate alignment we can do without exceeding count
+; // Use math to avoid introducing more unpredictable branches
+; // Due to inherent mod in lsr, ~7 is used instead of ~0 to handle count == 0
+; // Note logic will fail is count >= (1 << 61). But this exceeds max physical memory for arm64
+; uint8_t align = (dst & 0x7) & (~uint64_t(7) >> (countLeadingZeros(count) mod 64))
+;
+; if(align&0x1)
; {
-; *(UINT*)dst = (UINT)valEx;
-; dst = (UINT*)dst + 1;
+; *(unit8_t*)dst = (unit8_t)valEx;
+; dst = (unit8_t*)dst + 1;
+; count-=1;
+; }
+;
+; if(align&0x2)
+; {
+; *(unit16_t*)dst = (unit16_t)valEx;
+; dst = (unit16_t*)dst + 1;
+; count-=2;
+; }
+;
+; if(align&0x4)
+; {
+; *(unit32_t*)dst = (unit32_t)valEx;
+; dst = (unit32_t*)dst + 1;
; count-=4;
; }
-; else if(((uintptr_t)dst&0x1) == 0)
+; }
+;
+; if(use_dc_zva)
+; {
+; // If not aligned then make it aligned to dc_zva_size
+; if(dst&0x8)
+; {
+; *(uint64_t*)dst = (uint64_t)valEx;
+; dst = (uint64_t*)dst + 1;
+; count-=8;
+; }
+;
+; while(dst & (dc_zva_size - 1))
; {
-; while(count > 0 && ((uintptr_t)dst&0x7) != 0)
-; {
-; *(short*)dst = (short)valEx;
-; dst = (short*)dst + 1;
-; count-=2;
-; }
+; *(uint64_t*)dst = valEx;
+; dst = (uint64_t*)dst + 1;
+; *(uint64_t*)dst = valEx;
+; dst = (uint64_t*)dst + 1;
+; count-=16;
; }
-; else
+;
+; count -= dc_zva_size;
+;
+; while(count >= 0)
; {
-; while(count > 0 && ((uintptr_t)dst&0x7) != 0)
-; {
-; *(char*)dst = (char)valEx;
-; dst = (char*)dst + 1;
-; count--;
-; }
+; dc_zva(dst);
+; dst = (uint8_t*)dst + dc_zva_size;
+; count-=dc_zva_size;
; }
+;
+; count += dc_zva_size;
+; }
+;
+; count-=16;
+;
+; while(count >= 0)
+; {
+; *(uint64_t*)dst = valEx;
+; dst = (uint64_t*)dst + 1;
+; *(uint64_t*)dst = valEx;
+; dst = (uint64_t*)dst + 1;
+; count-=16;
; }
;
-; while(count >= 8)
+; if(count & 8)
; {
-; *(uintptr_t*)dst = valEx;
-; dst = (uintptr_t*)dst + 1;
-; count-=8;
+; *(uint64_t*)dst = valEx;
+; dst = (uint64_t*)dst + 1;
; }
;
; if(count & 4)
; {
-; *(UINT*)dst = (UINT)valEx;
-; dst = (UINT*)dst + 1;
+; *(uint32_t*)dst = (uint32_t)valEx;
+; dst = (uint32_t*)dst + 1;
; }
;
; if(count & 2)
; {
-; *(short*)dst = (short)valEx;
-; dst = (short*)dst + 1;
+; *(uint16_t*)dst = (uint16_t)valEx;
+; dst = (uint16_t*)dst + 1;
; }
;
; if(count & 1)
; {
-; *(char*)dst = (char)valEx;
+; *(uint8_t*)dst = (uint8_t)valEx;
; }
;}
;
@@ -87,68 +134,89 @@
; as C++ method.
LEAF_ENTRY JIT_MemSet
- uxtb w8,w1
- sxtw x8,w8
- orr x8,x8,x8 lsl #8
- orr x8,x8,x8 lsl #0x10
- orr x9,x8,x8 lsl #0x20
- and x8,x0,#7
- cbz x8,JIT_MemSet_0x7c
- and x8,x0,#3
- cbnz x8,JIT_MemSet_0x38
- str w9,[x0]
- add x0,x0,#4
- mov x8,#-4
- add x2,x2,x8
- b JIT_MemSet_0x7c
-JIT_MemSet_0x38
- cbz x2,JIT_MemSet_0x7c
- tbnz x0,#0,JIT_MemSet_0x60
-JIT_MemSet_0x40
- and x8,x0,#7
- cbz x8,JIT_MemSet_0x7c
- strh w9,[x0]
- add x0,x0,#2
- mov x8,#-2
- add x2,x2,x8
- cbnz x2,JIT_MemSet_0x40
- b JIT_MemSet_0x7c
-JIT_MemSet_0x60
- and x8,x0,#7
- cbz x8,JIT_MemSet_0x7c
- strb w9,[x0]
- add x0,x0,#1
- mov x8,#-1
- add x2,x2,x8
- cbnz x2,JIT_MemSet_0x60
-JIT_MemSet_0x7c
- cmp x2,#8
- blo JIT_MemSet_0xb8
- lsr x8,x2,#3
- mov x11,x8
- mov x10,x0
- add x8,x10,x11 lsl #3
+ ands w8, w1, #0xff
+ mrs x3, DCZID_EL0 ; x3 = DCZID_EL0
+ mov x6, #4
+ lsr x11, x2, #3 ; x11 = count >> 3
+
+ orr w8, w8, w8, lsl #8
+ and x5, x3, #0xf ; x5 = dczid_el0.bs
+ cseleq x11, x11, xzr ; x11 = (val == 0) ? count >> 3 : 0
+ tst x3, (1 << 4)
+
+ orr w8, w8, w8, lsl #0x10
+ cseleq x11, x11, xzr ; x11 = (val == 0) && !DCZID_EL0.p ? count >> 3 : 0
+ ands x3, x0, #7 ; x3 = dst & 7
+ lsl x9, x6, x5 ; x9 = size
+
+ orr x8, x8, x8, lsl #0x20
+ lsr x11, x11, x5 ; x11 = (val == 0) && !DCZID_EL0.p ? count >> (3 + DCZID_EL0.bs) : 0
+ sub x10, x9, #1 ; x10 = mask
+
+ beq JIT_MemSet_0x80
+
+ movn x4, #7
+ clz x5, x2
+ lsr x4, x4, x5
+ and x3, x3, x4
+
+ tbz x3, #0, JIT_MemSet_0x2c
+ strb w8, [x0], #1
+ sub x2, x2, #1
+JIT_MemSet_0x2c
+ tbz x3, #1, JIT_MemSet_0x5c
+ strh w8, [x0], #2
+ sub x2, x2, #2
+JIT_MemSet_0x5c
+ tbz x3, #2, JIT_MemSet_0x80
+ str w8, [x0], #4
+ sub x2, x2, #4
+JIT_MemSet_0x80
+ cbz x11, JIT_MemSet_0x9c
+ tbz x0, #3, JIT_MemSet_0x84
+ str x8, [x0], #8
+ sub x2, x2, #8
+
+ b JIT_MemSet_0x85
+JIT_MemSet_0x84
+ stp x8, x8, [x0], #16
+ sub x2, x2, #16
+JIT_MemSet_0x85
+ tst x0, x10
+ bne JIT_MemSet_0x84
+
+ b JIT_MemSet_0x8a
+JIT_MemSet_0x88
+ dc zva, x0
+ add x0, x0, x9
+JIT_MemSet_0x8a
+ subs x2, x2, x9
+ bge JIT_MemSet_0x88
+
+JIT_MemSet_0x8c
+ add x2, x2, x9
+
JIT_MemSet_0x9c
- cmp x10,x8
- beq JIT_MemSet_0xac
- str x9,[x10],#8
- b JIT_MemSet_0x9c
-JIT_MemSet_0xac
- mov x8,#-8
- madd x2,x11,x8,x2
- add x0,x0,x11 lsl #3
-JIT_MemSet_0xb8
- tbz x2,#2,JIT_MemSet_0xc4
- str w9,[x0]
- add x0,x0,#4
-JIT_MemSet_0xc4
- tbz x2,#1,JIT_MemSet_0xd0
- strh w9,[x0]
- add x0,x0,#2
-JIT_MemSet_0xd0
- tbz x2,#0,JIT_MemSet_0xd8
- strb w9,[x0]
-JIT_MemSet_0xd8
+ b JIT_MemSet_0xa8
+JIT_MemSet_0xa0
+ stp x8, x8, [x0], #16
+JIT_MemSet_0xa8
+ subs x2, x2, #16
+ bge JIT_MemSet_0xa0
+
+JIT_MemSet_0xb0
+ tbz x2, #3, JIT_MemSet_0xb4
+ str x8, [x0], #8
+JIT_MemSet_0xb4
+ tbz x2, #2, JIT_MemSet_0xc8
+ str w8, [x0], #4
+JIT_MemSet_0xc8
+ tbz x2, #1, JIT_MemSet_0xdc
+ strh w8, [x0], #2
+JIT_MemSet_0xdc
+ tbz x2, #0, JIT_MemSet_0xe8
+ strb w8, [x0]
+JIT_MemSet_0xe8
ret lr
LEAF_END