summaryrefslogtreecommitdiff
path: root/src/vm/arm64/crthelpers.S
diff options
context:
space:
mode:
Diffstat (limited to 'src/vm/arm64/crthelpers.S')
-rw-r--r--src/vm/arm64/crthelpers.S420
1 files changed, 241 insertions, 179 deletions
diff --git a/src/vm/arm64/crthelpers.S b/src/vm/arm64/crthelpers.S
index 36eb4ee7a3..c8b108ca8f 100644
--- a/src/vm/arm64/crthelpers.S
+++ b/src/vm/arm64/crthelpers.S
@@ -18,62 +18,109 @@
//
//void JIT_MemSet(void *dst, int val, SIZE_T count)
//
-// uintptr_t valEx = (unsigned char)val;
+// uint64_t valEx = (unsigned char)val;
// valEx = valEx | valEx << 8;
// valEx = valEx | valEx << 16;
// valEx = valEx | valEx << 32;
//
+// size_t dc_zva_size = 4ULL << DCZID_EL0.BS;
+//
+// uint64_t use_dc_zva = (val == 0) && !DCZID_EL0.p ? count / (2 * dc_zva_size) : 0; // ~Minimum size (assumes worst case alignment)
+//
// // If not aligned then make it 8-byte aligned
-// if(((uintptr_t)dst&0x7) != 0)
+// if(((uint64_t)dst&0xf) != 0)
// {
-// if(((uintptr_t)dst&0x3) == 0)
+// // Calculate alignment we can do without exceeding count
+// // Use math to avoid introducing more unpredictable branches
+// // Due to inherent mod in lsr, ~7 is used instead of ~0 to handle count == 0
+// // Note logic will fail is count >= (1 << 61). But this exceeds max physical memory for arm64
+// uint8_t align = (dst & 0x7) & (~uint64_t(7) >> (countLeadingZeros(count) mod 64))
+//
+// if(align&0x1)
// {
-// *(UINT*)dst = (UINT)valEx;
-// dst = (UINT*)dst + 1;
+// *(unit8_t*)dst = (unit8_t)valEx;
+// dst = (unit8_t*)dst + 1;
+// count-=1;
+// }
+//
+// if(align&0x2)
+// {
+// *(unit16_t*)dst = (unit16_t)valEx;
+// dst = (unit16_t*)dst + 1;
+// count-=2;
+// }
+//
+// if(align&0x4)
+// {
+// *(unit32_t*)dst = (unit32_t)valEx;
+// dst = (unit32_t*)dst + 1;
// count-=4;
// }
-// else if(((uintptr_t)dst&0x1) == 0)
+// }
+//
+// if(use_dc_zva)
+// {
+// // If not aligned then make it aligned to dc_zva_size
+// if(dst&0x8)
+// {
+// *(uint64_t*)dst = (uint64_t)valEx;
+// dst = (uint64_t*)dst + 1;
+// count-=8;
+// }
+//
+// while(dst & (dc_zva_size - 1))
// {
-// while(count > 0 && ((uintptr_t)dst&0x7) != 0)
-// {
-// *(short*)dst = (short)valEx;
-// dst = (short*)dst + 1;
-// count-=2;
-// }
+// *(uint64_t*)dst = valEx;
+// dst = (uint64_t*)dst + 1;
+// *(uint64_t*)dst = valEx;
+// dst = (uint64_t*)dst + 1;
+// count-=16;
// }
-// else
+//
+// count -= dc_zva_size;
+//
+// while(count >= 0)
// {
-// while(count > 0 && ((uintptr_t)dst&0x7) != 0)
-// {
-// *(char*)dst = (char)valEx;
-// dst = (char*)dst + 1;
-// count--;
-// }
+// dc_zva(dst);
+// dst = (uint8_t*)dst + dc_zva_size;
+// count-=dc_zva_size;
// }
+//
+// count += dc_zva_size;
// }
//
-// while(count >= 8)
+// count-=16;
+//
+// while(count >= 0)
// {
-// *(uintptr_t*)dst = valEx;
-// dst = (uintptr_t*)dst + 1;
-// count-=8;
+// *(uint64_t*)dst = valEx;
+// dst = (uint64_t*)dst + 1;
+// *(uint64_t*)dst = valEx;
+// dst = (uint64_t*)dst + 1;
+// count-=16;
+// }
+//
+// if(count & 8)
+// {
+// *(uint64_t*)dst = valEx;
+// dst = (uint64_t*)dst + 1;
// }
//
// if(count & 4)
// {
-// *(UINT*)dst = (UINT)valEx;
-// dst = (UINT*)dst + 1;
+// *(uint32_t*)dst = (uint32_t)valEx;
+// dst = (uint32_t*)dst + 1;
// }
//
// if(count & 2)
// {
-// *(short*)dst = (short)valEx;
-// dst = (short*)dst + 1;
+// *(uint16_t*)dst = (uint16_t)valEx;
+// dst = (uint16_t*)dst + 1;
// }
//
// if(count & 1)
// {
-// *(char*)dst = (char)valEx;
+// *(uint8_t*)dst = (uint8_t)valEx;
// }
//
//
@@ -85,68 +132,89 @@
// as C++ method.
LEAF_ENTRY JIT_MemSet, _TEXT
- uxtb w8,w1
- sxtw x8,w8
- orr x8,x8,x8, lsl #8
- orr x8,x8,x8, lsl #0x10
- orr x9,x8,x8, lsl #0x20
- and x8,x0,#7
- cbz x8,LOCAL_LABEL(JIT_MemSet_0x7c)
- and x8,x0,#3
- cbnz x8,LOCAL_LABEL(JIT_MemSet_0x38)
- str w9,[x0]
- add x0,x0,#4
- mov x8,#-4
- add x2,x2,x8
- b LOCAL_LABEL(JIT_MemSet_0x7c)
-LOCAL_LABEL(JIT_MemSet_0x38):
- cbz x2,LOCAL_LABEL(JIT_MemSet_0x7c)
- tbnz x0,#0,LOCAL_LABEL(JIT_MemSet_0x60)
-LOCAL_LABEL(JIT_MemSet_0x40):
- and x8,x0,#7
- cbz x8,LOCAL_LABEL(JIT_MemSet_0x7c)
- strh w9,[x0]
- add x0,x0,#2
- mov x8,#-2
- add x2,x2,x8
- cbnz x2,LOCAL_LABEL(JIT_MemSet_0x40)
- b LOCAL_LABEL(JIT_MemSet_0x7c)
-LOCAL_LABEL(JIT_MemSet_0x60):
- and x8,x0,#7
- cbz x8,LOCAL_LABEL(JIT_MemSet_0x7c)
- strb w9,[x0]
- add x0,x0,#1
- mov x8,#-1
- add x2,x2,x8
- cbnz x2,LOCAL_LABEL(JIT_MemSet_0x60)
-LOCAL_LABEL(JIT_MemSet_0x7c):
- cmp x2,#8
- blo LOCAL_LABEL(JIT_MemSet_0xb8)
- lsr x8,x2,#3
- mov x11,x8
- mov x10,x0
- add x8,x10,x11, lsl #3
+ ands w8, w1, #0xff
+ mrs x3, DCZID_EL0 // x3 = DCZID_EL0
+ mov x6, #4
+ lsr x11, x2, #3 // x11 = count >> 3
+
+ orr w8, w8, w8, lsl #8
+ and x5, x3, #0xf // x5 = dczid_el0.bs
+ csel x11, x11, xzr, eq // x11 = (val == 0) ? count >> 3 : 0
+ tst x3, (1 << 4)
+
+ orr w8, w8, w8, lsl #0x10
+ csel x11, x11, xzr, eq // x11 = (val == 0) && !DCZID_EL0.p ? count >> 3 : 0
+ ands x3, x0, #7 // x3 = dst & 7
+ lsl x9, x6, x5 // x9 = size
+
+ orr x8, x8, x8, lsl #0x20
+ lsr x11, x11, x5 // x11 = (val == 0) && !DCZID_EL0.p ? count >> (3 + DCZID_EL0.bs) : 0
+ sub x10, x9, #1 // x10 = mask
+
+ b.eq LOCAL_LABEL(JIT_MemSet_0x80)
+
+ movn x4, #7
+ clz x5, x2
+ lsr x4, x4, x5
+ and x3, x3, x4
+
+ tbz x3, #0, LOCAL_LABEL(JIT_MemSet_0x2c)
+ strb w8, [x0], #1
+ sub x2, x2, #1
+LOCAL_LABEL(JIT_MemSet_0x2c):
+ tbz x3, #1, LOCAL_LABEL(JIT_MemSet_0x5c)
+ strh w8, [x0], #2
+ sub x2, x2, #2
+LOCAL_LABEL(JIT_MemSet_0x5c):
+ tbz x3, #2, LOCAL_LABEL(JIT_MemSet_0x80)
+ str w8, [x0], #4
+ sub x2, x2, #4
+LOCAL_LABEL(JIT_MemSet_0x80):
+ cbz x11, LOCAL_LABEL(JIT_MemSet_0x9c)
+ tbz x0, #3, LOCAL_LABEL(JIT_MemSet_0x84)
+ str x8, [x0], #8
+ sub x2, x2, #8
+
+ b LOCAL_LABEL(JIT_MemSet_0x85)
+LOCAL_LABEL(JIT_MemSet_0x84):
+ stp x8, x8, [x0], #16
+ sub x2, x2, #16
+LOCAL_LABEL(JIT_MemSet_0x85):
+ tst x0, x10
+ b.ne LOCAL_LABEL(JIT_MemSet_0x84)
+
+ b LOCAL_LABEL(JIT_MemSet_0x8a)
+LOCAL_LABEL(JIT_MemSet_0x88):
+ dc zva, x0
+ add x0, x0, x9
+LOCAL_LABEL(JIT_MemSet_0x8a):
+ subs x2, x2, x9
+ b.ge LOCAL_LABEL(JIT_MemSet_0x88)
+
+LOCAL_LABEL(JIT_MemSet_0x8c):
+ add x2, x2, x9
+
LOCAL_LABEL(JIT_MemSet_0x9c):
- cmp x10,x8
- beq LOCAL_LABEL(JIT_MemSet_0xac)
- str x9,[x10],#8
- b LOCAL_LABEL(JIT_MemSet_0x9c)
-LOCAL_LABEL(JIT_MemSet_0xac):
- mov x8,#-8
- madd x2,x11,x8,x2
- add x0,x0,x11, lsl #3
-LOCAL_LABEL(JIT_MemSet_0xb8):
- tbz x2,#2,LOCAL_LABEL(JIT_MemSet_0xc4)
- str w9,[x0]
- add x0,x0,#4
-LOCAL_LABEL(JIT_MemSet_0xc4):
- tbz x2,#1,LOCAL_LABEL(JIT_MemSet_0xd0)
- strh w9,[x0]
- add x0,x0,#2
-LOCAL_LABEL(JIT_MemSet_0xd0):
- tbz x2,#0,LOCAL_LABEL(JIT_MemSet_0xd8)
- strb w9,[x0]
-LOCAL_LABEL(JIT_MemSet_0xd8):
+ b LOCAL_LABEL(JIT_MemSet_0xa8)
+LOCAL_LABEL(JIT_MemSet_0xa0):
+ stp x8, x8, [x0], #16
+LOCAL_LABEL(JIT_MemSet_0xa8):
+ subs x2, x2, #16
+ b.ge LOCAL_LABEL(JIT_MemSet_0xa0)
+
+LOCAL_LABEL(JIT_MemSet_0xb0):
+ tbz x2, #3, LOCAL_LABEL(JIT_MemSet_0xb4)
+ str x8, [x0], #8
+LOCAL_LABEL(JIT_MemSet_0xb4):
+ tbz x2, #2, LOCAL_LABEL(JIT_MemSet_0xc8)
+ str w8, [x0], #4
+LOCAL_LABEL(JIT_MemSet_0xc8):
+ tbz x2, #1, LOCAL_LABEL(JIT_MemSet_0xdc)
+ strh w8, [x0], #2
+LOCAL_LABEL(JIT_MemSet_0xdc):
+ tbz x2, #0, LOCAL_LABEL(JIT_MemSet_0xe8)
+ strb w8, [x0]
+LOCAL_LABEL(JIT_MemSet_0xe8):
ret lr
LEAF_END_MARKED JIT_MemSet, _TEXT
@@ -157,60 +225,74 @@ LEAF_END_MARKED JIT_MemSet, _TEXT
// // If not aligned then make it 8-byte aligned
// if(((uintptr_t)dst&0x7) != 0)
// {
-// if(((uintptr_t)dst&0x3) == 0)
+// // Calculate alignment we can do without exceeding count
+// // Use math to avoid introducing more unpredictable branches
+// // Due to inherent mod in lsr, ~7 is used instead of ~0 to handle count == 0
+// // Note logic will fail is count >= (1 << 61). But this exceeds max physical memory for arm64
+// uint8_t align = (dst & 0x7) & (~uint64_t(7) >> (countLeadingZeros(count) mod 64))
+//
+// if(align&0x1)
// {
-// *(UINT*)dst = *(UINT*)src;
-// dst = (UINT*)dst + 1;
-// src = (UINT*)src + 1;
-// count-=4;
+// *(unit8_t*)dst = *(unit8_t*)src;
+// dst = (unit8_t*)dst + 1;
+// src = (unit8_t*)src + 1;
+// count-=1;
// }
-// else if(((uintptr_t)dst&0x1) == 0)
+//
+// if(align&0x2)
// {
-// while(count > 0 && ((uintptr_t)dst&0x7) != 0)
-// {
-// *(short*)dst = *(short*)src;
-// dst = (short*)dst + 1;
-// src = (short*)src + 1;
-// count-=2;
-// }
+// *(unit16_t*)dst = *(unit16_t*)src;
+// dst = (unit16_t*)dst + 1;
+// src = (unit16_t*)src + 1;
+// count-=2;
// }
-// else
+//
+// if(align&0x4)
// {
-// while(count > 0 && ((uintptr_t)dst&0x7) != 0)
-// {
-// *(char*)dst = *(char*)src;
-// dst = (char*)dst + 1;
-// src = (char*)src + 1;
-// count--;
-// }
+// *(unit32_t*)dst = *(unit32_t*)src;
+// dst = (unit32_t*)dst + 1;
+// src = (unit32_t*)src + 1;
+// count-=4;
// }
// }
//
-// while(count >= 8)
+// count-=16;
+//
+// while(count >= 0)
+// {
+// *(unit64_t*)dst = *(unit64_t*)src;
+// dst = (unit64_t*)dst + 1;
+// src = (unit64_t*)src + 1;
+// *(unit64_t*)dst = *(unit64_t*)src;
+// dst = (unit64_t*)dst + 1;
+// src = (unit64_t*)src + 1;
+// count-=16;
+// }
+//
+// if(count & 8)
// {
-// *(uintptr_t*)dst = *(uintptr_t*)src;
-// dst = (uintptr_t*)dst + 1;
-// src = (uintptr_t*)src + 1;
-// count-=8;
+// *(unit64_t*)dst = *(unit64_t*)src;
+// dst = (unit64_t*)dst + 1;
+// src = (unit64_t*)src + 1;
// }
//
// if(count & 4)
// {
-// *(UINT*)dst = *(UINT*)src;
-// dst = (UINT*)dst + 1;
-// src = (UINT*)src + 1;
+// *(unit32_t*)dst = *(unit32_t*)src;
+// dst = (unit32_t*)dst + 1;
+// src = (unit32_t*)src + 1;
// }
//
// if(count & 2)
// {
-// *(short*)dst = *(short*)src;
-// dst = (short*)dst + 1;
-// src = (short*)src + 1;
+// *(unit16_t*)dst = *(unit16_t*)src;
+// dst = (unit16_t*)dst + 1;
+// src = (unit16_t*)src + 1;
// }
//
// if(count & 1)
// {
-// *(char*)dst = *(char*)src;
+// *(unit8_t*)dst = *(unit8_t*)src;
// }
//
//
@@ -218,69 +300,49 @@ LEAF_END_MARKED JIT_MemSet, _TEXT
// Assembly code corresponding to above C++ method.
// See comments above for JIT_MemSet method
LEAF_ENTRY JIT_MemCpy, _TEXT
- and x8,x0,#7
- cbz x8,LOCAL_LABEL(JIT_MemCpy_0x80)
- and x8,x0,#3
- cbnz x8,LOCAL_LABEL(JIT_MemCpy_0x2c)
- ldr w8,[x1]
- str w8,[x0]
- add x0,x0,#4
- add x1,x1,#4
- mov x8,#-4
- add x2,x2,x8
- b LOCAL_LABEL(JIT_MemCpy_0x80)
+ ands x3, x0, #7
+ movn x4, #7
+ clz x5, x2
+ b.eq LOCAL_LABEL(JIT_MemCpy_0xa8)
+ lsr x4, x4, x5
+ and x3, x3, x4
+ tbz x3, #0, LOCAL_LABEL(JIT_MemCpy_0x2c)
+ ldrsb w8, [x1], #1
+ strb w8, [x0], #1
+ sub x2, x2, #1
LOCAL_LABEL(JIT_MemCpy_0x2c):
- cbz x2,LOCAL_LABEL(JIT_MemCpy_0x80)
- tbnz x0,#0,LOCAL_LABEL(JIT_MemCpy_0x5c)
-LOCAL_LABEL(JIT_MemCpy_0x34):
- and x8,x0,#7
- cbz x8,LOCAL_LABEL(JIT_MemCpy_0x80)
- ldrsh w8,[x1]
- strh w8,[x0]
- add x0,x0,#2
- add x1,x1,#2
- mov x8,#-2
- add x2,x2,x8
- cbnz x2,LOCAL_LABEL(JIT_MemCpy_0x34)
- b LOCAL_LABEL(JIT_MemCpy_0x80)
+ tbz x3, #1, LOCAL_LABEL(JIT_MemCpy_0x5c)
+ ldrsh w8, [x1], #2
+ strh w8, [x0], #2
+ sub x2, x2, #2
LOCAL_LABEL(JIT_MemCpy_0x5c):
- and x8,x0,#7
- cbz x8,LOCAL_LABEL(JIT_MemCpy_0x80)
- ldrsb w8,[x1]
- strb w8,[x0]
- add x0,x0,#1
- add x1,x1,#1
- mov x8,#-1
- add x2,x2,x8
- cbnz x2,LOCAL_LABEL(JIT_MemCpy_0x5c)
-LOCAL_LABEL(JIT_MemCpy_0x80):
- cmp x2,#8
- blo LOCAL_LABEL(JIT_MemCpy_0xb4)
- lsr x9,x2,#3
- mov x8,#-8
- madd x2,x9,x8,x2
+ tbz x3, #2, LOCAL_LABEL(JIT_MemCpy_0xa8)
+ ldr w8, [x1], #4
+ str w8, [x0], #4
+ sub x2, x2, #4
+ b LOCAL_LABEL(JIT_MemCpy_0xa8)
LOCAL_LABEL(JIT_MemCpy_0xa0):
- ldr x8,[x1],#8
- str x8,[x0],#8
- mov x8,#-1
- add x9,x9,x8
- cbnz x9,LOCAL_LABEL(JIT_MemCpy_0xa0)
+ ldp x8, x9, [x1], #16
+ stp x8, x9, [x0], #16
+LOCAL_LABEL(JIT_MemCpy_0xa8):
+ subs x2, x2, #16
+ b.ge LOCAL_LABEL(JIT_MemCpy_0xa0)
+LOCAL_LABEL(JIT_MemCpy_0xb0):
+ tbz x2, #3, LOCAL_LABEL(JIT_MemCpy_0xb4)
+ ldr x8, [x1], #8
+ str x8, [x0], #8
LOCAL_LABEL(JIT_MemCpy_0xb4):
- tbz x2,#2,LOCAL_LABEL(JIT_MemCpy_0xc8)
- ldr w8,[x1]
- str w8,[x0]
- add x0,x0,#4
- add x1,x1,#4
+ tbz x2, #2, LOCAL_LABEL(JIT_MemCpy_0xc8)
+ ldr w8, [x1], #4
+ str w8, [x0], #4
LOCAL_LABEL(JIT_MemCpy_0xc8):
- tbz x2,#1,LOCAL_LABEL(JIT_MemCpy_0xdc)
- ldrsh w8,[x1]
- strh w8,[x0]
- add x0,x0,#2
- add x1,x1,#2
+ tbz x2, #1, LOCAL_LABEL(JIT_MemCpy_0xdc)
+ ldrsh w8, [x1], #2
+ strh w8, [x0], #2
LOCAL_LABEL(JIT_MemCpy_0xdc):
- tbz x2,#0,LOCAL_LABEL(JIT_MemCpy_0xe8)
- ldrsb w8,[x1]
- strb w8,[x0]
+ tbz x2, #0, LOCAL_LABEL(JIT_MemCpy_0xe8)
+ ldrsb w8, [x1]
+ strb w8, [x0]
LOCAL_LABEL(JIT_MemCpy_0xe8):
ret lr
LEAF_END_MARKED JIT_MemCpy, _TEXT