diff options
Diffstat (limited to 'src/vm/arm/memcpy_crt.asm')
-rw-r--r-- | src/vm/arm/memcpy_crt.asm | 1001 |
1 files changed, 1001 insertions, 0 deletions
diff --git a/src/vm/arm/memcpy_crt.asm b/src/vm/arm/memcpy_crt.asm new file mode 100644 index 0000000000..5e3a97e3fa --- /dev/null +++ b/src/vm/arm/memcpy_crt.asm @@ -0,0 +1,1001 @@ +; Licensed to the .NET Foundation under one or more agreements. +; The .NET Foundation licenses this file to you under the MIT license. +; See the LICENSE file in the project root for more information. + +; + +; + +#include "ksarm.h" + +#if !defined PF_ARM_EXTERNAL_CACHE_AVAILABLE +#define PF_ARM_EXTERNAL_CACHE_AVAILABLE 0x1a +#endif + +#if !defined(_BOOTCRT_) + + DATAAREA + +__memcpy_forward_large_func dcd __memcpy_decide + EXPORT __memcpy_forward_large_func +__memcpy_reverse_large_func dcd __memcpy_decide + EXPORT __memcpy_reverse_large_func + +#endif + + AREA |.text|,ALIGN=5,CODE,READONLY + +; +; void *memcpy(void *dst, const void *src, size_t length) +; +; Copy a block of memory in a forward direction. +; + + ALIGN 32 + LEAF_ENTRY memcpy + + ALTERNATE_ENTRY __memcpy_forward_new + + pld [r1] ; preload the first cache line + cmp r2, #16 ; less than 16 bytes? + mov r3, r0 ; use r3 as our destination + bhs CpyLrge ; go to the small copy case directly + +CpySmal tbb [pc, r2] ; branch to specialized bits for small copies +__SwitchTable1_Copy +CTable dcb (Copy0 - CTable) / 2 ; 0B + dcb (Copy1 - CTable) / 2 ; 1B + dcb (Copy2 - CTable) / 2 ; 2B + dcb (Copy3 - CTable) / 2 ; 3B + dcb (Copy4 - CTable) / 2 ; 4B + dcb (Copy5 - CTable) / 2 ; 5B + dcb (Copy6 - CTable) / 2 ; 6B + dcb (Copy7 - CTable) / 2 ; 7B + dcb (Copy8 - CTable) / 2 ; 8B + dcb (Copy9 - CTable) / 2 ; 9B + dcb (Copy10 - CTable) / 2 ; 10B + dcb (Copy11 - CTable) / 2 ; 11B + dcb (Copy12 - CTable) / 2 ; 12B + dcb (Copy13 - CTable) / 2 ; 13B + dcb (Copy14 - CTable) / 2 ; 14B + dcb (Copy15 - CTable) / 2 ; 15B +__SwitchTableEnd_Copy + +Copy1 ldrb r2, [r1] + strb r2, [r3] +Copy0 bx lr + +Copy2 ldrh r2, [r1] + strh r2, [r3] + bx lr + +Copy3 ldrh r2, [r1] + ldrb r1, [r1, #2] + strh r2, [r3] + strb r1, [r3, #2] + bx lr + +Copy4 ldr r2, [r1] + str r2, [r3] + bx lr + +Copy5 ldr r2, [r1] + ldrb r1, [r1, #4] + str r2, [r3] + strb r1, [r3, #4] + bx lr + +Copy6 ldr r2, [r1] + ldrh r1, [r1, #4] + str r2, [r3] + strh r1, [r3, #4] + bx lr + +Copy7 ldr r12, [r1] + ldrh r2, [r1, #4] + ldrb r1, [r1, #6] + str r12, [r3] + strh r2, [r3, #4] + strb r1, [r3, #6] + bx lr + +Copy8 ldr r2, [r1] + ldr r1, [r1, #4] + str r2, [r3] + str r1, [r3, #4] + bx lr + +Copy9 ldr r12, [r1] + ldr r2, [r1, #4] + ldrb r1, [r1, #8] + str r12, [r3] + str r2, [r3, #4] + strb r1, [r3, #8] + bx lr + +Copy10 ldr r12, [r1] + ldr r2, [r1, #4] + ldrh r1, [r1, #8] + str r12, [r3] + str r2, [r3, #4] + strh r1, [r3, #8] + bx lr + +Copy11 ldr r12, [r1] + ldr r2, [r1, #4] + str r12, [r3] + str r2, [r3, #4] + ldrh r2, [r1, #8] + ldrb r1, [r1, #10] + strh r2, [r3, #8] + strb r1, [r3, #10] + bx lr + +Copy12 ldr r12, [r1] + ldr r2, [r1, #4] + ldr r1, [r1, #8] + str r12, [r3] + str r2, [r3, #4] + str r1, [r3, #8] + bx lr + +Copy13 ldr r12, [r1] + ldr r2, [r1, #4] + str r12, [r3] + str r2, [r3, #4] + ldr r2, [r1, #8] + ldrb r1, [r1, #12] + str r2, [r3, #8] + strb r1, [r3, #12] + bx lr + +Copy14 ldr r12, [r1] + ldr r2, [r1, #4] + str r12, [r3] + str r2, [r3, #4] + ldr r2, [r1, #8] + ldrh r1, [r1, #12] + str r2, [r3, #8] + strh r1, [r3, #12] + bx lr + +Copy15 ldr r12, [r1] + ldr r2, [r1, #4] + str r12, [r3] + str r2, [r3, #4] + ldr r12, [r1, #8] + ldrh r2, [r1, #12] + ldrb r1, [r1, #14] + str r12, [r3, #8] + strh r2, [r3, #12] + strb r1, [r3, #14] + bx lr + +CpyLrge + +#if defined(_BOOTCRT_) + + b __memcpy_forward_large_integer ; always use integer in boot code + +#else + + eor r12, r0, r1 ; see if src/dst are equally aligned + tst r12, #3 ; at least to a 4 byte boundary + bne __memcpy_forward_large_neon ; if not, always use NEON + mov32 r12, __memcpy_forward_large_func ; otherwise, load the large function pointer + ldr pc, [r12] ; and call it + +#endif + + LEAF_END memcpy + + +; +; __memcpy_forward_large_integer (internal calling convention) +; +; Copy large (>= 16 bytes) blocks of memory in a forward direction, +; using integer registers only. +; + + ALIGN 32 + NESTED_ENTRY __memcpy_forward_large_integer_wrapper + +__memcpy_forward_large_integer + + PROLOG_NOP lsls r12, r3, #31 ; C = bit 1, N = bit 0 + PROLOG_PUSH {r4-r9, r11, lr} + +; +; Align destination to a word boundary +; + + bpl %F1 + ldrb r4, [r1], #1 ; fetch byte + subs r2, r2, #1 ; decrement count + strb r4, [r3], #1 ; store byte + lsls r12, r3, #31 ; compute updated status +1 + bcc %F2 ; if already aligned, just skip ahead + ldrh r4, [r1], #2 ; fetch halfword + subs r2, r2, #2 ; decrement count + strh r4, [r3], #2 ; store halfword +2 + tst r1, #3 ; is the source now word-aligned? + bne %F20 ; if not, we have to use the slow path + +; +; Source is word-aligned; fast case +; + +10 + subs r2, r2, #32 ; take 32 off the top + blo %F13 ; if not enough, recover and do small copies + subs r2, r2, #32 ; take off another 32 + pld [r1, #32] ; pre-load one block ahead + blo %F12 ; skip the loop if that's all we have +11 + pld [r1, #64] ; prefetch ahead + subs r2, r2, #32 ; count the bytes for this block + ldm r1!, {r4-r9, r12, lr} ; load 32 bytes + stm r3!, {r4-r9, r12, lr} ; store 32 bytes + bhs %B11 ; keep going until we're done +12 + ldm r1!, {r4-r9, r12, lr} ; load 32 bytes + stm r3!, {r4-r9, r12, lr} ; store 32 bytes +13 + adds r2, r2, #(32 - 8) ; recover original count, and pre-decrement + blo %F15 ; if not enough remaining, skip this loop +14 + subs r2, r2, #8 ; decrement count + ldrd r4, r5, [r1], #8 ; fetch pair of words + strd r4, r5, [r3], #8 ; store pair of words + bhs %B14 ; loop while we still have data remaining +15 + adds r2, r2, #8 ; recover final count + + EPILOG_POP {r4-r9, r11, lr} + EPILOG_NOP bne CpySmal ; if some left, continue with small + EPILOG_RETURN ; else just return + +; +; Source is not word-aligned; slow case +; + +20 + subs r2, r2, #64 ; pre-decrement to simplify the loop + blo %23 ; skip over the loop if we don't have enough + pld [r1, #32] ; pre-load one block ahead +21 + pld [r1, #64] ; prefetch ahead + ldr r4, [r1, #0] ; load 32 bytes + ldr r5, [r1, #4] ; + ldr r6, [r1, #8] ; + ldr r7, [r1, #12] ; + ldr r8, [r1, #16] ; + ldr r9, [r1, #20] ; + ldr r12, [r1, #24] ; + ldr lr, [r1, #28] ; + adds r1, r1, #32 ; update pointer + subs r2, r2, #32 ; count the bytes for this block + stm r3!, {r4-r9, r12, lr} ; store 32 bytes + bhs %B21 ; keep going until we're done +23 + adds r2, r2, #(64 - 8) ; recover original count, and pre-decrement + blo %F25 ; if not enough remaining, skip this loop +24 + ldr r4, [r1] ; fetch pair of words + ldr r5, [r1, #4] ; + adds r1, r1, #8 ; update pointer + subs r2, r2, #8 ; decrement count + strd r4, r5, [r3], #8 ; store pair of words + bhs %B24 ; loop while we still have data remaining +25 + adds r2, r2, #8 ; recover final count + + EPILOG_POP {r4-r9, r11, lr} + EPILOG_NOP bne CpySmal ; if some left, continue with small + EPILOG_RETURN ; else just return + + NESTED_END __memcpy_forward_large_integer + + +; +; __memcpy_forward_large_neon (internal calling convention) +; +; Copy large (>= 16 bytes) blocks of memory in a forward direction, +; using NEON registers. +; + +#if !defined(_BOOTCRT_) + + ALIGN 32 + NESTED_ENTRY __memcpy_forward_large_neon_wrapper + +__memcpy_forward_large_neon + + PROLOG_PUSH {r4-r5, r11, lr} + + subs r2, r2, #32 ; pre-decrement to simplify the loop + blo %F13 ; skip over the loop if we don't have enough + subs r2, r2, #32 ; pre-decrement to simplify the loop + pld [r1, #32] ; pre-load one block ahead + blo %F12 ; skip over the loop if we don't have enough +11 + pld [r1, #64] ; prefetch ahead + subs r2, r2, #32 ; count the bytes for this block + vld1.8 {d0-d3}, [r1]! ; load 32 bytes + vst1.8 {d0-d3}, [r3]! ; store 32 bytes + bhs %B11 ; keep going until we're done +12 + vld1.8 {d0-d3}, [r1]! ; load 32 bytes + vst1.8 {d0-d3}, [r3]! ; store 32 bytes +13 + adds r2, r2, #(32 - 8) ; recover original count, and pre-decrement + blo %F15 ; if not enough remaining, skip this loop +14 + ldr r4, [r1] ; fetch pair of words + ldr r5, [r1, #4] ; + adds r1, r1, #8 ; update pointer + str r4, [r3] ; store pair of words + str r5, [r3, #4] ; + adds r3, r3, #8 + subs r2, r2, #8 ; decrement count + bhs %B14 ; loop while we still have data remaining +15 + adds r2, r2, #8 ; recover final count + + EPILOG_POP {r4-r5, r11, lr} + EPILOG_NOP bne CpySmal ; if some left, continue with small + EPILOG_RETURN ; else just return + + NESTED_END __memcpy_forward_large_neon + +#endif + + +; +; void *memmove(void *dst, const void *src, size_t length) +; +; Copy a block of memory in a forward or reverse direction, ensuring that +; overlapping source/destination regions are copied correctly. +; + + ALIGN 32 + LEAF_ENTRY memmove + + subs r3, r0, r1 ; compute dest - source + cmp r3, r2 ; compare against size + bhs memcpy ; if no overlap, we can just do memcpy + + ALTERNATE_ENTRY __memcpy_reverse_new + + cmp r2, #16 ; less than 16 bytes? + pld [r1] ; preload the first cache line + bhs MovLrge ; go to the small copy case directly + +MovSmal tbb [pc, r2] ; branch to specialized bits for small copies +__SwitchTable1_Move +MTable dcb (Move0 - MTable) / 2 ; 0B + dcb (Move1 - MTable) / 2 ; 1B + dcb (Move2 - MTable) / 2 ; 2B + dcb (Move3 - MTable) / 2 ; 3B + dcb (Move4 - MTable) / 2 ; 4B + dcb (Move5 - MTable) / 2 ; 5B + dcb (Move6 - MTable) / 2 ; 6B + dcb (Move7 - MTable) / 2 ; 7B + dcb (Move8 - MTable) / 2 ; 8B + dcb (Move9 - MTable) / 2 ; 9B + dcb (Move10 - MTable) / 2 ; 10B + dcb (Move11 - MTable) / 2 ; 11B + dcb (Move12 - MTable) / 2 ; 12B + dcb (Move13 - MTable) / 2 ; 13B + dcb (Move14 - MTable) / 2 ; 14B + dcb (Move15 - MTable) / 2 ; 15B +__SwitchTableEnd_Move + +Move1 ldrb r2, [r1] + strb r2, [r0] +Move0 bx lr + +Move2 ldrh r2, [r1] + strh r2, [r0] + bx lr + +Move3 ldrh r2, [r1] + ldrb r1, [r1, #2] + strh r2, [r0] + strb r1, [r0, #2] + bx lr + +Move4 ldr r2, [r1] + str r2, [r0] + bx lr + +Move5 ldr r2, [r1] + ldrb r1, [r1, #4] + str r2, [r0] + strb r1, [r0, #4] + bx lr + +Move6 ldr r2, [r1] + ldrh r1, [r1, #4] + str r2, [r0] + strh r1, [r0, #4] + bx lr + +Move7 ldr r3, [r1] + ldrh r2, [r1, #4] + ldrb r1, [r1, #6] + str r3, [r0] + strh r2, [r0, #4] + strb r1, [r0, #6] + bx lr + +Move8 ldr r2, [r1] + ldr r1, [r1, #4] + str r2, [r0] + str r1, [r0, #4] + bx lr + +Move9 ldr r3, [r1] + ldr r2, [r1, #4] + ldrb r1, [r1, #8] + str r3, [r0] + str r2, [r0, #4] + strb r1, [r0, #8] + bx lr + +Move10 ldr r3, [r1] + ldr r2, [r1, #4] + ldrh r1, [r1, #8] + str r3, [r0] + str r2, [r0, #4] + strh r1, [r0, #8] + bx lr + +Move11 ldr r12, [r1] + ldr r3, [r1, #4] + ldrh r2, [r1, #8] + ldrb r1, [r1, #10] + str r12, [r0] + str r3, [r0, #4] + strh r2, [r0, #8] + strb r1, [r0, #10] + bx lr + +Move12 ldr r12, [r1] + ldr r2, [r1, #4] + ldr r1, [r1, #8] + str r12, [r0] + str r2, [r0, #4] + str r1, [r0, #8] + bx lr + +Move13 ldr r12, [r1] + ldr r3, [r1, #4] + ldr r2, [r1, #8] + ldrb r1, [r1, #12] + str r12, [r0] + str r3, [r0, #4] + str r2, [r0, #8] + strb r1, [r0, #12] + bx lr + +Move14 ldr r12, [r1] + ldr r3, [r1, #4] + ldr r2, [r1, #8] + ldrh r1, [r1, #12] + str r12, [r0] + str r3, [r0, #4] + str r2, [r0, #8] + strh r1, [r0, #12] + bx lr + +Move15 ldrh r3, [r1, #12] + ldrb r2, [r1, #14] + strh r3, [r0, #12] + strb r2, [r0, #14] + ldr r3, [r1] + ldr r2, [r1, #4] + ldr r1, [r1, #8] + str r3, [r0] + str r2, [r0, #4] + str r1, [r0, #8] + bx lr + +MovLrge + +#if defined(_BOOTCRT_) + + b __memcpy_reverse_large_integer ; always use integer in boot code + +#else + + eor r12, r0, r1 ; see if src/dst are equally aligned + tst r12, #3 ; at least to a 4 byte boundary + bne __memcpy_reverse_large_neon ; if not, always use NEON + mov32 r12, __memcpy_reverse_large_func + ldr pc, [r12] + +#endif + + LEAF_END memmove + + +; +; __memcpy_reverse_large_integer (internal calling convention) +; +; Copy large (>= 16 bytes) block of memory in a reverse direction, +; using NEON registers. +; + + ALIGN 32 + NESTED_ENTRY __memcpy_reverse_large_integer_wrapper + +__memcpy_reverse_large_integer + + PROLOG_NOP adds r3, r0, r2 ; advance destination to end + PROLOG_NOP adds r1, r1, r2 ; advance source to end + PROLOG_NOP lsls r12, r3, #31 ; C = bit 1, N = bit 0 + PROLOG_NOP pld [r1, #-32] ; pre-load one block ahead + PROLOG_PUSH {r4-r9, r11, lr} + +; +; Align destination to a word boundary +; + + bpl %F1 + ldrb r4, [r1, #-1]! ; fetch byte + subs r2, r2, #1 ; decrement count + strb r4, [r3, #-1]! ; store byte + lsls r12, r3, #31 ; compute updated status +1 + bcc %F2 ; if already aligned, just skip ahead + ldrh r4, [r1, #-2]! ; fetch halfword + subs r2, r2, #2 ; decrement count + strh r4, [r3, #-2]! ; store halfword +2 + tst r1, #3 ; is the source now word-aligned? + bne %F20 ; if not, we have to use the slow path + +; +; Source is word-aligned; fast case +; + +10 + subs r2, r2, #32 ; pre-decrement to simplify the loop + blo %F13 ; skip over the loop if we don't have enough + subs r2, r2, #32 ; pre-decrement to simplify the loop + pld [r1, #-64] ; pre-load one block ahead + blo %F12 ; skip over the loop if we don't have enough +11 + pld [r1, #-96] ; prefetch ahead + subs r2, r2, #32 ; count the bytes for this block + ldmdb r1!, {r4-r9, r12, lr} ; load 32 bytes + stmdb r3!, {r4-r9, r12, lr} ; store 32 bytes + bhs %B11 ; keep going until we're done +12 + ldmdb r1!, {r4-r9, r12, lr} ; load 32 bytes + stmdb r3!, {r4-r9, r12, lr} ; store 32 bytes +13 + adds r2, r2, #(32 - 8) ; recover original count, and pre-decrement + blo %F15 ; if not enough remaining, skip this loop +14 + subs r2, r2, #8 ; decrement count + ldrd r4, r5, [r1, #-8]! ; fetch pair of words + strd r4, r5, [r3, #-8]! ; store pair of words + bhs %B14 ; loop while we still have data remaining +15 + adds r2, r2, #8 ; determine final count + subs r1, r1, r2 ; recover original source + + EPILOG_POP {r4-r9, r11, lr} + EPILOG_NOP bne MovSmal ; if some left, continue with small + EPILOG_RETURN ; else just return + + +; +; Source is not word-aligned; slow case +; + +20 + subs r2, r2, #64 ; pre-decrement to simplify the loop + blo %F23 ; skip over the loop if we don't have enough + pld [r1, #-64] ; pre-load one block ahead +21 + pld [r1, #-96] ; prefetch ahead + subs r2, r2, #32 ; count the bytes for this block + ldr r4, [r1, #-32]! ; load 32 bytes + ldr r5, [r1, #4] ; + ldr r6, [r1, #8] ; + ldr r7, [r1, #12] ; + ldr r8, [r1, #16] ; + ldr r9, [r1, #20] ; + ldr r12, [r1, #24] ; + ldr lr, [r1, #28] ; + stmdb r3!, {r4-r9, r12, lr} ; store 32 bytes + bhs %B21 ; keep going until we're done +23 + adds r2, r2, #(64 - 8) ; recover original count, and pre-decrement + blo %F25 ; if not enough remaining, skip this loop +24 + subs r2, r2, #8 ; decrement count + ldr r4, [r1, #-8]! ; fetch pair of words + ldr r5, [r1, #4] ; + strd r4, r5, [r3, #-8]! ; store pair of words + bhs %B24 ; loop while we still have data remaining +25 + adds r2, r2, #8 ; determine final count + subs r1, r1, r2 ; recover original source + + EPILOG_POP {r4-r9, r11, lr} + EPILOG_NOP bne MovSmal ; if some left, continue with small + EPILOG_RETURN ; else just return + + NESTED_END __memcpy_reverse_large_integer + + +; +; __memcpy_reverse_large_neon (internal calling convention) +; +; Copy large (>= 16 bytes) block of memory in a reverse direction, +; using NEON registers. +; + +#if !defined(_BOOTCRT_) + + ALIGN 32 + NESTED_ENTRY __memcpy_reverse_large_neon_wrapper + +__memcpy_reverse_large_neon + + PROLOG_NOP adds r3, r0, r2 ; advance destination to end + PROLOG_NOP adds r1, r1, r2 ; advance source to end + PROLOG_NOP lsls r12, r3, #31 ; C = bit 1, N = bit 0 + PROLOG_NOP pld [r1, #-32] ; pre-load one block ahead + PROLOG_PUSH {r4-r5, r11, lr} + +; +; Align destination to a word boundary +; + + bpl %F1 + ldrb r4, [r1, #-1]! ; fetch byte + subs r2, r2, #1 ; decrement count + strb r4, [r3, #-1]! ; store byte + lsls r12, r3, #31 ; compute updated status +1 + bcc %F2 ; if already aligned, just skip ahead + ldrh r4, [r1, #-2]! ; fetch halfword + subs r2, r2, #2 ; decrement count + strh r4, [r3, #-2]! ; store halfword +2 + +; +; Perform main copy +; + + subs r2, r2, #32 ; pre-decrement to simplify the loop + blo %F13 ; skip over the loop if we don't have enough + subs r2, r2, #32 ; pre-decrement to simplify the loop + pld [r1, #-64] ; pre-load one block ahead + blo %F12 ; skip over the loop if we don't have enough +11 + pld [r1, #-96] ; prefetch ahead + subs r1, r1, #32 + subs r3, r3, #32 + subs r2, r2, #32 ; count the bytes for this block + vld1.8 {d0-d3}, [r1] ; load 32 bytes + vst1.8 {d0-d3}, [r3] ; store 32 bytes + bhs %B11 ; keep going until we're done +12 + subs r1, r1, #32 + subs r3, r3, #32 + vld1.8 {d0-d3}, [r1] ; load 32 bytes + vst1.8 {d0-d3}, [r3] ; store 32 bytes +13 + adds r2, r2, #(32 - 8) ; recover original count, and pre-decrement + blo %F15 ; if not enough remaining, skip this loop +14 + ldr r4, [r1, #-8]! ; fetch pair of words + ldr r5, [r1, #4] ; fetch pair of words + subs r2, r2, #8 ; decrement count + str r4, [r3, #-8]! ; store pair of words + str r5, [r3, #4] + bhs %B14 ; loop while we still have data remaining +15 + adds r2, r2, #8 ; determine final count + subs r1, r1, r2 ; recover original source + + EPILOG_POP {r4-r5, r11, lr} + EPILOG_NOP bne MovSmal ; if some left, continue with small + EPILOG_RETURN ; else just return + + NESTED_END __memcpy_reverse_large_neon + +#endif + + +; +; __memcpy_decide (internal calling convention) +; +; Determine whether to use integer or NEON for future memcpy's. +; + +#if !defined(_BOOTCRT_) + + ALIGN 32 + NESTED_ENTRY __memcpy_decide_wrapper + +__memcpy_decide + + PROLOG_PUSH {r4-r5, r11, lr} + + ; + ; We want to use integer memcpy's on the A9, which has an external cache. + ; + ; First determine if we're in user or kernel mode. Reading CPSR + ; from user mode will either return the proper 5 mode bits, or all 0s. + ; Conveniently, user mode is 0x10, and there is no mode 0x00, so if + ; we read CPSR and the low 4 bits are 0, that's good enough. + ; + + mrs r4, cpsr ; get CPSR + ands r4, r4, #0xf ; isolate the low 4 bits of the mode + beq %F1 ; if 0, we're in user mode + + ; + ; If we are in kernel mode, read the MIDR directly. + ; + + CP_READ r4, CP15_MIDR ; read main ID register + ubfx r5, r4, #24, #8 ; get implementer + lsrs r4, r4, #4 ; shift off revision field + cmp r5, #0x41 ; is implementer == ARM? + bne %F3 ; if not, use NEON + bfc r4, #12, #20 ; clear upper bits + ldr r5, =0xc09 ; A9 signature + cmp r4, r5 ; is this an A9? + bne %F3 ; if not, use NEON + b %F2 ; otherwise, use integer + + ; + ; If we are in user mode, check the "external cache available" flag + ; +1 + ldr r4, =MM_SHARED_USER_DATA_VA + UsProcessorFeatures + PF_ARM_EXTERNAL_CACHE_AVAILABLE + ldrb r4, [r4] ; get external cache bit + cbz r4, %F3 ; if no external cache, do NEON + + ; + ; Register for integer functions + ; +2 + ldr r4, =__memcpy_forward_large_integer ; select integer functions + ldr r5, =__memcpy_forward_large_func ; + str r4, [r5] ; + ldr r4, =__memcpy_reverse_large_integer ; select integer functions + ldr r5, =__memcpy_reverse_large_func ; + str r4, [r5] ; + b %F4 + + ; + ; Register for NEON functions + ; +3 + ldr r4, =__memcpy_forward_large_neon ; select NEON functions + ldr r5, =__memcpy_forward_large_func ; + str r4, [r5] ; + ldr r4, =__memcpy_reverse_large_neon ; select NEON functions + ldr r5, =__memcpy_reverse_large_func ; + str r4, [r5] ; +4 + EPILOG_POP {r4-r5, r11, lr} ; restore saved registers + EPILOG_NOP ldr pc, [r12] ; jump to the appropriate target + + NESTED_END __memcpy_decide + +#endif + + +; +; void _memcpy_strict_align(void *dst, const void *src, size_t length) +; +; Copy a block of memory in a forward direction, only performing naturally-aligned +; accesses. +; + + ALIGN 32 + LEAF_ENTRY _memcpy_strict_align + +; +; Verify alignment between source and destination +; + + sub r3, r0, r1 ; get relative alignment of source and destination + cbz r2, CopyExit ; exit if 0 count + ands r3, r3, #3 ; check DWORD alignment + bne CopyMisalignedHalf ; misaligned + +; +; Source and destination are equally aligned: just align the +; destination and the source will end up aligned as well +; + + tst r0, #3 ; dword aligned at the dest? + beq WordAligned_0 ; if so, skip ahead + tst r0, #1 ; halfword aligned at the dest? + beq HalfAligned_0 ; if so, skip ahead + + subs r2, r2, #1 ; decrement count + ldrb r3, [r1], #1 ; fetch byte + strb r3, [r0], #1 ; store it + beq CopyExit ; stop if done + tst r0, #3 ; word aligned now? + beq WordAligned_0 ; if so, skip ahead + +HalfAligned_0 + cmp r2, #2 ; do we have at least 2 bytes left? + blo CopyFinalBytes ; if not, copy bytes + subs r2, r2, #2 ; decrement count + ldrh r3, [r1], #2 ; fetch halfword + strh r3, [r0], #2 ; store it + beq CopyExit ; stop if done + +WordAligned_0 + subs r2, r2, #4 ; at least 4 bytes remaining? + blt WordLoopEnd_0 ; if not, skip the main loop +WordLoop_0 + subs r2, r2, #4 ; decrement count + ldr r3, [r1], #4 ; fetch word + str r3, [r0], #4 ; store it + bge WordLoop_0 ; stop if done +WordLoopEnd_0 + adds r2, r2, #4 ; recover the extra 4 we subtracted + beq CopyExit ; stop if that's everything + +CopyFinalHalfwords + subs r2, r2, #2 ; at least 2 bytes remaining? + blt CopyFinalHalfwordsEnd ; if not, skip this +CopyFinalHalfwordsLoop + subs r2, r2, #2 ; decrement count + ldrh r3, [r1], #2 ; fetch halfword + strh r3, [r0], #2 ; store it + bge CopyFinalHalfwordsLoop ; loop until done +CopyFinalHalfwordsEnd + adds r2, r2, #2 ; recover the extra 2 we subtracted + beq CopyExit ; stop if that's everything + +CopyFinalBytes + subs r2, r2, #1 ; decrement count + ldrb r3, [r1], #1 ; fetch byte + strb r3, [r0], #1 ; store it + bne CopyFinalBytes ; loop until done +CopyExit + bx lr ; return + + +; +; Source and destination are misaligned by 2 bytes +; + +CopyMisalignedHalf + cmp r3, #2 ; misaligned by a halfword? + bne CopyMisalignedByte ; if not, skip + + tst r0, #3 ; dword aligned at the dest? + beq WordAligned_2 ; if so, skip ahead + tst r0, #1 ; halfword aligned at the dest? + beq HalfAligned_2 ; if so, skip ahead + + subs r2, r2, #1 ; decrement count + ldrb r3, [r1], #1 ; fetch byte + strb r3, [r0], #1 ; store it + beq CopyExit ; stop if done + tst r0, #3 ; word aligned now? + beq WordAligned_2 ; if so, skip ahead + +HalfAligned_2 + cmp r2, #2 ; do we have at least 2 bytes left? + blo CopyFinalBytes ; if not, copy bytes + subs r2, r2, #2 ; decrement count + ldrh r3, [r1], #2 ; fetch halfword + strh r3, [r0], #2 ; store it + beq CopyExit ; stop if done + +WordAligned_2 + subs r2, r2, #6 ; at least 6 bytes remaining? + blt WordLoopEnd_2 ; if so, skip the main loop + ldrh r12, [r1], #2 ; preload a halfword of source + subs r2, r2, #2 ; count these 2 bytes +WordLoop_2 + subs r2, r2, #4 ; decrement count + ldr r3, [r1], #4 ; fetch word + orr r12, r12, r3, lsl #16 ; copy low 16 bits to upper 16 of r12 + str r12, [r0], #4 ; store it + lsr r12, r3, #16 ; copy upper 16 bits to lower 16 of r12 + bge WordLoop_2 ; stop if done + strh r12, [r0], #2 ; store the extra halfword to the dest +WordLoopEnd_2 + adds r2, r2, #6 ; recover the extra 6 we subtracted + beq CopyExit ; stop if that's everything + b CopyFinalHalfwords ; otherwise, copy remainder + + +; +; Source and destination are misaligned by 1 byte +; + +CopyMisalignedByte + cmp r3, #1 ; misaligned by a byte? + bne CopyMisalignedByte3 ; if not, skip + + tst r0, #3 ; dword aligned at the dest? + beq WordAligned_1 ; if so, skip ahead +ByteAlign_1 + subs r2, r2, #1 ; decrement count + ldrb r3, [r1], #1 ; fetch byte + strb r3, [r0], #1 ; store it + beq CopyExit ; stop if done + tst r0, #3 ; word aligned now? + bne ByteAlign_1 ; if not, keep copying bytes + +WordAligned_1 + subs r2, r2, #5 ; at least 5 bytes remaining? + blt WordLoopEnd_1 ; if so, skip the main loop + ldrb r12, [r1], #1 ; preload a byte of source + subs r2, r2, #1 ; count this byte +WordLoop_1 + subs r2, r2, #4 ; decrement count + ldr r3, [r1], #4 ; fetch word + orr r12, r12, r3, lsl #8 ; copy low 24 bits to upper 24 of r12 + str r12, [r0], #4 ; store it + lsr r12, r3, #24 ; copy upper 8 bits to lower 8 of r12 + bge WordLoop_1 ; stop if done + strb r12, [r0], #1 ; store the extra byte to the dest +WordLoopEnd_1 + adds r2, r2, #5 ; recover the extra 5 we subtracted + beq CopyExit ; stop if that's everything + b CopyFinalBytes ; otherwise, copy remainder + + +; +; Source and destination are misaligned by 3 bytes +; + +CopyMisalignedByte3 + tst r0, #3 ; dword aligned at the dest? + beq WordAligned_3 ; if so, skip ahead +ByteAlign_3 + subs r2, r2, #1 ; decrement count + ldrb r3, [r1], #1 ; fetch byte + strb r3, [r0], #1 ; store it + beq CopyExit ; stop if done + tst r0, #3 ; word aligned now? + bne ByteAlign_3 ; if not, keep copying bytes + +WordAligned_3 + subs r2, r2, #7 ; at least 7 bytes remaining? + blt WordLoopEnd_3 ; if so, skip the main loop + ldrb r12, [r1], #1 ; preload a byte of source + ldrh r3, [r1], #2 ; preload a halfword of source + orr r12, r12, r3, lsl #8 ; OR in the halfword + subs r2, r2, #3 ; count these 3 bytes +WordLoop_3 + subs r2, r2, #4 ; decrement count + ldr r3, [r1], #4 ; fetch word + orr r12, r12, r3, lsl #24 ; copy low 8 bits to upper 8 of r12 + str r12, [r0], #4 ; store it + lsr r12, r3, #8 ; copy upper 24 bits to lower 24 of r12 + bge WordLoop_3 ; stop if done + strh r12, [r0], #2 ; store the extra halfword to the dest + lsr r12, r12, #16 ; down to the final byte + strb r12, [r0], #1 ; store the extra byte to the dest +WordLoopEnd_3 + adds r2, r2, #7 ; recover the extra 7 we subtracted + beq CopyExit ; stop if that's everything + b CopyFinalBytes ; otherwise, copy remainder + + LEAF_END _memcpy_strict_align + + END |