; Licensed to the .NET Foundation under one or more agreements.
; The .NET Foundation licenses this file to you under the MIT license.
; See the LICENSE file in the project root for more information.

;

;

#include "ksarm.h"

#if !defined PF_ARM_EXTERNAL_CACHE_AVAILABLE
#define PF_ARM_EXTERNAL_CACHE_AVAILABLE 0x1a
#endif

#if !defined(_BOOTCRT_)

        DATAAREA

__memcpy_forward_large_func dcd __memcpy_decide
        EXPORT __memcpy_forward_large_func
__memcpy_reverse_large_func dcd __memcpy_decide
        EXPORT __memcpy_reverse_large_func

#endif

        AREA    |.text|,ALIGN=5,CODE,READONLY

;
; void *memcpy(void *dst, const void *src, size_t length)
;
; Copy a block of memory in a forward direction.
;

        ALIGN 32
        LEAF_ENTRY memcpy

        ALTERNATE_ENTRY __memcpy_forward_new
        
        pld     [r1]                                    ; preload the first cache line
        cmp     r2, #16                                 ; less than 16 bytes?
        mov     r3, r0                                  ; use r3 as our destination
        bhs     CpyLrge                                 ; go to the small copy case directly

CpySmal tbb     [pc, r2]                                ; branch to specialized bits for small copies
__SwitchTable1_Copy
CTable  dcb     (Copy0 - CTable) / 2                    ; 0B
        dcb     (Copy1 - CTable) / 2                    ; 1B
        dcb     (Copy2 - CTable) / 2                    ; 2B
        dcb     (Copy3 - CTable) / 2                    ; 3B
        dcb     (Copy4 - CTable) / 2                    ; 4B
        dcb     (Copy5 - CTable) / 2                    ; 5B
        dcb     (Copy6 - CTable) / 2                    ; 6B
        dcb     (Copy7 - CTable) / 2                    ; 7B
        dcb     (Copy8 - CTable) / 2                    ; 8B
        dcb     (Copy9 - CTable) / 2                    ; 9B
        dcb     (Copy10 - CTable) / 2                   ; 10B
        dcb     (Copy11 - CTable) / 2                   ; 11B
        dcb     (Copy12 - CTable) / 2                   ; 12B
        dcb     (Copy13 - CTable) / 2                   ; 13B
        dcb     (Copy14 - CTable) / 2                   ; 14B
        dcb     (Copy15 - CTable) / 2                   ; 15B
__SwitchTableEnd_Copy

Copy1   ldrb    r2, [r1]
        strb    r2, [r3]
Copy0   bx      lr

Copy2   ldrh    r2, [r1]
        strh    r2, [r3]
        bx      lr

Copy3   ldrh    r2, [r1]
        ldrb    r1, [r1, #2]
        strh    r2, [r3]
        strb    r1, [r3, #2]
        bx      lr

Copy4   ldr     r2, [r1]
        str     r2, [r3]
        bx      lr

Copy5   ldr     r2, [r1]
        ldrb    r1, [r1, #4]
        str     r2, [r3]
        strb    r1, [r3, #4]
        bx      lr

Copy6   ldr     r2, [r1]
        ldrh    r1, [r1, #4]
        str     r2, [r3]
        strh    r1, [r3, #4]
        bx      lr

Copy7   ldr     r12, [r1]
        ldrh    r2, [r1, #4]
        ldrb    r1, [r1, #6]
        str     r12, [r3]
        strh    r2, [r3, #4]
        strb    r1, [r3, #6]
        bx      lr

Copy8   ldr     r2, [r1]
        ldr     r1, [r1, #4]
        str     r2, [r3]
        str     r1, [r3, #4]
        bx      lr

Copy9   ldr     r12, [r1]
        ldr     r2, [r1, #4]
        ldrb    r1, [r1, #8]
        str     r12, [r3]
        str     r2, [r3, #4]
        strb    r1, [r3, #8]
        bx      lr

Copy10  ldr     r12, [r1]
        ldr     r2, [r1, #4]
        ldrh    r1, [r1, #8]
        str     r12, [r3]
        str     r2, [r3, #4]
        strh    r1, [r3, #8]
        bx      lr

Copy11  ldr     r12, [r1]
        ldr     r2, [r1, #4]
        str     r12, [r3]
        str     r2, [r3, #4]
        ldrh    r2, [r1, #8]
        ldrb    r1, [r1, #10]
        strh    r2, [r3, #8]
        strb    r1, [r3, #10]
        bx      lr

Copy12  ldr     r12, [r1]
        ldr     r2, [r1, #4]
        ldr     r1, [r1, #8]
        str     r12, [r3]
        str     r2, [r3, #4]
        str     r1, [r3, #8]
        bx      lr

Copy13  ldr     r12, [r1]
        ldr     r2, [r1, #4]
        str     r12, [r3]
        str     r2, [r3, #4]
        ldr     r2, [r1, #8]
        ldrb    r1, [r1, #12]
        str     r2, [r3, #8]
        strb    r1, [r3, #12]
        bx      lr

Copy14  ldr     r12, [r1]
        ldr     r2, [r1, #4]
        str     r12, [r3]
        str     r2, [r3, #4]
        ldr     r2, [r1, #8]
        ldrh    r1, [r1, #12]
        str     r2, [r3, #8]
        strh    r1, [r3, #12]
        bx      lr

Copy15  ldr     r12, [r1]
        ldr     r2, [r1, #4]
        str     r12, [r3]
        str     r2, [r3, #4]
        ldr     r12, [r1, #8]
        ldrh    r2, [r1, #12]
        ldrb    r1, [r1, #14]
        str     r12, [r3, #8]
        strh    r2, [r3, #12]
        strb    r1, [r3, #14]
        bx      lr

CpyLrge

#if defined(_BOOTCRT_)

        b       __memcpy_forward_large_integer          ; always use integer in boot code

#else

        eor     r12, r0, r1                             ; see if src/dst are equally aligned
        tst     r12, #3                                 ; at least to a 4 byte boundary
        bne     __memcpy_forward_large_neon             ; if not, always use NEON
        mov32   r12, __memcpy_forward_large_func        ; otherwise, load the large function pointer
        ldr     pc, [r12]                               ; and call it

#endif

        LEAF_END memcpy


;
; __memcpy_forward_large_integer (internal calling convention)
;
; Copy large (>= 16 bytes) blocks of memory in a forward direction,
; using integer registers only.
;

        ALIGN 32
        NESTED_ENTRY __memcpy_forward_large_integer_wrapper

__memcpy_forward_large_integer
        
        PROLOG_NOP lsls r12, r3, #31                    ; C = bit 1, N = bit 0
        PROLOG_PUSH {r4-r9, r11, lr}

;
; Align destination to a word boundary
;

        bpl     %F1
        ldrb    r4, [r1], #1                            ; fetch byte
        subs    r2, r2, #1                              ; decrement count
        strb    r4, [r3], #1                            ; store byte
        lsls    r12, r3, #31                            ; compute updated status
1
        bcc     %F2                                     ; if already aligned, just skip ahead
        ldrh    r4, [r1], #2                            ; fetch halfword
        subs    r2, r2, #2                              ; decrement count
        strh    r4, [r3], #2                            ; store halfword
2
        tst     r1, #3                                  ; is the source now word-aligned?
        bne     %F20                                    ; if not, we have to use the slow path

;
; Source is word-aligned; fast case
;

10
        subs    r2, r2, #32                             ; take 32 off the top
        blo     %F13                                    ; if not enough, recover and do small copies
        subs    r2, r2, #32                             ; take off another 32
        pld     [r1, #32]                               ; pre-load one block ahead
        blo     %F12                                    ; skip the loop if that's all we have
11
        pld     [r1, #64]                               ; prefetch ahead
        subs    r2, r2, #32                             ; count the bytes for this block
        ldm     r1!, {r4-r9, r12, lr}                   ; load 32 bytes
        stm     r3!, {r4-r9, r12, lr}                   ; store 32 bytes
        bhs     %B11                                    ; keep going until we're done
12
        ldm     r1!, {r4-r9, r12, lr}                   ; load 32 bytes
        stm     r3!, {r4-r9, r12, lr}                   ; store 32 bytes
13
        adds    r2, r2, #(32 - 8)                       ; recover original count, and pre-decrement
        blo     %F15                                    ; if not enough remaining, skip this loop
14 
        subs    r2, r2, #8                              ; decrement count
        ldrd    r4, r5, [r1], #8                        ; fetch pair of words
        strd    r4, r5, [r3], #8                        ; store pair of words
        bhs     %B14                                    ; loop while we still have data remaining
15
        adds    r2, r2, #8                              ; recover final count

        EPILOG_POP {r4-r9, r11, lr}
        EPILOG_NOP bne CpySmal                          ; if some left, continue with small
        EPILOG_RETURN                                   ; else just return

;
; Source is not word-aligned; slow case
;

20
        subs    r2, r2, #64                             ; pre-decrement to simplify the loop
        blo     %23                                     ; skip over the loop if we don't have enough
        pld     [r1, #32]                               ; pre-load one block ahead
21
        pld     [r1, #64]                               ; prefetch ahead
        ldr     r4, [r1, #0]                            ; load 32 bytes
        ldr     r5, [r1, #4]                            ;
        ldr     r6, [r1, #8]                            ;
        ldr     r7, [r1, #12]                           ;
        ldr     r8, [r1, #16]                           ;
        ldr     r9, [r1, #20]                           ;
        ldr     r12, [r1, #24]                          ;
        ldr     lr, [r1, #28]                           ;
        adds    r1, r1, #32                             ; update pointer
        subs    r2, r2, #32                             ; count the bytes for this block
        stm     r3!, {r4-r9, r12, lr}                   ; store 32 bytes
        bhs     %B21                                    ; keep going until we're done
23
        adds    r2, r2, #(64 - 8)                       ; recover original count, and pre-decrement
        blo     %F25                                    ; if not enough remaining, skip this loop
24
        ldr     r4, [r1]                                ; fetch pair of words
        ldr     r5, [r1, #4]                            ;
        adds    r1, r1, #8                              ; update pointer
        subs    r2, r2, #8                              ; decrement count
        strd    r4, r5, [r3], #8                        ; store pair of words
        bhs     %B24                                    ; loop while we still have data remaining
25
        adds    r2, r2, #8                              ; recover final count

        EPILOG_POP {r4-r9, r11, lr}
        EPILOG_NOP bne CpySmal                          ; if some left, continue with small
        EPILOG_RETURN                                   ; else just return

        NESTED_END __memcpy_forward_large_integer


;
; __memcpy_forward_large_neon (internal calling convention)
;
; Copy large (>= 16 bytes) blocks of memory in a forward direction,
; using NEON registers.
;

#if !defined(_BOOTCRT_)

        ALIGN 32
        NESTED_ENTRY __memcpy_forward_large_neon_wrapper

__memcpy_forward_large_neon
        
        PROLOG_PUSH {r4-r5, r11, lr}

        subs    r2, r2, #32                             ; pre-decrement to simplify the loop
        blo     %F13                                    ; skip over the loop if we don't have enough
        subs    r2, r2, #32                             ; pre-decrement to simplify the loop
        pld     [r1, #32]                               ; pre-load one block ahead
        blo     %F12                                    ; skip over the loop if we don't have enough
11
        pld     [r1, #64]                               ; prefetch ahead
        subs    r2, r2, #32                             ; count the bytes for this block
        vld1.8  {d0-d3}, [r1]!                          ; load 32 bytes
        vst1.8  {d0-d3}, [r3]!                          ; store 32 bytes
        bhs     %B11                                    ; keep going until we're done
12
        vld1.8  {d0-d3}, [r1]!                          ; load 32 bytes
        vst1.8  {d0-d3}, [r3]!                          ; store 32 bytes
13
        adds    r2, r2, #(32 - 8)                       ; recover original count, and pre-decrement
        blo     %F15                                    ; if not enough remaining, skip this loop
14   
        ldr     r4, [r1]                                ; fetch pair of words
        ldr     r5, [r1, #4]                            ;
        adds    r1, r1, #8                              ; update pointer
        str     r4, [r3]                                ; store pair of words
        str     r5, [r3, #4]                            ;
        adds    r3, r3, #8
        subs    r2, r2, #8                              ; decrement count
        bhs     %B14                                    ; loop while we still have data remaining
15
        adds    r2, r2, #8                              ; recover final count

        EPILOG_POP {r4-r5, r11, lr}
        EPILOG_NOP bne CpySmal                          ; if some left, continue with small
        EPILOG_RETURN                                   ; else just return

        NESTED_END __memcpy_forward_large_neon

#endif


;
; void *memmove(void *dst, const void *src, size_t length)
;
; Copy a block of memory in a forward or reverse direction, ensuring that
; overlapping source/destination regions are copied correctly.
;

        ALIGN 32
        LEAF_ENTRY memmove
        
        subs    r3, r0, r1                              ; compute dest - source
        cmp     r3, r2                                  ; compare against size
        bhs     memcpy                                  ; if no overlap, we can just do memcpy
        
        ALTERNATE_ENTRY __memcpy_reverse_new
        
        cmp     r2, #16                                 ; less than 16 bytes?
        pld     [r1]                                    ; preload the first cache line
        bhs     MovLrge                                 ; go to the small copy case directly

MovSmal tbb     [pc, r2]                                ; branch to specialized bits for small copies
__SwitchTable1_Move
MTable  dcb     (Move0 - MTable) / 2                    ; 0B
        dcb     (Move1 - MTable) / 2                    ; 1B
        dcb     (Move2 - MTable) / 2                    ; 2B
        dcb     (Move3 - MTable) / 2                    ; 3B
        dcb     (Move4 - MTable) / 2                    ; 4B
        dcb     (Move5 - MTable) / 2                    ; 5B
        dcb     (Move6 - MTable) / 2                    ; 6B
        dcb     (Move7 - MTable) / 2                    ; 7B
        dcb     (Move8 - MTable) / 2                    ; 8B
        dcb     (Move9 - MTable) / 2                    ; 9B
        dcb     (Move10 - MTable) / 2                   ; 10B
        dcb     (Move11 - MTable) / 2                   ; 11B
        dcb     (Move12 - MTable) / 2                   ; 12B
        dcb     (Move13 - MTable) / 2                   ; 13B
        dcb     (Move14 - MTable) / 2                   ; 14B
        dcb     (Move15 - MTable) / 2                   ; 15B
__SwitchTableEnd_Move

Move1   ldrb    r2, [r1]
        strb    r2, [r0]
Move0   bx      lr

Move2   ldrh    r2, [r1]
        strh    r2, [r0]
        bx      lr

Move3   ldrh    r2, [r1]
        ldrb    r1, [r1, #2]
        strh    r2, [r0]
        strb    r1, [r0, #2]
        bx      lr

Move4   ldr     r2, [r1]
        str     r2, [r0]
        bx      lr

Move5   ldr     r2, [r1]
        ldrb    r1, [r1, #4]
        str     r2, [r0]
        strb    r1, [r0, #4]
        bx      lr

Move6   ldr     r2, [r1]
        ldrh    r1, [r1, #4]
        str     r2, [r0]
        strh    r1, [r0, #4]
        bx      lr

Move7   ldr     r3, [r1]
        ldrh    r2, [r1, #4]
        ldrb    r1, [r1, #6]
        str     r3, [r0]
        strh    r2, [r0, #4]
        strb    r1, [r0, #6]
        bx      lr

Move8   ldr     r2, [r1]
        ldr     r1, [r1, #4]
        str     r2, [r0]
        str     r1, [r0, #4]
        bx      lr

Move9   ldr     r3, [r1]
        ldr     r2, [r1, #4]
        ldrb    r1, [r1, #8]
        str     r3, [r0]
        str     r2, [r0, #4]
        strb    r1, [r0, #8]
        bx      lr

Move10  ldr     r3, [r1]
        ldr     r2, [r1, #4]
        ldrh    r1, [r1, #8]
        str     r3, [r0]
        str     r2, [r0, #4]
        strh    r1, [r0, #8]
        bx      lr

Move11  ldr     r12, [r1]
        ldr     r3, [r1, #4]
        ldrh    r2, [r1, #8]
        ldrb    r1, [r1, #10]
        str     r12, [r0]
        str     r3, [r0, #4]
        strh    r2, [r0, #8]
        strb    r1, [r0, #10]
        bx      lr

Move12  ldr     r12, [r1]
        ldr     r2, [r1, #4]
        ldr     r1, [r1, #8]
        str     r12, [r0]
        str     r2, [r0, #4]
        str     r1, [r0, #8]
        bx      lr

Move13  ldr     r12, [r1]
        ldr     r3, [r1, #4]
        ldr     r2, [r1, #8]
        ldrb    r1, [r1, #12]
        str     r12, [r0]
        str     r3, [r0, #4]
        str     r2, [r0, #8]
        strb    r1, [r0, #12]
        bx      lr

Move14  ldr     r12, [r1]
        ldr     r3, [r1, #4]
        ldr     r2, [r1, #8]
        ldrh    r1, [r1, #12]
        str     r12, [r0]
        str     r3, [r0, #4]
        str     r2, [r0, #8]
        strh    r1, [r0, #12]
        bx      lr

Move15  ldrh    r3, [r1, #12]
        ldrb    r2, [r1, #14]
        strh    r3, [r0, #12]
        strb    r2, [r0, #14]
        ldr     r3, [r1]
        ldr     r2, [r1, #4]
        ldr     r1, [r1, #8]
        str     r3, [r0]
        str     r2, [r0, #4]
        str     r1, [r0, #8]
        bx      lr

MovLrge

#if defined(_BOOTCRT_)

        b       __memcpy_reverse_large_integer          ; always use integer in boot code

#else

        eor     r12, r0, r1                             ; see if src/dst are equally aligned
        tst     r12, #3                                 ; at least to a 4 byte boundary
        bne     __memcpy_reverse_large_neon             ; if not, always use NEON
        mov32   r12, __memcpy_reverse_large_func
        ldr     pc, [r12]

#endif

        LEAF_END memmove


;
; __memcpy_reverse_large_integer (internal calling convention)
;
; Copy large (>= 16 bytes) block of memory in a reverse direction, 
; using NEON registers.
;

        ALIGN 32
        NESTED_ENTRY __memcpy_reverse_large_integer_wrapper

__memcpy_reverse_large_integer
        
        PROLOG_NOP adds r3, r0, r2                      ; advance destination to end
        PROLOG_NOP adds r1, r1, r2                      ; advance source to end
        PROLOG_NOP lsls r12, r3, #31                    ; C = bit 1, N = bit 0
        PROLOG_NOP pld [r1, #-32]                       ; pre-load one block ahead
        PROLOG_PUSH {r4-r9, r11, lr}

;
; Align destination to a word boundary
;

        bpl     %F1
        ldrb    r4, [r1, #-1]!                          ; fetch byte
        subs    r2, r2, #1                              ; decrement count
        strb    r4, [r3, #-1]!                          ; store byte
        lsls    r12, r3, #31                            ; compute updated status
1
        bcc     %F2                                     ; if already aligned, just skip ahead
        ldrh    r4, [r1, #-2]!                          ; fetch halfword
        subs    r2, r2, #2                              ; decrement count
        strh    r4, [r3, #-2]!                          ; store halfword
2
        tst     r1, #3                                  ; is the source now word-aligned?
        bne     %F20                                    ; if not, we have to use the slow path

;
; Source is word-aligned; fast case
;

10
        subs    r2, r2, #32                             ; pre-decrement to simplify the loop
        blo     %F13                                    ; skip over the loop if we don't have enough
        subs    r2, r2, #32                             ; pre-decrement to simplify the loop
        pld     [r1, #-64]                              ; pre-load one block ahead
        blo     %F12                                    ; skip over the loop if we don't have enough
11
        pld     [r1, #-96]                              ; prefetch ahead
        subs    r2, r2, #32                             ; count the bytes for this block
        ldmdb   r1!, {r4-r9, r12, lr}                   ; load 32 bytes
        stmdb   r3!, {r4-r9, r12, lr}                   ; store 32 bytes
        bhs     %B11                                    ; keep going until we're done
12
        ldmdb   r1!, {r4-r9, r12, lr}                   ; load 32 bytes
        stmdb   r3!, {r4-r9, r12, lr}                   ; store 32 bytes
13
        adds    r2, r2, #(32 - 8)                       ; recover original count, and pre-decrement
        blo     %F15                                    ; if not enough remaining, skip this loop
14
        subs    r2, r2, #8                              ; decrement count
        ldrd    r4, r5, [r1, #-8]!                      ; fetch pair of words
        strd    r4, r5, [r3, #-8]!                      ; store pair of words
        bhs     %B14                                    ; loop while we still have data remaining
15
        adds    r2, r2, #8                              ; determine final count
        subs    r1, r1, r2                              ; recover original source
        
        EPILOG_POP {r4-r9, r11, lr}
        EPILOG_NOP bne MovSmal                          ; if some left, continue with small
        EPILOG_RETURN                                   ; else just return


;
; Source is not word-aligned; slow case
;

20
        subs    r2, r2, #64                             ; pre-decrement to simplify the loop
        blo     %F23                                    ; skip over the loop if we don't have enough
        pld     [r1, #-64]                              ; pre-load one block ahead
21
        pld     [r1, #-96]                              ; prefetch ahead
        subs    r2, r2, #32                             ; count the bytes for this block
        ldr     r4, [r1, #-32]!                         ; load 32 bytes
        ldr     r5, [r1, #4]                            ;
        ldr     r6, [r1, #8]                            ;
        ldr     r7, [r1, #12]                           ;
        ldr     r8, [r1, #16]                           ;
        ldr     r9, [r1, #20]                           ;
        ldr     r12, [r1, #24]                          ;
        ldr     lr, [r1, #28]                           ;
        stmdb   r3!, {r4-r9, r12, lr}                   ; store 32 bytes
        bhs     %B21                                    ; keep going until we're done
23
        adds    r2, r2, #(64 - 8)                       ; recover original count, and pre-decrement
        blo     %F25                                    ; if not enough remaining, skip this loop
24
        subs    r2, r2, #8                              ; decrement count
        ldr     r4, [r1, #-8]!                          ; fetch pair of words
        ldr     r5, [r1, #4]                            ;
        strd    r4, r5, [r3, #-8]!                      ; store pair of words
        bhs     %B24                                    ; loop while we still have data remaining
25
        adds    r2, r2, #8                              ; determine final count
        subs    r1, r1, r2                              ; recover original source
        
        EPILOG_POP {r4-r9, r11, lr}
        EPILOG_NOP bne MovSmal                          ; if some left, continue with small
        EPILOG_RETURN                                   ; else just return

        NESTED_END __memcpy_reverse_large_integer


;
; __memcpy_reverse_large_neon (internal calling convention)
;
; Copy large (>= 16 bytes) block of memory in a reverse direction, 
; using NEON registers.
;

#if !defined(_BOOTCRT_)

        ALIGN 32
        NESTED_ENTRY __memcpy_reverse_large_neon_wrapper

__memcpy_reverse_large_neon
        
        PROLOG_NOP adds r3, r0, r2                      ; advance destination to end
        PROLOG_NOP adds r1, r1, r2                      ; advance source to end
        PROLOG_NOP lsls r12, r3, #31                    ; C = bit 1, N = bit 0
        PROLOG_NOP pld [r1, #-32]                       ; pre-load one block ahead
        PROLOG_PUSH {r4-r5, r11, lr}

;
; Align destination to a word boundary
;

        bpl     %F1
        ldrb    r4, [r1, #-1]!                          ; fetch byte
        subs    r2, r2, #1                              ; decrement count
        strb    r4, [r3, #-1]!                          ; store byte
        lsls    r12, r3, #31                            ; compute updated status
1
        bcc     %F2                                     ; if already aligned, just skip ahead
        ldrh    r4, [r1, #-2]!                          ; fetch halfword
        subs    r2, r2, #2                              ; decrement count
        strh    r4, [r3, #-2]!                          ; store halfword
2

;
; Perform main copy
;

        subs    r2, r2, #32                             ; pre-decrement to simplify the loop
        blo     %F13                                    ; skip over the loop if we don't have enough
        subs    r2, r2, #32                             ; pre-decrement to simplify the loop
        pld     [r1, #-64]                              ; pre-load one block ahead
        blo     %F12                                    ; skip over the loop if we don't have enough
11
        pld     [r1, #-96]                              ; prefetch ahead
        subs    r1, r1, #32
        subs    r3, r3, #32
        subs    r2, r2, #32                             ; count the bytes for this block
        vld1.8  {d0-d3}, [r1]                           ; load 32 bytes
        vst1.8  {d0-d3}, [r3]                           ; store 32 bytes
        bhs     %B11                                    ; keep going until we're done
12
        subs    r1, r1, #32
        subs    r3, r3, #32
        vld1.8  {d0-d3}, [r1]                           ; load 32 bytes
        vst1.8  {d0-d3}, [r3]                           ; store 32 bytes
13
        adds    r2, r2, #(32 - 8)                       ; recover original count, and pre-decrement
        blo     %F15                                    ; if not enough remaining, skip this loop
14
        ldr     r4, [r1, #-8]!                          ; fetch pair of words
        ldr     r5, [r1, #4]                            ; fetch pair of words
        subs    r2, r2, #8                              ; decrement count
        str     r4, [r3, #-8]!                          ; store pair of words
        str     r5, [r3, #4]
        bhs     %B14                                    ; loop while we still have data remaining
15
        adds    r2, r2, #8                              ; determine final count
        subs    r1, r1, r2                              ; recover original source
        
        EPILOG_POP {r4-r5, r11, lr}
        EPILOG_NOP bne MovSmal                          ; if some left, continue with small
        EPILOG_RETURN                                   ; else just return

        NESTED_END __memcpy_reverse_large_neon

#endif


;
; __memcpy_decide (internal calling convention)
;
; Determine whether to use integer or NEON for future memcpy's.
;

#if !defined(_BOOTCRT_)

        ALIGN 32
        NESTED_ENTRY __memcpy_decide_wrapper

__memcpy_decide

        PROLOG_PUSH {r4-r5, r11, lr}
        
        ;
        ; We want to use integer memcpy's on the A9, which has an external cache.
        ;
        ; First determine if we're in user or kernel mode. Reading CPSR
        ; from user mode will either return the proper 5 mode bits, or all 0s.
        ; Conveniently, user mode is 0x10, and there is no mode 0x00, so if
        ; we read CPSR and the low 4 bits are 0, that's good enough.
        ;
        
        mrs     r4, cpsr                                ; get CPSR
        ands    r4, r4, #0xf                            ; isolate the low 4 bits of the mode
        beq     %F1                                     ; if 0, we're in user mode

        ;
        ; If we are in kernel mode, read the MIDR directly.
        ;

        CP_READ r4, CP15_MIDR                           ; read main ID register
        ubfx    r5, r4, #24, #8                         ; get implementer
        lsrs    r4, r4, #4                              ; shift off revision field
        cmp     r5, #0x41                               ; is implementer == ARM?
        bne     %F3                                     ; if not, use NEON
        bfc     r4, #12, #20                            ; clear upper bits
        ldr     r5, =0xc09                              ; A9 signature
        cmp     r4, r5                                  ; is this an A9?
        bne     %F3                                     ; if not, use NEON
        b       %F2                                     ; otherwise, use integer

        ;
        ; If we are in user mode, check the "external cache available" flag
        ;
1
        ldr     r4, =MM_SHARED_USER_DATA_VA + UsProcessorFeatures + PF_ARM_EXTERNAL_CACHE_AVAILABLE
        ldrb    r4, [r4]                                ; get external cache bit
        cbz     r4, %F3                                 ; if no external cache, do NEON

        ;
        ; Register for integer functions
        ;
2
        ldr     r4, =__memcpy_forward_large_integer     ; select integer functions
        ldr     r5, =__memcpy_forward_large_func        ;
        str     r4, [r5]                                ;   
        ldr     r4, =__memcpy_reverse_large_integer     ; select integer functions
        ldr     r5, =__memcpy_reverse_large_func        ;
        str     r4, [r5]                                ;
        b       %F4        

        ;
        ; Register for NEON functions
        ;
3
        ldr     r4, =__memcpy_forward_large_neon        ; select NEON functions
        ldr     r5, =__memcpy_forward_large_func        ;
        str     r4, [r5]                                ;   
        ldr     r4, =__memcpy_reverse_large_neon        ; select NEON functions
        ldr     r5, =__memcpy_reverse_large_func        ;
        str     r4, [r5]                                ;   
4
        EPILOG_POP {r4-r5, r11, lr}                     ; restore saved registers
        EPILOG_NOP ldr     pc, [r12]                    ; jump to the appropriate target

        NESTED_END __memcpy_decide

#endif


;
; void _memcpy_strict_align(void *dst, const void *src, size_t length)
;
; Copy a block of memory in a forward direction, only performing naturally-aligned
; accesses.
;

        ALIGN 32
        LEAF_ENTRY _memcpy_strict_align
        
;
; Verify alignment between source and destination
;
        
        sub     r3, r0, r1                              ; get relative alignment of source and destination
        cbz     r2, CopyExit                            ; exit if 0 count        
        ands    r3, r3, #3                              ; check DWORD alignment
        bne     CopyMisalignedHalf                      ; misaligned

;
; Source and destination are equally aligned: just align the
; destination and the source will end up aligned as well
;
        
        tst     r0, #3                                  ; dword aligned at the dest?
        beq     WordAligned_0                           ; if so, skip ahead
        tst     r0, #1                                  ; halfword aligned at the dest?
        beq     HalfAligned_0                           ; if so, skip ahead

        subs    r2, r2, #1                              ; decrement count
        ldrb    r3, [r1], #1                            ; fetch byte
        strb    r3, [r0], #1                            ; store it
        beq     CopyExit                                ; stop if done
        tst     r0, #3                                  ; word aligned now?
        beq     WordAligned_0                           ; if so, skip ahead

HalfAligned_0
        cmp     r2, #2                                  ; do we have at least 2 bytes left?
        blo     CopyFinalBytes                          ; if not, copy bytes
        subs    r2, r2, #2                              ; decrement count
        ldrh    r3, [r1], #2                            ; fetch halfword
        strh    r3, [r0], #2                            ; store it
        beq     CopyExit                                ; stop if done

WordAligned_0
        subs    r2, r2, #4                              ; at least 4 bytes remaining?
        blt     WordLoopEnd_0                           ; if not, skip the main loop
WordLoop_0
        subs    r2, r2, #4                              ; decrement count
        ldr     r3, [r1], #4                            ; fetch word
        str     r3, [r0], #4                            ; store it
        bge     WordLoop_0                              ; stop if done
WordLoopEnd_0
        adds    r2, r2, #4                              ; recover the extra 4 we subtracted
        beq     CopyExit                                ; stop if that's everything

CopyFinalHalfwords
        subs    r2, r2, #2                              ; at least 2 bytes remaining?
        blt     CopyFinalHalfwordsEnd                   ; if not, skip this
CopyFinalHalfwordsLoop
        subs    r2, r2, #2                              ; decrement count
        ldrh    r3, [r1], #2                            ; fetch halfword
        strh    r3, [r0], #2                            ; store it
        bge     CopyFinalHalfwordsLoop                  ; loop until done
CopyFinalHalfwordsEnd
        adds    r2, r2, #2                              ; recover the extra 2 we subtracted
        beq     CopyExit                                ; stop if that's everything

CopyFinalBytes
        subs    r2, r2, #1                              ; decrement count
        ldrb    r3, [r1], #1                            ; fetch byte
        strb    r3, [r0], #1                            ; store it
        bne     CopyFinalBytes                          ; loop until done
CopyExit
        bx      lr                                      ; return        


;
; Source and destination are misaligned by 2 bytes
;
        
CopyMisalignedHalf
        cmp     r3, #2                                  ; misaligned by a halfword?
        bne     CopyMisalignedByte                      ; if not, skip

        tst     r0, #3                                  ; dword aligned at the dest?
        beq     WordAligned_2                           ; if so, skip ahead
        tst     r0, #1                                  ; halfword aligned at the dest?
        beq     HalfAligned_2                           ; if so, skip ahead

        subs    r2, r2, #1                              ; decrement count
        ldrb    r3, [r1], #1                            ; fetch byte
        strb    r3, [r0], #1                            ; store it
        beq     CopyExit                                ; stop if done
        tst     r0, #3                                  ; word aligned now?
        beq     WordAligned_2                           ; if so, skip ahead

HalfAligned_2
        cmp     r2, #2                                  ; do we have at least 2 bytes left?
        blo     CopyFinalBytes                          ; if not, copy bytes
        subs    r2, r2, #2                              ; decrement count
        ldrh    r3, [r1], #2                            ; fetch halfword
        strh    r3, [r0], #2                            ; store it
        beq     CopyExit                                ; stop if done

WordAligned_2
        subs    r2, r2, #6                              ; at least 6 bytes remaining?
        blt     WordLoopEnd_2                           ; if so, skip the main loop
        ldrh    r12, [r1], #2                           ; preload a halfword of source
        subs    r2, r2, #2                              ; count these 2 bytes
WordLoop_2
        subs    r2, r2, #4                              ; decrement count
        ldr     r3, [r1], #4                            ; fetch word
        orr     r12, r12, r3, lsl #16                   ; copy low 16 bits to upper 16 of r12
        str     r12, [r0], #4                           ; store it
        lsr     r12, r3, #16                            ; copy upper 16 bits to lower 16 of r12
        bge     WordLoop_2                              ; stop if done
        strh    r12, [r0], #2                           ; store the extra halfword to the dest
WordLoopEnd_2
        adds    r2, r2, #6                              ; recover the extra 6 we subtracted
        beq     CopyExit                                ; stop if that's everything
        b       CopyFinalHalfwords                      ; otherwise, copy remainder


;
; Source and destination are misaligned by 1 byte
;
        
CopyMisalignedByte
        cmp     r3, #1                                  ; misaligned by a byte?
        bne     CopyMisalignedByte3                     ; if not, skip

        tst     r0, #3                                  ; dword aligned at the dest?
        beq     WordAligned_1                           ; if so, skip ahead
ByteAlign_1
        subs    r2, r2, #1                              ; decrement count
        ldrb    r3, [r1], #1                            ; fetch byte
        strb    r3, [r0], #1                            ; store it
        beq     CopyExit                                ; stop if done
        tst     r0, #3                                  ; word aligned now?
        bne     ByteAlign_1                             ; if not, keep copying bytes

WordAligned_1
        subs    r2, r2, #5                              ; at least 5 bytes remaining?
        blt     WordLoopEnd_1                           ; if so, skip the main loop
        ldrb    r12, [r1], #1                           ; preload a byte of source
        subs    r2, r2, #1                              ; count this byte
WordLoop_1
        subs    r2, r2, #4                              ; decrement count
        ldr     r3, [r1], #4                            ; fetch word
        orr     r12, r12, r3, lsl #8                    ; copy low 24 bits to upper 24 of r12
        str     r12, [r0], #4                           ; store it
        lsr     r12, r3, #24                            ; copy upper 8 bits to lower 8 of r12
        bge     WordLoop_1                              ; stop if done
        strb    r12, [r0], #1                           ; store the extra byte to the dest
WordLoopEnd_1
        adds    r2, r2, #5                              ; recover the extra 5 we subtracted
        beq     CopyExit                                ; stop if that's everything
        b       CopyFinalBytes                          ; otherwise, copy remainder


;
; Source and destination are misaligned by 3 bytes
;
        
CopyMisalignedByte3
        tst     r0, #3                                  ; dword aligned at the dest?
        beq     WordAligned_3                           ; if so, skip ahead
ByteAlign_3
        subs    r2, r2, #1                              ; decrement count
        ldrb    r3, [r1], #1                            ; fetch byte
        strb    r3, [r0], #1                            ; store it
        beq     CopyExit                                ; stop if done
        tst     r0, #3                                  ; word aligned now?
        bne     ByteAlign_3                             ; if not, keep copying bytes

WordAligned_3
        subs    r2, r2, #7                              ; at least 7 bytes remaining?
        blt     WordLoopEnd_3                           ; if so, skip the main loop
        ldrb    r12, [r1], #1                           ; preload a byte of source
        ldrh    r3, [r1], #2                            ; preload a halfword of source
        orr     r12, r12, r3, lsl #8                    ; OR in the halfword
        subs    r2, r2, #3                              ; count these 3 bytes
WordLoop_3
        subs    r2, r2, #4                              ; decrement count
        ldr     r3, [r1], #4                            ; fetch word
        orr     r12, r12, r3, lsl #24                   ; copy low 8 bits to upper 8 of r12
        str     r12, [r0], #4                           ; store it
        lsr     r12, r3, #8                             ; copy upper 24 bits to lower 24 of r12
        bge     WordLoop_3                              ; stop if done
        strh    r12, [r0], #2                           ; store the extra halfword to the dest
        lsr     r12, r12, #16                           ; down to the final byte
        strb    r12, [r0], #1                           ; store the extra byte to the dest
WordLoopEnd_3
        adds    r2, r2, #7                              ; recover the extra 7 we subtracted
        beq     CopyExit                                ; stop if that's everything
        b       CopyFinalBytes                          ; otherwise, copy remainder

        LEAF_END _memcpy_strict_align

        END