summaryrefslogtreecommitdiff
path: root/src/vm/amd64/CrtHelpers.asm
diff options
context:
space:
mode:
Diffstat (limited to 'src/vm/amd64/CrtHelpers.asm')
-rw-r--r--src/vm/amd64/CrtHelpers.asm734
1 files changed, 268 insertions, 466 deletions
diff --git a/src/vm/amd64/CrtHelpers.asm b/src/vm/amd64/CrtHelpers.asm
index 6ec6e4d2a9..9d5b280558 100644
--- a/src/vm/amd64/CrtHelpers.asm
+++ b/src/vm/amd64/CrtHelpers.asm
@@ -13,48 +13,19 @@
; ***********************************************************************
include AsmMacros.inc
-include asmconstants.inc
-; JIT_MemSet/JIT_MemCpy
-;
-; It is IMPORANT that the exception handling code is able to find these guys
-; on the stack, but to keep them from being tailcalled by VC++ we need to turn
-; off optimization and it ends up being a wasteful implementation.
-;
-; Hence these assembly helpers.
-;
-
-
-;***
-;memset.asm - set a section of memory to all one byte
-;
-; Licensed to the .NET Foundation under one or more agreements.
-; The .NET Foundation licenses this file to you under the MIT license.
-; See the LICENSE file in the project root for more information.;
-;
-;*******************************************************************************
-
-;***
;char *memset(dst, value, count) - sets "count" bytes at "dst" to "value"
;
;Purpose:
; Sets the first "count" bytes of the memory starting
; at "dst" to the character value "value".
;
-; Algorithm:
-; char *
-; memset (dst, value, count)
-; char *dst;
-; char value;
-; unsigned int count;
-; {
-; char *start = dst;
-;
-; while (count--)
-; *dst++ = value;
-; return(start);
-; }
-;
+;Algorithm:
+;Set dst based on count as follow
+; count [0, 16]: use 1/2/4/8 bytes width registers
+; count [16, 128]: use 16 bytes width registers (XMM) without loop
+; count [128, 512]: use 16 bytes width registers (XMM) with loops, unrolled 8 times
+; count [512, upper]: use rep stosb
;Entry:
; char *dst - pointer to memory to fill with value
; char value - value to put in dst bytes
@@ -69,460 +40,291 @@ include asmconstants.inc
;
;*******************************************************************************
-CACHE_LIMIT_MEMSET equ 070000h ; limit for nontemporal fill
-
LEAF_ENTRY JIT_MemSet, _TEXT
- mov rax, rcx ; save destination address
- cmp r8, 8 ; check if 8 bytes to fill
- jb short mset40 ; if b, less than 8 bytes to fill
movzx edx, dl ; set fill pattern
- mov r9, 0101010101010101h ; replicate fill over 8 bytes
- imul rdx, r9 ;
- cmp r8, 64 ; check if 64 bytes to fill
- jb short mset20 ; if b, less than 64 bytes
-
-;
-; Large block - fill alignment bytes.
-;
-
-mset00: neg rcx ; compute bytes to alignment
- and ecx, 7 ;
- jz short mset10 ; if z, no alignment required
- sub r8, rcx ; adjust remaining bytes by alignment
- mov [rax], rdx ; fill alignment bytes
-mset10: add rcx, rax ; compute aligned destination address
-
-;
-; Attempt to fill 64-byte blocks
-;
-
- mov r9, r8 ; copy count of bytes remaining
- and r8, 63 ; compute remaining byte count
- shr r9, 6 ; compute number of 64-byte blocks
- test r9, r9 ; remove partial flag stall caused by shr
- jnz short mset70 ; if nz, 64-byte blocks to fill
-
-;
-; Fill 8-byte bytes.
-;
-
-mset20: mov r9, r8 ; copy count of bytes remaining
- and r8, 7 ; compute remaining byte count
- shr r9, 3 ; compute number of 8-byte blocks
- test r9, r9 ; remove partial flag stall caused by shr
- jz short mset40 ; if z, no 8-byte blocks
-
- align ; simpler way to align instrucitons
-
-mset30: mov [rcx], rdx ; fill 8-byte blocks
- add rcx, 8 ; advance to next 8-byte block
- dec r9 ; decrement loop count
- jnz short mset30 ; if nz, more 8-byte blocks
-
-;
-; Fill residual bytes.
-;
-
-mset40: test r8, r8 ; test if any bytes to fill
- jz short mset60 ; if z, no bytes to fill
-mset50: mov [rcx], dl ; fill byte
- inc rcx ; advance to next byte
- dec r8 ; decrement loop count
- jnz short mset50 ; if nz, more bytes to fill
-mset60:
- ; for some reason the assembler doesn't like the REPRET macro on the same line as a label
- REPRET ; return
-
-;
-; Fill 64-byte blocks.
-;
-
- align 16
-
- db 066h, 066h, 066h, 090h
- db 066h, 066h, 090h
-
-mset70: cmp r9, CACHE_LIMIT_MEMSET / 64 ; check if large fill
- jae short mset90 ; if ae, large fill
-mset80: mov [rcx], rdx ; fill 64-byte block
- mov 8[rcx], rdx ;
- mov 16[rcx], rdx ;
- add rcx, 64 ; advance to next block
- mov (24 - 64)[rcx], rdx ;
- mov (32 - 64)[rcx], rdx ;
- dec r9 ; decrement loop count
- mov (40 - 64)[rcx], rdx ;
- mov (48 - 64)[rcx], rdx ;
- mov (56 - 64)[rcx], rdx ;
- jnz short mset80 ; if nz, more 64-byte blocks
- jmp short mset20 ; finish in common code
-
-;
-; Fill 64-byte blocks nontemporal.
-;
-
- align
-
-mset90: movnti [rcx], rdx ; fill 64-byte block
- movnti 8[rcx], rdx ;
- movnti 16[rcx], rdx ;
- add rcx, 64 ; advance to next block
- movnti (24 - 64)[rcx], rdx ;
- movnti (32 - 64)[rcx], rdx ;
- dec r9 ; decrement loop count
- movnti (40 - 64)[rcx], rdx ;
- movnti (48 - 64)[rcx], rdx ;
- movnti (56 - 64)[rcx], rdx ;
- jnz short mset90 ; if nz, move 64-byte blocks
- lock or byte ptr [rsp], 0 ; flush data to memory
- jmp mset20 ; finish in common code
+ mov r9, 0101010101010101h
+ imul rdx, r9 ; rdx is 8 bytes filler
+
+ cmp r8, 16
+ jbe mset04
+
+ cmp r8, 512
+ jbe mset00
+
+ ; count > 512
+ mov r10, rcx ; save dst address
+ mov r11, rdi ; save rdi
+ mov eax, edx ; eax is value
+ mov rdi, rcx ; rdi is dst
+ mov rcx, r8 ; rcx is count
+ rep stosb
+ mov rdi, r11 ; restore rdi
+ mov rax, r10
+ ret
+
+ align 16
+mset00: mov rax, rcx ; save dst address
+ movd xmm0, rdx
+ punpcklbw xmm0, xmm0 ; xmm0 is 16 bytes filler
+
+ cmp r8, 128
+ jbe mset02
+
+ ; count > 128 && count <= 512
+ mov r9, r8
+ shr r9, 7 ; count/128
+
+ align 16
+mset01: movdqu [rcx], xmm0
+ movdqu 16[rcx], xmm0
+ movdqu 32[rcx], xmm0
+ movdqu 48[rcx], xmm0
+ movdqu 64[rcx], xmm0
+ movdqu 80[rcx], xmm0
+ movdqu 96[rcx], xmm0
+ movdqu 112[rcx], xmm0
+ add rcx, 128
+ dec r9
+ jnz mset01
+ and r8, 7fh ; and r8 with 0111 1111
+
+ ; the remainder is from 0 to 127
+ cmp r8, 16
+ jnbe mset02
+
+ ; the remainder <= 16
+ movdqu -16[rcx + r8], xmm0
+ ret
+
+ ; count > 16 && count <= 128 for mset02
+ align 16
+mset02: movdqu [rcx], xmm0
+ movdqu -16[rcx + r8], xmm0
+ cmp r8, 32
+ jbe mset03
+
+ ; count > 32 && count <= 64
+ movdqu 16[rcx], xmm0
+ movdqu -32[rcx + r8], xmm0
+ cmp r8, 64
+ jbe mset03
+
+ ; count > 64 && count <= 128
+ movdqu 32[rcx], xmm0
+ movdqu 48[rcx], xmm0
+ movdqu -48[rcx + r8], xmm0
+ movdqu -64[rcx + r8], xmm0
+mset03: ret
+
+ align 16
+mset04: mov rax, rcx ; save dst address
+ test r8b, 24 ; and r8b with 0001 1000
+ jz mset05
+
+ ; count >= 8 && count <= 16
+ mov [rcx], rdx
+ mov -8[rcx + r8], rdx
+ ret
+
+ align 16
+mset05: test r8b, 4 ; and r8b with 0100
+ jz mset06
+
+ ; count >= 4 && count < 8
+ mov [rcx], edx
+ mov -4[rcx + r8], edx
+ ret
+
+ ; count >= 0 && count < 4
+ align 16
+mset06: test r8b, 1 ; and r8b with 0001
+ jz mset07
+ mov [rcx],dl
+mset07: test r8b, 2 ; and r8b with 0010
+ jz mset08
+ mov -2[rcx + r8], dx
+mset08: ret
LEAF_END_MARKED JIT_MemSet, _TEXT
-;*******************************************************************************
-; This ensures that atomic updates of aligned fields will stay atomic.
-;***
;JIT_MemCpy - Copy source buffer to destination buffer
;
;Purpose:
-;JIT_MemCpy - Copy source buffer to destination buffer
-;
-;Purpose:
-; JIT_MemCpy() copies a source memory buffer to a destination memory
-; buffer. This routine recognize overlapping buffers to avoid propogation.
-; For cases where propogation is not a problem, memcpy() can be used.
+; JIT_MemCpy() copies a source memory buffer to a destination memory
+; buffer. This routine recognize overlapping buffers to avoid propogation.
+; For cases where propogation is not a problem, memcpy() can be used.
+;
+;Algorithm:
+;Copy to destination based on count as follow
+; count [0, 64]: overlap check not needed
+; count [0, 16]: use 1/2/4/8 bytes width registers
+; count [16, 64]: use 16 bytes width registers (XMM) without loop
+; count [64, upper]: check overlap
+; non-overlap:
+; count [64, 512]: use 16 bytes width registers (XMM) with loops, unrolled 4 times
+; count [512, upper]: use rep movsb
+; overlap::
+; use 16 bytes width registers (XMM) with loops to copy from end to beginnig
;
;Entry:
-; void *dst = pointer to destination buffer
-; const void *src = pointer to source buffer
-; size_t count = number of bytes to copy
+; void *dst = pointer to destination buffer
+; const void *src = pointer to source buffer
+; size_t count = number of bytes to copy
;
;Exit:
-; Returns a pointer to the destination buffer in AX/DX:AX
+; Returns a pointer to the destination buffer
;
;Uses:
-; CX, DX
;
;Exceptions:
;*******************************************************************************
-; This ensures that atomic updates of aligned fields will stay atomic.
-
-CACHE_LIMIT_MEMMOV equ 040000h ; limit for nontemporal fill
-CACHE_BLOCK equ 01000h ; nontemporal move block size
-
LEAF_ENTRY JIT_MemCpy, _TEXT
- mov r11, rcx ; save destination address
- sub rdx, rcx ; compute offset to source buffer
- jb mmov10 ; if b, destination may overlap
- cmp r8, 8 ; check if 8 bytes to move
- jb short mcpy40 ; if b, less than 8 bytes to move
-
-;
-; Move alignment bytes.
-;
-
- test cl, 7 ; test if destination aligned
- jz short mcpy20 ; if z, destination aligned
- test cl, 1 ; test if byte move needed
- jz short mcpy00 ; if z, byte move not needed
- mov al, [rcx + rdx] ; move byte
- dec r8 ; decrement byte count
- mov [rcx], al ;
- inc rcx ; increment destination address
-mcpy00: test cl, 2 ; test if word move needed
- jz short mcpy10 ; if z, word move not needed
- mov ax, [rcx + rdx] ; move word
- sub r8, 2 ; reduce byte count
- mov [rcx], ax ;
- add rcx, 2 ; advance destination address
-mcpy10: test cl, 4 ; test if dword move needed
- jz short mcpy20 ; if z, dword move not needed
- mov eax, [rcx + rdx] ; move dword
- sub r8, 4 ; reduce byte count
- mov [rcx], eax ;
- add rcx, 4 ; advance destination address
-
-;
-; Attempt to move 32-byte blocks.
-;
-
-mcpy20: mov r9, r8 ; copy count of bytes remaining
- shr r9, 5 ; compute number of 32-byte blocks
- test r9, r9 ; v-liti, remove partial flag stall caused by shr
- jnz short mcpy60 ; if nz, 32-byte blocks to fill
-
- align
-;
-; Move 8-byte blocks.
-;
-
-mcpy25: mov r9, r8 ; copy count of bytes remaining
- shr r9, 3 ; compute number of 8-byte blocks
- test r9, r9 ; v-liti, remove partial flag stall caused by shr
- jz short mcpy40 ; if z, no 8-byte blocks
- align
-
-mcpy30: mov rax, [rcx + rdx] ; move 8-byte blocks
- mov [rcx], rax ;
- add rcx, 8 ; advance destination address
- dec r9 ; decrement loop count
- jnz short mcpy30 ; if nz, more 8-byte blocks
- and r8, 7 ; compute remaining byte count
-
-;
-; Test for residual bytes.
-;
-
-mcpy40: test r8, r8 ; test if any bytes to move
- jnz short mcpy50 ; if nz, residual bytes to move
- mov rax, r11 ; set destination address
- ret ;
-
-;
-; Move residual bytes.
-;
-
- align
-
-mcpy50: mov al, [rcx + rdx] ; move byte
- mov [rcx], al ;
- inc rcx ; increment destiantion address
- dec r8 ; decrement loop count
- jnz short mcpy50 ; if nz, more bytes to fill
- mov rax, r11 ; set destination address
- ret ; return
+ mov rax, rcx ; save dst address
+ cmp r8, 16
+ jbe mcpy02
+
+ cmp r8, 64
+ jnbe mcpy07
+
+ ; count > 16 && count <= 64
+ align 16
+mcpy00: movdqu xmm0, [rdx]
+ movdqu xmm1, -16[rdx + r8] ; save 16 to 32 bytes src
+ cmp r8, 32
+ jbe mcpy01
+
+ movdqu xmm2, 16[rdx]
+ movdqu xmm3, -32[rdx + r8] ; save 32 to 64 bytes src
+
+ ;count > 32 && count <= 64
+ movdqu 16[rcx], xmm2
+ movdqu -32[rcx + r8], xmm3
+
+ ;count > 16 && count <= 32
+mcpy01: movdqu [rcx], xmm0
+ movdqu -16[rcx + r8], xmm1
+ ret
+
+ ; count <= 16
+ align 16
+mcpy02: test r8b, 24 ; test count with 0001 1000
+ jz mcpy03
+ ; count >= 8 && count <= 16
+ mov r9, [rdx]
+ mov r10, -8[rdx + r8]
+ mov [rcx], r9
+ mov -8[rcx + r8], r10
+ ret
+
+ align 16
+mcpy03: test r8b, 4 ; test count with 0100
+ jz mcpy04
+ ; count >= 4 && count < 8
+ mov r9d, [rdx]
+ mov r10d, -4[rdx + r8]
+ mov [rcx], r9d
+ mov -4[rcx + r8], r10d
+ ret
+
+ ; count >= 0 && count < 4
+ align 16
+mcpy04: test r8, r8
+ jz mcpy06 ; count == 1/2/3
+ mov r9b, [rdx] ; save the first byte
+
+ test r8b, 2 ; test count with 0010
+ jz mcpy05
+ mov r10w, -2[rdx + r8]
+ mov -2[rcx + r8], r10w
+mcpy05: mov [rcx], r9b
+mcpy06: ret
+
+ align 16
+ ; count > 64, we need to check overlap
+mcpy07: mov r9, rdx ; r9 is src address
+ sub r9, rcx ; if src - dst < 0 jump to mcpy11
+ jb mcpy11 ; if b, destination may overlap
+
+mcpy08: cmp r8, 512
+ jnbe mcpy10
+
+ ; count > 64 && count <= 512
+ mov r9, r8
+ shr r9, 6 ; count/64
+
+ align 16
+mcpy09: movdqu xmm0, [rdx]
+ movdqu xmm1, 16[rdx]
+ movdqu xmm2, 32[rdx]
+ movdqu xmm3, 48[rdx]
+ movdqu [rcx], xmm0
+ movdqu 16[rcx], xmm1
+ movdqu 32[rcx], xmm2
+ movdqu 48[rcx], xmm3
+ add rdx, 64
+ add rcx, 64
+ dec r9
+ jnz mcpy09
+
+ ; the remainder is from 0 to 63
+ and r8, 3fh ; and with 0011 1111
+ cmp r8, 16
+ jnbe mcpy00
+
+ ; the remainder <= 16
+ jmp mcpy02
+ ret
+
+ ; count > 512
+ align 16
+mcpy10: mov r10, rdi ; save rdi
+ mov r11, rsi ; save rsi
+ mov rdi, rcx ; rdi is dst
+ mov rsi, rdx ; rsi is src
+ mov rcx, r8 ; rcx is count
+ rep movsb ; mov from rsi to rdi
+ mov rsi, r11 ; restore rsi
+ mov rdi, r10 ; restore rdi
+ ret
-;
-; Move 32 byte blocks
-;
-
- align 16
-
- db 066h, 066h, 066h, 090h
- db 066h, 066h, 090h
-
-mcpy60: cmp r9, CACHE_LIMIT_MEMMOV / 32 ; check if large move
- jae short mcpy80 ; if ae, large move
-mcpy70: mov rax, [rcx + rdx] ; move 32-byte block
- mov r10, 8[rcx + rdx] ;
- add rcx, 32 ; advance destination address
- mov (-32)[rcx], rax ;
- mov (-24)[rcx], r10 ;
- mov rax, (-16)[rcx + rdx] ;
- mov r10, (-8)[rcx + rdx] ;
- dec r9 ;
- mov (-16)[rcx], rax ;
- mov (-8)[rcx], r10 ;
- jnz short mcpy70 ; if nz, more 32-byte blocks
- and r8, 31 ; compute remaining byte count
- jmp mcpy25 ;
-
-;
-; Move 64-byte blocks nontemporal.
-;
-
- align
-
- db 066h, 090h
-
-mcpy80: cmp rdx, CACHE_BLOCK ; check if cache block spacing
- jb short mcpy70 ; if b, not cache block spaced
-mcpy81: mov eax, CACHE_BLOCK / 128 ; set loop count
-mcpy85: prefetchnta [rcx + rdx] ; prefetch 128 bytes
- prefetchnta 64[rcx + rdx] ;
- add rcx, 128 ; advance source address
- dec eax ; decrement loop count
- jnz short mcpy85 ; if nz, more to prefetch
- sub rcx, CACHE_BLOCK ; reset source address
- mov eax, CACHE_BLOCK / 64 ; set loop count
-mcpy90: mov r9, [rcx + rdx] ; move 64-byte block
- mov r10, 8[rcx + rdx] ;
- movnti [rcx], r9 ;
- movnti 8[rcx], r10 ;
- mov r9, 16[rcx + rdx] ;
- mov r10, 24[rcx + rdx] ;
- movnti 16[rcx], r9 ;
- movnti 24[rcx], r10 ;
- mov r9, 32[rcx + rdx] ;
- mov r10, 40[rcx + rdx] ;
- add rcx, 64 ; advance destination address
- movnti (32 - 64)[rcx], r9 ;
- movnti (40 - 64)[rcx], r10 ;
- mov r9, (48 - 64)[rcx + rdx] ;
- mov r10, (56 - 64)[rcx + rdx] ;
- dec eax ;
- movnti (48 - 64)[rcx], r9 ;
- movnti (56 - 64)[rcx], r10 ;
- jnz short mcpy90 ; if nz, more 32-byte blocks
- sub r8, CACHE_BLOCK ; reduce remaining length
- cmp r8, CACHE_BLOCK ; check if cache block remains
- jae mcpy81 ; if ae, cache block remains
- lock or byte ptr [rsp], 0 ; flush data to memory
- jmp mcpy20 ;
-
-;
; The source address is less than the destination address.
-;
-
- align
-
- db 066h, 066h, 066h, 090h
- db 066h, 066h, 066h, 090h
- db 066h, 090h
-
-mmov10: add rcx, r8 ; compute ending destination address
- cmp r8, 8 ; check if 8 bytes to move
- jb short mmov60 ; if b, less than 8 bytes to move
-
-;
-; Move alignment bytes.
-;
-
- test cl, 7 ; test if destination aligned
- jz short mmov30 ; if z, destination aligned
- test cl, 1 ; test if byte move needed
- jz short mmov15 ; if z, byte move not needed
- dec rcx ; decrement destination address
- mov al, [rcx + rdx] ; move byte
- dec r8 ; decrement byte count
- mov [rcx], al ;
-mmov15: test cl, 2 ; test if word move needed
- jz short mmov20 ; if z, word move not needed
- sub rcx, 2 ; reduce destination address
- mov ax, [rcx + rdx] ; move word
- sub r8, 2 ; reduce byte count
- mov [rcx], ax ;
-mmov20: test cl, 4 ; test if dword move needed
- jz short mmov30 ; if z, dword move not needed
- sub rcx, 4 ; reduce destination address
- mov eax, [rcx + rdx] ; move dword
- sub r8, 4 ; reduce byte count
- mov [rcx], eax ;
-
-;
-; Attempt to move 32-byte blocks
-;
-
-mmov30: mov r9, r8 ; copy count of bytes remaining
- shr r9, 5 ; compute number of 32-byte blocks
- test r9, r9 ; v-liti, remove partial flag stall caused by shr
- jnz short mmov80 ; if nz, 32-byte blocks to fill
-
-;
-; Move 8-byte blocks.
-;
- align
-
-mmov40: mov r9, r8 ; copy count of bytes remaining
- shr r9, 3 ; compute number of 8-byte blocks
- test r9, r9 ; v-liti, remove partial flag stall caused by shr
- jz short mmov60 ; if z, no 8-byte blocks
-
- align
-
-mmov50: sub rcx, 8 ; reduce destination address
- mov rax, [rcx + rdx] ; move 8-byte blocks
- dec r9 ; decrement loop count
- mov [rcx], rax ;
- jnz short mmov50 ; if nz, more 8-byte blocks
- and r8, 7 ; compute remaining byte count
-
-;
-; Test for residual bytes.
-;
-
-mmov60: test r8, r8 ; test if any bytes to move
- jnz short mmov70 ; if nz, residual bytes to move
- mov rax, r11 ; set destination address
- ret ;
-
-;
-; Move residual bytes.
-;
-
- align
-mmov70: dec rcx ; decrement destination address
- mov al, [rcx + rdx] ; move byte
- dec r8 ; decrement loop count
- mov [rcx], al ;
- jnz short mmov70 ; if nz, more bytes to fill
- mov rax, r11 ; set destination address
- ret ; return
-
-;
-; Move 32 byte blocks
-;
-
- align 16
-
- db 066h, 066h, 066h, 090h
- db 066h, 066h, 090h
-
-mmov80: cmp r9, CACHE_LIMIT_MEMMOV / 32 ; check if large move
- jae short mmov93 ; if ae, large move
-mmov90: mov rax, (-8)[rcx + rdx] ; move 32-byte block
- mov r10, (-16)[rcx + rdx] ;
- sub rcx, 32 ; reduce destination address
- mov 24[rcx], rax ;
- mov 16[rcx], r10 ;
- mov rax, 8[rcx + rdx] ;
- mov r10, [rcx + rdx] ;
- dec r9 ;
- mov 8[rcx], rax ;
- mov [rcx], r10 ;
- jnz short mmov90 ; if nz, more 32-byte blocks
- and r8, 31 ; compute remaining byte count
- jmp mmov40 ;
-
-;
-; Move 64-byte blocks nontemporal.
-;
-
- align
-
- db 066h, 090h
-
-mmov93: cmp rdx, -CACHE_BLOCK ; check if cache block spacing
- ja short mmov90 ; if a, not cache block spaced
-mmov94: mov eax, CACHE_BLOCK / 128 ; set loop count
-mmov95: sub rcx, 128 ; reduce destination address
- prefetchnta [rcx + rdx] ; prefetch 128 bytes
- prefetchnta 64[rcx + rdx] ;
- dec eax ; decrement loop count
- jnz short mmov95 ; if nz, more to prefetch
- add rcx, CACHE_BLOCK ; reset source address
- mov eax, CACHE_BLOCK / 64 ; set loop count
-mmov97: mov r9, (-8)[rcx + rdx] ; move 64-byte block
- mov r10, (-16)[rcx + rdx] ;
- movnti (-8)[rcx], r9 ;
- movnti (-16)[rcx], r10 ;
- mov r9, (-24)[rcx + rdx] ;
- mov r10, (-32)[rcx + rdx] ;
- movnti (-24)[rcx], r9 ;
- movnti (-32)[rcx], r10 ;
- mov r9, (-40)[rcx + rdx] ;
- mov r10, (-48)[rcx + rdx] ;
- sub rcx, 64 ; reduce destination address
- movnti (64 - 40)[rcx], r9 ;
- movnti (64 - 48)[rcx], r10 ;
- mov r9, (64 - 56)[rcx + rdx] ;
- mov r10, (64 - 64)[rcx + rdx] ;
- dec eax ; decrement loop count
- movnti (64 - 56)[rcx], r9 ;
- movnti (64 - 64)[rcx], r10 ;
- jnz short mmov97 ; if nz, more 32-byte blocks
- sub r8, CACHE_BLOCK ; reduce remaining length
- cmp r8, CACHE_BLOCK ; check if cache block remains
- jae mmov94 ; if ae, cache block remains
- lock or byte ptr [rsp], 0 ; flush data to memory
- jmp mmov30 ;
+ align 16
+mcpy11: add r9, r8 ; src - dst + count
+ cmp r9, 0 ; src + count < = dst jump to mcpy08
+ jle mcpy08
+
+ lea r9, [rdx + r8] ; r9 is the src + count
+ lea r10, [rcx + r8] ; r10 is the dst + count
+
+ mov r11, r8
+ shr r11, 6 ; count/64
+
+ ; count > 64
+ align 16
+mcpy12: movdqu xmm0, -16[r9]
+ movdqu xmm1, -32[r9]
+ movdqu xmm2, -48[r9]
+ movdqu xmm3, -64[r9]
+ movdqu -16[r10], xmm0
+ movdqu -32[r10], xmm1
+ movdqu -48[r10], xmm2
+ movdqu -64[r10], xmm3
+ sub r9, 64
+ sub r10, 64
+ dec r11
+ jnz mcpy12
+
+ ; the remainder is from 0 to 63
+ and r8, 3fh ; and with 0011 1111
+ cmp r8, 16
+ jnbe mcpy00
+
+ ; the remainder <= 16
+ jmp mcpy02
LEAF_END_MARKED JIT_MemCpy, _TEXT
-
-
- end
-
+ end \ No newline at end of file