From 4b4aad7217d3292650e77eec2cf4c198ea9c3b4b Mon Sep 17 00:00:00 2001 From: Jiyoung Yun Date: Wed, 23 Nov 2016 19:09:09 +0900 Subject: Imported Upstream version 1.1.0 --- src/vm/amd64/CrtHelpers.asm | 528 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 528 insertions(+) create mode 100644 src/vm/amd64/CrtHelpers.asm (limited to 'src/vm/amd64/CrtHelpers.asm') diff --git a/src/vm/amd64/CrtHelpers.asm b/src/vm/amd64/CrtHelpers.asm new file mode 100644 index 0000000000..6ec6e4d2a9 --- /dev/null +++ b/src/vm/amd64/CrtHelpers.asm @@ -0,0 +1,528 @@ +; Licensed to the .NET Foundation under one or more agreements. +; The .NET Foundation licenses this file to you under the MIT license. +; See the LICENSE file in the project root for more information. + +; ==++== +; + +; +; ==--== +; *********************************************************************** +; File: CrtHelpers.asm, see history in asmhelpers.asm +; +; *********************************************************************** + +include AsmMacros.inc +include asmconstants.inc + +; JIT_MemSet/JIT_MemCpy +; +; It is IMPORANT that the exception handling code is able to find these guys +; on the stack, but to keep them from being tailcalled by VC++ we need to turn +; off optimization and it ends up being a wasteful implementation. +; +; Hence these assembly helpers. +; + + +;*** +;memset.asm - set a section of memory to all one byte +; +; Licensed to the .NET Foundation under one or more agreements. +; The .NET Foundation licenses this file to you under the MIT license. +; See the LICENSE file in the project root for more information.; +; +;******************************************************************************* + +;*** +;char *memset(dst, value, count) - sets "count" bytes at "dst" to "value" +; +;Purpose: +; Sets the first "count" bytes of the memory starting +; at "dst" to the character value "value". +; +; Algorithm: +; char * +; memset (dst, value, count) +; char *dst; +; char value; +; unsigned int count; +; { +; char *start = dst; +; +; while (count--) +; *dst++ = value; +; return(start); +; } +; +;Entry: +; char *dst - pointer to memory to fill with value +; char value - value to put in dst bytes +; int count - number of bytes of dst to fill +; +;Exit: +; returns dst, with filled bytes +; +;Uses: +; +;Exceptions: +; +;******************************************************************************* + +CACHE_LIMIT_MEMSET equ 070000h ; limit for nontemporal fill + +LEAF_ENTRY JIT_MemSet, _TEXT + + mov rax, rcx ; save destination address + cmp r8, 8 ; check if 8 bytes to fill + jb short mset40 ; if b, less than 8 bytes to fill + movzx edx, dl ; set fill pattern + mov r9, 0101010101010101h ; replicate fill over 8 bytes + imul rdx, r9 ; + cmp r8, 64 ; check if 64 bytes to fill + jb short mset20 ; if b, less than 64 bytes + +; +; Large block - fill alignment bytes. +; + +mset00: neg rcx ; compute bytes to alignment + and ecx, 7 ; + jz short mset10 ; if z, no alignment required + sub r8, rcx ; adjust remaining bytes by alignment + mov [rax], rdx ; fill alignment bytes +mset10: add rcx, rax ; compute aligned destination address + +; +; Attempt to fill 64-byte blocks +; + + mov r9, r8 ; copy count of bytes remaining + and r8, 63 ; compute remaining byte count + shr r9, 6 ; compute number of 64-byte blocks + test r9, r9 ; remove partial flag stall caused by shr + jnz short mset70 ; if nz, 64-byte blocks to fill + +; +; Fill 8-byte bytes. +; + +mset20: mov r9, r8 ; copy count of bytes remaining + and r8, 7 ; compute remaining byte count + shr r9, 3 ; compute number of 8-byte blocks + test r9, r9 ; remove partial flag stall caused by shr + jz short mset40 ; if z, no 8-byte blocks + + align ; simpler way to align instrucitons + +mset30: mov [rcx], rdx ; fill 8-byte blocks + add rcx, 8 ; advance to next 8-byte block + dec r9 ; decrement loop count + jnz short mset30 ; if nz, more 8-byte blocks + +; +; Fill residual bytes. +; + +mset40: test r8, r8 ; test if any bytes to fill + jz short mset60 ; if z, no bytes to fill +mset50: mov [rcx], dl ; fill byte + inc rcx ; advance to next byte + dec r8 ; decrement loop count + jnz short mset50 ; if nz, more bytes to fill +mset60: + ; for some reason the assembler doesn't like the REPRET macro on the same line as a label + REPRET ; return + +; +; Fill 64-byte blocks. +; + + align 16 + + db 066h, 066h, 066h, 090h + db 066h, 066h, 090h + +mset70: cmp r9, CACHE_LIMIT_MEMSET / 64 ; check if large fill + jae short mset90 ; if ae, large fill +mset80: mov [rcx], rdx ; fill 64-byte block + mov 8[rcx], rdx ; + mov 16[rcx], rdx ; + add rcx, 64 ; advance to next block + mov (24 - 64)[rcx], rdx ; + mov (32 - 64)[rcx], rdx ; + dec r9 ; decrement loop count + mov (40 - 64)[rcx], rdx ; + mov (48 - 64)[rcx], rdx ; + mov (56 - 64)[rcx], rdx ; + jnz short mset80 ; if nz, more 64-byte blocks + jmp short mset20 ; finish in common code + +; +; Fill 64-byte blocks nontemporal. +; + + align + +mset90: movnti [rcx], rdx ; fill 64-byte block + movnti 8[rcx], rdx ; + movnti 16[rcx], rdx ; + add rcx, 64 ; advance to next block + movnti (24 - 64)[rcx], rdx ; + movnti (32 - 64)[rcx], rdx ; + dec r9 ; decrement loop count + movnti (40 - 64)[rcx], rdx ; + movnti (48 - 64)[rcx], rdx ; + movnti (56 - 64)[rcx], rdx ; + jnz short mset90 ; if nz, move 64-byte blocks + lock or byte ptr [rsp], 0 ; flush data to memory + jmp mset20 ; finish in common code + +LEAF_END_MARKED JIT_MemSet, _TEXT + +;******************************************************************************* +; This ensures that atomic updates of aligned fields will stay atomic. +;*** +;JIT_MemCpy - Copy source buffer to destination buffer +; +;Purpose: +;JIT_MemCpy - Copy source buffer to destination buffer +; +;Purpose: +; JIT_MemCpy() copies a source memory buffer to a destination memory +; buffer. This routine recognize overlapping buffers to avoid propogation. +; For cases where propogation is not a problem, memcpy() can be used. +; +;Entry: +; void *dst = pointer to destination buffer +; const void *src = pointer to source buffer +; size_t count = number of bytes to copy +; +;Exit: +; Returns a pointer to the destination buffer in AX/DX:AX +; +;Uses: +; CX, DX +; +;Exceptions: +;******************************************************************************* +; This ensures that atomic updates of aligned fields will stay atomic. + +CACHE_LIMIT_MEMMOV equ 040000h ; limit for nontemporal fill +CACHE_BLOCK equ 01000h ; nontemporal move block size + + +LEAF_ENTRY JIT_MemCpy, _TEXT + + mov r11, rcx ; save destination address + sub rdx, rcx ; compute offset to source buffer + jb mmov10 ; if b, destination may overlap + cmp r8, 8 ; check if 8 bytes to move + jb short mcpy40 ; if b, less than 8 bytes to move + +; +; Move alignment bytes. +; + + test cl, 7 ; test if destination aligned + jz short mcpy20 ; if z, destination aligned + test cl, 1 ; test if byte move needed + jz short mcpy00 ; if z, byte move not needed + mov al, [rcx + rdx] ; move byte + dec r8 ; decrement byte count + mov [rcx], al ; + inc rcx ; increment destination address +mcpy00: test cl, 2 ; test if word move needed + jz short mcpy10 ; if z, word move not needed + mov ax, [rcx + rdx] ; move word + sub r8, 2 ; reduce byte count + mov [rcx], ax ; + add rcx, 2 ; advance destination address +mcpy10: test cl, 4 ; test if dword move needed + jz short mcpy20 ; if z, dword move not needed + mov eax, [rcx + rdx] ; move dword + sub r8, 4 ; reduce byte count + mov [rcx], eax ; + add rcx, 4 ; advance destination address + +; +; Attempt to move 32-byte blocks. +; + +mcpy20: mov r9, r8 ; copy count of bytes remaining + shr r9, 5 ; compute number of 32-byte blocks + test r9, r9 ; v-liti, remove partial flag stall caused by shr + jnz short mcpy60 ; if nz, 32-byte blocks to fill + + align +; +; Move 8-byte blocks. +; + +mcpy25: mov r9, r8 ; copy count of bytes remaining + shr r9, 3 ; compute number of 8-byte blocks + test r9, r9 ; v-liti, remove partial flag stall caused by shr + jz short mcpy40 ; if z, no 8-byte blocks + align + +mcpy30: mov rax, [rcx + rdx] ; move 8-byte blocks + mov [rcx], rax ; + add rcx, 8 ; advance destination address + dec r9 ; decrement loop count + jnz short mcpy30 ; if nz, more 8-byte blocks + and r8, 7 ; compute remaining byte count + +; +; Test for residual bytes. +; + +mcpy40: test r8, r8 ; test if any bytes to move + jnz short mcpy50 ; if nz, residual bytes to move + mov rax, r11 ; set destination address + ret ; + +; +; Move residual bytes. +; + + align + +mcpy50: mov al, [rcx + rdx] ; move byte + mov [rcx], al ; + inc rcx ; increment destiantion address + dec r8 ; decrement loop count + jnz short mcpy50 ; if nz, more bytes to fill + mov rax, r11 ; set destination address + ret ; return + +; +; Move 32 byte blocks +; + + align 16 + + db 066h, 066h, 066h, 090h + db 066h, 066h, 090h + +mcpy60: cmp r9, CACHE_LIMIT_MEMMOV / 32 ; check if large move + jae short mcpy80 ; if ae, large move +mcpy70: mov rax, [rcx + rdx] ; move 32-byte block + mov r10, 8[rcx + rdx] ; + add rcx, 32 ; advance destination address + mov (-32)[rcx], rax ; + mov (-24)[rcx], r10 ; + mov rax, (-16)[rcx + rdx] ; + mov r10, (-8)[rcx + rdx] ; + dec r9 ; + mov (-16)[rcx], rax ; + mov (-8)[rcx], r10 ; + jnz short mcpy70 ; if nz, more 32-byte blocks + and r8, 31 ; compute remaining byte count + jmp mcpy25 ; + +; +; Move 64-byte blocks nontemporal. +; + + align + + db 066h, 090h + +mcpy80: cmp rdx, CACHE_BLOCK ; check if cache block spacing + jb short mcpy70 ; if b, not cache block spaced +mcpy81: mov eax, CACHE_BLOCK / 128 ; set loop count +mcpy85: prefetchnta [rcx + rdx] ; prefetch 128 bytes + prefetchnta 64[rcx + rdx] ; + add rcx, 128 ; advance source address + dec eax ; decrement loop count + jnz short mcpy85 ; if nz, more to prefetch + sub rcx, CACHE_BLOCK ; reset source address + mov eax, CACHE_BLOCK / 64 ; set loop count +mcpy90: mov r9, [rcx + rdx] ; move 64-byte block + mov r10, 8[rcx + rdx] ; + movnti [rcx], r9 ; + movnti 8[rcx], r10 ; + mov r9, 16[rcx + rdx] ; + mov r10, 24[rcx + rdx] ; + movnti 16[rcx], r9 ; + movnti 24[rcx], r10 ; + mov r9, 32[rcx + rdx] ; + mov r10, 40[rcx + rdx] ; + add rcx, 64 ; advance destination address + movnti (32 - 64)[rcx], r9 ; + movnti (40 - 64)[rcx], r10 ; + mov r9, (48 - 64)[rcx + rdx] ; + mov r10, (56 - 64)[rcx + rdx] ; + dec eax ; + movnti (48 - 64)[rcx], r9 ; + movnti (56 - 64)[rcx], r10 ; + jnz short mcpy90 ; if nz, more 32-byte blocks + sub r8, CACHE_BLOCK ; reduce remaining length + cmp r8, CACHE_BLOCK ; check if cache block remains + jae mcpy81 ; if ae, cache block remains + lock or byte ptr [rsp], 0 ; flush data to memory + jmp mcpy20 ; + +; +; The source address is less than the destination address. +; + + align + + db 066h, 066h, 066h, 090h + db 066h, 066h, 066h, 090h + db 066h, 090h + +mmov10: add rcx, r8 ; compute ending destination address + cmp r8, 8 ; check if 8 bytes to move + jb short mmov60 ; if b, less than 8 bytes to move + +; +; Move alignment bytes. +; + + test cl, 7 ; test if destination aligned + jz short mmov30 ; if z, destination aligned + test cl, 1 ; test if byte move needed + jz short mmov15 ; if z, byte move not needed + dec rcx ; decrement destination address + mov al, [rcx + rdx] ; move byte + dec r8 ; decrement byte count + mov [rcx], al ; +mmov15: test cl, 2 ; test if word move needed + jz short mmov20 ; if z, word move not needed + sub rcx, 2 ; reduce destination address + mov ax, [rcx + rdx] ; move word + sub r8, 2 ; reduce byte count + mov [rcx], ax ; +mmov20: test cl, 4 ; test if dword move needed + jz short mmov30 ; if z, dword move not needed + sub rcx, 4 ; reduce destination address + mov eax, [rcx + rdx] ; move dword + sub r8, 4 ; reduce byte count + mov [rcx], eax ; + +; +; Attempt to move 32-byte blocks +; + +mmov30: mov r9, r8 ; copy count of bytes remaining + shr r9, 5 ; compute number of 32-byte blocks + test r9, r9 ; v-liti, remove partial flag stall caused by shr + jnz short mmov80 ; if nz, 32-byte blocks to fill + +; +; Move 8-byte blocks. +; + align + +mmov40: mov r9, r8 ; copy count of bytes remaining + shr r9, 3 ; compute number of 8-byte blocks + test r9, r9 ; v-liti, remove partial flag stall caused by shr + jz short mmov60 ; if z, no 8-byte blocks + + align + +mmov50: sub rcx, 8 ; reduce destination address + mov rax, [rcx + rdx] ; move 8-byte blocks + dec r9 ; decrement loop count + mov [rcx], rax ; + jnz short mmov50 ; if nz, more 8-byte blocks + and r8, 7 ; compute remaining byte count + +; +; Test for residual bytes. +; + +mmov60: test r8, r8 ; test if any bytes to move + jnz short mmov70 ; if nz, residual bytes to move + mov rax, r11 ; set destination address + ret ; + +; +; Move residual bytes. +; + + align + +mmov70: dec rcx ; decrement destination address + mov al, [rcx + rdx] ; move byte + dec r8 ; decrement loop count + mov [rcx], al ; + jnz short mmov70 ; if nz, more bytes to fill + mov rax, r11 ; set destination address + ret ; return + +; +; Move 32 byte blocks +; + + align 16 + + db 066h, 066h, 066h, 090h + db 066h, 066h, 090h + +mmov80: cmp r9, CACHE_LIMIT_MEMMOV / 32 ; check if large move + jae short mmov93 ; if ae, large move +mmov90: mov rax, (-8)[rcx + rdx] ; move 32-byte block + mov r10, (-16)[rcx + rdx] ; + sub rcx, 32 ; reduce destination address + mov 24[rcx], rax ; + mov 16[rcx], r10 ; + mov rax, 8[rcx + rdx] ; + mov r10, [rcx + rdx] ; + dec r9 ; + mov 8[rcx], rax ; + mov [rcx], r10 ; + jnz short mmov90 ; if nz, more 32-byte blocks + and r8, 31 ; compute remaining byte count + jmp mmov40 ; + +; +; Move 64-byte blocks nontemporal. +; + + align + + db 066h, 090h + +mmov93: cmp rdx, -CACHE_BLOCK ; check if cache block spacing + ja short mmov90 ; if a, not cache block spaced +mmov94: mov eax, CACHE_BLOCK / 128 ; set loop count +mmov95: sub rcx, 128 ; reduce destination address + prefetchnta [rcx + rdx] ; prefetch 128 bytes + prefetchnta 64[rcx + rdx] ; + dec eax ; decrement loop count + jnz short mmov95 ; if nz, more to prefetch + add rcx, CACHE_BLOCK ; reset source address + mov eax, CACHE_BLOCK / 64 ; set loop count +mmov97: mov r9, (-8)[rcx + rdx] ; move 64-byte block + mov r10, (-16)[rcx + rdx] ; + movnti (-8)[rcx], r9 ; + movnti (-16)[rcx], r10 ; + mov r9, (-24)[rcx + rdx] ; + mov r10, (-32)[rcx + rdx] ; + movnti (-24)[rcx], r9 ; + movnti (-32)[rcx], r10 ; + mov r9, (-40)[rcx + rdx] ; + mov r10, (-48)[rcx + rdx] ; + sub rcx, 64 ; reduce destination address + movnti (64 - 40)[rcx], r9 ; + movnti (64 - 48)[rcx], r10 ; + mov r9, (64 - 56)[rcx + rdx] ; + mov r10, (64 - 64)[rcx + rdx] ; + dec eax ; decrement loop count + movnti (64 - 56)[rcx], r9 ; + movnti (64 - 64)[rcx], r10 ; + jnz short mmov97 ; if nz, more 32-byte blocks + sub r8, CACHE_BLOCK ; reduce remaining length + cmp r8, CACHE_BLOCK ; check if cache block remains + jae mmov94 ; if ae, cache block remains + lock or byte ptr [rsp], 0 ; flush data to memory + jmp mmov30 ; + +LEAF_END_MARKED JIT_MemCpy, _TEXT + + + end + -- cgit v1.2.3