summaryrefslogtreecommitdiff
path: root/src/vm/amd64/CrtHelpers.asm
blob: 9d5b280558bcd831a0344292852e28609ec33283 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
; Licensed to the .NET Foundation under one or more agreements.
; The .NET Foundation licenses this file to you under the MIT license.
; See the LICENSE file in the project root for more information.

; ==++==
;

;
; ==--==
; ***********************************************************************
; File: CrtHelpers.asm, see history in asmhelpers.asm
;
; ***********************************************************************

include AsmMacros.inc

;char *memset(dst, value, count) - sets "count" bytes at "dst" to "value"
;
;Purpose:
;   Sets the first "count" bytes of the memory starting
;   at "dst" to the character value "value".
;
;Algorithm:
;Set dst based on count as follow
;   count [0, 16]: use 1/2/4/8 bytes width registers
;   count [16, 128]: use 16 bytes width registers (XMM) without loop
;   count [128, 512]: use 16 bytes width registers (XMM) with loops, unrolled 8 times
;   count [512, upper]: use rep stosb
;Entry:
;   char *dst - pointer to memory to fill with value
;   char value - value to put in dst bytes
;   int count - number of bytes of dst to fill
;
;Exit:
;   returns dst, with filled bytes
;
;Uses:
;
;Exceptions:
;
;*******************************************************************************

LEAF_ENTRY JIT_MemSet, _TEXT

        movzx   edx, dl                 ; set fill pattern
        mov     r9, 0101010101010101h   
        imul    rdx, r9                 ; rdx is 8 bytes filler

        cmp     r8, 16                  
        jbe     mset04                 

        cmp     r8, 512                 
        jbe     mset00 
        
        ; count > 512
        mov     r10, rcx                ; save dst address
        mov     r11, rdi                ; save rdi
        mov     eax, edx                ; eax is value
        mov     rdi, rcx                ; rdi is dst
        mov     rcx, r8                 ; rcx is count
        rep     stosb
        mov     rdi, r11                ; restore rdi
        mov     rax, r10
        ret

        align 16
mset00: mov     rax, rcx                ; save dst address
        movd    xmm0, rdx				
        punpcklbw xmm0, xmm0            ; xmm0 is 16 bytes filler

        cmp     r8, 128                
        jbe     mset02  

        ; count > 128 && count <= 512
        mov     r9, r8
        shr     r9, 7                   ; count/128
        
        align 16
mset01: movdqu	[rcx], xmm0
        movdqu	16[rcx], xmm0
        movdqu	32[rcx], xmm0
        movdqu	48[rcx], xmm0
        movdqu	64[rcx], xmm0
        movdqu	80[rcx], xmm0
        movdqu	96[rcx], xmm0
        movdqu	112[rcx], xmm0
        add     rcx, 128
        dec     r9
        jnz     mset01    
        and     r8, 7fh                 ; and r8 with 0111 1111
        
        ; the remainder is from 0 to 127
        cmp     r8, 16                  
        jnbe    mset02                  
        
        ; the remainder <= 16 
        movdqu  -16[rcx + r8], xmm0
        ret
        
        ; count > 16 && count <= 128 for mset02
        align 16
mset02: movdqu	[rcx], xmm0         
        movdqu	-16[rcx + r8], xmm0    
        cmp     r8, 32                 
        jbe     mset03
        
        ; count > 32 && count <= 64
        movdqu	16[rcx], xmm0
        movdqu	-32[rcx + r8], xmm0
        cmp     r8, 64
        jbe     mset03
        
        ; count > 64 && count <= 128
        movdqu	32[rcx], xmm0
        movdqu	48[rcx], xmm0
        movdqu	-48[rcx + r8], xmm0
        movdqu	-64[rcx + r8], xmm0   
mset03: ret
 
        align 16
mset04: mov     rax, rcx                ; save dst address
        test    r8b, 24                 ; and r8b with 0001 1000
        jz      mset05
        
        ; count >= 8 && count <= 16
        mov     [rcx], rdx        
        mov     -8[rcx + r8], rdx
        ret

        align 16
mset05: test    r8b, 4                  ; and r8b with 0100
        jz      mset06
        
        ; count >= 4 && count < 8
        mov     [rcx], edx        
        mov     -4[rcx + r8], edx
        ret
        
        ; count >= 0 && count < 4
        align 16
mset06: test    r8b, 1                  ; and r8b with 0001
        jz      mset07
        mov     [rcx],dl
mset07: test    r8b, 2                  ; and r8b with 0010
        jz      mset08
        mov     -2[rcx + r8], dx
mset08: ret

LEAF_END_MARKED JIT_MemSet, _TEXT

;JIT_MemCpy - Copy source buffer to destination buffer
;
;Purpose:
;   JIT_MemCpy() copies a source memory buffer to a destination memory
;   buffer. This routine recognize overlapping buffers to avoid propogation.
;   For cases where propogation is not a problem, memcpy() can be used.
;
;Algorithm:
;Copy to destination based on count as follow
;   count [0, 64]: overlap check not needed
;       count [0, 16]: use 1/2/4/8 bytes width registers  
;       count [16, 64]: use 16 bytes width registers (XMM) without loop
;   count [64, upper]: check overlap
;       non-overlap:
;           count [64, 512]: use 16 bytes width registers (XMM) with loops, unrolled 4 times
;           count [512, upper]: use rep movsb
;       overlap::
;           use 16 bytes width registers (XMM) with loops to copy from end to beginnig
;
;Entry:
;   void *dst = pointer to destination buffer
;   const void *src = pointer to source buffer
;   size_t count = number of bytes to copy
;
;Exit:
;   Returns a pointer to the destination buffer
;
;Uses:
;
;Exceptions:
;*******************************************************************************

LEAF_ENTRY JIT_MemCpy, _TEXT

        mov     rax, rcx                ; save dst address
        cmp     r8, 16                  
        jbe     mcpy02
        
        cmp     r8, 64             
        jnbe    mcpy07

        ; count > 16 && count <= 64
        align 16        
mcpy00: movdqu  xmm0, [rdx]             
        movdqu  xmm1, -16[rdx + r8]     ; save 16 to 32 bytes src
        cmp     r8, 32
        jbe     mcpy01
        
        movdqu  xmm2, 16[rdx]           
        movdqu  xmm3, -32[rdx + r8]     ; save 32 to 64 bytes src
        
        ;count > 32 && count <= 64
        movdqu  16[rcx], xmm2
        movdqu  -32[rcx + r8], xmm3
        
        ;count > 16 && count <= 32
mcpy01: movdqu  [rcx], xmm0
        movdqu  -16[rcx + r8], xmm1
        ret

        ; count <= 16 
        align 16
mcpy02: test    r8b, 24                 ; test count with 0001 1000
        jz      mcpy03
        ; count >= 8 && count <= 16
        mov     r9, [rdx]
        mov     r10, -8[rdx + r8]
        mov     [rcx], r9
        mov     -8[rcx + r8], r10
        ret
        
        align 16
mcpy03: test    r8b, 4                  ; test count with 0100
        jz      mcpy04
        ; count >= 4 && count < 8
        mov     r9d, [rdx]
        mov     r10d, -4[rdx + r8]
        mov     [rcx], r9d
        mov     -4[rcx + r8], r10d
        ret
        
        ; count >= 0 && count < 4
        align 16
mcpy04: test    r8, r8                  
        jz      mcpy06                  ; count == 1/2/3
        mov     r9b, [rdx]              ; save the first byte
        
        test    r8b, 2                  ; test count with 0010
        jz      mcpy05
        mov     r10w, -2[rdx + r8]        
        mov     -2[rcx + r8], r10w
mcpy05: mov     [rcx], r9b
mcpy06: ret
 
        align 16
        ; count > 64, we need to check overlap
mcpy07: mov     r9, rdx                 ; r9 is src address
        sub     r9, rcx                 ; if src - dst < 0 jump to mcpy11
        jb      mcpy11                  ; if b, destination may overlap 
        
mcpy08: cmp     r8, 512
        jnbe    mcpy10
        
        ; count > 64 && count <= 512
        mov     r9, r8
        shr     r9, 6                   ; count/64
        
        align 16
mcpy09: movdqu  xmm0, [rdx] 
        movdqu  xmm1, 16[rdx]
        movdqu  xmm2, 32[rdx]
        movdqu  xmm3, 48[rdx]
        movdqu  [rcx], xmm0
        movdqu  16[rcx], xmm1
        movdqu  32[rcx], xmm2
        movdqu  48[rcx], xmm3
        add     rdx, 64
        add     rcx, 64
        dec     r9
        jnz     mcpy09
        
        ; the remainder is from 0 to 63
        and     r8, 3fh                 ; and with 0011 1111 
        cmp     r8, 16                  
        jnbe    mcpy00                  

        ; the remainder <= 16
        jmp     mcpy02
        ret
        
        ; count > 512
        align 16
mcpy10: mov     r10, rdi                ; save rdi
        mov     r11, rsi                ; save rsi
        mov     rdi, rcx                ; rdi is dst
        mov     rsi, rdx                ; rsi is src
        mov     rcx, r8                 ; rcx is count
        rep     movsb                   ; mov from rsi to rdi
        mov     rsi, r11                ; restore rsi
        mov     rdi, r10                ; restore rdi
        ret

; The source address is less than the destination address.

        align 16
mcpy11: add     r9, r8                  ; src - dst + count
        cmp     r9, 0                   ; src + count < = dst jump to mcpy08
        jle     mcpy08
        
        lea     r9, [rdx + r8]          ; r9 is the src + count     
        lea     r10, [rcx + r8]         ; r10 is the dst + count
        
        mov     r11, r8
        shr     r11, 6                  ; count/64
       
        ; count > 64
        align 16
mcpy12: movdqu  xmm0, -16[r9]
        movdqu  xmm1, -32[r9]
        movdqu  xmm2, -48[r9]
        movdqu  xmm3, -64[r9]
        movdqu  -16[r10], xmm0
        movdqu  -32[r10], xmm1
        movdqu  -48[r10], xmm2
        movdqu  -64[r10], xmm3    
        sub     r9, 64
        sub     r10, 64
        dec     r11
        jnz     mcpy12
        
        ; the remainder is from 0 to 63
        and     r8, 3fh                 ; and with 0011 1111 
        cmp     r8, 16                  
        jnbe    mcpy00                  

        ; the remainder <= 16
        jmp     mcpy02

LEAF_END_MARKED JIT_MemCpy, _TEXT
		end