summaryrefslogtreecommitdiff
path: root/packaging/0019-Add-profiling-support.patch
blob: 1e39d21a8e75c0fc6608b6a30583944a2cb9bca6 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
From eef4ae4f508fa9d1f4acb3ee533a3b8a756693e1 Mon Sep 17 00:00:00 2001
From: CHUNSEOK LEE <chunseok.lee@samsung.com>
Date: Fri, 18 Aug 2017 14:05:03 +0900
Subject: [PATCH 19/23] Add profiling support

Signed-off-by: CHUNSEOK LEE <chunseok.lee@samsung.com>
---
 Documentation/botr/clr-abi.md |  36 +++++-
 clrfeatures.cmake             |   2 +-
 src/jit/codegencommon.cpp     | 113 ++++++++++++++++-
 src/jit/codegenxarch.cpp      |  38 +++++-
 src/jit/compiler.cpp          |  23 ++++
 src/jit/compiler.h            |   8 ++
 src/jit/lclvars.cpp           |   2 +-
 src/jit/target.h              |   9 +-
 src/vm/CMakeLists.txt         |   1 +
 src/vm/amd64/asmhelpers.S     | 289 ++++++++++++++++++++++++++++++++++++++++++
 src/vm/amd64/profiler.cpp     |  12 ++
 src/vm/amd64/unixstubs.cpp    |  15 ---
 12 files changed, 523 insertions(+), 25 deletions(-)
 create mode 100644 src/vm/amd64/asmhelpers.S

diff --git a/Documentation/botr/clr-abi.md b/Documentation/botr/clr-abi.md
index a85bfa4..c0ec331 100644
--- a/Documentation/botr/clr-abi.md
+++ b/Documentation/botr/clr-abi.md
@@ -585,9 +585,9 @@ The CLR unwinder assumes any non-leaf frame was unwound as a result of a call. T
 
 If the JIT gets passed `CORJIT_FLG_PROF_ENTERLEAVE`, then the JIT might need to insert native entry/exit/tail call probes. To determine for sure, the JIT must call GetProfilingHandle. This API returns as out parameters, the true dynamic boolean indicating if the JIT should actually insert the probes and a parameter to pass to the callbacks (typed as void*), with an optional indirection (used for NGEN). This parameter is always the first argument to all of the call-outs (thus placed in the usual first argument register `RCX` (AMD64) or `R0` (ARM, ARM64)).
 
-Outside of the prolog (in a GC interruptible location), the JIT injects a call to `CORINFO_HELP_PROF_FCN_ENTER`. For AMD64, all argument registers will be homed into their caller-allocated stack locations (similar to varargs). For ARM and ARM64, all arguments are prespilled (again similar to varargs).
+Outside of the prolog (in a GC interruptible location), the JIT injects a call to `CORINFO_HELP_PROF_FCN_ENTER`. For AMD64,  on Windows all argument registers will be homed into their caller-allocated stack locations (similar to varargs), on Unix all argument registers will be stored in the inner structure. For ARM and ARM64, all arguments are prespilled (again similar to varargs).
 
-After computing the return value and storing it in the correct register, but before any epilog code (including before a possible GS cookie check), the JIT injects a call to `CORINFO_HELP_PROF_FCN_LEAVE`. For AMD64 this call must preserve the return register: `RAX` or `XMM0`. For ARM, the return value will be moved from `R0` to `R2` (if it was in `R0`), `R1`, `R2`, and `S0/D0` must be preserved by the callee (longs will be `R2`, `R1` - note the unusual ordering of the registers, floats in `S0`, doubles in `D0`, smaller integrals in `R2`).
+After computing the return value and storing it in the correct register, but before any epilog code (including before a possible GS cookie check), the JIT injects a call to `CORINFO_HELP_PROF_FCN_LEAVE`. For AMD64 this call must preserve the return register: `RAX` or `XMM0` on Windows and `RAX` and `RDX` or `XMM0` and `XMM1` on Unix. For ARM, the return value will be moved from `R0` to `R2` (if it was in `R0`), `R1`, `R2`, and `S0/D0` must be preserved by the callee (longs will be `R2`, `R1` - note the unusual ordering of the registers, floats in `S0`, doubles in `D0`, smaller integrals in `R2`).
 
 TODO: describe ARM64 profile leave conventions.
 
@@ -667,3 +667,35 @@ The general rules outlined in the System V x86_64 ABI (described at http://www.x
 3. The JIT proactively generates frame register frames (with `RBP` as a frame register) in order to aid the native OS tooling for stack unwinding and the like.
 4. All the other internal VM contracts for PInvoke, EH, and generic support remains in place. Please see the relevant sections above for more details. Note, however, that the registers used are different on System V due to the different calling convention. For example, the integer argument registers are, in order, RDI, RSI, RDX, RCX, R8, and R9. Thus, where the first argument (typically, the "this" pointer) on Windows AMD64 goes in RCX, on System V it goes in RDI, and so forth.   
 5. Structs with explicit layout are always passed by value on the stack.
+6. The following table describes register usage according to the System V x86_64 ABI
+
+```
+| Register     | Usage                                   | Preserved across  |
+|              |                                         | function calls    |
+|--------------|-----------------------------------------|-------------------|
+| %rax         | temporary register; with variable argu- | No                |
+|              | ments passes information about the      |                   |
+|              | number of SSE registers used;           |                   |
+|              | 1st return argument                     |                   |
+| %rbx         | callee-saved register; optionally used  | Yes               |
+|              | as base pointer                         |                   |
+| %rcx         | used to pass 4st integer argument to    | No                |
+|              | to functions                            |                   |
+| %rdx         | used to pass 3rd argument to functions  | No                |
+|              | 2nd return register                     |                   |
+| %rsp         | stack pointer                           | Yes               |
+| %rbp         | callee-saved register; optionally used  | Yes               |
+|              | as frame pointer                        |                   |
+| %rsi         | used to pass 2nd argument to functions  | No                |
+| %rdi         | used to pass 1st argument to functions  | No                |
+| %r8          | used to pass 5th argument to functions  | No                |
+| %r9          | used to pass 6th argument to functions  | No                |
+| %r10         | temporary register, used for passing a  | No                |
+|              | function's static chain pointer         |                   |
+| %r11         | temporary register                      | No                |
+| %r12-%r15    | callee-saved registers                  | Yes               |
+| %xmm0-%xmm1  | used to pass and return floating point  | No                |
+|              | arguments                               |                   |
+| %xmm2-%xmm7  | used to pass floating point arguments   | No                |
+| %xmm8-%xmm15 | temporary registers                     | No                |
+```
diff --git a/clrfeatures.cmake b/clrfeatures.cmake
index f047c91..0e2801c 100644
--- a/clrfeatures.cmake
+++ b/clrfeatures.cmake
@@ -1,5 +1,5 @@
 if(CLR_CMAKE_TARGET_TIZEN_LINUX)
-  set(FEATURE_EVENT_TRACE 0)
+  set(FEATURE_EVENT_TRACE 1)
 endif()
 
 if(NOT DEFINED FEATURE_EVENT_TRACE)
diff --git a/src/jit/codegencommon.cpp b/src/jit/codegencommon.cpp
index 719299e..50f43fa 100644
--- a/src/jit/codegencommon.cpp
+++ b/src/jit/codegencommon.cpp
@@ -4431,7 +4431,9 @@ void CodeGen::genFnPrologCalleeRegArgs(regNumber xtraReg, bool* pXtraRegClobbere
             if ((regSet.rsMaskPreSpillRegs(false) & genRegMask(regNum)) == 0)
 #endif // _TARGET_ARM_
             {
-                noway_assert(xtraReg != varDsc->lvArgReg + i);
+#if !defined(UNIX_AMD64_ABI)
+                noway_assert(xtraReg != (varDsc->lvArgReg + i));
+#endif
                 noway_assert(regArgMaskLive & genRegMask(regNum));
             }
 
@@ -7437,7 +7439,9 @@ void CodeGen::genProfilingEnterCallback(regNumber initReg, bool* pInitRegZeroed)
         return;
     }
 
-#if defined(_TARGET_AMD64_) && !defined(UNIX_AMD64_ABI) // No profiling for System V systems yet.
+#if defined(_TARGET_AMD64_)
+#if !defined(UNIX_AMD64_ABI)
+
     unsigned   varNum;
     LclVarDsc* varDsc;
 
@@ -7566,6 +7570,57 @@ void CodeGen::genProfilingEnterCallback(regNumber initReg, bool* pInitRegZeroed)
         *pInitRegZeroed = false;
     }
 
+#else // !defined(UNIX_AMD64_ABI)
+
+    // Emit profiler EnterCallback(ProfilerMethHnd, caller's SP)
+    // R14 = ProfilerMethHnd
+    if (compiler->compProfilerMethHndIndirected)
+    {
+        // Profiler hooks enabled during Ngen time.
+        // Profiler handle needs to be accessed through an indirection of a pointer.
+        getEmitter()->emitIns_R_AI(INS_mov, EA_PTR_DSP_RELOC, REG_PROFILER_ENTER_ARG_0,
+                                   (ssize_t)compiler->compProfilerMethHnd);
+    }
+    else
+    {
+        // No need to record relocations, if we are generating ELT hooks under the influence
+        // of COMPlus_JitELTHookEnabled=1
+        if (compiler->opts.compJitELTHookEnabled)
+        {
+            genSetRegToIcon(REG_PROFILER_ENTER_ARG_0, (ssize_t)compiler->compProfilerMethHnd, TYP_I_IMPL);
+        }
+        else
+        {
+            instGen_Set_Reg_To_Imm(EA_8BYTE, REG_PROFILER_ENTER_ARG_0, (ssize_t)compiler->compProfilerMethHnd);
+        }
+    }
+
+    // R15 = caller's SP
+    // Notes
+    //   1) Here we can query caller's SP offset since prolog will be generated after final frame layout.
+    //   2) caller's SP relative offset to FramePointer will be negative.  We need to add absolute value
+    //      of that offset to FramePointer to obtain caller's SP value.
+    assert(compiler->lvaOutgoingArgSpaceVar != BAD_VAR_NUM);
+    int callerSPOffset = compiler->lvaToCallerSPRelativeOffset(0, isFramePointerUsed());
+    getEmitter()->emitIns_R_AR(INS_lea, EA_PTRSIZE, REG_PROFILER_ENTER_ARG_1, genFramePointerReg(), -callerSPOffset);
+
+    // Can't have a call until we have enough padding for rejit
+    genPrologPadForReJit();
+
+    // We can use any callee trash register (other than RAX, RDI, RSI) for call target.
+    // We use R11 here. This will emit either
+    // "call ip-relative 32-bit offset" or
+    // "mov r11, helper addr; call r11"
+    genEmitHelperCall(CORINFO_HELP_PROF_FCN_ENTER, 0, EA_UNKNOWN, REG_DEFAULT_PROFILER_CALL_TARGET);
+
+    // If initReg is one of RBM_CALLEE_TRASH, then it needs to be zero'ed before using.
+    if ((RBM_CALLEE_TRASH & genRegMask(initReg)) != 0)
+    {
+        *pInitRegZeroed = false;
+    }
+
+#endif // !defined(UNIX_AMD64_ABI)
+
 #elif defined(_TARGET_X86_) || (defined(_TARGET_ARM_) && defined(LEGACY_BACKEND))
 
     unsigned saveStackLvl2 = genStackLevel;
@@ -7673,6 +7728,7 @@ void CodeGen::genProfilingEnterCallback(regNumber initReg, bool* pInitRegZeroed)
 //
 void CodeGen::genProfilingLeaveCallback(unsigned helper /*= CORINFO_HELP_PROF_FCN_LEAVE*/)
 {
+
     assert((helper == CORINFO_HELP_PROF_FCN_LEAVE) || (helper == CORINFO_HELP_PROF_FCN_TAILCALL));
 
     // Only hook if profiler says it's okay.
@@ -7686,7 +7742,8 @@ void CodeGen::genProfilingLeaveCallback(unsigned helper /*= CORINFO_HELP_PROF_FC
     // Need to save on to the stack level, since the helper call will pop the argument
     unsigned saveStackLvl2 = genStackLevel;
 
-#if defined(_TARGET_AMD64_) && !defined(UNIX_AMD64_ABI) // No profiling for System V systems yet.
+#if defined(_TARGET_AMD64_)
+#if !defined(UNIX_AMD64_ABI)
 
     // Since the method needs to make a profiler callback, it should have out-going arg space allocated.
     noway_assert(compiler->lvaOutgoingArgSpaceVar != BAD_VAR_NUM);
@@ -7757,6 +7814,48 @@ void CodeGen::genProfilingLeaveCallback(unsigned helper /*= CORINFO_HELP_PROF_FC
     // "mov r8, helper addr; call r8"
     genEmitHelperCall(helper, 0, EA_UNKNOWN, REG_ARG_2);
 
+#else // !defined(UNIX_AMD64_ABI)
+
+    // RDI = ProfilerMethHnd
+    if (compiler->compProfilerMethHndIndirected)
+    {
+        getEmitter()->emitIns_R_AI(INS_mov, EA_PTR_DSP_RELOC, REG_ARG_0, (ssize_t)compiler->compProfilerMethHnd);
+    }
+    else
+    {
+        if (compiler->opts.compJitELTHookEnabled)
+        {
+            genSetRegToIcon(REG_ARG_0, (ssize_t)compiler->compProfilerMethHnd, TYP_I_IMPL);
+        }
+        else
+        {
+            instGen_Set_Reg_To_Imm(EA_8BYTE, REG_ARG_0, (ssize_t)compiler->compProfilerMethHnd);
+        }
+    }
+
+    // RSI = caller's SP
+    if (compiler->lvaDoneFrameLayout == Compiler::FINAL_FRAME_LAYOUT)
+    {
+        int callerSPOffset = compiler->lvaToCallerSPRelativeOffset(0, isFramePointerUsed());
+        getEmitter()->emitIns_R_AR(INS_lea, EA_PTRSIZE, REG_ARG_1, genFramePointerReg(), -callerSPOffset);
+    }
+    else
+    {
+        LclVarDsc* varDsc = compiler->lvaTable;
+        NYI_IF((varDsc == nullptr) || !varDsc->lvIsParam, "Profiler ELT callback for a method without any params");
+
+        // lea rdx, [FramePointer + Arg0's offset]
+        getEmitter()->emitIns_R_S(INS_lea, EA_PTRSIZE, REG_ARG_1, 0, 0);
+    }
+
+    // We can use any callee trash register (other than RAX, RDI, RSI) for call target.
+    // We use R11 here. This will emit either
+    // "call ip-relative 32-bit offset" or
+    // "mov r11, helper addr; call r11"
+    genEmitHelperCall(helper, 0, EA_UNKNOWN, REG_DEFAULT_PROFILER_CALL_TARGET);
+
+#endif // !defined(UNIX_AMD64_ABI)
+
 #elif defined(_TARGET_X86_)
 
     //
@@ -8198,6 +8297,14 @@ void CodeGen::genFinalizeFrame()
         regSet.rsSetRegsModified(RBM_INT_CALLEE_SAVED & ~RBM_FPBASE);
     }
 
+#ifdef UNIX_AMD64_ABI
+    // On Unix x64 we also save R14 and R15 for ELT profiler hook generation.
+    if (compiler->compIsProfilerHookNeeded())
+    {
+        regSet.rsSetRegsModified(RBM_PROFILER_ENTER_ARG_0 | RBM_PROFILER_ENTER_ARG_1);
+    }
+#endif
+
     /* Count how many callee-saved registers will actually be saved (pushed) */
 
     // EBP cannot be (directly) modified for EBP frame and double-aligned frames
diff --git a/src/jit/codegenxarch.cpp b/src/jit/codegenxarch.cpp
index 252f004..fa2c22b 100644
--- a/src/jit/codegenxarch.cpp
+++ b/src/jit/codegenxarch.cpp
@@ -1219,16 +1219,51 @@ void CodeGen::genReturn(GenTreePtr treeNode)
         // Since we are invalidating the assumption that we would slip into the epilog
         // right after the "return", we need to preserve the return reg's GC state
         // across the call until actual method return.
+        ReturnTypeDesc retTypeDesc;
+        unsigned       regCount;
+        if (compiler->compMethodReturnsMultiRegRetType())
+        {
+            if (varTypeIsLong(compiler->info.compRetNativeType))
+            {
+                retTypeDesc.InitializeLongReturnType(compiler);
+            }
+            else // we must have a struct return type
+            {
+                retTypeDesc.InitializeStructReturnType(compiler, compiler->info.compMethodInfo->args.retTypeClass);
+            }
+            regCount = retTypeDesc.GetReturnRegCount();
+        }
+
         if (varTypeIsGC(compiler->info.compRetType))
         {
             gcInfo.gcMarkRegPtrVal(REG_INTRET, compiler->info.compRetType);
         }
+        else if (compiler->compMethodReturnsMultiRegRetType())
+        {
+            for (unsigned i = 0; i < regCount; ++i)
+            {
+                if (varTypeIsGC(retTypeDesc.GetReturnRegType(i)))
+                {
+                    gcInfo.gcMarkRegPtrVal(retTypeDesc.GetABIReturnReg(i), retTypeDesc.GetReturnRegType(i));
+                }
+            }
+        }
 
         genProfilingLeaveCallback();
 
         if (varTypeIsGC(compiler->info.compRetType))
         {
-            gcInfo.gcMarkRegSetNpt(REG_INTRET);
+            gcInfo.gcMarkRegSetNpt(genRegMask(REG_INTRET));
+        }
+        else if (compiler->compMethodReturnsMultiRegRetType())
+        {
+            for (unsigned i = 0; i < regCount; ++i)
+            {
+                if (varTypeIsGC(retTypeDesc.GetReturnRegType(i)))
+                {
+                    gcInfo.gcMarkRegSetNpt(genRegMask(retTypeDesc.GetABIReturnReg(i)));
+                }
+            }
         }
     }
 #endif
@@ -8244,7 +8279,6 @@ void CodeGen::genPutStructArgStk(GenTreePutArgStk* putArgStk)
                 var_types memType = (gcPtrs[i] == TYPE_GC_REF) ? TYP_REF : TYP_BYREF;
                 getEmitter()->emitIns_R_AR(ins_Load(memType), emitTypeSize(memType), REG_RCX, REG_RSI, 0);
                 genStoreRegToStackArg(memType, REG_RCX, i * TARGET_POINTER_SIZE);
-
 #ifdef DEBUG
                 numGCSlotsCopied++;
 #endif // DEBUG
diff --git a/src/jit/compiler.cpp b/src/jit/compiler.cpp
index 1c24b93..ccda4f5 100644
--- a/src/jit/compiler.cpp
+++ b/src/jit/compiler.cpp
@@ -6839,6 +6839,29 @@ void Compiler::GetStructTypeOffset(const SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSIN
         *type1 = GetEightByteType(structDesc, 1);
     }
 }
+
+//------------------------------------------------------------------------------------------------------
+// GetStructTypeOffset: Gets the type, size and offset of the eightbytes of a struct for System V systems.
+//
+// Arguments:
+//    'typeHnd'    -  type handle
+//    'type0'      -  out param; returns the type of the first eightbyte.
+//    'type1'      -  out param; returns the type of the second eightbyte.
+//    'offset0'    -  out param; returns the offset of the first eightbyte.
+//    'offset1'    -  out param; returns the offset of the second eightbyte.
+//
+void Compiler::GetStructTypeOffset(CORINFO_CLASS_HANDLE typeHnd,
+                                   var_types*           type0,
+                                   var_types*           type1,
+                                   unsigned __int8*     offset0,
+                                   unsigned __int8*     offset1)
+{
+    SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR structDesc;
+    eeGetSystemVAmd64PassStructInRegisterDescriptor(typeHnd, &structDesc);
+    assert(structDesc.passedInRegisters);
+    GetStructTypeOffset(structDesc, type0, type1, offset0, offset1);
+}
+
 #endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
 
 /*****************************************************************************/
diff --git a/src/jit/compiler.h b/src/jit/compiler.h
index 5bff8dd..bf7aaac 100644
--- a/src/jit/compiler.h
+++ b/src/jit/compiler.h
@@ -9253,11 +9253,19 @@ public:
     static var_types GetTypeFromClassificationAndSizes(SystemVClassificationType classType, int size);
     static var_types GetEightByteType(const SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR& structDesc,
                                       unsigned                                                   slotNum);
+
     static void GetStructTypeOffset(const SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR& structDesc,
                                     var_types*                                                 type0,
                                     var_types*                                                 type1,
                                     unsigned __int8*                                           offset0,
                                     unsigned __int8*                                           offset1);
+
+    void GetStructTypeOffset(CORINFO_CLASS_HANDLE typeHnd,
+                             var_types*           type0,
+                             var_types*           type1,
+                             unsigned __int8*     offset0,
+                             unsigned __int8*     offset1);
+
     void fgMorphSystemVStructArgs(GenTreeCall* call, bool hasStructArgument);
 #endif // defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
 
diff --git a/src/jit/lclvars.cpp b/src/jit/lclvars.cpp
index 4770a1d..e540c7f 100644
--- a/src/jit/lclvars.cpp
+++ b/src/jit/lclvars.cpp
@@ -6147,7 +6147,7 @@ void Compiler::lvaAlignFrame()
     // On AMD64-Unix, there are no such slots. There is a possibility to have calls in the method with frame size of 0.
     // The frame alignment logic won't kick in. This flags takes care of the AMD64-Unix case by remembering that there
     // are calls and making sure the frame alignment logic is executed.
-    bool stackNeedsAlignment = (compLclFrameSize != 0 || opts.compNeedToAlignFrame);
+    bool stackNeedsAlignment = (compLclFrameSize != 0 || opts.compNeedToAlignFrame || compIsProfilerHookNeeded());
 #else  // !UNIX_AMD64_ABI
     bool stackNeedsAlignment = compLclFrameSize != 0;
 #endif // !UNIX_AMD64_ABI
diff --git a/src/jit/target.h b/src/jit/target.h
index 9fa5e33..8330139 100644
--- a/src/jit/target.h
+++ b/src/jit/target.h
@@ -830,6 +830,13 @@ typedef unsigned short regPairNoSmall; // arm: need 12 bits
   #define RBM_FLT_CALLEE_SAVED    (0)
   #define RBM_FLT_CALLEE_TRASH    (RBM_XMM0|RBM_XMM1|RBM_XMM2|RBM_XMM3|RBM_XMM4|RBM_XMM5|RBM_XMM6|RBM_XMM7| \
                                    RBM_XMM8|RBM_XMM9|RBM_XMM10|RBM_XMM11|RBM_XMM12|RBM_XMM13|RBM_XMM14|RBM_XMM15)
+  #define REG_PROFILER_ENTER_ARG_0 REG_R14
+  #define RBM_PROFILER_ENTER_ARG_0 RBM_R14
+  #define REG_PROFILER_ENTER_ARG_1 REG_R15
+  #define RBM_PROFILER_ENTER_ARG_1 RBM_R15
+
+  #define REG_DEFAULT_PROFILER_CALL_TARGET REG_R11
+
 #else // !UNIX_AMD64_ABI
 #define MIN_ARG_AREA_FOR_CALL     (4 * REGSIZE_BYTES)       // Minimum required outgoing argument space for a call.
 
@@ -976,7 +983,7 @@ typedef unsigned short regPairNoSmall; // arm: need 12 bits
   //    profiler.
   #define REG_DEFAULT_HELPER_CALL_TARGET    REG_RAX
 
-  // GenericPInvokeCalliHelper VASigCookie Parameter 
+  // GenericPInvokeCalliHelper VASigCookie Parameter
   #define REG_PINVOKE_COOKIE_PARAM          REG_R11
   #define RBM_PINVOKE_COOKIE_PARAM          RBM_R11
   #define PREDICT_REG_PINVOKE_COOKIE_PARAM  PREDICT_REG_R11
diff --git a/src/vm/CMakeLists.txt b/src/vm/CMakeLists.txt
index 835e31c..00c7d04 100644
--- a/src/vm/CMakeLists.txt
+++ b/src/vm/CMakeLists.txt
@@ -368,6 +368,7 @@ else(WIN32)
 
     if(CLR_CMAKE_TARGET_ARCH_AMD64)
         set(VM_SOURCES_WKS_ARCH_ASM
+            ${ARCH_SOURCES_DIR}/asmhelpers.S
             ${ARCH_SOURCES_DIR}/calldescrworkeramd64.S
             ${ARCH_SOURCES_DIR}/crthelpers.S
             ${ARCH_SOURCES_DIR}/externalmethodfixupthunk.S
diff --git a/src/vm/amd64/asmhelpers.S b/src/vm/amd64/asmhelpers.S
new file mode 100644
index 0000000..0f0ca07
--- /dev/null
+++ b/src/vm/amd64/asmhelpers.S
@@ -0,0 +1,289 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+.intel_syntax noprefix
+#include "unixasmmacros.inc"
+#include "asmconstants.h"
+
+#define real4 dword
+#define real8 qword
+
+//
+//    file: profile.cpp
+//    typedef struct _PROFILE_PLATFORM_SPECIFIC_DATA
+//    {
+//        FunctionID *functionId; // function ID comes in the r11 register
+//        void       *rbp;
+//        void       *probersp;
+//        void       *ip;
+//        void       *profiledRsp;
+//        UINT64      rax;
+//        LPVOID      hiddenArg;
+//        UINT64      flt0;
+//        UINT64      flt1;
+//        UINT64      flt2;
+//        UINT64      flt3;
+// #if defined(UNIX_AMD64_ABI)
+//        UINT64      flt4;
+//        UINT64      flt5;
+//        UINT64      flt6;
+//        UINT64      flt7;
+//        UINT64      rdi;
+//        UINT64      rsi;
+//        UINT64      rdx;
+//        UINT64      rcx;
+//        UINT64      r8;
+//        UINT64      r9;
+// #endif
+//        UINT32      flags;
+//    } PROFILE_PLATFORM_SPECIFIC_DATA, *PPROFILE_PLATFORM_SPECIFIC_DATA;
+//
+.equ SIZEOF_PROFILE_PLATFORM_SPECIFIC_DATA, 0x8*21 + 0x4*2   // includes fudge to make FP_SPILL right
+.equ SIZEOF_FP_ARG_SPILL, 0x10*2
+
+.equ OFFSETOF_FP_ARG_SPILL_0, SIZEOF_PROFILE_PLATFORM_SPECIFIC_DATA
+.equ OFFSETOF_FP_ARG_SPILL_1, OFFSETOF_FP_ARG_SPILL_0 + 0x10
+
+.equ SIZEOF_STACK_FRAME, SIZEOF_PROFILE_PLATFORM_SPECIFIC_DATA + SIZEOF_FP_ARG_SPILL + 0x8
+
+.equ PROFILE_ENTER, 0x1
+.equ PROFILE_LEAVE, 0x2
+.equ PROFILE_TAILCALL, 0x4
+
+// ***********************************************************
+//   NOTE:
+//
+//   Register preservation scheme:
+//
+//       Preserved:
+//           - all non-volatile registers
+//           - rax, rdx
+//           - xmm0, xmm1
+//
+//       Not Preserved:
+//           - integer argument registers (rcx, rdx, r8, r9)
+//           - floating point argument registers (xmm1-3)
+//           - volatile integer registers (r10, r11)
+//           - volatile floating point registers (xmm4-5)
+//           - upper halves of ymm registers on AVX (which are volatile)
+//
+// ***********************************************************
+
+// EXTERN_C void ProfileEnterNaked(FunctionIDOrClientID functionIDOrClientID, size_t profiledRsp);
+// <NOTE>
+//
+// </NOTE>
+NESTED_ENTRY ProfileEnterNaked, _TEXT, NoHandler
+  //       Upon entry :
+  //           r14 = clientInfo
+  //           r15 = profiledRsp
+
+  push_nonvol_reg         rax
+
+  lea                     rax, [rsp + 0x10]    // caller rsp
+  mov                     r10, [rax - 0x8]     // return address
+
+  push_argument_register  rdx
+  alloc_stack             SIZEOF_STACK_FRAME
+
+  // correctness of return value in structure doesn't matter for enter probe
+
+  // setup ProfilePlatformSpecificData structure
+  xor                     r11, r11 // nullify r11
+  mov                     [rsp +  0x0], r11    // r11 is null     -- struct functionId field
+  save_reg_postrsp        rbp, 0x8             //                 -- struct rbp field
+  mov                     [rsp + 0x10], rax    // caller rsp      -- struct probeRsp field
+  mov                     [rsp + 0x18], r10    // return address  -- struct ip field
+  mov                     [rsp + 0x20], r15    //                 -- struct profiledRsp field
+  mov                     [rsp + 0x28], r11    // return value    -- struct rax field
+  mov                     [rsp + 0x30], r11    // r11 is null     -- struct hiddenArg field
+  movsd                   real8 ptr [rsp + 0x38], xmm0    //      -- struct flt0 field
+  movsd                   real8 ptr [rsp + 0x40], xmm1    //      -- struct flt1 field
+  movsd                   real8 ptr [rsp + 0x48], xmm2    //      -- struct flt2 field
+  movsd                   real8 ptr [rsp + 0x50], xmm3    //      -- struct flt3 field
+  movsd                   real8 ptr [rsp + 0x58], xmm4    //      -- struct flt4 field
+  movsd                   real8 ptr [rsp + 0x60], xmm5    //      -- struct flt5 field
+  movsd                   real8 ptr [rsp + 0x68], xmm6    //      -- struct flt6 field
+  movsd                   real8 ptr [rsp + 0x70], xmm7    //      -- struct flt7 field
+  mov                     [rsp + 0x78], rdi     //                -- struct rdi field
+  mov                     [rsp + 0x80], rsi     //                -- struct rsi field
+  mov                     [rsp + 0x88], rdx     //                -- struct rdx field
+  mov                     [rsp + 0x90], rcx     //                -- struct rcx field
+  mov                     [rsp + 0x98], r8      //                -- struct r8 field
+  mov                     [rsp + 0xa0], r9      //                -- struct r9 field
+  mov                     r10, 0x1 // PROFILE_ENTER
+  mov                     [rsp + 0xa8], r10d   //                -- struct flags field
+
+  // we need to be able to restore the fp return register
+  save_xmm128_postrsp     xmm0, OFFSETOF_FP_ARG_SPILL_0
+  save_xmm128_postrsp     xmm1, OFFSETOF_FP_ARG_SPILL_1
+  END_PROLOGUE
+
+  // rdi already contains the clientInfo
+  mov                     rdi, r14
+  lea                     rsi, [rsp + 0x0]
+  call                    C_FUNC(ProfileEnter)
+
+  // restore arg registers
+  mov                     rdi, [rsp + 0x78]
+  mov                     rsi, [rsp + 0x80]
+  mov                     rdx, [rsp + 0x88]
+  mov                     rcx, [rsp + 0x90]
+  mov                     r8, [rsp + 0x98]
+  mov                     r9, [rsp + 0xa0]
+
+  // restore fp return register
+  movdqa                  xmm0, [rsp + OFFSETOF_FP_ARG_SPILL_0]
+  movdqa                  xmm1, [rsp + OFFSETOF_FP_ARG_SPILL_1]
+
+  // begin epilogue
+  free_stack              SIZEOF_STACK_FRAME
+  pop_argument_register   rdx
+
+  pop_nonvol_reg          rax
+
+  ret
+NESTED_END ProfileEnterNaked, _TEXT
+
+// EXTERN_C void ProfileLeaveNaked(FunctionIDOrClientID functionIDOrClientID, size_t profiledRsp);
+// <NOTE>
+//
+// </NOTE>
+NESTED_ENTRY ProfileLeaveNaked, _TEXT, NoHandler
+//       Upon entry :
+//           rdi = clientInfo
+//           rsi = profiledRsp
+
+  push_nonvol_reg         rbx
+
+  lea                     rbx, [rsp + 0x10]    // caller rsp
+  mov                     r10, [rbx - 0x8]     // return address
+
+  // rdx should be saved here because it can be used for returning struct values
+  push_argument_register  rdx
+  alloc_stack             SIZEOF_STACK_FRAME
+
+  // correctness of argument registers in structure doesn't matter for leave probe
+
+  // setup ProfilePlatformSpecificData structure
+  xor                     r11, r11  // nullify r11
+  mov                     [rsp +  0x0], r11    // r11 is null     -- struct functionId field
+  save_reg_postrsp        rbp, 0x8             //                 -- struct rbp field
+  mov                     [rsp + 0x10], rbx    // caller rsp      -- struct probeRsp field
+  mov                     [rsp + 0x18], r10    // return address  -- struct ip field
+  mov                     [rsp + 0x20], rsi    //                 -- struct profiledRsp field
+  mov                     [rsp + 0x28], rax    // return value    -- struct rax field
+  mov                     [rsp + 0x30], r11    // r11 is null     -- struct hiddenArg field
+  movsd                   real8 ptr [rsp + 0x38], xmm0    //      -- struct flt0 field
+  movsd                   real8 ptr [rsp + 0x40], xmm1    //      -- struct flt1 field
+  movsd                   real8 ptr [rsp + 0x48], xmm2    //      -- struct flt2 field
+  movsd                   real8 ptr [rsp + 0x50], xmm3    //      -- struct flt3 field
+  movsd                   real8 ptr [rsp + 0x58], xmm4    //      -- struct flt4 field
+  movsd                   real8 ptr [rsp + 0x60], xmm5    //      -- struct flt5 field
+  movsd                   real8 ptr [rsp + 0x68], xmm6    //      -- struct flt6 field
+  movsd                   real8 ptr [rsp + 0x70], xmm7    //      -- struct flt7 field
+  mov                     [rsp + 0x78], r11     //                -- struct rdi field
+  mov                     [rsp + 0x80], r11     //                -- struct rsi field
+  mov                     [rsp + 0x88], r11     //                -- struct rdx field
+  mov                     [rsp + 0x90], r11     //                -- struct rcx field
+  mov                     [rsp + 0x98], r11     //                -- struct r8 field
+  mov                     [rsp + 0xa0], r11    //                -- struct r9 field
+  mov                     r10, 0x2  // PROFILE_LEAVE
+  mov                     [rsp + 0xa8], r10d   // flags           -- struct flags field
+
+  // we need to be able to restore the fp return register
+  save_xmm128_postrsp     xmm0, OFFSETOF_FP_ARG_SPILL_0
+  save_xmm128_postrsp     xmm1, OFFSETOF_FP_ARG_SPILL_1
+  END_PROLOGUE
+
+  // rdi already contains the clientInfo
+  lea                     rsi, [rsp + 0x0]
+  call                    C_FUNC(ProfileLeave)
+
+  // restore fp return register
+  movdqa                  xmm0, [rsp + OFFSETOF_FP_ARG_SPILL_0]
+  movdqa                  xmm1, [rsp + OFFSETOF_FP_ARG_SPILL_1]
+
+  // restore int return register
+  mov                     rax, [rsp + 0x28]
+
+  // begin epilogue
+  free_stack              SIZEOF_STACK_FRAME
+  pop_argument_register   rdx
+
+  pop_nonvol_reg          rbx
+
+  ret
+NESTED_END ProfileLeaveNaked, _TEXT
+
+// EXTERN_C void ProfileTailcallNaked(FunctionIDOrClientID functionIDOrClientID, size_t profiledRsp);
+// <NOTE>
+//
+// </NOTE>
+NESTED_ENTRY ProfileTailcallNaked, _TEXT, NoHandler
+//       Upon entry :
+//           rdi = clientInfo
+//           rsi = profiledRsp
+
+  push_nonvol_reg         rbx
+
+  lea                     rbx, [rsp + 0x10]    // caller rsp
+  mov                     r10, [rbx - 0x8]     // return address
+
+  // rdx should be saved here because it can be used for returning struct values
+  push_argument_register  rdx
+  alloc_stack             SIZEOF_STACK_FRAME
+
+  // correctness of argument registers in structure doesn't matter for tailcall probe
+
+  // setup ProfilePlatformSpecificData structure
+  xor                     r11, r11  // nullify r11
+  mov                     [rsp +  0x0], r11    // r11 is null     -- struct functionId field
+  save_reg_postrsp        rbp, 0x8             //                 -- struct rbp field
+  mov                     [rsp + 0x10], rbx    // caller rsp      -- struct probeRsp field
+  mov                     [rsp + 0x18], r10    // return address  -- struct ip field
+  mov                     [rsp + 0x20], rsi    //                 -- struct profiledRsp field
+  mov                     [rsp + 0x28], rax    // return value    -- struct rax field
+  mov                     [rsp + 0x30], r11    // r11 is null     -- struct hiddenArg field
+  movsd                   real8 ptr [rsp + 0x38], xmm0    //      -- struct flt0 field
+  movsd                   real8 ptr [rsp + 0x40], xmm1    //      -- struct flt1 field
+  movsd                   real8 ptr [rsp + 0x48], xmm2    //      -- struct flt2 field
+  movsd                   real8 ptr [rsp + 0x50], xmm3    //      -- struct flt3 field
+  movsd                   real8 ptr [rsp + 0x58], xmm4    //      -- struct flt4 field
+  movsd                   real8 ptr [rsp + 0x60], xmm5    //      -- struct flt5 field
+  movsd                   real8 ptr [rsp + 0x68], xmm6    //      -- struct flt6 field
+  movsd                   real8 ptr [rsp + 0x70], xmm7    //      -- struct flt7 field
+  mov                     [rsp + 0x78], r11     //                -- struct rdi field
+  mov                     [rsp + 0x80], r11     //                -- struct rsi field
+  mov                     [rsp + 0x88], r11     //                -- struct rdx field
+  mov                     [rsp + 0x90], r11     //                -- struct rcx field
+  mov                     [rsp + 0x98], r11     //                -- struct r8 field
+  mov                     [rsp + 0xa0], r11     //                -- struct r9 field
+  mov                     r10, 0x2  // PROFILE_LEAVE
+  mov                     [rsp + 0xa8], r10d   // flags           -- struct flags field
+
+  // we need to be able to restore the fp return register
+  save_xmm128_postrsp     xmm0, OFFSETOF_FP_ARG_SPILL_0
+  save_xmm128_postrsp     xmm1, OFFSETOF_FP_ARG_SPILL_1
+  END_PROLOGUE
+
+  // rdi already contains the clientInfo
+  lea                     rsi, [rsp + 0x0]
+  call                    C_FUNC(ProfileTailcall)
+
+  // restore fp return register
+  movdqa                  xmm0, [rsp + OFFSETOF_FP_ARG_SPILL_0]
+  movdqa                  xmm1, [rsp + OFFSETOF_FP_ARG_SPILL_1]
+
+  // restore int return register
+  mov                     rax, [rsp + 0x28]
+
+  // begin epilogue
+  free_stack              SIZEOF_STACK_FRAME
+  pop_argument_register   rdx
+
+  pop_nonvol_reg          rbx
+
+  ret
+NESTED_END ProfileTailcallNaked, _TEXT
diff --git a/src/vm/amd64/profiler.cpp b/src/vm/amd64/profiler.cpp
index e88cbba..a5563e4 100644
--- a/src/vm/amd64/profiler.cpp
+++ b/src/vm/amd64/profiler.cpp
@@ -36,6 +36,18 @@ typedef struct _PROFILE_PLATFORM_SPECIFIC_DATA
     UINT64      flt1;
     UINT64      flt2;
     UINT64      flt3;
+#if defined(UNIX_AMD64_ABI)
+    UINT64      flt4;
+    UINT64      flt5;
+    UINT64      flt6;
+    UINT64      flt7;
+    UINT64      rdi;
+    UINT64      rsi;
+    UINT64      rdx;
+    UINT64      rcx;
+    UINT64      r8;
+    UINT64      r9;
+#endif
     UINT32      flags;
 } PROFILE_PLATFORM_SPECIFIC_DATA, *PPROFILE_PLATFORM_SPECIFIC_DATA;
 
diff --git a/src/vm/amd64/unixstubs.cpp b/src/vm/amd64/unixstubs.cpp
index 76d3cf1..83764e0 100644
--- a/src/vm/amd64/unixstubs.cpp
+++ b/src/vm/amd64/unixstubs.cpp
@@ -11,21 +11,6 @@ extern "C"
         PORTABILITY_ASSERT("Implement for PAL");
     }
 
-    void ProfileEnterNaked(FunctionIDOrClientID functionIDOrClientID)    
-    {
-        PORTABILITY_ASSERT("Implement for PAL");
-    }
-
-    void ProfileLeaveNaked(FunctionIDOrClientID functionIDOrClientID)
-    {
-        PORTABILITY_ASSERT("Implement for PAL");
-    }
-
-    void ProfileTailcallNaked(FunctionIDOrClientID functionIDOrClientID)
-    {
-        PORTABILITY_ASSERT("Implement for PAL");
-    }
-
     DWORD getcpuid(DWORD arg, unsigned char result[16])
     {
         DWORD eax;
-- 
1.9.1